In [2]:
import numpy as np, numpy.random as npr, random as r
import tensorflow as tf  
from NavTask import NavigationTask
import tensorflow.contrib.rnn as rnn

In [39]:
#Getting data from the env 
data = NavigationTask.generateRandomRewardTrajectories(20000,10,verbose=False)

In [40]:
def layer(x, size, name, initializer=None, bias_init=0):
    w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=initializer)
    b = tf.get_variable(name + "/b", [size], initializer=tf.constant_initializer(bias_init))
    return tf.nn.relu(tf.matmul(x, w) + b)

def linear(x, size, name, initializer=None, bias_init=0):
    w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=initializer)
    b = tf.get_variable(name + "/b", [size], initializer=tf.constant_initializer(bias_init))
    return tf.matmul(x, w) + b

def normalized_columns_initializer(std=1.0):
    def _initializer(shape, dtype=None, partition_info=None):
        out = np.random.randn(*shape).astype(np.float32)
        out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
        return tf.constant(out)
    return _initializer


In [45]:
#State ---> Reward 
class GreedyRewardFunction(object):
    
    def __init__(self, obs_space, h_size=256):
        
        # Input: [Batch observations_space]
        self.input = tf.placeholder(tf.float32, [None] + list(obs_space))
        
        hidden1 = layer(self.input, h_size, "hidden1",  normalized_columns_initializer(1.0))
        hidden2 = layer(hidden1, h_size, "hidden2",  normalized_columns_initializer(1.0))
        self.output = linear(hidden2, 1, "linear", normalized_columns_initializer(1.0))
        # output: [Batch reward[1]]
        
        self.targetRewards = tf.placeholder(tf.float32, [None])
        
        self.loss = tf.reduce_mean(tf.square(self.output - self.targetRewards))
        self.optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(self.loss)
        
    def train(self, obs, rewards):
        sess = tf.get_default_session()
        batch_feed = {self.input: obs, self.targetRewards: rewards}
        return sess.run([self.loss, self.optimizer], feed_dict=batch_feed)
        
    def inference(self, obs):
        sess = tf.get_default_session()
        batch_feed = {self.input: [obs]}
        return sess.run([self.output], feed_dict=batch_feed)
        

In [46]:
def main():
    # hyper parameters
    env = NavigationTask()
    input_size = np.shape(env.getStateRep())
    hparams = {
            'input_size': input_size,
    }

    # environment params
    eparams = {
            'num_batches': 1,
            'ep_per_batch': 1
    }


    with tf.Graph().as_default(), tf.Session() as sess:

        v = GreedyRewardFunction(hparams['input_size'])

        sess.run(tf.initialize_all_variables())
        
        for batch in range(0, eparams['num_batches']):
            #print('=====\nBATCH {}\n===='.format(batch))
            dataset = data[batch]
            for i in range(0, eparams['ep_per_batch']):
                obs, rewards = dataset
                v.train(obs, rewards)
#             print("loss",pi.trai(n_step(obs, acts, rews))
            #print("loss", v.train(obs, rewards))
        print("Goal state", v.inference([14,14,0,1,0,0,14,14]))
        #print(policyRollout(pi, hparams))

In [47]:
obs, rewards = data[0]
print(obs)

[array([  6.,   1.,   0.,   0.,   1.,   0.,  14.,  14.]), array([  6.,   1.,   0.,   1.,   0.,   0.,  14.,  14.]), array([  6.,   1.,   0.,   0.,   0.,   1.,  14.,  14.]), array([  6.,   1.,   1.,   0.,   0.,   0.,  14.,  14.]), array([  6.,   4.,   1.,   0.,   0.,   0.,  14.,  14.]), array([  6.,   9.,   1.,   0.,   0.,   0.,  14.,  14.]), array([  6.,  12.,   1.,   0.,   0.,   0.,  14.,  14.]), array([  6.,  12.,   1.,   0.,   0.,   0.,  14.,  14.])]


In [48]:
main()

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Goal state [array([[ 5.51589012]], dtype=float32)]
