In [2]:
import numpy as np, numpy.random as npr, random as r
import tensorflow as tf  
from NavTask import NavigationTask
import tensorflow.contrib.rnn as rnn

In [72]:
def linear(x, size, name, initializer=None, bias_init=0):
    print("x shape",x.get_shape()[1])
    print("size", size)
    w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=initializer)
    b = tf.get_variable(name + "/b", [size], initializer=tf.constant_initializer(bias_init))
    return tf.matmul(x, w) + b

def normalized_columns_initializer(std=1.0):
    def _initializer(shape, dtype=None, partition_info=None):
        out = np.random.randn(*shape).astype(np.float32)
        out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
        return tf.constant(out)
    return _initializer

def categorical_sample(logits, d):
    value = tf.squeeze(tf.multinomial(logits - tf.reduce_max(logits, [1], keep_dims=True), 1), [1])
    return tf.one_hot(value, d)

class LSTMPolicy(object):

    def __init__(self, ob_space, ac_space):
        
        print("obs space", ob_space)
        # x is the observations/states for the length of the episode
        self.x = x = tf.placeholder(tf.float32,[None] + list(ob_space), name="x")
        print("x shape", x)
        size = 256
        
        # introduce a "fake" batch dimension of 1 to do LSTM over time dim
        x = tf.expand_dims(x, [0])
        
        print("x shape", x)
        lstm = rnn.BasicLSTMCell(size, state_is_tuple=True)
        self.state_size = lstm.state_size

        #Step size for truncated backprop using the ob_space, basically [batch_size]
        self.step_size = step_size = tf.shape(self.x)[:1]
        print("step_size", step_size)
        
        # defining the cell state and output state of the LSTM
        c_init = np.zeros((1, lstm.state_size.c), np.float32)
        h_init = np.zeros((1, lstm.state_size.h), np.float32)
        self.state_init = [c_init, h_init]
        
        #defining placeholders so that we can input during training and inference, Example: during rollout you want to input these values 
        c_in = tf.placeholder(tf.float32, [1, lstm.state_size.c], name='c_in')
        h_in = tf.placeholder(tf.float32, [1, lstm.state_size.h], name='h_in')
        self.state_in = [c_in, h_in]
        
        state_in = rnn.LSTMStateTuple(c_in, h_in)
        
        lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
            lstm, x, initial_state=state_in, sequence_length=step_size,
            time_major=False)
        lstm_c, lstm_h = lstm_state
        
        print(lstm_outputs)
        x = tf.reshape(lstm_outputs, [-1, size])
        print("x as output", x)
        
        # vf == value-function?? is one-dimenstion, so basically value for the given state? 
        self.vf = tf.reshape(linear(x, 1, "value", normalized_columns_initializer(1.0)), [-1])
        
        # can be used to later to get the values 
        self.state_out = [lstm_c[:1, :], lstm_h[:1, :]]

        # [0, :] means pick action of first state from batch. Hardcoded b/c
        # batch=1 during rollout collection. Its not used during batch training.
        
        self.logits = linear(x, ac_space, "action", normalized_columns_initializer(0.01))
        print("logits", self.logits)
        self.sample = categorical_sample(self.logits, ac_space)[0, :]
        print("sample", self.sample)
        self.probs = tf.nn.softmax(self.logits, dim=-1)[0, :]
        print("self.probs", self.probs)
        
     
        self.log_prob = log_prob = tf.nn.log_softmax(self.logits,  dim=-1)
        self.prob_tf = tf.nn.softmax(self.logits)
        
        # training part of graph
        self.ac = tf.placeholder(tf.float32, [None, ac_space], name="ac")
        self.adv = tf.placeholder(tf.float32, [None], name="adv")
        self.cumaltiveReward = tf.reduce_sum(self.adv)
        self.entropy =  tf.reduce_mean(tf.reduce_sum(self.prob_tf  * log_prob, 1))
  
        self.cross_entropy_loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.log_prob, labels=self.ac)
        self.loss =  tf.reduce_mean(self.cross_entropy_loss) + 0.1*self.entropy 
        self.gradients = tf.train.AdamOptimizer(0.001).compute_gradients(self.loss)
        for i, (grad, var) in enumerate(self.gradients):
            if grad is not None:
                self.gradients[i] = (grad * self.cumaltiveReward, var)
                
        self._train = tf.train.AdamOptimizer(0.001).apply_gradients(self.gradients)
       
    
    def get_initial_features(self):
        # Call this function to get reseted lstm memory cells
        return self.state_init

    def act(self, ob, c, h):
        sess = tf.get_default_session()
        return sess.run([self.sample, self.vf] + self.state_out,
                        {self.x: [ob], self.state_in[0]: c, self.state_in[1]: h})
        return sess.run(self.pred, {self.input:x})

    def act_inference(self, ob, c, h):
        sess = tf.get_default_session()
        return sess.run([self.probs, self.sample, self.vf] + self.state_out,
                        {self.x: [ob], self.state_in[0]: c, self.state_in[1]: h})

    def value(self, ob, c, h):
        sess = tf.get_default_session()
        return sess.run(self.vf, {self.x: [ob], self.state_in[0]: c, self.state_in[1]: h})[0]

    def train_step(self, obs, acts, advantages, c, h):
        sess = tf.get_default_session()
        batch_feed = { self.x: obs, self.ac: acts, self.adv: advantages, self.state_in[0]: c, self.state_in[1]: h}
        return sess.run([self._train, self.loss, self.logits], feed_dict=batch_feed)


In [73]:
def inference(agent, hparams):
    #"Runs one episode"
    episode_length = hparams['epiode_length']
    env = NavigationTask(3,3)
    obs, acts, rews = [], [], []
    c, h = agent.get_initial_features()
    for i in range(0, episode_length): 
        state = env.getStateRep()
        obs.append(state)
        
        actionProb, sampleAction , _ , c, h  = agent.act_inference(state, c, h)
        
        action = actionProb.argmax()
        sampleActionIndex = sampleAction.argmax()
        
        env.performAction(action)
        newState  = env.getStateRep()
        reward = env.getReward(distance_based=True) 
        
        acts.append(action)
        rews.append(reward)
        
    return obs, acts, rews  

In [74]:
def policyRollout(agent, hparams):
    
    #"Runs one episode"
    episode_length = hparams['epiode_length']
    env = NavigationTask(3,3)
    obs, acts, rews = [], [], []
    c, h = agent.get_initial_features()
    
    for i in range(0, episode_length): 
        
        state = env.getStateRep()
        obs.append(state)
        actionProb, sampleAction , _ , c, h  = agent.act_inference(state, c, h)
      
        action = actionProb.argmax()
        sampleActionIndex = sampleAction.argmax()
        
        env.performAction(sampleActionIndex)
        newState  = env.getStateRep()
        reward = env.getReward(distance_based=True) 
        
        acts.append(sampleAction)
        rews.append(reward)
        
    return obs, acts, rews 

In [77]:
def main():
    # hyper parameters
    env = NavigationTask()
    input_size = np.shape(env.getStateRep())
    hparams = {
            'input_size': input_size,
            'num_actions': 10,
            'learning_rate': 0.1,
            'epiode_length': 6
    }

    # environment params
    eparams = {
            'num_batches': 10,
            'ep_per_batch': 1000
    }

    with tf.Graph().as_default(), tf.Session() as sess:

        pi = LSTMPolicy(hparams['input_size'], hparams['num_actions'])

        sess.run(tf.initialize_all_variables())
        
        for batch in range(0, eparams['num_batches']):
            print('=====\nBATCH {}\n===='.format(batch))
            num = 0
            for i in range(0, eparams['ep_per_batch']):
                obs, acts, rews = policyRollout(pi, hparams)
                c, h = pi.get_initial_features()
                num += 1 if 1 in rews else 0
                pi.train_step(obs, acts, rews, c, h)
            print("number of times reward", num)
            c, h = pi.get_initial_features()
            obs, acts, rews = policyRollout(pi, hparams)
            print("loss",pi.train_step(obs, acts, rews, c, h))
            print("Observation", obs)
            print("acts", [np.argmax(a) for a in acts])
            print("rews", rews)
        print(inference(pi, hparams))
        

In [78]:
main()

obs space (8,)
x shape Tensor("x:0", shape=(?, 8), dtype=float32)
x shape Tensor("ExpandDims:0", shape=(1, ?, 8), dtype=float32)
step_size Tensor("strided_slice:0", shape=(1,), dtype=int32)
Tensor("rnn/transpose:0", shape=(1, ?, 256), dtype=float32)
x as output Tensor("Reshape:0", shape=(?, 256), dtype=float32)
x shape 256
size 1
x shape 256
size 10
logits Tensor("add_1:0", shape=(?, 10), dtype=float32)
sample Tensor("strided_slice_3:0", shape=(10,), dtype=float32)
self.probs Tensor("strided_slice_4:0", shape=(10,), dtype=float32)
Instructions for updating:
Use `tf.global_variables_initializer` instead.
=====
BATCH 0
====
number of times reward 140
loss [None, 2.1798012, array([[ -2.35246614e-01,  -2.35674791e-02,   2.49333575e-01,
         -7.00309455e-01,  -4.68993247e-01,  -1.22918971e-01,
          2.02588990e-01,   3.78602207e-01,   1.22639745e-01,
          2.67338604e-01],
       [ -2.59991080e-01,   9.79101062e-02,   5.88574409e-01,
         -1.17616677e+00,  -1.00992298e+00,  