In [1]:
import numpy as np, numpy.random as npr, random as r
import tensorflow as tf  
from NavTask import NavigationTask
import tensorflow.contrib.rnn as rnn

In [23]:
def linear(x, size, name, initializer=None, bias_init=0):
    print("x shape",x.get_shape()[1])
    print("size", size)
    w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=initializer)
    b = tf.get_variable(name + "/b", [size], initializer=tf.constant_initializer(bias_init))
    return tf.matmul(x, w) + b

def normalized_columns_initializer(std=1.0):
    def _initializer(shape, dtype=None, partition_info=None):
        out = np.random.randn(*shape).astype(np.float32)
        out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
        return tf.constant(out)
    return _initializer

def categorical_sample(logits, d):
    value = tf.squeeze(tf.multinomial(logits - tf.reduce_max(logits, [1], keep_dims=True), 1), [1])
    return tf.one_hot(value, d)

class LSTMPolicy(object):

    def __init__(self, ob_space, ac_space):
        
        print("obs space", ob_space)
        # x is the observations/states for the length of the episode
        self.x = x = tf.placeholder(tf.float32,[None] + list(ob_space), name="x")
        print("x shape", x)
        size = 256
        
        # introduce a "fake" batch dimension of 1 to do LSTM over time dim
        x = tf.expand_dims(x, [0])
        
        print("x shape", x)
        lstm = rnn.BasicLSTMCell(size, state_is_tuple=True)
        self.state_size = lstm.state_size

        #Step size for truncated backprop using the ob_space, basically [batch_size]
        self.step_size = step_size = tf.shape(self.x)[:1]
        print("step_size", step_size)
        
        # defining the cell state and output state of the LSTM
        c_init = np.zeros((1, lstm.state_size.c), np.float32)
        h_init = np.zeros((1, lstm.state_size.h), np.float32)
        self.state_init = [c_init, h_init]
        
        #defining placeholders so that we can input during training and inference, Example: during rollout you want to input these values 
        c_in = tf.placeholder(tf.float32, [1, lstm.state_size.c], name='c_in')
        h_in = tf.placeholder(tf.float32, [1, lstm.state_size.h], name='h_in')
        self.state_in = [c_in, h_in]
        
        state_in = rnn.LSTMStateTuple(c_in, h_in)
        
        lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
            lstm, x, initial_state=state_in, sequence_length=step_size,
            time_major=False)
        lstm_c, lstm_h = lstm_state
        
        print(lstm_outputs)
        x = tf.reshape(lstm_outputs, [-1, size])
        print("x as output", x)
        
        # vf == value-function?? is one-dimenstion, so basically value for the given state? 
        self.vf = tf.reshape(linear(x, 1, "value", normalized_columns_initializer(1.0)), [-1])
        
        # can be used to later to get the values 
        self.state_out = [lstm_c[:1, :], lstm_h[:1, :]]

        # [0, :] means pick action of first state from batch. Hardcoded b/c
        # batch=1 during rollout collection. Its not used during batch training.
        
        self.logits = linear(x, ac_space, "action", normalized_columns_initializer(0.01))
        print("logits", self.logits)
        self.sample = categorical_sample(self.logits, ac_space)[0, :]
        print("sample", self.sample)
        self.probs = tf.nn.softmax(self.logits, dim=-1)[0, :]
        print("self.probs", self.probs)
        
        # need to do this over all the actions in the time series for training 
        #self.log_prob = log_prob = tf.log(tf.nn.softmax(self.logits, dim=-1))
        #print(log_prob)
        self.log_prob = log_prob = tf.nn.log_softmax(self.logits,  dim=-1)
        self.prob_tf = tf.nn.softmax(self.logits)
        
        # training part of graph
        self.ac = tf.placeholder(tf.float32, [None, ac_space], name="ac")
        self.adv = tf.placeholder(tf.float32, [None], name="adv")
        
        
        # get log probs of actions from episode
        # tf.range ==== baically log_proba [10(timeseteps) x 10]
#         indices = tf.range(0, tf.shape(log_prob)[0]) * tf.shape(log_prob)[1] + self._acts # [1 actions]
#         act_prob = tf.gather(tf.reshape(log_prob, [-1]), indices)
        self.crossEntropy = tf.reduce_sum(log_prob * self.ac, 1)
        self.entropy = entropy = - tf.reduce_mean(tf.reduce_sum(self.prob_tf  * log_prob, 1))
        self.loss = -tf.reduce_mean(self.crossEntropy * self.adv) - 0.001 * entropy

        # loss
        #self.loss = tf.reduce_sum(tf.multiply(act_prob, self._advantages))

        # update
        optimizer = tf.train.AdamOptimizer(0.1)
        self._train = optimizer.minimize(self.loss)

        #self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
    
    def get_initial_features(self):
        # Call this function to get reseted lstm memory cells
        return self.state_init

    def act(self, ob, c, h):
        sess = tf.get_default_session()
        return sess.run([self.sample, self.vf] + self.state_out,
                        {self.x: [ob], self.state_in[0]: c, self.state_in[1]: h})
        return sess.run(self.pred, {self.input:x})

    def act_inference(self, ob, c, h):
        sess = tf.get_default_session()
        return sess.run([self.probs, self.sample, self.vf] + self.state_out,
                        {self.x: [ob], self.state_in[0]: c, self.state_in[1]: h})

    def value(self, ob, c, h):
        sess = tf.get_default_session()
        return sess.run(self.vf, {self.x: [ob], self.state_in[0]: c, self.state_in[1]: h})[0]

    def train_step(self, obs, acts, advantages, c, h):
        sess = tf.get_default_session()
        batch_feed = { self.x: obs, self.ac: acts, self.adv: advantages, self.state_in[0]: c, self.state_in[1]: h}
        return sess.run([self._train, self.loss], feed_dict=batch_feed)


In [28]:
def inference(agent, hparams):
    #"Runs one episode"
    episode_length = hparams['epiode_length']
    env = NavigationTask(3,3)
    #print(env.getStateRep())
    #observation, reward, done = env.getStateRep(), 0, False 
    obs, acts, rews = [], [], []
    
    c, h = agent.get_initial_features()
    
    for i in range(0, episode_length): # TODO: episode length
        state = env.getStateRep()
        obs.append(state)
        
        actionProb, sampleAction , _ , c, h  = agent.act_inference(state, c, h)
        # todo: action_probablity 
        #print(sampleAction)
        action = actionProb.argmax()
        sampleActionIndex = sampleAction.argmax()
        #todo: convert onehot vector into a j index 
        env.performAction(sampleActionIndex)
        newState  = env.getStateRep()
        reward = env.getReward() 
        
        acts.append(action)
        rews.append(reward)
    return obs, acts, rews  
    

def policyRollout(agent, hparams):
    
    #"Runs one episode"
    episode_length = hparams['epiode_length']
    env = NavigationTask(3,3)
    #print(env.getStateRep())
    #observation, reward, done = env.getStateRep(), 0, False 
    obs, acts, rews = [], [], []
    
    c, h = agent.get_initial_features()
    
    for i in range(0, episode_length): # TODO: episode length
        state = env.getStateRep()
        obs.append(state)
        
        actionProb, sampleAction , _ , c, h  = agent.act_inference(state, c, h)
        # todo: action_probablity 
        #print(sampleAction)
        action = actionProb.argmax()
        sampleActionIndex = sampleAction.argmax()
        #todo: convert onehot vector into a j index 
        #env.performAction(action)
        env.performAction(sampleActionIndex)
        newState  = env.getStateRep()
        reward = env.getReward() 
        
        #acts.append(tf.one_hot([action],10))
        #actions not smapled
#         values = [sampleActionIndex]
#         acts.append(np.squeeze(np.eye( hparams['num_actions'])[values]))
        acts.append(sampleAction)
        
        
        rews.append(reward)
    return obs, acts, rews  


def process_rewards(rews):
    """Rewards -> Advantages for one episode. """

    # total reward: length of episode
    return [len(rews)] * len(rews)

def main():
    # hyper parameters
    env = NavigationTask()
    input_size = np.shape(env.getStateRep())
    hparams = {
            'input_size': input_size,
            'num_actions': 10,
            'learning_rate': 0.1,
            'epiode_length': 6
    }

    # environment params
    eparams = {
            'num_batches': 100,
            'ep_per_batch': 10
    }

    with tf.Graph().as_default(), tf.Session() as sess:

        pi = LSTMPolicy(hparams['input_size'], hparams['num_actions'])

        sess.run(tf.initialize_all_variables())
        num = 0
        for batch in range(0, eparams['num_batches']):
            

            #print('=====\nBATCH {}\n===='.format(batch))

            b_obs, b_acts, b_rews = [], [], []

            for i in range(0, eparams['ep_per_batch']):
                c, h = pi.get_initial_features()
                obs, acts, rews = policyRollout(pi, hparams)
                num += 1 if 1 in rews else 0
                if 1 in rews:
                    print("loss",pi.train_step(obs, acts, rews, c, h))
                advantages = process_rewards(rews)
                #print("Observation", obs)
                #print("acts", acts[0])
                #print("rews", rews)
                #print("loss",pi.train_step(obs, acts, rews, c, h))
                #value = pi.train_step(obs, acts, advantages, c, h)
#             print("loss",pi.train_step(obs, acts, rews, c, h))    
            #_, loss = pi.train_step(obs, acts, rews, c, h)
        print(inference(pi, hparams))
#         print("loss", loss)
        print(num)



In [29]:
main()

obs space (6,)
x shape Tensor("x:0", shape=(?, 6), dtype=float32)
x shape Tensor("ExpandDims:0", shape=(1, ?, 6), dtype=float32)
step_size Tensor("strided_slice:0", shape=(1,), dtype=int32)
Tensor("rnn/transpose:0", shape=(1, ?, 256), dtype=float32)
x as output Tensor("Reshape:0", shape=(?, 256), dtype=float32)
x shape 256
size 1
x shape 256
size 10
logits Tensor("add_1:0", shape=(?, 10), dtype=float32)
sample Tensor("strided_slice_3:0", shape=(10,), dtype=float32)
self.probs Tensor("strided_slice_4:0", shape=(10,), dtype=float32)
Instructions for updating:
Use `tf.global_variables_initializer` instead.
loss [None, 1.5326692]
loss [None, 0.29812384]
loss [None, 0.29669872]
([array([ 0.,  0.,  1.,  0.,  0.,  0.]), array([ 0.,  2.,  1.,  0.,  0.,  0.]), array([ 0.,  2.,  1.,  0.,  0.,  0.]), array([ 0.,  2.,  1.,  0.,  0.,  0.]), array([ 0.,  2.,  1.,  0.,  0.,  0.]), array([ 0.,  2.,  1.,  0.,  0.,  0.])], [9, 9, 9, 9, 9, 9], [0, 0, 0, 0, 0, 0])
3


In [14]:
values = [3]
n_values = np.max(values) + 1
np.squeeze(np.eye(10)[values])

array([ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.])