In [10]:
import numpy as np, numpy.random as npr, random as r
import tensorflow as tf  
from NavTask import NavigationTask
import tensorflow.contrib.rnn as rnn
import tensorflow.contrib.slim as slim
from forwardModel3 import *

In [3]:
#### Node --> Children  

# From: https://github.com/ericjang/gumbel-softmax
class GumbelSoftmax(object):
    def sample_gumbel(shape, eps=1e-20): 
      """Sample from Gumbel(0, 1)"""
      """  dd """
      U = tf.random_uniform(shape,minval=0,maxval=1)
      return -tf.log(-tf.log(U + eps) + eps)

    def gumbel_softmax_sample(logits, temperature): 
      """ Draw a sample from the Gumbel-Softmax distribution"""
      y = logits + GumbelSoftmax.sample_gumbel(tf.shape(logits))
      return tf.nn.softmax( y / temperature)

    def gumbel_softmax(logits, temperature, hard=False):
      """Sample from the Gumbel-Softmax distribution and optionally discretize.
      Args:
        logits: [batch_size, n_class] unnormalized log-probs
        temperature: non-negative scalar
        hard: if True, take argmax, but differentiate w.r.t. soft sample y
      Returns:
        [batch_size, n_class] sample from the Gumbel-Softmax distribution.
        If hard=True, then the returned sample will be one-hot, otherwise it will
        be a probabilitiy distribution that sums to 1 across classes
      """
      y = GumbelSoftmax.gumbel_softmax_sample(logits, temperature)
      if hard:
        k = tf.shape(logits)[-1]
        #y_hard = tf.cast(tf.one_hot(tf.argmax(y,1),k), y.dtype)
        y_hard = tf.cast(tf.equal(y,tf.reduce_max(y,1,keep_dims=True)),y.dtype)
        y = tf.stop_gradient(y_hard - y) + y
      return y

#sess = tf.Session()
#sess.run(GumbelSoftmax.gumbel_softmax(tf.constant([[0.5, 0.5]]), 0.5, hard=True))

In [6]:
class SimulationPolicy(object):
    
    def __init__(self, obs_space, act_space, h_size=100):
        print("Observation Space: " , obs_space)
        print("Action Space: ", act_space)
        self.h_size = h_size
        # Input space: [Episode_length, observations], output:[Episode_Length,action_space]
        self.input = tf.placeholder(tf.float32, [None] + list(obs_space))
        self.act_space = act_space
        #self.output = self.getSoftAction(self.input)
        self.sampleOutput = self.sample(self.input)
        self.trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) 
    
    def getSoftAction(self,observation, reuse=None):
        with tf.variable_scope("softaction", reuse=reuse):
            hidden = slim.fully_connected(observation, self.h_size, biases_initializer=None, activation_fn=tf.nn.relu)
            return slim.fully_connected(hidden, self.act_space, activation_fn=tf.nn.softmax, biases_initializer=None)
        
    def sample(self,observation,temperature=0.5, reuse=None):
        sess = tf.get_default_session()
        #print(sess.run(tf.report_uninitialized_variables()))
        softAction = self.getSoftAction(observation, reuse=reuse)
        #sess.run(tf.global_variables_initializer())
        return GumbelSoftmax.gumbel_softmax(softAction, temperature, hard=True)
        

In [8]:
# sess = tf.Session()
with tf.Graph().as_default(), tf.Session() as sess:
    simpolicy = SimulationPolicy( np.shape([1,1]),3)
    sess.run(tf.global_variables_initializer())
    
    sample = simpolicy.sample(tf.constant([[5.0,1.0]]), reuse=True) 
    #sample1 = simpolicy.sample(tf.constant([[5.0,1.0]]), reuse=True)
    
    writer = tf.summary.FileWriter('logs', sess.graph)
    x, _ = sess.run([sample)
    #writer.close()
    print(x)

Observation Space:  (2,)
Action Space:  3
[[ 1.  0.  0.]]


In [None]:
class Node(object):
    
    def __init__(self, parent_node, state, action):
        self.parent = parent_node
        self.children = []
        self.state = state
        self.action = action
        
    def addChild(self, child):
        self.children.append(child)
        
class Tree(object):
    
    def __init__(self,initialState,forwardModel,simPolicy,maxDepth=5,branchingFactor=3):
        self.simPolicy = simPolicy
        self.maxDepth, self.branchFactor = maxDepth, branchingFactor
        self.forwardModel = forwardModel
        parent = Node(None,initialState,None)
        # Generate Tree
        self.tree_head = self.grow(parent,0,self.branchFactor)
        # Get leaves
        q, self.leaves = [ parent ], []
        while len(q) >= 1:
            currNode = q.pop()
            for child in currNode.children:
                if len( child.children ) == 0: self.leaves.append( child )
                else: q.append( child )
        print(self.leaves)
        
    def grow(node,d,b):
        if d == self.maxDepth : return node
        for i in range(b):
            # Sample the current action
            a_s = self.simPolicy.sample(node.state)
            # Compute the predicted forward state
            concat_vec = tf.concat([tf.cast(node.state,dtype=tf.float32),a_s], axis=0)
            concat_vec = tf.reshape(concat_vec,[1,1,-1]) #[batch size, sequence length, size of concat_vec]
            state_out = self.forwardModel.get_initial_features(1)
            current_state,state_out = self.forwardModel.dynamic_cell(concat_vec, tf.constant([1]), state_out)
            # Build the next subtree
            node.addChild( grow( Node(node, current_state, a_s), d+1, b) )
            return node
        
    

In [32]:
env = NavigationTask() 
state_i=env.getStateRep()
observation_space = np.shape(state_i)
#get goal state
state_f=env.getStateRep()
inds = np.cumsum([0,env.w,env.h,len(env.oriens),env.w,env.h])
state_f[inds[0]:inds[1]] = env._intToOneHot(env.goal_pos[0],env.w)
state_f[inds[1]:inds[2]] = env._intToOneHot(env.goal_pos[1],env.h)
num_of_actions = 10
print(state_i, state_f)

with tf.Graph().as_default(), tf.Session() as sess:
    forwardModel=ForwardModel(64,74,10, 100)
    forwardModel.load_model('abcd.ckpt')
    simpolicy = SimulationPolicy( observation_space ,num_of_actions)
    reshape_state = np.reshape(state_i,(1,-1))
    sess.run(tf.global_variables_initializer())
    a_s = sess.run([simpolicy.sampleOutput], {simpolicy.input: reshape_state})
    print(a_s)
    concat_vec = tf.concat([tf.cast(state_i,dtype=tf.float32),tf.squeeze(a_s[0])], axis=0)
    concat_vec = tf.reshape(concat_vec,[1,1,-1]) #[batch size, sequence length, size of concat_vec]
    state_out = forwardModel.get_initial_features(1)
    current_state,state_out = forwardModel.dynamic_cell(concat_vec, tf.constant([1]), state_out)
    output = sess.run([current_state])[0][0]
    print(output)
    print(np.argmax(output[0:15]))
    print(np.argmax(output[15:30]))
    print(np.argmax(output[30:34]))
    print(np.argmax(output[34:49]))
    print(np.argmax(output[49:64]))
    
    print(np.argmax(state_i[0:15]))
    print(np.argmax(state_i[15:30]))
    print(np.argmax(state_i[30:34]))
    print(np.argmax(state_i[34:49]))
    print(np.argmax(state_i[49:64]))
    

[ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.] [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
INFO:tensorflow:Restoring parameters from abcd.ckpt
Observation Space:  (64,)
Action Space:  10
[array([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]], dtype=float32)]
[-2.03846502 -0.85695219 -0.51081717 -1.80624509 -1.36497927  0.55826956
 -1.94655108 -0.51467937 -0.2117053   0.13349709  0.72574234  0.02418317
 -0.29246455 -0.19109452 -0.53288925 -0.76872003  1.33117223  1.5037179
 -0.31478378 -1.63553715 -2.1405561   0.34129989 -0.10810138  1.10110307
  0.5648883  -