In [1]:
#NAVI_HARD_CODE_DOMAIN

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from sklearn.metrics import confusion_matrix
import time
from datetime import timedelta
import math
import os
import pandas as pd
#Functional coding
import functools
from functools import partial
from tensorflow.python.ops import array_ops 

In [3]:
#Data Path..
Datapath="DATA/Navigation/linear/Navigation_Data.txt"
Labelpath="DATA/Navigation/linear/Navigation_Label.txt"
Rewardpath="DATA/Navigation/linear/Navigation_Reward.txt"

In [4]:
#Given local path, find full path
def PathFinder(path):
    #python 2
    #script_dir = os.path.dirname('__file__')
    #fullpath = os.path.join(script_dir,path)
    #python 3
    fullpath=os.path.abspath(path)
    print(fullpath)
    return fullpath

#Read Data for Deep Learning
def ReadData(path):
    fullpath=PathFinder(path)
    return pd.read_csv(fullpath, sep=',', header=0)

In [5]:
default_settings = {
    "dims"          : 2,
    "min_maze_bound": tf.constant(0.0,dtype=tf.float32), 
    "max_maze_bound": tf.constant(10.0,dtype=tf.float32), 
    "min_act_bound": tf.constant(-1.0,dtype=tf.float32), 
    "max_act_bound": tf.constant(1.0,dtype=tf.float32), 
    "goal"    : tf.constant(8.0,dtype=tf.float32),
    "centre"  : tf.constant(5.0,dtype=tf.float32)
   }

In [6]:
class NAVI(object):
    def __init__(self, 
                 batch_size,
                 default_settings):
        self.__dict__.update(default_settings)
        self.batch_size = batch_size
        self.zero = tf.constant(0,shape=[batch_size,2], dtype=tf.float32)
        self.two = tf.constant(2.0,dtype=tf.float32)
        self.one = tf.constant(1.0,shape=[batch_size],dtype=tf.float32)
    
    def MINMAZEBOUND(self):
        return self.min_maze_bound
    
    def MAXMAZEBOUND(self):
        return self.max_maze_bound
    
    def MINACTIONBOUND(self):
        return self.min_act_bound
    
    def MAXACTIONBOUND(self):
        return self.max_act_bound
    
    def GOAL(self):
        return self.goal
    
    def CENTER(self):
        return self.centre
    
    def Transition(self, states, actions):
        previous_state = states
        distance = tf.reduce_sum(tf.abs(states-self.CENTER()),1)
        scalefactor = tf.select(tf.less(distance,self.two),distance/self.two,self.one)
        proposedLoc = previous_state + tf.matrix_transpose(scalefactor*tf.matrix_transpose(actions))
        new_states = tf.select(tf.logical_and(tf.less_equal(proposedLoc,self.MAXMAZEBOUND()),tf.greater_equal(proposedLoc,self.MINMAZEBOUND())),\
                               proposedLoc,\
                              tf.select(tf.greater(proposedLoc,self.MAXMAZEBOUND()),\
                                        self.zero+self.MAXMAZEBOUND(),\
                                        self.zero+self.MINMAZEBOUND())\
                              )
        return new_states

    def Reward(self, states,actions):
        new_reward = -tf.reduce_sum(tf.abs(states-self.GOAL()),1,keep_dims=True)
        return new_reward

In [7]:
# States
states = tf.placeholder(tf.float32,[30, 2],name="States")

# Actions
actions = tf.placeholder(tf.float32,[30, 2],name="Actions")


In [8]:
class NAVICell(tf.nn.rnn_cell.RNNCell):

    def __init__(self, batch_size, default_settings):
        self._num_state_units = 2
        self._num_reward_units = 3
        self.navi = NAVI(batch_size, default_settings)

    @property
    def state_size(self):
        return self._num_state_units

    @property
    def output_size(self):
        return self._num_reward_units

    def __call__(self, inputs, state, scope=None):
        next_state =  self.navi.Transition(state, inputs)
        reward = self.navi.Reward(state, inputs)      
        return tf.concat(1,[reward,next_state]), next_state

In [9]:
class ActionOptimizer(object):
    def __init__(self,
                a, # Actions
                num_step, # Number of RNN step, this is a fixed step RNN sequence, 12 for navigation
                num_act, # Number of actions
                batch_size, #Batch Size
                learning_rate=0.01): 
        self.action = tf.reshape(a,[-1,num_step,num_act]) #Reshape rewards
        print(self.action)
        self.batch_size = batch_size
        self.num_step = num_step
        self.learning_rate = learning_rate
        self._p_create_rnn_graph()
        self._p_Q_loss()
        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())
    
    def _p_create_rnn_graph(self):
        cell = NAVICell(self.batch_size,default_settings)
        initial_state = cell.zero_state(self.batch_size, dtype=tf.float32)
        print('action batch size:{0}'.format(array_ops.shape(self.action)[0]))
        print('Initial_state shape:{0}'.format(initial_state))
        rnn_outputs, state = tf.nn.dynamic_rnn(cell, self.action, dtype=tf.float32,initial_state=initial_state)
        #need output intermediate states as well
        concated = tf.concat(0,rnn_outputs)
        print('concated shape:{0}'.format(concated.get_shape()))
        something_unpacked =  tf.unpack(concated, axis=2)
        self.outputs = tf.reshape(something_unpacked[0],[-1,self.num_step,1])
        print(' self.outputs:{0}'.format(self.outputs.get_shape()))
        self.intern_states = tf.pack([something_unpacked[1],something_unpacked[2]], axis=2)
        self.last_state = state
        self.pred = tf.reduce_sum(self.outputs,1)
        self.average_pred = tf.reduce_mean(self.pred)
        print("self.pred:{0}".format(self.pred))
            
    def _p_create_loss(self):

        objective = tf.reduce_mean(tf.square(self.pred)) 
        self.loss = objective
        print(self.loss.get_shape())
        #self.loss = -objective
        self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate).minimize(self.loss, var_list=[a])
        
    def _p_Q_loss(self):
        objective = tf.constant(0.0, shape=[self.batch_size, 1])
        for i in range(self.num_step):
            Rt = self.outputs[:,i]
            SumRj=tf.constant(0.0, shape=[self.batch_size, 1])
            SumRk=tf.constant(0.0, shape=[self.batch_size, 1])
            if i<(self.num_step-1):
                j= i + 1
                SumRj = tf.reduce_sum(self.outputs[:,j:],1)
            #if i<(self.num_step-1):
                #k= i+1
                #SumRk = tf.reduce_sum(self.outputs[:,k:],1)
            objective+=(Rt*(SumRj-SumRk)+0.5*tf.square(Rt))/(self.num_step-i)
        self.loss = tf.reduce_mean(tf.square(objective))
        self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate).minimize(self.loss, var_list=[a])
        
    def Optimize(self,epoch=100):
        
        new_loss = self.sess.run([self.average_pred])
        print('Loss in epoch {0}: {1}'.format("Initial", new_loss)) 
        for epoch in range(epoch):
            training = self.sess.run([self.optimizer])
            self.sess.run(tf.assign(a, tf.clip_by_value(a, -1, 1)))
            if True:
                new_loss = self.sess.run([self.average_pred])
                print('Loss in epoch {0}: {1}'.format(epoch, new_loss))  
        minimum_costs_id=self.sess.run(tf.argmax(self.pred,0))
        print(minimum_costs_id)
        best_action = np.round(self.sess.run(self.action)[minimum_costs_id[0]],4)
        print('Optimal Action Squence:{0}'.format(best_action))
        print('Best Cost: {0}'.format(self.sess.run(self.pred)[minimum_costs_id[0]]))
        print('The last state:{0}'.format(self.sess.run(self.last_state)[minimum_costs_id[0]]))
        print('Rewards each time step:{0}'.format(self.sess.run(self.outputs)[minimum_costs_id[0]]))
        print('Intermediate states:{0}'.format(self.sess.run(self.intern_states)[minimum_costs_id[0]]))
        

In [10]:
a = tf.Variable(tf.truncated_normal(shape=[2400],mean=0.0, stddev=0.5),name="action")
#a = tf.Variable(tf.constant(1.0,shape=[240], dtype=tf.float32),name="action")
rnn_inst = ActionOptimizer(a, 12,2,100)  

Tensor("Reshape:0", shape=(100, 12, 2), dtype=float32)
action batch size:Tensor("strided_slice:0", shape=(), dtype=int32)
Initial_state shape:Tensor("zeros:0", shape=(100, 2), dtype=float32)
concated shape:(100, 12, 3)
 self.outputs:(100, 12, 1)
self.pred:Tensor("Sum:0", shape=(100, 1), dtype=float32)


In [11]:
rnn_inst.Optimize(1000)

Loss in epoch Initial: [-177.06371]
Loss in epoch 0: [-174.74911]
Loss in epoch 1: [-172.94203]
Loss in epoch 2: [-171.39449]
Loss in epoch 3: [-169.97644]
Loss in epoch 4: [-168.63158]
Loss in epoch 5: [-167.38614]
Loss in epoch 6: [-166.19824]
Loss in epoch 7: [-165.06105]
Loss in epoch 8: [-163.97281]
Loss in epoch 9: [-162.92691]
Loss in epoch 10: [-161.88525]
Loss in epoch 11: [-160.87341]
Loss in epoch 12: [-159.89127]
Loss in epoch 13: [-158.89503]
Loss in epoch 14: [-157.91255]
Loss in epoch 15: [-156.9588]
Loss in epoch 16: [-156.03314]
Loss in epoch 17: [-155.12317]
Loss in epoch 18: [-154.22881]
Loss in epoch 19: [-153.33989]
Loss in epoch 20: [-152.43556]
Loss in epoch 21: [-151.56151]
Loss in epoch 22: [-150.70926]
Loss in epoch 23: [-149.84656]
Loss in epoch 24: [-148.99004]
Loss in epoch 25: [-148.1564]
Loss in epoch 26: [-147.33751]
Loss in epoch 27: [-146.51863]
Loss in epoch 28: [-145.70789]
Loss in epoch 29: [-144.90488]
Loss in epoch 30: [-144.11099]
Loss in epoch 3

In [12]:
action=np.array([[1,1],[1,1],[1,1],[1,1],[1,-1],[1,1],[1,1],[1,1],[1,1],[0.4,1],[0,1],[0,0]])

In [13]:
a = tf.Variable(tf.constant(action,dtype=tf.float32),name="action")

In [14]:
rnn_inst = ActionOptimizer(a, 12,2,1) 
rnn_inst.Optimize(0)

Tensor("Reshape_2:0", shape=(1, 12, 2), dtype=float32)
action batch size:Tensor("strided_slice_24:0", shape=(), dtype=int32)
Initial_state shape:Tensor("zeros_2:0", shape=(1, 2), dtype=float32)
concated shape:(1, 12, 3)
 self.outputs:(1, 12, 1)
self.pred:Tensor("Sum_12:0", shape=(1, 1), dtype=float32)
Loss in epoch Initial: [-85.800003]
[0]
Optimal Action Squence:[[ 1.          1.        ]
 [ 1.          1.        ]
 [ 1.          1.        ]
 [ 1.          1.        ]
 [ 1.         -1.        ]
 [ 1.          1.        ]
 [ 1.          1.        ]
 [ 1.          1.        ]
 [ 1.          1.        ]
 [ 0.40000001  1.        ]
 [ 0.          1.        ]
 [ 0.          0.        ]]
Best Cost: [-85.80000305]
The last state:[ 9.39999962  9.        ]
Rewards each time step:[[-16.        ]
 [-14.        ]
 [-12.        ]
 [-10.        ]
 [ -8.        ]
 [ -8.        ]
 [ -6.        ]
 [ -4.        ]
 [ -2.        ]
 [ -2.        ]
 [ -1.39999962]
 [ -2.39999962]]
Intermediate states:[[ 1. 