In [14]:
import torch, torch.autograd as autograd
import torch.nn as nn, torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable as avar
    
from SimpleTask import SimpleGridTask
from TransportTask import TransportTask
from NavTask import NavigationTask
from SeqData import SeqData
from Henaff_pt2 import ForwardModelFFANN

import os, sys, pickle, numpy as np, numpy.random as npr, random as r

In [2]:
def sample_gumbel(shape, eps=1e-20):
    U = torch.rand(shape) #.cuda()
    return -avar(torch.log(-torch.log(U + eps) + eps))

def gumbel_softmax_sample(logits, temperature):
    y = logits + sample_gumbel(logits.size())
    return F.softmax(y / temperature, dim=-1)

def gumbel_softmax(logits, temperature):
    """
    input: [*, n_class]
    return: [*, n_class] an one-hot vector
    """
    y = gumbel_softmax_sample(logits, temperature)
    shape = y.size()
    _, ind = y.max(dim=-1)
    y_hard = torch.zeros_like(y).view(-1, shape[-1])
    y_hard.scatter_(1, ind.view(-1, 1), 1)
    y_hard = y_hard.view(*shape)
    return (y_hard - y).detach() + y

In [12]:
#Testing Simulation Policy
env = NavigationTask() 
simpolicy = SimulationPolicy(env)
simpolicy.sample(avar(torch.FloatTensor([env.getStateRep()])))

State Size:  64
Action Size:  10


Variable containing:
    0     0     1     0     0     0     0     0     0     0
[torch.FloatTensor of size 1x10]

In [16]:
#Loading and testing Forward Model
f_model_name = 'forward-ffann-noisy-wan-1.pt' # 6 gets 99% on 0.1% noise
exampleEnv = NavigationTask()
f = ForwardModelFFANN(exampleEnv)
f.load_state_dict( torch.load(f_model_name) )
start = np.zeros(74, dtype=np.float32)
start[0+4] = 1
start[15+6] = 1
start[15+15+0] = 1
start[15+15+4+8] = 1
start[15+15+4+15+7] = 1
start[15+15+4+15+15+4] = 1.0
f.test(start)
print('-----\n','Starting manualTest loop')
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()
for i in range(5):
    width, height = 15, 15
    p_0 = np.array([npr.randint(0,width),npr.randint(0,height)])
    start_pos = [p_0, r.choice(NavigationTask.oriens)]
    goal_pos = np.array([ npr.randint(0,width), npr.randint(0,height) ])
    checkEnv = NavigationTask(
        width=width, height=height, agent_start_pos=start_pos, goal_pos=goal_pos,
        track_history=True, stochasticity=0.0, maxSteps=10)
    s_0 = checkEnv.getStateRep()
    #a1, a2 = np.zeros(10), np.zeros(10)
    #a1[ npr.randint(0,10) ] = 1
    #a2[ npr.randint(0,10) ] = 1
    numActions = 3
    currState = avar( torch.FloatTensor(s_0).unsqueeze(0) )
    print('Start State')
    f.printState( currState[0] )
    actionSet = []
    for j in range(numActions):
        action = np.zeros( 10 )
        action[ npr.randint(0,10) ] = 1
        action += npr.randn( 10 )*0.1
        action = softmax( action )
        print('\tSoft Noisy Action ',j,'=',action)
        #### Apply Gumbel Softmax ####
        temperature = 0.01
        logProbAction = torch.log( avar(torch.FloatTensor(action)) ) 
        actiong = gumbel_softmax(logProbAction, temperature)
        ##############################
        print('\tGumbel Action ',j,'=',actiong.data.numpy())
        actionSet.append( actiong )
        checkEnv.performAction( np.argmax(action) )
        a = actiong  # avar( torch.FloatTensor(actiong) )
        currState = f.forward( torch.cat([currState[0],a]).unsqueeze(0) )
        print("Intermediate State",j)
        f.printState( currState[0] )
    #checkEnv.performAction(np.argmax(a1))
    #checkEnv.performAction(np.argmax(a2))
    s_1 = checkEnv.getStateRep()
    #inval = np.concatenate( (s_0,a1) )
    #outval1 = f.forward( avar(torch.FloatTensor(inval).unsqueeze(0)) )
    #print(outval1.shape)
    #print(a2.shape)
    #inval2 = np.concatenate( (outval1[0].data.numpy(),a2) )
    #outval2 = f.forward( avar(torch.FloatTensor(inval2).unsqueeze(0)) )
    for action in actionSet:
        f.printAction(action)
    print('Predicted')
    f.printState( currState[0] )
    print('Actual')
    s1 = avar( torch.FloatTensor( s_1 ).unsqueeze(0) )
    f.printState( s1[0] ) 
    print("Rough accuracy", torch.sum( (currState - s1).pow(2) ).data[0] )
    #print('Predicted',currState.data[0].numpy())
    #print('Actual',s_1)
    #outval1 = f.test(inval,s_1)
    print('----\n')

Input State
	px: 4   [0.0000,0.0000,0.0000,0.0000,1.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000]
	py: 6   [0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,1.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000]
	orien: 0   [1.0000,0.0000,0.0000,0.0000]
	gx: 8   [0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,1.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000]
	gy: 7   [0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,1.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000]
Input Action
	a: Face_west (4)
Predicted Final State
	px: 4   [0.0000,0.0000,0.0000,0.0000,1.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000]
	py: 6   [0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,1.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000]
	orien: 3   [0.0000,0.0000,0.0000,1.0000]
	gx: 8   [0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,1.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000]
	gy: 7   [0.0000,0.0000,0.0000,0.0000,0

	Gumbel Action  0 = [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
Intermediate State 0
px: 8   [0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,1.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000]
py: 11   [0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,1.0000,0.0000,0.0000,0.0000]
orien: 0   [1.0000,0.0000,0.0000,0.0000]
gx: 9   [0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,1.0000,0.0000,0.0000,0.0000,0.0000,0.0000]
gy: 6   [0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,1.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000]
	Soft Noisy Action  1 = [0.0782 0.071  0.0902 0.0855 0.09   0.2321 0.0931 0.0755 0.0856 0.0988]
	Gumbel Action  1 = [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
Intermediate State 1
px: 8   [0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,1.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000]
py: 12   [0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,1.0000,0.0000,0.0000]
orien: 0   [1.0000,0.0000

In [268]:
def greedy_valueF(state):
    state = state.squeeze()
    #ForwardModel.printState(state)
    vx = torch.sum((state[0:15]-state[34:49]).pow(2))
    #print('vx',vx)
    vy = torch.sum((state[15:30]-state[49:64]).pow(2))
    #print('vy',vy)
    value = -( vx + vy ) 
    return value

def greedy_cont_valueF(state):
    state = state.squeeze()
    _,ix = state[0:15].max(0)
    _,gx = state[34:49].max(0)
    _,iy = state[15:30].max(0)
    _,gy = state[49:64].max(0)
    #ForwardModel.printState(state)
    vx = torch.sum((ix - gx)*(ix - gx))
    #print('vx',vx)
    vy = torch.sum((iy - gy)*(iy - gy))
    #print('vy',vy)
    value = -( vx + vy ) 
    return value


In [292]:
def generateTask(px,py,orien,gx,gy):
    direction = NavigationTask.oriens[orien]
    gs = np.array([gx, gy])
    env = NavigationTask(agent_start_pos=[np.array([px,py]), direction],goal_pos=gs)
    return env

class SimulationPolicy(nn.Module):
    def __init__(self,  env, layerSizes=[100,100]):
        super(SimulationPolicy, self).__init__()
        self.actionSize = len(env.actions)
        self.stateSize = len(env.getStateRep(oneHotOutput=True))
        print("State Size: " , self.stateSize)
        print("Action Size: ", self.actionSize)
        
        # Input space: [Batch, observations], output:[Batch, action_space]
        self.layer1 = nn.Linear(self.stateSize, layerSizes[0])
        self.layer2 = nn.Linear(layerSizes[0], layerSizes[1])
        self.layer3 = nn.Linear(layerSizes[1], self.actionSize)
        
    def sample(self,state,temperature=0.5):
        output = F.relu( self.layer1(state) )
        output = F.relu( self.layer2(output) ) # F.sigmoid
        output = self.layer3(output)
        m = nn.LogSoftmax(dim=1)
        output = m(output)
        return gumbel_softmax(output, temperature)
    
    def forward(self, state):
        output = F.relu( self.layer1(state) )
        output = F.relu( self.layer2(output) ) # F.sigmoid
        output = self.layer3(output) 
        output = F.softmax(output,dim=1)
        return output
    
    def trainSad(self, forwardModel):
        
        optimizer = optim.Adam(self.parameters(), lr = 0.25 )

        maxDepth = 1
        treeBreadth = 10
        
#         p = npr.randint(0,15,2)
#         orien = npr.randint(0,4,1)
#         g = npr.randint(0,15,2)
        cenv = generateTask(0,0,0,0,4)
#       cenv = generateTask(p[0],p[1],orien,g[0],g[1])
        s0 = avar(torch.FloatTensor([testEnv.getStateRep()]), requires_grad=False)
        for i in range(0,1000):
            tree = Tree(s0,forwardModel,self,greedy_valueF,maxDepth,treeBreadth)
            loss = tree.getLossFromLeaves()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            if i % 50 == 0: print('Loss',i,":",loss.data[0])
        
# POSSIBLE IDEA
# Dont just consider the leaves; consider all the nodes as possible leaves (consider all subpaths too)

class Node(object):
    
    def __init__(self, parent_node, state, action):
        self.parent = parent_node
        self.children = []
        self.state = state
        self.action = action
        
    def addChild(self, child):
        self.children.append(child)
        
class Tree(object):
    
    def __init__(self, initialState, forwardModel, simPolicy, valueF, maxDepth=5, branchingFactor=3):
        self.simPolicy = simPolicy
        self.maxDepth, self.branchFactor = maxDepth, branchingFactor
        self.forwardModel = forwardModel
        self.valueF = valueF
        parent = Node(None,initialState,None)
        self.allStates = [initialState]
        self.allActions = []

#         print('Generating growth')
        # Generate Tree
        self.tree_head = self.grow(parent,0,self.branchFactor)
        #self.tAllStates = tf.stack(self.allStates)
        # Get leaves
#         print('Getting leaves')
        q, self.leaves = [ parent ], []
        while len(q) >= 1:
            currNode = q.pop()
            for child in currNode.children:
                if len( child.children ) == 0: self.leaves.append( child )
                else: q.append( child )
        #print(self.leaves)
    
    def getPathFromLeaf(self,leafNumber):
        leaf = self.leaves[leafNumber]
        path = [leaf.state]
        actions = [leaf.action]
        currNode = leaf
        while not currNode.parent is None:
            #print(currNode.state)
            path.append(currNode.parent.state)
            if not currNode.parent.action is None:
                actions.append(currNode.parent.action)
            currNode = currNode.parent
        return (list(reversed(path)),list(reversed(actions)))
    
    def grow(self,node,d,b,verbose=False):
        if verbose: print('Grow depth: ',d)
        if verbose: self.forwardModel.printState(node.state[0])
        if d == self.maxDepth : return node
        for i in range(b):
            # Sample the current action
            a_s =  self.simPolicy.sample(node.state)
            #print(a_s)
            concat_vec = torch.cat([node.state, a_s], 1)
            #print("concat_vec",concat_vec.data.numpy(),d)
            current_state = self.forwardModel(concat_vec)
            # Build the next subtre
            self.allStates.append(current_state)
            self.allActions.append(a_s)
            if verbose: print("int_state at depth",d)
            if verbose: self.forwardModel.printState(node.state[0])
            if verbose: print("a_s at depth ",d," and breath",i)
            if verbose: self.forwardModel.printAction(a_s[0])
            if verbose: print("curr_state at depth",d)
            if verbose: self.forwardModel.printState(current_state[0])
            node.addChild( self.grow( Node(node, current_state, a_s), d+1, b) )
        return node
    
    def getBestPlan(self):
        bestInd, bestVal = 0, avar(torch.FloatTensor( [float('-inf')])) #float('-inf')
        for i, leaf in enumerate(self.leaves):
            currVal = self.valueF(leaf.state)
            #print('State')
            #self.forwardModel.printState(leaf.state[0])
            #print('Value',currVal)
            if currVal.data.numpy() > bestVal.data.numpy():
                bestInd = i
                bestVal = currVal
        #print(bestVal)
        return self.getPathFromLeaf( bestInd )
    
    def getLossFromLeaves(self):
        totalLosses = avar(torch.FloatTensor([0.0]))
        for leaf in self.leaves:
            totalLosses += -self.valueF( leaf.state )
        return totalLosses
        
    

In [293]:
f_model_name = 'forward-ffann-noisy-wan-1.pt' # 6 gets 99% on 0.1% noise
exampleEnv = NavigationTask()
ForwardModel = ForwardModelFFANN(exampleEnv)
ForwardModel.load_state_dict( torch.load(f_model_name) )
#ForwardModel.printState(avar(torch.FloatTensor(testEnv.getStateRep()), requires_grad=False))
#SimPolicy = SimulationPolicy(exampleEnv)

In [None]:
SimPolicy = SimulationPolicy(exampleEnv)
SimPolicy.trainSad(ForwardModel)


State Size:  64
Action Size:  10
Loss 0 : 38.0
Loss 50 : 38.0
Loss 100 : 40.0
Loss 150 : 40.0
Loss 200 : 40.0
Loss 250 : 40.0
Loss 300 : 40.0
Loss 350 : 40.0
Loss 400 : 40.0
Loss 450 : 40.0
Loss 500 : 40.0
Loss 550 : 40.0
Loss 600 : 40.0


In [266]:
testEnv = generateTask(0,0,0,2,3)
print(testEnv.getStateRep())
s0 = avar(torch.FloatTensor([testEnv.getStateRep()]), requires_grad=False)
#greedy_valueF(s0)
tree = Tree(s0,ForwardModel,SimPolicy,greedy_cont_valueF,4,5)
nodes,actions = tree.getBestPlan()
actions

[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Generating growth
Getting leaves


[Variable containing:
     1     0     0     0     0     0     0     0     0     0
 [torch.FloatTensor of size 1x10], Variable containing:
     0     0     0     0     0     0     0     1     0     0
 [torch.FloatTensor of size 1x10], Variable containing:
     0     0     1     0     0     0     0     0     0     0
 [torch.FloatTensor of size 1x10], Variable containing:
     0     0     0     0     0     0     1     0     0     0
 [torch.FloatTensor of size 1x10]]

In [221]:
actions

[Variable containing:
     1     0     0     0     0     0     0     0     0     0
 [torch.FloatTensor of size 1x10]]