In [1]:
import numpy as np
import matplotlib.pyplot as plt
import math
from IPython.display import display, clear_output
import time
%matplotlib inline
%load_ext autoreload
%autoreload 2
%matplotlib notebook

In [None]:
def GridLine(col,row):
    ax = plt.gca();
    # Major ticks
    ax.set_xticks(np.arange(0, col, 1));
    ax.set_yticks(np.arange(0, row, 1));
    # Labels for major ticks
    ax.set_xticklabels(np.arange(1, col+1, 1));
    ax.set_yticklabels(np.arange(1, row+1, 1));
    # Minor ticks
    ax.set_xticks(np.arange(-.5, col, 1), minor=True);
    ax.set_yticks(np.arange(-.5, row, 1), minor=True);
    # Gridlines based on minor ticks
    ax.grid(which='minor', color='w', linestyle='-', linewidth=2)
    return ax

### Setup

In [2]:
aUp = 0; aDown = 1; aLeft = 2; aRight = 3;
class smallWorld():
    def __init__(self):
        paramSet = dict()
        paramSet['rowCount'] = 4; paramSet['colCount'] = 4;
        paramSet['obsSet'] = np.array([[1,1],[2,1],[1,2]])
        paramSet['startState'] = np.array([0,0])
        paramSet['goalState'] = np.array([3,3])
        paramSet['badSet'] = np.array([])
        paramSet['pGoodTrans'] = .8; paramSet['bias'] = .5
        paramSet['rStep']=-1; paramSet['rGoal']=10
        paramSet['rBad'] = -6; paramSet['gamma'] = .9
        self.paramSet = paramSet
# smallworld
world = smallWorld()
print(world.paramSet)

{'rowCount': 4, 'colCount': 4, 'obsSet': array([[1, 1],
       [2, 1],
       [1, 2]]), 'startState': array([0, 0]), 'goalState': array([3, 3]), 'badSet': array([], dtype=float64), 'pGoodTrans': 0.8, 'bias': 0.5, 'rStep': -1, 'rGoal': 10, 'rBad': -6, 'gamma': 0.9}


In [3]:
def getDir(a,ang):
    leftLUT = np.array([2,3,1,0])
    rightLUT = np.array([3,2,0,1])
    if ang == 0:
        Dir = a
    elif ang == -1:
        Dir = leftLUT[a]
    elif ang == 1:
        Dir = rightLUT[a]
    return Dir

In [4]:
def getState(s,Dir,paramSet):
    cLUT = np.array([0,0,-1,1])
    rLUT = np.array([-1,1,0,0])
    rc = np.array(np.unravel_index(s,
                    (paramSet['colCount'],paramSet['colCount'])))
    rc[0] = rc[0] + rLUT[Dir]
    rc[1] = rc[1] + cLUT[Dir]
    
    cond1 = np.any(rc<0)
    cond2 = np.any(rc[0]>paramSet['rowCount']-1)
    cond3 = np.any(rc[1]>paramSet['colCount']-1)
    cond4 = np.any(np.sum(np.abs(
        paramSet['obsSet']-
        np.tile(rc,(paramSet['obsSet'].shape[0],1))),1) ==0)
    
    if cond1 or cond2 or cond3 or cond4:
        sout = s
    else:
        sout = np.ravel_multi_index(rc,
                            (paramSet['colCount'],paramSet['colCount']))
    return sout

In [5]:
def initGridworld(paramSet):
    model = dict()
    model['stateCount'] = paramSet['colCount']*paramSet['rowCount']+1
    model['gamma'] = paramSet['gamma']
    model['startState'] = np.ravel_multi_index(
                            paramSet['startState'],
                            (paramSet['colCount'],paramSet['colCount']))
    model['goalState'] = np.ravel_multi_index(
                            paramSet['goalState'],
                            (paramSet['colCount'],paramSet['colCount']))
    model['R'] = np.tile(paramSet['rStep'], (model['stateCount'], 4))
    model['R'][model['stateCount']-1,:] = 0
    model['R'][model['goalState'],:] = paramSet['rGoal']
    for i in range(paramSet['badSet'].shape[0]):
        badState=np.ravel_multi_index(
                    paramSet['badSet'][i,:],
                    (paramSet['colCount'],paramSet['colCount']))
        model['R'][badState,:] = paramSet['rBad']
    
    model['P'] = np.zeros((model['stateCount'],model['stateCount'],4))
    for a in range(4):
        for s in range(model['stateCount']):
            if s == model['stateCount']-1:
                model['P'][s,s,a] = 1
                continue
            
            rc = np.array(np.unravel_index(s,
                    (paramSet['colCount'],paramSet['colCount'])))
            endSet=np.concatenate((paramSet['obsSet'],
                               paramSet['goalState'].reshape(-1,2)))
            cond=np.sum(np.abs(endSet-np.tile(rc,(endSet.shape[0],1))),1) 
            if np.any(cond==0):
                model['P'][s,model['stateCount']-1,a]=1
            else:
                for ang in np.linspace(-1,1,3):
                    ang = int(ang)
                    Dir = getDir(a,ang)
                    sout = getState(s,Dir,paramSet)
                    if ang == 0:
                        p= paramSet['pGoodTrans']
                    elif ang == -1:
                        p=(1-paramSet['pGoodTrans'])*paramSet['bias']
                    elif ang == 1:
                        p=(1-paramSet['pGoodTrans'])*(1-paramSet['bias'])
                    model['P'][s,sout,a]=model['P'][s,sout,a]+p
            
    return model
model = initGridworld(world.paramSet)

In [6]:
print(model['P'][:,:,0])

[[0.9 0.1 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.1 0.8 0.1 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.1 0.8 0.1 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.1 0.9 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.8 0.  0.  0.  0.2 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1. ]
 [0.  0.  0.  0.8 0.  0.  0.  0.2 0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.8 0.  0.  0.  0.2 0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.9 0.1 0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.8 0.  0.  0.1 0.1 0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.8 0.  0.  0.  0.1 0.1 0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.1 0.8 0.1 0.  0. ]
 [0.  

### Value Iteration

In [7]:
def valueIteration(model,maxIter):
    v = np.zeros((model['stateCount'],1))
    
    for i in range(maxIter):
        v_ = np.zeros((model['stateCount'],1))
        pi_= np.ones((model['stateCount'],1))
        
        for s in range(model['stateCount']):
            tmpa = np.zeros((4,1))
            for a in range(4):
                tmpa[a] = np.dot(model['P'][s,:,a],(model['R'][s,a]+
                                            model['gamma']*v[:]))
            tmpargmax = tmpa.argmax()
            tmpvalmax = tmpa.max()
            v_[s] = tmpvalmax
            pi_[s]= tmpargmax
            
        if (v_-v).max() < .01:
            break
        else:
            v = v_
            pi=pi_
        pi = pi.astype(int)
    return v,pi

v,pi = valueIteration(model,10000)