In [1]:
import numpy as np
import MDP
import sys

''' Construct a simple maze MDP

  Grid world layout:

  ---------------------
  |  0 |  1 |  2 |  3 |
  ---------------------
  |  4 |  5 |  6 |  7 |
  ---------------------
  |  8 |  9 | 10 | 11 |
  ---------------------
  | 12 | 13 | 14 | 15 |
  ---------------------

  Goal state: 15
  Bad state: 9
  End state: 16

  The end state is an absorbing state that the agent transitions
  to after visiting the goal state.

  There are 17 states in total (including the end state)
  and 4 actions (up, down, left, right).'''

# Transition function: |A| x |S| x |S'| array
T = np.zeros([4,17,17])
a = 0.8;  # intended move
b = 0.1;  # lateral move

# up (a = 0)

T[0,0,0] = a+b;
T[0,0,1] = b;

T[0,1,0] = b;
T[0,1,1] = a;
T[0,1,2] = b;

T[0,2,1] = b;
T[0,2,2] = a;
T[0,2,3] = b;

T[0,3,2] = b;
T[0,3,3] = a+b;

T[0,4,4] = b;
T[0,4,0] = a;
T[0,4,5] = b;

T[0,5,4] = b;
T[0,5,1] = a;
T[0,5,6] = b;

T[0,6,5] = b;
T[0,6,2] = a;
T[0,6,7] = b;

T[0,7,6] = b;
T[0,7,3] = a;
T[0,7,7] = b;

T[0,8,8] = b;
T[0,8,4] = a;
T[0,8,9] = b;

T[0,9,8] = b;
T[0,9,5] = a;
T[0,9,10] = b;

T[0,10,9] = b;
T[0,10,6] = a;
T[0,10,11] = b;

T[0,11,10] = b;
T[0,11,7] = a;
T[0,11,11] = b;

T[0,12,12] = b;
T[0,12,8] = a;
T[0,12,13] = b;

T[0,13,12] = b;
T[0,13,9] = a;
T[0,13,14] = b;

T[0,14,13] = b;
T[0,14,10] = a;
T[0,14,15] = b;

T[0,15,16] = 1;
T[0,16,16] = 1;

# down (a = 1)

T[1,0,0] = b;
T[1,0,4] = a;
T[1,0,1] = b;

T[1,1,0] = b;
T[1,1,5] = a;
T[1,1,2] = b;

T[1,2,1] = b;
T[1,2,6] = a;
T[1,2,3] = b;

T[1,3,2] = b;
T[1,3,7] = a;
T[1,3,3] = b;

T[1,4,4] = b;
T[1,4,8] = a;
T[1,4,5] = b;

T[1,5,4] = b;
T[1,5,9] = a;
T[1,5,6] = b;

T[1,6,5] = b;
T[1,6,10] = a;
T[1,6,7] = b;

T[1,7,6] = b;
T[1,7,11] = a;
T[1,7,7] = b;

T[1,8,8] = b;
T[1,8,12] = a;
T[1,8,9] = b;

T[1,9,8] = b;
T[1,9,13] = a;
T[1,9,10] = b;

T[1,10,9] = b;
T[1,10,14] = a;
T[1,10,11] = b;

T[1,11,10] = b;
T[1,11,15] = a;
T[1,11,11] = b;

T[1,12,12] = a+b;
T[1,12,13] = b;

T[1,13,12] = b;
T[1,13,13] = a;
T[1,13,14] = b;

T[1,14,13] = b;
T[1,14,14] = a;
T[1,14,15] = b;

T[1,15,16] = 1;
T[1,16,16] = 1;

# left (a = 2)

T[2,0,0] = a+b;
T[2,0,4] = b;

T[2,1,1] = b;
T[2,1,0] = a;
T[2,1,5] = b;

T[2,2,2] = b;
T[2,2,1] = a;
T[2,2,6] = b;

T[2,3,3] = b;
T[2,3,2] = a;
T[2,3,7] = b;

T[2,4,0] = b;
T[2,4,4] = a;
T[2,4,8] = b;

T[2,5,1] = b;
T[2,5,4] = a;
T[2,5,9] = b;

T[2,6,2] = b;
T[2,6,5] = a;
T[2,6,10] = b;

T[2,7,3] = b;
T[2,7,6] = a;
T[2,7,11] = b;

T[2,8,4] = b;
T[2,8,8] = a;
T[2,8,12] = b;

T[2,9,5] = b;
T[2,9,8] = a;
T[2,9,13] = b;

T[2,10,6] = b;
T[2,10,9] = a;
T[2,10,14] = b;

T[2,11,7] = b;
T[2,11,10] = a;
T[2,11,15] = b;

T[2,12,8] = b;
T[2,12,12] = a+b;

T[2,13,9] = b;
T[2,13,12] = a;
T[2,13,13] = b;

T[2,14,10] = b;
T[2,14,13] = a;
T[2,14,14] = b;

T[2,15,16] = 1;
T[2,16,16] = 1;

# right (a = 3)

T[3,0,0] = b;
T[3,0,1] = a;
T[3,0,4] = b;

T[3,1,1] = b;
T[3,1,2] = a;
T[3,1,5] = b;

T[3,2,2] = b;
T[3,2,3] = a;
T[3,2,6] = b;

T[3,3,3] = a+b;
T[3,3,7] = b;

T[3,4,0] = b;
T[3,4,5] = a;
T[3,4,8] = b;

T[3,5,1] = b;
T[3,5,6] = a;
T[3,5,9] = b;

T[3,6,2] = b;
T[3,6,7] = a;
T[3,6,10] = b;

T[3,7,3] = b;
T[3,7,7] = a;
T[3,7,11] = b;

T[3,8,4] = b;
T[3,8,9] = a;
T[3,8,12] = b;

T[3,9,5] = b;
T[3,9,10] = a;
T[3,9,13] = b;

T[3,10,6] = b;
T[3,10,11] = a;
T[3,10,14] = b;

T[3,11,7] = b;
T[3,11,11] = a;
T[3,11,15] = b;

T[3,12,8] = b;
T[3,12,13] = a;
T[3,12,12] = b;

T[3,13,9] = b;
T[3,13,14] = a;
T[3,13,13] = b;

T[3,14,10] = b;
T[3,14,15] = a;
T[3,14,14] = b;

T[3,15,16] = 1;
T[3,16,16] = 1;

# Reward function: |A| x |S| array
R = -1 * np.ones([4,17]);

# set rewards
R[:,15] = 100;  # goal state
R[:,9] = -70;   # bad state
R[:,16] = 0;    # end state

# Discount factor: scalar in [0,1)
discount = 0.95

# MDP object
mdp = MDP.MDP(T,R,discount)

### Question 1:

In [2]:
'''Test each procedure'''
[V,nIterations,epsilon] = mdp.valueIteration(initialV=np.zeros(mdp.nStates),tolerance=0.01)
policy = mdp.extractPolicy(V)
print ("----------------------------------------------------------------")
print ("[DEBUG] valueIteration nIterations: {}, epsilon: {}".format(nIterations,epsilon))
print ("policy:")
print (policy)
print ("V:")
print (V)

----------------------------------------------------------------
[DEBUG] valueIteration nIterations: 20, epsilon: 0.008079508521525725
policy:
[3 3 3 1 3 3 3 1 1 3 3 1 3 3 3 0 0]
V:
[  60.62388836   66.03486523   71.80422632   77.09196339   59.81429704
   65.18237783   77.83066489   84.14118981   58.09361039    7.98780239
   84.86704922   91.78159355   69.49584217   76.80962081   91.78159355
  100.            0.        ]


### Question 2:

In [3]:
[policy,V,nIterations] = mdp.policyIteration(np.zeros(mdp.nStates,dtype=int))
print ("----------------------------------------------------------------")
print ("[DEBUG] policyIteration nIterations: {}".format(nIterations))
print ("policy:")
print (policy)
print ("V:")
print (V)

----------------------------------------------------------------
[DEBUG] policyIteration nIterations: 5
policy:
[3 3 3 1 3 3 3 1 1 3 3 1 3 3 3 0 0]
V:
[  60.63256172   66.03897428   71.8062328    77.09295576   59.81945165
   65.18457679   77.83151901   84.14149059   58.0955782     7.98862928
   84.86730581   91.78165089   69.4968138    76.80991653   91.78165089
  100.            0.        ]


### Question 3:

In [4]:
[policy,V,nIterations,epsilon] = mdp.modifiedPolicyIteration(np.zeros(mdp.nStates,dtype=int),np.zeros(mdp.nStates),tolerance=0.01)
print ("----------------------------------------------------------------")
print ("[DEBUG] modifiedPolicyIteration nIterations: {}, epsilon: {}".format(nIterations,epsilon))
print ("policy:")
print (policy)
print ("V:")
print (V)

----------------------------------------------------------------
[DEBUG] modifiedPolicyIteration nIterations: 5, epsilon: 0.03999550518275896
policy:
[3 3 3 1 3 3 3 1 1 3 3 1 3 3 3 0 0]
V:
[  60.55565103   66.00541979   71.78950745   77.08500938   59.77621736
   65.16570516   77.8247663    84.13898151   58.07978244    7.98208916
   84.86513537   91.78120374   69.48912236   76.80733671   91.78120374
  100.            0.        ]


In [5]:
for nEvalIterations in range(1, 11):
    [policy,V,nIterations,epsilon] = mdp.modifiedPolicyIteration(np.zeros(mdp.nStates,dtype=int),np.zeros(mdp.nStates),nEvalIterations=nEvalIterations,tolerance=0.01)
    #print ("nEvalIterations: {}, nIterations: {}".format(nEvalIterations, nIterations))
    print ("---------------------- nEvalIterations: {} ---------------------".format(nEvalIterations))
    print ("[DEBUG] modifiedPolicyIteration nIterations: {}, epsilon: {}".format(nIterations,epsilon))
    print ("policy:")
    print (policy)
    print ("V:")
    print (V)

---------------------- nEvalIterations: 1 ---------------------
[DEBUG] modifiedPolicyIteration nIterations: 6, epsilon: 2.5124590505248747
policy:
[3 3 3 1 3 3 3 1 1 3 3 1 3 3 3 0 0]
V:
[  55.70972755   63.82696405   70.74863933   76.58226105   57.06599793
   63.99308755   77.39484437   83.98897815   57.1337347     7.56465366
   84.7369633    91.75321171   69.02356996   76.66530748   91.75321171
  100.            0.        ]
---------------------- nEvalIterations: 2 ---------------------
[DEBUG] modifiedPolicyIteration nIterations: 6, epsilon: 0.19266242376422582
policy:
[3 3 3 1 3 3 3 1 1 3 3 1 3 3 3 0 0]
V:
[  60.24319471   65.85970324   71.7232826    77.05003039   59.57241234
   65.09507114   77.79405884   84.1293218    58.01666457    7.95113392
   84.85712515   91.77913881   69.45452201   76.797928     91.77913881
  100.            0.        ]
---------------------- nEvalIterations: 3 ---------------------
[DEBUG] modifiedPolicyIteration nIterations: 5, epsilon: 0.4996394094650114