In [1]:
from MDP import *

''' Construct simple MDP as described in Lecture 2a Slides 13-14'''
# Transition function: |A| x |S| x |S'| array
T = np.array([[[0.5,0.5,0,0],[0,1,0,0],[0.5,0.5,0,0],[0,1,0,0]],[[1,0,0,0],[0.5,0,0,0.5],[0.5,0,0.5,0],[0,0,0.5,0.5]]])
# Reward function: |A| x |S| array
R = np.array([[0,0,10,10],[0,0,10,10]])
# Discount factor: scalar in [0,1)
discount = 0.9
# MDP object
mdp = MDP(T,R,discount)

### Value Iteration with Extract Policy

In [3]:
'''Test each procedure'''
[V,nIterations,epsilon] = mdp.valueIteration(initialV=np.zeros(mdp.nStates))
print ("[DEBUG] valueIteration: {}".format([V,nIterations,epsilon]))
policy = mdp.extractPolicy(V)
print ("[DEBUG] extractPolicy: {}".format(policy))
[V,nIterations,epsilon] = mdp.valueIteration(initialV=np.zeros(mdp.nStates),tolerance=0.0)
print ("[DEBUG] valueIteration: {}".format([V,nIterations,epsilon]))
policy = mdp.extractPolicy(V)
print ("[DEBUG] extractPolicy: {}".format(policy))
[V,nIterations,epsilon] = mdp.valueIteration(initialV=np.zeros(mdp.nStates),nIterations=100)
print ("[DEBUG] valueIteration: {}".format([V,nIterations,epsilon]))
policy = mdp.extractPolicy(V)
print ("[DEBUG] extractPolicy: {}".format(policy))
[V,nIterations,epsilon] = mdp.valueIteration(initialV=np.zeros(mdp.nStates),nIterations=100,tolerance=0.0)
print ("[DEBUG] valueIteration: {}".format([V,nIterations,epsilon]))
policy = mdp.extractPolicy(V)
print ("[DEBUG] extractPolicy: {}".format(policy))

[DEBUG] valueIteration: [array([ 31.49636306,  38.51527513,  43.935435  ,  54.1128575 ]), 58, 0.0098601388198389373]
[DEBUG] extractPolicy: [0 1 1 1]
[DEBUG] valueIteration: [array([ 31.58510431,  38.60401638,  44.02417625,  54.20159875]), 336, 0.0]
[DEBUG] extractPolicy: [0 1 1 1]
[DEBUG] valueIteration: [array([ 31.49636306,  38.51527513,  43.935435  ,  54.1128575 ]), 58, 0.0098601388198389373]
[DEBUG] extractPolicy: [0 1 1 1]
[DEBUG] valueIteration: [array([ 31.58404185,  38.60295392,  44.0231138 ,  54.2005363 ]), 100, 0.00011805066172598799]
[DEBUG] extractPolicy: [0 1 1 1]


### Evaluate Policy

In [5]:
V = mdp.evaluatePolicy(np.array([1,0,1,0]))
print ("[DEBUG] evaluatePolicy: {}".format(V))
V = mdp.evaluatePolicy(np.array([1,0,0,0]))
print ("[DEBUG] evaluatePolicy: {}".format(V))
V = mdp.evaluatePolicy(np.array([1,1,1,0]))
print ("[DEBUG] evaluatePolicy: {}".format(V))
V = mdp.evaluatePolicy(np.array([0,1,1,1]))
print ("[DEBUG] evaluatePolicy: {}".format(V))

[DEBUG] evaluatePolicy: [  0.           0.          18.18181818  10.        ]
[DEBUG] evaluatePolicy: [  0.   0.  10.  10.]
[DEBUG] evaluatePolicy: [  0.           7.56302521  18.18181818  16.80672269]
[DEBUG] evaluatePolicy: [ 31.58510431  38.60401638  44.02417625  54.20159875]


### Policy Iteration

In [7]:
[policy,V,iterId] = mdp.policyIteration(np.array([0,0,0,0]))
print ("[DEBUG] policyIteration: {}".format([policy,V,iterId]))
[policy,V,iterId] = mdp.policyIteration(np.array([1,0,0,0]))
print ("[DEBUG] policyIteration: {}".format([policy,V,iterId]))
[policy,V,iterId] = mdp.policyIteration(np.array([1,1,1,0]))
print ("[DEBUG] policyIteration: {}".format([policy,V,iterId]))

[DEBUG] policyIteration: [array([0, 1, 1, 1], dtype=int64), array([ 31.58510431,  38.60401638,  44.02417625,  54.20159875]), 2]
[DEBUG] policyIteration: [array([0, 1, 1, 1], dtype=int64), array([ 31.58510431,  38.60401638,  44.02417625,  54.20159875]), 2]
[DEBUG] policyIteration: [array([0, 1, 1, 1], dtype=int64), array([ 31.58510431,  38.60401638,  44.02417625,  54.20159875]), 2]


### Partial Policy Iteration

In [9]:
[V,iterId,epsilon] = mdp.evaluatePolicyPartially(np.array([1,0,1,0]),np.array([0,10,0,13]))
print ("[DEBUG] evaluatePolicyPartially: {}".format([V,iterId,epsilon]))
[V,iterId,epsilon] = mdp.evaluatePolicyPartially(np.array([1,1,1,0]),np.array([10,0,15,1]))
print ("[DEBUG] evaluatePolicyPartially: {}".format([V,iterId,epsilon]))

[DEBUG] evaluatePolicyPartially: [array([  0.        ,   0.08727964,  18.18181818,  10.08727964]), 45, 0.0096977372978752641]
[DEBUG] evaluatePolicyPartially: [array([  0.08727964,   7.65030482,  18.26909782,  16.89400229]), 45, 0.0096977372978752363]


### Modified Policy Iteration
The following code shows results for running modified policy iteration with `nEvalIterations` set to $\inf$ to achieve the same behavior as Policy Iterations. The results also match as the number of iterations to converge to optimal policy is $2$. I also set the value of `nEvalIterations` to $0$ and we see that the number of iterations till convergence matches the results from value iterations: $336$ and $58$ with $\epsilon=0.0,0.01$ respectively.

In [12]:
[policy,V,iterId,tolerance] = mdp.modifiedPolicyIteration(np.array([1,0,1,0]),np.array([0,10,0,13]))
print ("[DEBUG] modifiedPolicyIteration: {}".format([policy,V,iterId,tolerance]))
[policy,V,iterId,tolerance] = mdp.modifiedPolicyIteration(np.array([1,0,1,0]),np.array([0,10,0,13]),nEvalIterations=1)
print ("[DEBUG] modifiedPolicyIteration: {}".format([policy,V,iterId,tolerance]))
#Policy Iteration using Modified Policy Iteration (nEvalIterations=infinite)
print ("POLICY ITERATION")
[policy,V,iterId,tolerance] = mdp.modifiedPolicyIteration(np.array([1,0,1,0]),np.array([0,10,0,13]),nEvalIterations=np.inf)
print ("[DEBUG] modifiedPolicyIteration: {}".format([policy,V,iterId,tolerance]))
[policy,V,iterId,tolerance] = mdp.modifiedPolicyIteration(np.zeros(mdp.nStates,dtype=int),np.zeros(mdp.nStates),nEvalIterations=np.inf, tolerance=0.0)
print ("[DEBUG] modifiedPolicyIteration: {}".format([policy,V,iterId,tolerance]))
#Value Iteration using Modified Policy Iteration (nEvalIterations=0)
print ("VALUE ITERATION")
[policy,V,iterId,tolerance] = mdp.modifiedPolicyIteration(np.zeros(mdp.nStates,dtype=int),np.zeros(mdp.nStates),nEvalIterations=0, tolerance=0.0)
print ("[DEBUG] modifiedPolicyIteration: {}".format([policy,V,iterId,tolerance]))
[policy,V,iterId,tolerance] = mdp.modifiedPolicyIteration(np.zeros(mdp.nStates,dtype=int),np.zeros(mdp.nStates),nEvalIterations=0)
print ("[DEBUG] modifiedPolicyIteration: {}".format([policy,V,iterId,tolerance]))

[DEBUG] modifiedPolicyIteration: [array([0, 1, 1, 1], dtype=int64), array([ 31.49727025,  38.51618232,  43.9363422 ,  54.1137647 ]), 11, 0.0087834054981215104]
[DEBUG] modifiedPolicyIteration: [array([0, 1, 1, 1], dtype=int64), array([ 31.49636306,  38.51527513,  43.935435  ,  54.1128575 ]), 29, 0.0088741249378472276]
POLICY ITERATION
[DEBUG] modifiedPolicyIteration: [array([0, 1, 1, 1], dtype=int64), array([ 31.50048281,  38.51939488,  43.93955476,  54.11697725]), 2, 0.00846214972925452]
[DEBUG] modifiedPolicyIteration: [array([0, 1, 1, 1], dtype=int64), array([ 31.58510431,  38.60401638,  44.02417625,  54.20159875]), 2, 0.0]
VALUE ITERATION
[DEBUG] modifiedPolicyIteration: [array([0, 1, 1, 1], dtype=int64), array([ 31.58510431,  38.60401638,  44.02417625,  54.20159875]), 336, 0.0]
[DEBUG] modifiedPolicyIteration: [array([0, 1, 1, 1], dtype=int64), array([ 31.48650292,  38.50541499,  43.92557486,  54.10299736]), 58, 0.0098601388198389373]
