Part 6. MDP model solving

In [13]:
import mdptoolbox, mdptoolbox.example
import pandas as pd
import numpy as np

In [None]:
#read probability matrix from part 5
P = pd.read_csv("dataset/P.csv")
P = P.iloc[:,1:] ## Remove index column
P = np.array(P).reshape(3,4,4)

P2 = pd.read_csv("dataset/P2.csv")
P2 = P2.iloc[:,1:]
P2 = np.array(P2).reshape(3,4,4) ## Reshape to (actions × clusters × clusters)


In [None]:
#define the reward function
def Reward(k):
    R = np.zeros(12).reshape(4,3)
    R[:,2] -= k
    R[1,:] -= 1
    return R

"""R = [
    [ 0,  0, -k],  # Cluster 0
    [-1, -1, -1-k], # Cluster 1 
    [ 0,  0, -k],   # Cluster 2
    [ 0,  0, -k]    # Cluster 3
]"""

In [None]:
## Value iteration with transition probabilites = P
k = 0.1

R = Reward(k)
##epsilon -> Convergence threshold
mdp_model = mdptoolbox.mdp.ValueIteration(P, R, 0.99, epsilon=0.01, max_iter=10)
mdp_model.run()
print(f'optimal policy when reward is -{k} is {mdp_model.policy}', "\n")

action_value = np.zeros(12).reshape(4,3)
for i in range(4):
    for j in range(3):
        ## @ -> Matrix multiplication (equivalent to np.dot())        
        ## Q(i,j) = V(i) + [ Σ P(i'|i,j)*V(i') + R(i,j) ] * γ     
        ##[V1, V2, V3, V4] @ [P1, P2, P3, P4] = V1*P1 + V2*P2 + V3*P3 + V4*P4   
        action_value[i,j] = mdp_model.V[i] + (np.array(mdp_model.V) @ P[j,i,:] + R[i,j]) * 0.99

print(f'State - Action has policy values as follows \n {action_value}')

optimal policy when reward is -0.1 is (0, 1, 0, 1) 

State - Action has policy values as follows 
 [[-1.63161999 -1.63761482 -1.72905749]
 [-4.17485407 -4.06360818 -4.22286958]
 [-1.53555504 -1.55681276 -1.68182917]
 [-1.59913236 -1.53673146 -1.64524798]]


In [None]:
## Value iteration with transition probabilites = P2

k = 0.1

R = Reward(k)
mdp_model = mdptoolbox.mdp.ValueIteration(P2, R, 0.99, epsilon=0.01, max_iter=10)
mdp_model.run()
print(f'optimal policy when reward is -{k} is {mdp_model.policy}', "\n")

action_value = np.zeros(12).reshape(4,3)
for i in range(4):
    for j in range(3):
        action_value[i,j] = mdp_model.V[i] + (np.array(mdp_model.V) @ P2[j,i,:] + R[i,j]) * 0.99

print(f'State - Action has policy values as follows \n {action_value}')

optimal policy when reward is -0.1 is (1, 0, 0, 1) 

State - Action has policy values as follows 
 [[-1.85455622 -1.85239429 -2.01545719]
 [-4.31171793 -4.35041276 -4.49935547]
 [-1.80006791 -1.8290447  -1.93274651]
 [-1.77668853 -1.74108697 -1.86804464]]


In [17]:
#export our results
pd.DataFrame(action_value).to_csv("action_va.csv")