In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import random

In [2]:
class MDP:
    def __init__(self, size, policy, discount, actions):
        self.size = size
        self.policy = policy
        self.discount = discount
        self.actions = actions
        
    def getNextState(self, state, action):
        if(np.array_equal(state, np.array([0,1]))):
            return np.array([4,1]), 10
        elif(np.array_equal(state, np.array([0,3]))):
            return np.array([2,3]), 5
        new_state = state + action
        new_i, new_j = new_state
        if(new_i < 0 or new_i > self.size-1 or new_j < 0 or new_j > self.size-1):
            new_state = state
            return new_state, -1
        return new_state, 0
              
    def generateFig32(self):
        value_func = np.zeros((self.size, self.size))
        updated_values = np.zeros_like(value_func)
        error = 1000
        
        while(error > 1e-4):
            updated_values = np.zeros_like(value_func)
            for i in range(self.size):
                for j in range(self.size):
                    for a in self.actions:
                        [new_i, new_j], reward = self.getNextState(np.array([i,j]), a)
                        updated_values[i,j] +=  self.policy*(reward + self.discount*value_func[new_i, new_j])
                  
            error = np.sum(np.abs(value_func - updated_values))
            value_func = updated_values
            # print(error)
        return np.round(updated_values, decimals=1)
        
    def generateFig35(self):
        value_func = np.zeros((self.size, self.size))
        updated_values = np.zeros_like(value_func)
        error = 1000
        
        while(error > 1e-4):
            updated_values = np.zeros_like(value_func)
            for i in range(self.size):
                for j in range(self.size):
                    value_log = []
                    for a in self.actions:
                        [new_i, new_j], reward = self.getNextState(np.array([i,j]), a)
                        value_log.append(reward + self.discount*value_func[new_i, new_j])
                    updated_values[i,j] = np.max(value_log)
                        
            error = np.sum(np.abs(value_func - updated_values))
            value_func = updated_values
            # print(error)
        return np.round(updated_values, decimals=1)
            

In [3]:
mdp = MDP(5, 0.25, 0.9, np.array([[0,-1],[0,1],[1,0],[-1,0]]))

# fig1 = plt.figure(figsize = (20,10))
# fig2 = plt.figure(figsize = (20,10))

valueFig32 = mdp.generateFig32()

print(valueFig32)

[[ 3.3  8.8  4.4  5.3  1.5]
 [ 1.5  3.   2.3  1.9  0.5]
 [ 0.1  0.7  0.7  0.4 -0.4]
 [-1.  -0.4 -0.4 -0.6 -1.2]
 [-1.9 -1.3 -1.2 -1.4 -2. ]]


In [4]:
valueFig35 = mdp.generateFig35()

print(valueFig35)

[[22.  24.4 22.  19.4 17.5]
 [19.8 22.  19.8 17.8 16. ]
 [17.8 19.8 17.8 16.  14.4]
 [16.  17.8 16.  14.4 13. ]
 [14.4 16.  14.4 13.  11.7]]
