In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import random

In [2]:
class MDP:
    def __init__(self, size, policy, discount, actions):
        self.size = size
        self.policy = policy
        self.discount = discount
        self.actions = actions
        
    def getNextState(self, state, action):
        if(np.array_equal(state, np.array([0,1]))):
            return np.array([4,1]), 10
        elif(np.array_equal(state, np.array([0,3]))):
            return np.array([2,3]), 5
        new_state = state + action
        new_i, new_j = new_state
        if(new_i < 0 or new_i > self.size-1 or new_j < 0 or new_j > self.size-1):
            new_state = state
            return new_state, -1
        return new_state, 0
              
    def generateFig32(self):
        value_func = np.zeros((self.size, self.size))
        updated_values = np.zeros_like(value_func)
        error = 1000
        
        while(error > 1e-4):
            updated_values = np.zeros_like(value_func)
            for i in range(self.size):
                for j in range(self.size):
                    for a in self.actions:
                        [new_i, new_j], reward = self.getNextState(np.array([i,j]), a)
                        updated_values[i,j] +=  self.policy*(reward + self.discount*value_func[new_i, new_j])
                  
            error = np.sum(np.abs(value_func - updated_values))
            value_func = updated_values
        return np.round(updated_values, decimals=1)
        
    def generateFig35(self):
        value_func = np.zeros((self.size, self.size))
        updated_values = np.zeros_like(value_func)
        error = 1000
        
        while(error > 1e-4):
            updated_values = np.zeros_like(value_func)
            for i in range(self.size):
                for j in range(self.size):
                    value_log = []
                    for a in self.actions:
                        [new_i, new_j], reward = self.getNextState(np.array([i,j]), a)
                        value_log.append(reward + self.discount*value_func[new_i, new_j])
                    updated_values[i,j] = np.max(value_log)
                        
            error = np.sum(np.abs(value_func - updated_values))
            value_func = updated_values
            # print(error)
        return np.round(updated_values, decimals=1)
            

In [3]:
mdp = MDP(5, 0.25, 0.9, np.array([[0,-1],[0,1],[1,0],[-1,0]]))

valueFig32 = mdp.generateFig32()

print(valueFig32)

[[-0.5  10.   -0.25  5.   -0.5 ]
 [-0.25  0.    0.    0.   -0.25]
 [-0.25  0.    0.    0.   -0.25]
 [-0.25  0.    0.    0.   -0.25]
 [-0.5  -0.25 -0.25 -0.25 -0.5 ]]
[[ 1.46875  9.775    3.06875  5.       0.34375]
 [-0.475    2.19375 -0.05625  1.06875 -0.475  ]
 [-0.41875 -0.05625  0.      -0.05625 -0.41875]
 [-0.475   -0.1125  -0.05625 -0.1125  -0.475  ]
 [-0.8375  -0.475   -0.41875 -0.475   -0.8375 ]]
[[ 2.2534375   9.5725      3.7521875   4.949375    0.6728125 ]
 [ 0.37296875  2.0671875   1.42453125  0.9928125  -0.13328125]
 [-0.570625    0.3740625  -0.050625    0.1209375  -0.570625  ]
 [-0.66484375 -0.2390625  -0.14484375 -0.2390625  -0.66484375]
 [-1.090625   -0.66484375 -0.570625   -0.66484375 -1.090625  ]]
[[ 2.75177734  9.40164063  4.18218359  5.10884375  0.88638672]
 [ 0.67766797  2.64241406  1.52135156  1.43135156 -0.03361328]
 [-0.35989844  0.27154687  0.39930469  0.0298125  -0.53075781]
 [-0.82716016 -0.24760547 -0.24735938 -0.30455859 -0.82716016]
 [-1.28996094 -0.82716016

In [4]:
valueFig35 = mdp.generateFig35()

print(valueFig35)

[[22.  24.4 22.  19.4 17.5]
 [19.8 22.  19.8 17.8 16. ]
 [17.8 19.8 17.8 16.  14.4]
 [16.  17.8 16.  14.4 13. ]
 [14.4 16.  14.4 13.  11.7]]
