In [1]:
#Import Libraries
import numpy as np

In [2]:
#Defining MDP
class Problem():
    def __init__(self):
        #State Space
        self.states = {0:"low", 1:"average", 2:"high"};
        #State Value function
        self.state_vals = np.zeros(len(self.states))
        #Action Space
        self.actions = {0:"keep", 1:"replace"};
        #Probability matrices
        self.p = {self.actions[0]:np.array([[1,0,0],[0.8,0.2,0],[0.6,0.4,0]]), \
                  self.actions[1]:np.array([[0.6,0.4,0],[0,0.4,0.6],[0,0,1]])}
        self.policy = {}
        #Discount factor
        self.gamma = 0.99
    #Reward for different states
    def get_reward(self,state,action):
        if action == self.actions[0]:
            return 0
        if action == self.actions[1]:
            if state == self.states[2]:
                return -1
            else:
                return 1
        if action == self.actions[2]:
            return 0
    #Value Iteration
    def value_iter(self,state):
        value = np.zeros(len(self.states))
        #Over all actions
        for i in range(len(self.actions)):
            for j in range(len(self.states)):
                r = self.get_reward(self.states[j],self.actions[i])
                value[i] += self.p[self.actions[i]][state,j]*(r + \
                                                              self.gamma*self.state_vals[j])
        #Select the action which gives maximum state_value_function
        self.state_vals[state] = np.max(value)
        self.policy[self.states[state]] = self.actions[np.argmax(value)]

In [3]:
problem = Problem()
#To check number of iterations
iters = 1
#Convergence factor
eps = 0.01
delta = 1
while(delta>eps):
    delta = 0
    iters += 1
    for j in range(len(problem.states)):
            oldvalue = problem.state_vals[j]
            problem.value_iter(j)
            diff = abs(oldvalue-problem.state_vals[j])
            delta = max(delta,diff)
            


In [4]:
print("Optimal Policy:",problem.policy)
print("Final state value function:",problem.state_vals)
print("Iterations:",iters)

Optimal Policy: {'high': 'walk', 'low': 'recharge', 'out_of_charge': 'recharge'}
Final state value function: [66.22653404 65.39834343 65.23630522]
Iterations: 332
