In [1]:
#Import Libraries
import numpy as np

In [2]:
class Problem():
    def __init__(self):
        #State Space
        self.states = {0:'high',1:'low',2:'out_of_charge'}
        #State Value function
        self.state_vals = np.zeros(len(self.states))
        #Action Space
        self.actions = {0:'recharge',1:'walk',2:'stay'}
        #Probability matrices
        self.p = {self.actions[0]:np.array([[1,0,0],[0.8,0.2,0],[0.6,0.4,0]]), \
                  self.actions[1]:np.array([[0.6,0.4,0],[0,0.4,0.6],[0,0,1]]), \
                  self.actions[2]:np.array([[0.8,0.2,0],[0.2,0.4,0.4],[0,0.2,0.8]])}
        #Inital Random Policy
        self.policy = {'high':'recharge','low':'recharge','out_of_charge':'recharge'}
        #Discount factor
        self.gamma = 0.99
    #Reward for different states
    def get_reward(self,state,action):
        if action == self.actions[0]:
            return 0
        if action == self.actions[1]:
            if state == self.states[2]:
                return -1
            else:
                return 1
        if action == self.actions[2]:
            return 0
    #State-Action value function
    def action_val_func(self,state):
        value = np.zeros(len(self.states))
        for i in range(len(self.actions)):
            for j in range(len(self.states)):
                r = self.get_reward(self.states[j],self.actions[i])
                value[i] += self.p[self.actions[i]][state,j]*(r + \
                                                              self.gamma*self.state_vals[j])
        return self.actions[np.argmax(value)]
    #Policy Evaluation
    def policy_eval(self):
        eps = 0.01
        while True:
            delta = 0
            values = self.state_vals
            for i in range(len(self.states)):
                for j in range(len(self.states)):
                    old_val = self.state_vals[i]
                    r = self.get_reward(self.states[i],self.policy[self.states[i]])
                    self.state_vals[i] += self.p[self.policy[self.states[i]]][i,j]*(r + \
                                                                               self.gamma*values[j])
                delta = max(delta, abs(self.state_vals[i] - old_val))
            if delta < eps:
                break
    #Policy Improvement            
    def policy_improv(self):
        policy_stable = True
        for i in range(len(self.policy)):
            old_action = self.policy[self.states[i]]
            
            new_action = self.action_val_func(i)
            
            self.policy[self.states[i]] = new_action
            
            if old_action != new_action:
                policy_stable = False
        return policy_stable

In [3]:
problem = Problem()
iters = 1
#Policy Iteration
while(True):
    iters += 1
    problem.policy_eval()
    done = problem.policy_improv()
    #Exit if Policy is stable
    if done == True:
        break
print("Optimal Policy:",problem.policy)
print("Optimal state value function:",problem.state_vals)
print("Iterations:",iters)

Optimal Policy: {'high': 'walk', 'low': 'recharge', 'out_of_charge': 'recharge'}
Optimal state value function: [1.         0.948816   0.96973114]
Iterations: 3
