In [1]:
import numpy as np
import pandas as pd

In [2]:
class env:
    def __init__(self):
        self.nS = 3
        self.nA = 3
        self.reward = np.matrix([[1,0,0],[0.5,0,0.5],[0.5,0.5,0]])
        self.P = {0: np.matrix([[0.8,0.1,0.1],[0.5,0,0.5],[0.2,0.4,0.4]]), 1: np.matrix([[0.3,0.3,0.4],[0.2,0.1,0.7],[0.4,0.4,0.2]]), 2: np.matrix([[0.1,0.5,0.4],[0.3,0.5,0.2],[0,0.1,0.9]])}

In [3]:
def value_iteration(env, theta=0.0001, discount_factor=0.5):
    """
    Value Iteration Algorithm.
    
    Args:
        env: the environment.
            env.P[s] is a matrix with (i,j)th element represent P(j|s,i).
            env.nS is the number of states in the environment. 
            env.nA is the number of actions in the environment.
        theta: We stop evaluation once our value function change is less than theta for all states.
        discount_factor: Gamma discount factor.
        
    Returns:
        A tuple (policy, V) of the optimal policy and the optimal value function.
    """
    
    def one_step_lookahead(state, V):
        A = np.zeros(env.nA)
        for a in range(env.nA):
            for j in range(env.nS):
                A[a] += env.P[state][a,j] * (env.reward[state, a] + discount_factor * V[j])
        return A
    
    V = np.zeros(env.nS)
    while True:
        delta = 0
        for s in range(env.nS):
            A = one_step_lookahead(s, V)
            best_action_value = np.max(A)
            delta = max(delta, np.abs(best_action_value - V[s]))
            V[s] = best_action_value        
        if delta < theta:
            break
    
    policy = np.zeros([env.nS, env.nA])
    for s in range(env.nS):
        A = one_step_lookahead(s, V)
        best_action = np.argmax(A)
        policy[s, best_action] = 1.0
    
    return policy, V

In [4]:
env1 = env()

In [5]:
op_policy, V = value_iteration(env = env1, discount_factor = 0.5)