In [1]:
import numpy as np
import pandas as pd

In [29]:
class env:
    def __init__(self):
        self.nS = 3
        self.nA = 3
        self.reward = np.matrix([[1,0,0],[0.5,0,0.5],[0.5,0.5,0]])
        self.P = {0: np.matrix([[0.8,0.1,0.1],[0.5,0,0.5],[0.2,0.4,0.4]]), 1: np.matrix([[0.3,0.3,0.4],[0.2,0.1,0.7],[0.4,0.4,0.2]]), 2: np.matrix([[0.1,0.5,0.4],[0.3,0.5,0.2],[0,0.1,0.9]])}
        
    

In [63]:
def policy_eval(policy, env, discount_factor=0.5, theta=0.00001):
    """
    Evaluate a policy given an environment and a full description of the environment's dynamics.
    
    Args:
        policy: [S, A] shaped matrix representing the policy.
        env: 
            env.P[s] is a matrix of with (i,j) represent P(j|s,i).
            env.nS is a number of states in the environment. 
            env.nA is a number of actions in the environment.
        theta: We stop evaluation once our value function change is less than theta for all states.
        discount_factor: Gamma discount factor.
    
    Returns:
        Vector of length env.nS representing the value function.
    """
    V = np.zeros(env.nS)
    while True:
        delta = 0
        for s in range(env.nS):
            v = 0
            for a, action_prob in enumerate(policy[s]):
                for j in range(env.nS):
                    v += action_prob * env.P[s][a,j] * (env.reward[s,a] + discount_factor * V[j])
            # How much our value function changed (across any states)
            delta = max(delta, np.abs(v - V[s]))
            V[s] = v
        # Stop evaluating once our value function change is below a threshold
        if delta < theta:
            break
    return np.array(V)

In [74]:
#Apply policy evaluation
env1 = env()
policy1 = np.array([[1,0,0],[1,0,0],[0,1,0]])
policy2 = np.array([[0.5,0,0.5],[0,0.5,0.5],[0,0.5,0.5]])
policy3 = np.array([[1,0,0],[1,0,0],[1,0,0]])
V1 = policy_eval(policy1, env1)
V2 = policy_eval(policy2, env1)
V3 = policy_eval(policy3, env1)

In [88]:
def policy_improvement(env, policy_eval_fn=policy_eval, discount_factor=0.5):
    """
    Policy Improvement Algorithm. Iteratively evaluates and improves a policy
    until an optimal policy is found.
    
    Args:
        env: The environment.
        policy_eval_fn: Policy Evaluation function that takes 3 arguments:
            policy, env, discount_factor.
        discount_factor: gamma discount factor.
        
    Returns:
        A tuple (policy, V). 
        policy is the optimal policy, a matrix of shape [S, A] where each state s
        contains a valid probability distribution over actions.
        V is the value function for the optimal policy.
        
    """

    def one_step_lookahead(state, V):
        A = np.zeros(env.nA)
        for a in range(env.nA):
            for j in range(env.nS):
                A[a] += env.P[state][a,j] * (env.reward[state, a] + discount_factor * V[j])
        return A
    
    policy = np.array([[0.5,0,0.5],[0,0.5,0.5],[0,0.5,0.5]])
    
    while True:
        V = policy_eval_fn(policy, env, discount_factor)
        policy_stable = True
        for s in range(env.nS):
            chosen_a = np.argmax(policy[s])
            action_values = one_step_lookahead(s, V)
            best_a = np.argmax(action_values)

            if chosen_a != best_a:
                policy_stable = False
            policy[s] = np.eye(env.nA)[best_a]
            
        if policy_stable:
            break
    return policy, V

In [89]:
#Apply policy iteration
op_policy, V = policy_improvement(env = env1, discount_factor = 0.5)