In [29]:
# Import necessary libraries

import gym
import numpy as np

In [30]:
# Initialize the FrozenLake-v0 gym environment
env = gym.make('FrozenLake-v0')


In [31]:
# Take a quick look at the environment by calling the render function
env.render()



[41mS[0mFFF
FHFH
FFFH
HFFG


### **POLICY EVALUATION**
<img src="./images/policy_eval.png" alt="Policy Evaluation" style="width:700px">

In [32]:
# Function to compute the value function
# This function takes as input the policy and discount factor

def policy_evaluation(policy, gamma=1.0):
    
    # Initialize the value table with zeros
    # The size of the value table is the equal to the number of states
    value_table = np.zeros(env.observation_space.n)
    
    # Set the threshold
    threshold = 1e-10
    
    # Initialize a loop
    while True:
        
        # Save the value table to the updated_value_table
        updated_value_table = value_table

        # For each state in the environment, 
        # select the action according to the policy and compute the value table
        for state in range(env.nS):
            
            # Get the action from the policy based on the state
            action = policy[state]
            
            # For the selected action, compute the value function based on the equation in the instructions
            value_table[state] = sum([trans_prob * (reward_prob + gamma * updated_value_table[next_state]) 
                        for trans_prob, next_state, reward_prob, _ in env.P[state][action]])
            
        # Compare the change in value function
        # End the loop if the change is lower than the threshold
        if (np.sum((np.fabs(updated_value_table - value_table))) <= threshold):
            break
            
    # Return the value table
    return value_table


### **POLICY IMPROVEMENT**
<img src="./images/policy_improv.png" alt="Policy Improvement" style="width:700px">

In [33]:
# Function to extract the optimal policy based on the optimal value function
# The function takes as input the value table and the discount factor

def policy_improvement(value_table, gamma = 1.0):
 
    # Initialize the policy with zeros
    # The size of the policy is equivalent to the number of states
    policy = np.zeros(env.observation_space.n) 
    
    # Loop for each state 
    for state in range(env.observation_space.n):
        
        # Initialize the Q table for a state
        # with zeroes and the number of possible actions
        Q_table = np.zeros(env.action_space.n)
        
        # Loop for each action
        for action in range(env.action_space.n):
            
            # For the given state and action, loop over the next states
            for next_sr in env.P[state][action]: 
                
                # Get the transition probability, next state, reward from the the environment
                trans_prob, next_state, reward, _ = next_sr 
                
                # Update the q-table the particular action based on the equation in the instructions
                Q_table[action] += (trans_prob * (reward + gamma * value_table[next_state]))
        
        # Select the action which has maximum Q value as an optimal action of the state
        policy[state] = np.argmax(Q_table)
    
    # Return the policy
    return policy


### **POLICY ITERATION**
<img src="./images/policy_iter.png" alt="Policy Iteration" style="width:700px">

In [38]:
# Function to perform policy iteration
# This function takes the environment and discount factor

def policy_iteration(env,gamma = 1.0):
    
    # Initialize policy with zeros for the dimension of the number of state
    old_policy = np.zeros(env.observation_space.n)  
    
    # Specify the number of iterations
    no_of_iterations = 200
    
    # Loop over the number of iterations
    for i in range(no_of_iterations):
        
        # Compute the value function calling the policy_evaluation 
        # by passing the policy and discount factor
        new_value_function = policy_evaluation(old_policy, gamma)
        
        # Extract the new policy by calling the policy_improvement function with the 
        # new value function and the discount factor
        new_policy = policy_improvement(new_value_function, gamma)
   
        # Check whether we have reached convergence i.e whether we found the optimal
        # policy by comparing old_policy and new policy.
        # If the policies are the same, break the loop, else update the old policy

        if (np.all(old_policy == new_policy)):
            print ('Policy-Iteration converged at step %d.' %(i+1))
            break
        old_policy = new_policy
        
    return new_policy



In [39]:
# Call the policy_iteration function by passing the environment as a parameter
print (policy_iteration(env, 0.9))

Policy-Iteration converged at step 3.
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0.]
