# Predicting the value of states in the Frozen Lake Environment

In [1]:
import gym
import pandas as pd 

In [2]:
env = gym.make('FrozenLake-v1')

In [3]:
def random_policy():
    return env.action_space.sample()

In [8]:
V = {}
for s in range(env.observation_space.n):
    V[s] = 0.0
    alpha = 0.85
    gamma = 0.90
    num_episodes = 50000
    num_timesteps = 1000
    for i in range(num_episodes):
        s = env.reset()
        for t in range(num_timesteps):
            a = random_policy()
            s_, r, done, _ = env.step(a)
            V[s_] = s_
            V[s] += alpha * (r + gamma * V[s_] -V[s])
            s = s_
            if done:
                break

In [13]:
env.reset()
x = env.step(1)

In [21]:
s_, r, done, _, t = x

In [22]:
t

{'prob': 0.3333333333333333}

In [9]:
df = pd.DataFrame(list(V.items()), columns=['state', 'value'])

In [10]:
df

Unnamed: 0,state,value
0,0,0.765
1,4,0.6
2,5,5.0
3,8,4.26
4,12,12.0
5,9,9.0
6,1,1.68
7,2,2.595
8,6,4.725
9,3,5.805


# Computing Optimal Policy using SARSA

In [1]:
import gym 
import pandas as pd
import random

In [2]:
env = gym.make('FrozenLake-v1')

In [3]:
Q = {}
for s in range(env.observation_space.n):
    for a in range(env.action_space.n):
        Q[(s, a)] = 0.0

In [4]:
def epsilon_greedy(state, epsilon):
    if random.uniform(0, 1) < epsilon:
        return env.action_space.sample()
    else:
        return max(list(range(env.action_space.n)), key = lambda x: Q[(state, x)])

In [5]:
alpha = 0.85
gamma = 0.90
epsilon = 0.8

In [6]:
num_episodes = 5000
num_timesteps = 1000

In [7]:
# for each episode 
for i in range(20000):
    
    # initialize the state by reseting the environment
    s = env.reset()
    
    # select the action using the epsilon-greedy policy
    a = epsilon_greedy(s, epsilon)
    
    # for each step in the episode
    for t in range(num_timesteps):
        
        # perform the selected action and store the next state information
        s_, r, done, _ = env.step(a)
        
        # select the action a dash in the next state using the epsilon greedy policy
        a_ = epsilon_greedy(s_, epsilon)
        
        # compute the Q value of the state-action pair
        Q[(s, a)] = Q[(s, a)] + alpha * (r + gamma * Q[(s_, a_)] - Q[(s, a)])
        
        # update next state to current state
        s_ = s
        
        # update next action to current action
        a_ = a
        
        # if the current state is the terminal state then break
        if done:
            break

# Computing the Optimal Policy using Q Learning

In [8]:
import gym
import pandas as pd
import random
import numpy as np

In [9]:
env = gym.make('FrozenLake-v1')

In [10]:
Q = {}
for s in range(env.observation_space.n):
    for a in range(env.action_space.n):
        Q[(s, a)] = 0.0

In [11]:
def epsilon_greedy(state, epsilon):
    if random.uniform(0, 1) < epsilon:
        return env.action_space.sample()
    else:
        return max(list(range(env.action_space.n)), key = lambda x: Q[(state, x)])

In [12]:
alpha = 0.85
gamma = 0.90
epsilon = 0.80

In [13]:
num_episodes = 5000
num_steps = 1000

In [17]:
# for each episode:
for i in range(num_episodes):
    
    # initialize the state by resetting the environment
    s = env.reset()
    
    # for each step in the episode
    for t in range(num_steps):
        
        # select the action using the epsilon-greedy policy
        a = epsilon_greedy(s, epsilon)
        
        # perform the selected action and store the next state information
        s_, r, done, _ = env.step(a)
        
        # first, select the action a dash which has a maximum Q value in the next state
        a_ = np.argmax([Q[s, a] for a in range(env.action_space.n)])
        
        # we calculate the value of previous state using our update rule 
        Q[(s, a)] = Q[(s, a)] + alpha * (r + gamma * Q[(s_, a_)] - Q[(s, a)])
        
        # update current state to next state
        s = s_
        
        # if current state is the terminal state then break
        if done:
            break
        
        