# Q* Learning with FrozenLake
The goal of this game is to go from the starting state (S) to the goal state (G) by walking only on frozen tiles (F) and avoid holes (H). However, the ice is slippery, so you won't always move in the direction you intend (stochastic environment).

In [1]:
# Bit of formatting because inline code is not styled very good by default:
from IPython.core.display import HTML
HTML("""<style> .rendered_html code { 
    padding: 2px 4px;
    color: #c7254e;
    background-color: #f9f2f4;
    border-radius: 4px;
} </style>""")

In [2]:
# Get necessary libraries
import numpy as np
import gym
import random

## Step 1: Create Environment from OpenAI Gym

In [3]:
env = gym.make("FrozenLake-v0")
env.reset()                    
env.render()

print("Action space: ", env.action_space)
print("Observation space: ", env.observation_space)


[41mS[0mFFF
FHFH
FFFH
HFFG
Action space:  Discrete(4)
Observation space:  Discrete(16)


## Step 2: Initialize Q table

In [4]:
action_size = env.action_space.n
state_size = env.observation_space.n

qtable = np.zeros((state_size, action_size))
print(qtable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


## Step 3: Set Hyperparameters

In [5]:
total_episodes = 20000        # Total episodes
learning_rate = 0.5           # Learning rate
max_steps = 99                # Max steps per episode
gamma = 0.95                  # Discounting rate

epsilon = 1.0                # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.005            # Exponential decay rate for exploration prob

## Step 4: Implement Q Learning Algorithm
Initialize Q-values arbitrarily for all state-action pairs <br>
Repeat for each episode:
- Choose an action in the current world state based on current Q-value estimates
- Take action and observe the outcome state and reward
- Update Q function based on the Bellman Equation

In [6]:
rewards = []

for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        # Choose an action a in the current world state (s)
        ## First we randomize a number
        exp_exp_tradeoff = random.uniform(0, 1)
        
        ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])

        # Else doing a random choice --> exploration
        else:
            action = env.action_space.sample()

        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info = env.step(action)

        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        # qtable[new_state,:] : all the actions we can take from new state
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
        
        total_rewards += reward
        
        # Our new state is state
        state = new_state
        
        # If done (if we're dead) : finish episode
        if done == True: 
            break
        
    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
    rewards.append(total_rewards)

print ("Score over time: " +  str(sum(rewards)/total_episodes))
print(qtable)

Score over time: 0.51935
[[2.54826851e-01 1.17586286e-01 8.70184184e-02 2.13959630e-01]
 [4.81078203e-03 8.27149832e-02 2.61398402e-02 1.92709431e-01]
 [4.15775095e-02 1.05407985e-01 3.13103616e-02 9.51092966e-02]
 [2.00643268e-02 1.29606446e-02 1.89107883e-02 9.00621156e-02]
 [2.91949963e-01 1.72353181e-01 1.69459458e-02 2.01183074e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [8.94618075e-02 4.00011730e-03 2.84212837e-03 2.48555951e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.54852861e-01 3.09231757e-01 7.63662925e-02 3.60177599e-01]
 [1.56544861e-01 4.45118136e-01 3.46663453e-01 4.08928828e-02]
 [3.39396860e-01 1.13113458e-02 3.06983923e-02 1.07235886e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.78646091e-01 4.32473121e-01 4.57361209e-01 3.79530165e-01]
 [4.13796472e-01 7.72236735e-01 3.80813016e-01 4.16149366e-01]
 [0.00000000e+00 0.00000000e+0

## Step 5: Play FrozenLake

In [7]:
env.reset()
rewards = []
total_test_episodes = 10

for episode in range(total_test_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):
        
        #env.render()
        
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        total_rewards += reward
        
        if done:
            # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
            env.render()
            rewards.append(total_rewards)
            # We print the number of step it took.
            print("Number of steps", step)
            break
        state = new_state
env.close()
print ("Score over time: " +  str(sum(rewards)/total_test_episodes))

****************************************************
EPISODE  0
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 60
****************************************************
EPISODE  1
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 23
****************************************************
EPISODE  2
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 52
****************************************************
EPISODE  3
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
Number of steps 38
****************************************************
EPISODE  4
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 26
****************************************************
EPISODE  5
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 19
****************************************************
EPISODE  6
****************************************************
EPISODE  7
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
Number of steps 6
****************************************************
EPISODE  8
  (Down)
SFFF
FHFH
FFFH
HF