In [1]:
import numpy as np
import gym
import random

Q* Learning with FrozenLake.

In this Notebook,  an agent is implemented that plays FrozenLake.
The goal of this game is to go from the starting state (S) to the goal state (G) by walking only on frozen tiles (F) and avoid holes (H).

In [2]:
env = gym.make("FrozenLake-v0")

In [3]:
action_size = env.action_space.n
state_size = env.observation_space.n

In [4]:
# Create a Q table with state_size rows and action_size columns (64x4)
qtable = np.zeros((state_size, action_size))
print(qtable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [5]:
total_episodes = 20000       # Total episodes
learning_rate = 0.1         # Learning rate
max_steps = 100               # Max steps per episode
gamma = 0.99                 # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.001            # Exponential decay rate for exploration prob

In [6]:
# List of rewards
rewards = []

# 2 For life or until learning is stopped
for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        # 3. Choose an action a in the current world state (s)
        ## First we randomize a number
        exp_exp_tradeoff = random.uniform(0, 1)
        
        ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])
            #print(exp_exp_tradeoff, "action", action)

        # Else doing a random choice --> exploration
        else:
            action = env.action_space.sample()
            #print("action random", action)
            
        
        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info = env.step(action)

        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        # qtable[new_state,:] : all the actions we can take from new state
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
        
        total_rewards += reward
        
        # Our new state is state
        state = new_state
        
        # If done (if we're dead) : finish episode
        if done == True: 
            break
        
    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
    rewards.append(total_rewards)
    

print ("Score over time: " +  str(sum(rewards)/total_episodes))
print(qtable)

Score over time: 0.60175
[[0.51811561 0.49188085 0.48575644 0.48779304]
 [0.32036463 0.42641569 0.40783518 0.47075478]
 [0.40959059 0.40984869 0.38095944 0.44328973]
 [0.35006952 0.18294938 0.25807465 0.43167614]
 [0.53751008 0.36882591 0.25007102 0.38051508]
 [0.         0.         0.         0.        ]
 [0.26177478 0.16989351 0.16159283 0.13004725]
 [0.         0.         0.         0.        ]
 [0.35695585 0.44286583 0.47796661 0.56659524]
 [0.46359951 0.59988115 0.41795242 0.41011341]
 [0.5569533  0.4135092  0.41535018 0.33123417]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.37679905 0.64895357 0.67823121 0.43786923]
 [0.65811283 0.75565115 0.74281936 0.7572835 ]
 [0.         0.         0.         0.        ]]


In [7]:
env.reset()

for episode in range(5):
    state = env.reset()
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):
        
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        if done:
            # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
            env.render()
            if new_state == 15:
                print("We reached our Goal 🏆")
            else:
                print("We fell into a hole ☠️")
            
            # We print the number of step it took.
            print("Number of steps", step)
            
            break
        state = new_state
env.close()

****************************************************
EPISODE  0
  (Up)
SFFF
FHFH
FFFH
HFF[41mG[0m
We reached our Goal 🏆
Number of steps 39
****************************************************
EPISODE  1
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
We fell into a hole ☠️
Number of steps 99
****************************************************
EPISODE  2
  (Up)
SFFF
FHFH
FFFH
HFF[41mG[0m
We reached our Goal 🏆
Number of steps 21
****************************************************
EPISODE  3
  (Up)
SFFF
FHFH
FFFH
HFF[41mG[0m
We reached our Goal 🏆
Number of steps 47
****************************************************
EPISODE  4
  (Up)
SFFF
FHFH
FFFH
HFF[41mG[0m
We reached our Goal 🏆
Number of steps 57
