In [1]:
!pip install gym



In [13]:
import numpy as np
import gym
import random
import time
import pickle

In [2]:
env = gym.make("FrozenLake-v0")

In [3]:
action_size = env.action_space.n
state_size = env.observation_space.n

In [4]:
print(action_size, state_size)

4 16


In [5]:
qtable = np.zeros((state_size, action_size))
print(qtable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [11]:
# define hyperparameters ----------
total_episodes = 15000        # Total episodes
learning_rate = 0.8           # Learning rate
max_steps = 99                # Max steps per episode
gamma = 0.95                  # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability
decay_rate = 0.005             # Exponential decay rate for exploration prob

In [15]:
# List of rewards
rewards = []
start = time.time()

# 2 For life or until learning is stopped
for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        ## First we randomize a number
        exp_exp_tradeoff = random.uniform(0, 1)
        
        ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])
            

        # Else doing a random choice --> exploration
        else:
            action = env.action_space.sample()            
        
        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info = env.step(action)

        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        # qtable[new_state,:] : all the actions we can take from new state
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
        
        total_rewards += reward
        state = new_state
        
        # If done (if dead) : finish episode
        if done == True: 
            break
        
    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
    rewards.append(total_rewards)

end = time.time()
print(f'Time taken to train {end - start} seconds')
    
print ("Score over time: " +  str(sum(rewards)/total_episodes))


Time taken to train 13.640382766723633 seconds
Score over time: 0.48186666666666667


In [10]:
# Saving Q table as separate file
import pickle
with open("frozenLake_qTable.pkl", 'wb') as f:
  pickle.dump(qtable, f)

In [16]:
print(qtable)

[[2.26127637e-01 6.15859042e-02 6.93802152e-03 4.24656816e-02]
 [1.83698115e-03 8.46959086e-04 2.87504678e-03 1.68169531e-01]
 [3.12322294e-02 6.05280689e-03 2.36310463e-03 9.86331554e-02]
 [8.70160880e-04 6.88264261e-04 1.08704534e-04 3.29878339e-02]
 [2.09021394e-01 4.48432407e-02 7.38982555e-03 4.84321951e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.47043191e-05 8.95274624e-06 1.18236637e-01 1.32712567e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [4.96064998e-03 1.41242739e-03 1.81877858e-01 1.24723588e-01]
 [5.98897758e-02 2.46142162e-01 3.99899111e-03 4.63813115e-03]
 [6.79643678e-01 3.37343498e-03 2.62651604e-03 1.33906973e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [8.51736175e-02 1.73866555e-02 7.99869058e-01 1.01596012e-01]
 [2.85452047e-01 9.87140828e-01 2.80620109e-01 1.45079978e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.000000

In [None]:
# Parameters used in training
# In this environment agent has 4 possible moves which are represented in the environment as 0, 1, 2, 3 for left, right, down, up respectively

# Parameters

# epsilon for the epsilon-greedy approach, 
# gamma is the discount factor : how much to discount the future reward.
# max_episodes is the maximum amount of times to run the game, 
# max_steps is the maximum steps run for every episode 
# lr_rate is the learning rate : how quickly a network abandons the former value for the new


In [None]:
# Approach

# A central dilemma of reinforcement learning is to exploit what it has already experienced in order to obtain a reward. 

# But in order to do that, it has to explore in order to make better actions in the future.

# This is known as the epsilon greedy strategy. 
# In the beginning, the epsilon rates will be higher. The bot will explore the environment and randomly choose actions. 
# The logic behind this is that the bot does not know anything about the environment. 
# However the more the bot explores the environment, the more the epsilon rate will decreases and the bot starts to exploit the environment.

# we randomly generate a number between 0 and 1 and see if it’s smaller than epsilon. 
# If it’s smaller, then a random action is chosen using env.action_space.sample() 
# and if it’s greater then we choose the action having the maximum value in the Q-table for state

In [None]:
# Time taken to train model 13.64 seconds
# Score over time is 0.48

In [None]:
# Using Q-table to play Frozen lake

In [17]:
env.reset()

for episode in range(5):
    state = env.reset()
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):
        
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        if done:
            # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
            env.render()
            if new_state == 15:
                print("We reached our Goal 🏆")
            else:
                print("We fell into a hole ☠️")
            
            # We print the number of step it took.
            print("Number of steps", step)
            
            break
        state = new_state
env.close()

****************************************************
EPISODE  0
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
We fell into a hole ☠️
Number of steps 17
****************************************************
EPISODE  1
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
We fell into a hole ☠️
Number of steps 5
****************************************************
EPISODE  2
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
We fell into a hole ☠️
Number of steps 14
****************************************************
EPISODE  3
  (Right)
SFFF
FHFH
FFFH
[41mH[0mFFG
We fell into a hole ☠️
Number of steps 6
****************************************************
EPISODE  4
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
We reached our Goal 🏆
Number of steps 15
