In [5]:
import numpy as np
import matplotlib.pyplot as plt
import gym
import random

# CREATE THE ENVIRONMENT
env = gym.make("Taxi-v3")
action_size = env.action_space.n
state_size = env.observation_space.n
print("Action space size: ", action_size)
print("State space size: ", state_size)


Action space size:  6
State space size:  500


In [6]:

# INITIALISE Q TABLE TO ZERO
Q = np.zeros((state_size, action_size))

In [7]:

# HYPERPARAMETERS
train_episodes = 2000         # Total train episodes
test_episodes = 100           # Total test episodes
max_steps = 100               # Max steps per episode
alpha = 0.7                   # Learning rate
gamma = 0.618                 # Discounting rate

# EXPLORATION / EXPLOITATION PARAMETERS
epsilon = 1                   # Exploration rate
max_epsilon = 1               # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.01             # Exponential decay rate for exploration prob

# TRAINING PHASE
training_rewards = []   # list of rewards

for episode in range(train_episodes):
    state = env.reset()    # Reset the environment
    cumulative_training_rewards = 0
    
    for step in range(max_steps):
        # Choose an action (a) among the possible states (s)
        exp_exp_tradeoff = random.uniform(0, 1)   # choose a random number
        
        # If this number > epsilon, select the action corresponding to the biggest Q value for this state (Exploitation)
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(Q[state,:])        
        # Else choose a random action (Exploration)
        else:
            action = env.action_space.sample()
        
        # Perform the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info = env.step(action)

        # Update the Q table using the Bellman equation: Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[new_state, :]) - Q[state, action]) 
        cumulative_training_rewards += reward  # increment the cumulative reward        
        state = new_state         # Update the state
        
        # If we reach the end of the episode
        if done == True:
            print ("Cumulative reward for episode {}: {}".format(episode, cumulative_training_rewards))
            break
    
    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    
    # append the episode cumulative reward to the list
    training_rewards.append(cumulative_training_rewards)

print ("Training score over time: " + str(sum(training_rewards)/train_episodes))

Cumulative reward for episode 6: -214
Cumulative reward for episode 36: -126
Cumulative reward for episode 56: -215
Cumulative reward for episode 74: -294
Cumulative reward for episode 79: -73
Cumulative reward for episode 82: -214
Cumulative reward for episode 83: -120
Cumulative reward for episode 84: -150
Cumulative reward for episode 90: -17
Cumulative reward for episode 94: -83
Cumulative reward for episode 104: -49
Cumulative reward for episode 111: -80
Cumulative reward for episode 114: -26
Cumulative reward for episode 117: -150
Cumulative reward for episode 125: -168
Cumulative reward for episode 129: -165
Cumulative reward for episode 130: -154
Cumulative reward for episode 132: -73
Cumulative reward for episode 135: -102
Cumulative reward for episode 145: -34
Cumulative reward for episode 147: -42
Cumulative reward for episode 149: -125
Cumulative reward for episode 150: -22
Cumulative reward for episode 151: -142
Cumulative reward for episode 153: -42
Cumulative reward for 

Cumulative reward for episode 542: 10
Cumulative reward for episode 543: -1
Cumulative reward for episode 544: 10
Cumulative reward for episode 545: 0
Cumulative reward for episode 546: 7
Cumulative reward for episode 547: 8
Cumulative reward for episode 548: 6
Cumulative reward for episode 549: 8
Cumulative reward for episode 550: 8
Cumulative reward for episode 551: 11
Cumulative reward for episode 552: 9
Cumulative reward for episode 553: 5
Cumulative reward for episode 554: 2
Cumulative reward for episode 555: 5
Cumulative reward for episode 556: 9
Cumulative reward for episode 557: 7
Cumulative reward for episode 558: 11
Cumulative reward for episode 559: 9
Cumulative reward for episode 560: 8
Cumulative reward for episode 561: 10
Cumulative reward for episode 562: 5
Cumulative reward for episode 563: 10
Cumulative reward for episode 564: 5
Cumulative reward for episode 565: 12
Cumulative reward for episode 566: 0
Cumulative reward for episode 567: 9
Cumulative reward for episode 

Cumulative reward for episode 962: 8
Cumulative reward for episode 963: 9
Cumulative reward for episode 964: 7
Cumulative reward for episode 965: 8
Cumulative reward for episode 966: 7
Cumulative reward for episode 967: 6
Cumulative reward for episode 968: 5
Cumulative reward for episode 969: 10
Cumulative reward for episode 970: 7
Cumulative reward for episode 971: 7
Cumulative reward for episode 972: 6
Cumulative reward for episode 973: 4
Cumulative reward for episode 974: 4
Cumulative reward for episode 975: 8
Cumulative reward for episode 976: 12
Cumulative reward for episode 977: 6
Cumulative reward for episode 978: 10
Cumulative reward for episode 979: 8
Cumulative reward for episode 980: 11
Cumulative reward for episode 981: 5
Cumulative reward for episode 982: 5
Cumulative reward for episode 983: 6
Cumulative reward for episode 984: 6
Cumulative reward for episode 985: 5
Cumulative reward for episode 986: 11
Cumulative reward for episode 987: 3
Cumulative reward for episode 988

Cumulative reward for episode 1396: 10
Cumulative reward for episode 1397: 8
Cumulative reward for episode 1398: 10
Cumulative reward for episode 1399: 5
Cumulative reward for episode 1400: 6
Cumulative reward for episode 1401: 5
Cumulative reward for episode 1402: 5
Cumulative reward for episode 1403: 5
Cumulative reward for episode 1404: 8
Cumulative reward for episode 1405: 10
Cumulative reward for episode 1406: 12
Cumulative reward for episode 1407: 11
Cumulative reward for episode 1408: 6
Cumulative reward for episode 1409: 8
Cumulative reward for episode 1410: 11
Cumulative reward for episode 1411: 6
Cumulative reward for episode 1412: 5
Cumulative reward for episode 1413: 4
Cumulative reward for episode 1414: 3
Cumulative reward for episode 1415: 4
Cumulative reward for episode 1416: 7
Cumulative reward for episode 1417: 7
Cumulative reward for episode 1418: 10
Cumulative reward for episode 1419: 5
Cumulative reward for episode 1420: 9
Cumulative reward for episode 1421: 10
Cumu

Cumulative reward for episode 1804: 8
Cumulative reward for episode 1805: 5
Cumulative reward for episode 1806: 6
Cumulative reward for episode 1807: 6
Cumulative reward for episode 1808: 6
Cumulative reward for episode 1809: -4
Cumulative reward for episode 1810: 9
Cumulative reward for episode 1811: 15
Cumulative reward for episode 1812: 4
Cumulative reward for episode 1813: 6
Cumulative reward for episode 1814: 9
Cumulative reward for episode 1815: 3
Cumulative reward for episode 1816: 8
Cumulative reward for episode 1817: 7
Cumulative reward for episode 1818: 5
Cumulative reward for episode 1819: -5
Cumulative reward for episode 1820: 6
Cumulative reward for episode 1821: 10
Cumulative reward for episode 1822: 4
Cumulative reward for episode 1823: 5
Cumulative reward for episode 1824: 5
Cumulative reward for episode 1825: 11
Cumulative reward for episode 1826: 4
Cumulative reward for episode 1827: 9
Cumulative reward for episode 1828: 7
Cumulative reward for episode 1829: 13
Cumula