In [1]:
import gym
import numpy as np
import  random

random.seed(1234)

env = gym.make("Taxi-v3").env
env.render()


+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| :[43m [0m|[34;1mB[0m: |
+---------+



In [2]:
action_size = env.action_space.n
state_size = env.observation_space.n
action_size, state_size

(6, 500)

In [3]:
qtable = np.zeros((state_size,action_size))
qtable

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

Hyperparameters

In [4]:
total_epochs = 100000
max_steps = 25 # max step per epoch

learning_rate = 0.7
gamma = 0.618 #discounting rate

Exploration parameters 

In [5]:
epsilon = 1.0 #exploration rate
max_epsilon = 1.0 #maximum exploration probability at start
min_epsilon = 0.01 #minimum exploration probability at end
decay_rate = 0.01 #exponential decay rate for exploration probability

In [6]:
for epoch in range(total_epochs):
    state = env.reset()
    step = 0
    done = False
    for step in range(max_steps):
        
        #random number between 0 and 1
        exp_tradeoff = random.uniform(0,1)
        
        # if this number > exploration_probability then exploitation
        if(exp_tradeoff > epsilon):
            action = np.argmax(qtable[state,:])
        
        #else doing a randomm choice
        else:
            action = env.action_space.sample()
        
        #Get info from state
        new_state, reward, done, info = env.step(action)
        
        #Update Q-table
        qtable[state,action] = qtable[state,action] + learning_rate*(reward + gamma*np.max(qtable[new_state,:]) - qtable[state,action])
        
        state = new_state
        if done == True:
            break
    
    epoch+=1
    #Reduce exploration probability
    epsilon = max_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*epoch)
            

In [7]:
from IPython.display import clear_output
from time import sleep

for tripnum in range(1, 11):
    state = env.reset()
    
    done = False
    trip_length = 0
    
    while not done and trip_length < max_steps:
        action = np.argmax(qtable[state])
        next_state, reward, done, info = env.step(action)
        clear_output(wait=True)
        print("Trip number " + str(tripnum) + " Step " + str(trip_length))
        print(env.render(mode='ansi'))
        sleep(.5)
        state = next_state
        trip_length += 1
        
    sleep(2)

Trip number 10 Step 12
+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

