Implementation
Installing gym, cmake and pygame is required
OpenAI gym has the exactly environment we need, it is called "Taxi-v3"

In [1]:
# !pip install cmake
# !pip install pygame
# !pip install gym[atari]

Import library

In [2]:
import gym
import numpy as np
import random
from IPython.display import clear_output
from tqdm import tqdm
from time import sleep

Build emvironment by call Taxi-v3 in gym library

In [3]:
env = gym.make('Taxi-v3').env

Some example to show how taxi-v3 work

In [4]:
env.reset() # get random state
env.render()

+---------+
|R: | : :[34;1mG[0m|
| : | :[43m [0m: |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+



In [5]:
display(env.P[328]) # reward table
#From left to right: [(probability, nextstate, reward, done)]

{0: [(1.0, 428, -1, False)],
 1: [(1.0, 228, -1, False)],
 2: [(1.0, 348, -1, False)],
 3: [(1.0, 328, -1, False)],
 4: [(1.0, 328, -10, False)],
 5: [(1.0, 328, -10, False)]}

In [6]:
# function use to show the frame that discribes how the taxi work
def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        
#         print(f"Sample: {x}")
        
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        print(f"Done: {frame['done']}")
        sleep(0)# increase 0 to print frames slower

Now let's make a function that will give a random move, break when completed.

In [7]:
env.s = 328  # set environment to illustration's state

epochs = 0
penalties, reward = 0, 0

frames = [] # for animation

done = False

while not done:
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)

    if reward == -10:
        penalties += 1
    
    # Put each rendered frame into dict for animation
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward,
        'done': done
        }
    )

    epochs += 1
    
    
print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))

Timesteps taken: 240
Penalties incurred: 81


In [8]:
print_frames(frames)

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timestep: 240
State: 0
Action: 5
Reward: 20
Done: True


Training Q table

In [9]:
# %%time
# """Training the agent"""
def training_Qtable():    
    q_table = np.zeros([env.observation_space.n, env.action_space.n])

    # Picking up hyperparameters, you can use random.uniform(0, 1) instead
    alpha = random.random()
    gamma = random.random()
    epsilon = 0.1
    
    print(f"alpha = {alpha} và gamma = {gamma} và epsilon = {epsilon}")

    # For plotting metrics
    all_epochs = []
    all_penalties = []

    for i in tqdm(range(1, 80001), colour = 'BLUE'):
        state = env.reset()

        epochs, penalties, reward, = 0, 0, 0
        done = False

        while not done:
            # Determind what to do next

            if random.uniform(0, 1) < epsilon:
                action = env.action_space.sample() # Explore action space
                # env.action_space.sample() automatically selects one random action from set of all possible actions.
            else:
                action = np.argmax(q_table[state]) # Exploit learned values
                # Returning a state with the highest score.

            # Simulating the next action

            next_state, reward, done, info = env.step(action) 

            # Calculation using above equation

            old_value = q_table[state, action]
            next_max = np.max(q_table[next_state])

            new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
            q_table[state, action] = new_value

            # Determind if we have a big penalty

            if reward == -10:
                penalties += 1

            # Moving to the next state

            state = next_state
            epochs += 1

    print("Training finished.\n")
    return q_table

q_table = training_Qtable()    

alpha = 0.8422045099733166 và gamma = 0.7423951992359493 và epsilon = 0.1


100%|[34m██████████████████████████████████████████████████████████████████████████[0m| 80000/80000 [00:25<00:00, 3157.48it/s][0m

Training finished.






Strategy function

In [10]:
frames = [] # for animation
location = {'R' : 0, 'G' : 1, 'Y' : 2, 'B' : 3}
R, G, Y, B = 'R', 'G', 'Y', 'B'
def strategy(x, y, passenger, destination):
    env.s = env.encode(x, y, location[passenger], location[destination])  # set environment
    
    frames.clear()
    result = []
    
    print(f"Taxi's position = ({x},{y})")
    print("Passenger: " + passenger)
    print("Destination: " + destination)
    
    step, reward = 0, 0
    
    done = False

    while not done:
        if (q_table[env.s] == 0.).all():
            break
        
        action = np.where(q_table[env.s] == q_table[env.s].max())[0][0]
        state, r, done, info = env.step(action)
        
        reward += r
        
        # Save action list
        result.append(action) 
        
        # Put each rendered frame into dict for animation
        frames.append({
            'frame' : env.render(mode='ansi'),
            'action': action,
            'reward': reward,
            'state' : state,
            'done'  : done
        })

        step += 1

    print("Steps taken: {}".format(step))
    print("Result = {}".format(result))
    
    if len(frames) == 0:
        state, reward, done, info = env.step(5)
        frames.append({
            'frame' : env.render(mode='ansi'),
            'action': 5,
            'reward': 0,
            'state' : state,
            'done'  : True
        })
    
    return result

Now let's check the result

In [11]:
sample = []
f = [R,G,Y,B]
w = []
for x in range(0,10):    
    x_taxi = random.randint(0,4)
    y_taxi = random.randint(0,4)
    passenger = random.choice(f)
    destination = random.choice(f)
    
    #check if passenger and destination is the same
#     while passenger == destination:
#         destination = random.randint(0,3)

    sample.append([x_taxi, y_taxi, passenger, destination])

i = 0
for x in sample:
    print(f"Sample {i}")
    print(x)
    i+=1
    result = strategy(x[0], x[1], x[2], x[3])
    print(f"Reward: {frames[len(frames)-1]['reward']}")
    print()

Sample 0
[3, 4, 'G', 'G']
Taxi's position = (3,4)
Passenger: G
Destination: G
Steps taken: 0
Result = []
Reward: 0

Sample 1
[2, 0, 'R', 'G']
Taxi's position = (2,0)
Passenger: R
Destination: G
Steps taken: 12
Result = [1, 1, 4, 0, 0, 2, 2, 1, 1, 2, 2, 5]
Reward: 9

Sample 2
[1, 3, 'R', 'R']
Taxi's position = (1,3)
Passenger: R
Destination: R
Steps taken: 0
Result = []
Reward: 0

Sample 3
[3, 2, 'Y', 'R']
Taxi's position = (3,2)
Passenger: Y
Destination: R
Steps taken: 11
Result = [1, 3, 3, 0, 0, 4, 1, 1, 1, 1, 5]
Reward: 10

Sample 4
[1, 4, 'B', 'G']
Taxi's position = (1,4)
Passenger: B
Destination: G
Steps taken: 11
Result = [0, 0, 0, 3, 4, 1, 1, 1, 1, 2, 5]
Reward: 10

Sample 5
[1, 3, 'B', 'R']
Taxi's position = (1,3)
Passenger: B
Destination: R
Steps taken: 12
Result = [0, 0, 0, 4, 1, 1, 3, 3, 1, 1, 3, 5]
Reward: 9

Sample 6
[0, 0, 'R', 'G']
Taxi's position = (0,0)
Passenger: R
Destination: G
Steps taken: 10
Result = [4, 0, 0, 2, 2, 1, 1, 2, 2, 5]
Reward: 11

Sample 7
[0, 0, 'G', '

In [12]:
x_taxi = random.randint(0,4)
y_taxi = random.randint(0,4)
passenger = random.choice(f)
destination = random.choice(f)
result = strategy(x_taxi, y_taxi, passenger, destination)
print(f"Reward: {frames[len(frames)-1]['reward']}")

Taxi's position = (3,4)
Passenger: G
Destination: B
Steps taken: 10
Result = [1, 1, 1, 4, 0, 0, 0, 0, 3, 5]
Reward: 11


In [13]:
result = strategy(4,4,Y,B)
print(f"Reward: {frames[len(frames)-1]['reward']}")

Taxi's position = (4,4)
Passenger: Y
Destination: B
Steps taken: 17
Result = [1, 1, 3, 3, 3, 3, 0, 0, 4, 1, 1, 2, 2, 2, 0, 0, 5]
Reward: 4


Call print_frames function to show frames that discribes how the taxi take the passenger. 

In [14]:
print_frames(frames)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35m[34;1m[43mB[0m[0m[0m: |
+---------+
  (Dropoff)

Timestep: 17
State: 475
Action: 5
Reward: 4
Done: True
