In [1]:
import gym
import numpy as np
import random

from IPython.display import display, clear_output
from time import sleep

In [2]:
# create Taxi environment
env = gym.make('Taxi-v3', render_mode='ansi')

In [3]:
# initialize q-table
state_size 			= env.observation_space.n
action_size 		= env.action_space.n
qtable 				= np.zeros((state_size, action_size))

# hyperparameters
learning_rate 		= 0.9
discount_rate 		= 0.8
epsilon 			= 1.0
decay_rate			= 0.005

# training variables
num_episodes 		= 2000
max_steps 			= 99 # per episode

print("Training the agent...")

for episode in range(num_episodes):

    # Reset the environment
    state 		    = env.reset()
    state 			= state[0]
    step 			= 0
    done 			= False

    for step in range(max_steps):

        # Exploration-exploitation tradeoff
        if random.uniform(0,1) < epsilon:
            # Explore
            action = env.action_space.sample()
        else:
            # Exploit
            action = np.argmax(qtable[state,:])

        # Take an action and observe the reward
        #new_state, reward, done, info = env.step(action)
        output 		= env.step(action)
        new_state 	= output[0]
        reward 		= output[1]
        done 		= output[2]
        info 		= output[3]

        # Q-learning algorithm
        qtable[state,action] = qtable[state,action] + learning_rate * (reward + discount_rate * np.max(qtable[new_state,:])-qtable[state,action])

        # Update to our new state
        state 		= new_state

        # if done, finish episode
        if done == True:
            break

    # Decrease epsilon
    epsilon = 1.0 / (1.0 + decay_rate * episode)

print(f"Trained Q-table: {qtable}")
print(f"Training completed after {num_episodes} episodes")

Training the agent...


  if not isinstance(terminated, (bool, np.bool8)):


Trained Q-table: [[  0.           0.           0.           0.           0.
    0.        ]
 [ -2.85394489  -2.31752643  -2.86464051  -2.31564717  -1.6445568
  -11.31566284]
 [  0.24288      1.55357757   0.24277779   1.54836485   3.192
   -7.44640005]
 ...
 [ -2.74682857   1.50973178  -2.842129    -2.6860032  -11.72839608
  -11.31762096]
 [ -3.82845378  -3.78799993  -3.60279663  -0.805696   -11.3558004
  -12.48238226]
 [  9.80099988   2.14426471  10.76282784  15.           0.
    1.98876689]]
Training completed after 2000 episodes


In [4]:
def visualize_agent(env, qtable, episodes=5, max_steps=100):
    for episode in range(episodes):
        state = env.reset()[0]
        done = False
        print(f"Episode {episode + 1}\n")
        sleep(1)

        for step in range(max_steps):
            clear_output(wait=True)
            print(env.render())
            sleep(0.5)  # Adjust the speed of the animation

            # Choose action based on Q-table
            action = np.argmax(qtable[state, :])
            output = env.step(action)
            new_state 	= output[0]
            reward 		= output[1]
            done 		= output[2]
            info 		= output[3]            
            
            state = new_state

            if done:
                print(f"Episode finished after {step + 1} timesteps\n")
                sleep(2)
                clear_output(wait=True)
                break

# Visualize the trained agent
visualize_agent(env, qtable, episodes=5)

+---------+
|[35m[42mR[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (West)

Episode finished after 14 timesteps

