In [1]:
import gym
import numpy as np
import random

# used to help with visualizing in Colab
from IPython.display import display, clear_output
from time import sleep

In [2]:
class bcolors:
    RED= '\u001b[31m'
    GREEN= '\u001b[32m'
    RESET= '\u001b[0m'

In [3]:
# create Taxi environment
env = gym.make('Taxi-v3', render_mode='ansi')

In [4]:


# initialize q-table
state_size 			= env.observation_space.n
action_size 		= env.action_space.n
qtable 				= np.zeros((state_size, action_size))

# hyperparameters
learning_rate 		= 0.9
discount_rate 		= 0.8
epsilon 			= 1.0
decay_rate			= 0.005

# training variables
num_episodes 		= 2000
max_steps 			= 99 # per episode

print("AGENT IS TRAINING...")

for episode in range(num_episodes):

    # Reset the environment
    state 		= env.reset()
    state 			= state[0]
    step 			= 0
    done 			= False

    for step in range(max_steps):

        # Exploration-exploitation tradeoff
        if random.uniform(0,1) < epsilon:
            # Explore
            action = env.action_space.sample()
        else:
            # Exploit
            action = np.argmax(qtable[state,:])

        # Take an action and observe the reward
        #new_state, reward, done, info = env.step(action)
        output 		= env.step(action)
        new_state 	= output[0]
        reward 		= output[1]
        done 		= output[2]
        info 		= output[3]

        # Q-learning algorithm
        qtable[state,action] = qtable[state,action] + learning_rate * (reward + discount_rate * np.max(qtable[new_state,:])-qtable[state,action])

        # Update to our new state
        state 		= new_state

        # if done, finish episode
        if done == True:
            break

    # Decrease epsilon
    epsilon = np.exp(-decay_rate*episode)

# Get ready to watch our trained agent
clear_output()
print(f"Our Q-table: {qtable}")
print(f"Training completed over {num_episodes} episodes")
#input("Press Enter to see our trained taxi agent")
#sleep(1)
#clear_output()

def visualize_agent(env, qtable, episodes=5, max_steps=100):
    for episode in range(episodes):
        state = env.reset()[0]
        done = False
        print(f"Episode {episode + 1}\n")
        sleep(1)

        for step in range(max_steps):
            clear_output(wait=True)
            print(env.render())
            sleep(0.5)  # Adjust the speed of the animation

            # Choose action based on Q-table
            action = np.argmax(qtable[state, :])
            output = env.step(action)
            new_state 	= output[0]
            reward 		= output[1]
            done 		= output[2]
            info 		= output[3]            
            
            state = new_state

            if done:
                print(f"Episode finished after {step + 1} timesteps\n")
                sleep(2)
                clear_output(wait=True)
                break

# Visualize the trained agent
visualize_agent(env, qtable, episodes=5)

+---------+
|[34;1mR[0m: | : :G|
|[43m [0m: | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)



KeyboardInterrupt: 

NameError: name 'env' is not defined

In [4]:
print(env.render())

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35m[34;1m[43mB[0m[0m[0m: |
+---------+
  (Dropoff)



In [6]:
env.render()

'+---------+\n|R: | : :\x1b[34;1mG\x1b[0m|\n| : | : : |\n|\x1b[43m \x1b[0m: : : : |\n| | : | : |\n|\x1b[35mY\x1b[0m| : |B: |\n+---------+\n\n'

In [5]:
episodes_to_preview = 3
for episode in range(episodes_to_preview):

	# Reset the environment
	state = env.reset()
	step = 0
	done = False
	episode_rewards = 0

	for step in range(num_steps):
		# clear screen
		clear_output(wait=True)

		print(f"TRAINED AGENT")
		print(f"+++++EPISODE {episode+1}+++++")
		print(f"Step {step+1}")

		# Exploit
		action = np.argmax(qtable[state,:])

		# Take an action and observe the reward
		new_state, reward, done, info = env.step(action)

		# Accumulate our rewards
		episode_rewards += reward

		print(env.render())
		print("")
		if episode_rewards < 0:
			print(f"Score: {bcolors.RED}{episode_rewards}{bcolors.RESET}")
		else:
			print(f"Score: {bcolors.GREEN}{episode_rewards}{bcolors.RESET}")
		sleep(0.5)

		# Update to our new state
		state = new_state

		# if done, finish episode
		if done == True:
			break

# Close the Taxi environment
env.close()

NameError: name 'num_steps' is not defined