In [None]:
import gym
import numpy as np
import random

# Create the Taxi environment
env = gym.make("Taxi-v3", render_mode="ansi")

# Initialize Q-table
q_table = np.zeros([env.observation_space.n, env.action_space.n])

# Set hyperparameters
learning_rate = 0.1
discount_factor = 0.9
epsilon = 0.1
epochs = 1000

# Training the agent
for epoch in range(epochs):
    state = env.reset()[0]
    done = False
    total_reward = 0
    
    while not done:
        # Exploration vs exploitation
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()  # Exploration
        else:
            action = np.argmax(q_table[state])  # Exploitation
        
        # Take the action and observe the next state and reward
        next_state, reward, done, _, _ = env.step(action)
        
        # Update Q-value using the Q-learning formula
        q_table[state, action] = q_table[state, action] + learning_rate * (reward + discount_factor * np.max(q_table[next_state]) - q_table[state, action])
        
        state = next_state
        total_reward += reward
    
    if (epoch + 1) % 1000 == 0:
        print(f"Epoch {epoch + 1}/{epochs} completed")

print("Training finished.")

# Displaying possible actions
print("\nPossible Actions:")
actions = ["South", "North", "East", "West", "Pick up", "Drop off"]
for i, action in enumerate(actions):
    print(f"{i}: {action}")

# Displaying possible states
print("\nPossible States:")
# Extracting some state values
for state in range(0, 10):  # Displaying only the first 10 states for brevity
    print(f"State {state}: {env.decode(state)}")

# Testing the trained agent
state = env.reset()[0]
done = False
total_reward = 0

print("\nTesting the trained agent:")
while not done:
    action = np.argmax(q_table[state])  # Choose the best action from Q-table
    next_state, reward, done, _, _ = env.step(action)
    env.render()  # Visualize the environment
    state = next_state
    total_reward += reward

print(f"Total reward during testing: {total_reward}")

# Show the action that achieves the best reward
best_action = np.argmax(q_table[state])
print(f"\nThe action that achieves the best reward in the final state is: {actions[best_action]}")
