# Phase 1: training

In [None]:
import gym
import random
import numpy as np
from IPython.display import clear_output
from time import sleep

# Create the Taxi environment
streets = gym.make("Taxi-v3").env
streets.reset()

In [None]:
q_table = np.zeros([streets.observation_space.n, streets.action_space.n])  # 500 x 6

# Hyperparameters
learning_rate = 0.1
discount_factor = 0.5
exploration = 0.1
epochs_list = [1000, 5000, 10000]

# Function to train the model for the given number of epochs
def train_model(epochs):
    rewards = []
    steps = []

    for epoch in range(epochs):
        state = streets.reset()
        done = False
        total_reward = 0
        total_steps = 0

        while not done:
            random_value = random.uniform(0, 1)
            if random_value < exploration:
                action = streets.action_space.sample()  # Explore a random action
            else:
                action = np.argmax(q_table[state])  # Return the action with the highest q-value

            next_state, reward, done, _ = streets.step(action)  # Perform the action

            prev_q = q_table[state, action]
            next_max_q = np.max(q_table[next_state])
            new_q = (1 - learning_rate) * prev_q + learning_rate * (reward + discount_factor * next_max_q)
            q_table[state, action] = new_q

            total_reward += reward
            total_steps += 1
            state = next_state

        rewards.append(total_reward)
        steps.append(total_steps)
    return rewards, steps


# Function to evaluate the policy for a given number of trips
def evaluate_policy(trips):
    trip_lengths = []

    for trip in range(1, trips + 1):
        state = streets.reset()
        done = False
        trip_length = 0

        while not done and trip_length < 25:
            action = np.argmax(q_table[state])
            next_state, reward, done, _ = streets.step(action)
            clear_output(wait=True)
            print("Trip number " + str(trip) + " Step " + str(trip_length))
            print(streets.render(mode='ansi'))
            sleep(.2)
            state = next_state
            trip_length += 1

        trip_lengths.append(trip_length)
        sleep(.2)

    avg_trip_length = sum(trip_lengths) / trips
    return avg_trip_length


# Train and evaluate the model for different epoch settings
for epochs in epochs_list:
    print("\nTraining for", epochs, "epochs:")
    rewards, steps = train_model(epochs)

    # Print rewards and steps for each epoch
    print("Epoch\tReward\tSteps")
    for i in range(epochs):
        print(i, "\t", rewards[i], "\t", steps[i])

    avg_trip_length = evaluate_policy(10)
    print("Average Trip Length ({} epochs):".format(epochs), avg_trip_length)

# Phase 2: Testing and Evaluation


In [None]:
q_table = np.zeros([streets.observation_space.n, streets.action_space.n])  # 500 x 6

# Hyperparameters
learning_rate = 0.1
discount_factors = [0.3, 0.5, 0.7]  # Different discount factors to compare
epochs = 10000

# Function to train the model for the given number of epochs and discount factor
def train_model(epochs, discount_factor):
    rewards = []
    steps = []

    for epoch in range(epochs):
        state = streets.reset()
        done = False
        total_reward = 0
        total_steps = 0

        while not done:
            random_value = random.uniform(0, 1)
            if random_value < exploration:
                action = streets.action_space.sample()  # Explore a random action
            else:
                action = np.argmax(q_table[state])  # Return the action with the highest q-value

            next_state, reward, done, _ = streets.step(action)  # Perform the action

            prev_q = q_table[state, action]
            next_max_q = np.max(q_table[next_state])
            new_q = (1 - learning_rate) * prev_q + learning_rate * (reward + discount_factor * next_max_q)
            q_table[state, action] = new_q

            total_reward += reward
            total_steps += 1
            state = next_state

        rewards.append(total_reward)
        steps.append(total_steps)
    return rewards, steps


# Function to evaluate the policy for a given number of trips
def evaluate_policy(trips):
    trip_lengths = []
    trip_rewards = []

    for trip in range(1, trips + 1):
        state = streets.reset()
        done = False
        trip_length = 0
        trip_reward = 0

        while not done and trip_length < 25:
            action = np.argmax(q_table[state])
            next_state, reward, done, _ = streets.step(action)
            state = next_state
            trip_length += 1
            trip_reward += reward

        trip_lengths.append(trip_length)
        trip_rewards.append(trip_reward)

    avg_trip_length = sum(trip_lengths) / trips
    avg_trip_reward = sum(trip_rewards) / trips
    return avg_trip_length, avg_trip_reward


# Train and evaluate the model for different discount factors
results = []

for discount_factor in discount_factors:
    print("\nTraining for", epochs, "epochs with discount factor =", discount_factor)
    rewards, steps = train_model(epochs, discount_factor)

    # Print rewards and steps for each epoch
    print("Epoch\tReward\tSteps")
    for i in range(epochs):
        print(i, "\t", rewards[i], "\t", steps[i])

    avg_trip_length, avg_trip_reward = evaluate_policy(10)
    print("Average Trip Length ({} epochs):".format(epochs), avg_trip_length)
    print("Average Trip Reward ({} epochs):".format(epochs), avg_trip_reward)
    
    results.append((discount_factor, avg_trip_length, avg_trip_reward))

# Print the results for each discount factor
print("\nResults:")
print("Discount Factor\tAverage Trip Length\tAverage Trip Reward")
for result in results:
    print(result[0], "\t\t", result[1], "\t\t\t", result[2])
