Hello again, world!

In [None]:
import numpy as np
import gym

# Create the Taxi-v3 environment
env = gym.make("Taxi-v3")

# Initialize Q-table with zeros
state_size = env.observation_space.n
action_size = env.action_space.n
q_table = np.zeros((state_size, action_size))

# Hyperparameters
total_episodes = 50000
total_test_episodes = 100
max_steps = 99
learning_rate = 0.7
gamma = 0.618

# Exploration parameters
epsilon = 1.0  # Exploration rate
max_epsilon = 1.0  # Exploration probability at start
min_epsilon = 0.01  # Minimum exploration probability
decay_rate = 0.01  # Exponential decay rate for exploration prob

# Q-learning algorithm
for episode in range(total_episodes):
    state = env.reset()
    step = 0
    done = False
    
    for step in range(max_steps):
        # Exploration-exploitation trade-off
        exp_exp_tradeoff = np.random.uniform(0, 1)
        
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(q_table[state, :])
        else:
            action = env.action_space.sample()
        
        # Take the action and observe the outcome
        new_state, reward, done, info, _ = env.step(action)
        
        # Update Q-value using the Bellman equation
        q_table[state, action] = q_table[state, action] + \
            learning_rate * (reward + gamma * np.max(q_table[new_state, :]) - q_table[state, action])
        
        state = new_state
        
        if done:
            break
    
    # Reduce exploration rate exponentially
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)

# Test the trained agent
total_reward = 0
for episode in range(total_test_episodes):
    state = env.reset()
    step = 0
    done = False
    
    for step in range(max_steps):
        action = np.argmax(q_table[state, :])
        new_state, reward, done, info = env.step(action)
        total_reward += reward
        
        if done:
            break

average_reward = total_reward / total_test_episodes
print("Average reward over {} test episodes: {}".format(total_test_episodes, average_reward))
