<a href="https://colab.research.google.com/github/vanakema/reinforcement_learning_course_solutions/blob/main/q-learning/Taxi_Q_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# OpenAI Gym Taxi Q-Learning solution
This is my work on getting a Q-Learning-based solution to the Taxi v3.0 OpenAI Gym. This is just a basic Q-Learning solution, nothing fancy like deep NN's

Install pip packages

In [None]:
!pip install numpy
!pip install gym
!pip install wandb -qqq



Import libs needed, and log into W&B

In [None]:
import numpy as np
import gym
import random
import wandb
wandb.login()



True

Make the environment and show what it looks like

In [None]:
env = gym.make("Taxi-v3")
env.render()

+---------+
|[34;1mR[0m: | : :[43mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+



Get the observation/action space and initialize the Q-Table

In [None]:
observation_space = env.observation_space.n
action_space = env.action_space.n

print(f"Observation space: {observation_space}")
print(f"Action space: {action_space}")

q_table = np.zeros((observation_space, action_space))

print(f"Q-Table\n{q_table}")

Observation space: 500
Action space: 6
Q-Table
[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


W&B initialization and hyper params

In [None]:
wandb.init(
    project="taxiv3-qlearning",
    config={
        "episodes": 1000000,
        "test_episodes": 100,
        "max_steps": 100,

        "learning_rate": 0.7,
        "discount_rate": 0.9,

        "exploration_rate": 1.0,
        "max_exploration_rate": 1.0,
        "min_exploration_rate": 0.01,
        "exploration_decay_rate": 0.000001
    }
)

config = wandb.config

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,▁▁█
exploration_rate,██▁

0,1
episode,32500
exploration_rate,0.99679
phase,train


Model

In [None]:
for episode in range(config.episodes):
  state = env.reset()
  step = 0
  total_episode_reward = 0
  optimal_action_rewards = []
  done = False

  for step in range(config.max_steps):
    is_optimal_action_step = False
    exploration_tradeoff = random.uniform(0,1)

    if (exploration_tradeoff > config.exploration_rate):
      action = np.argmax(q_table[state,:])
      is_optimal_action_step = True
    else:
      action = env.action_space.sample()

    state_1, reward, done, info = env.step(action)

    total_episode_reward += reward
    if is_optimal_action_step:
      optimal_action_rewards.append(reward)

    q_table[state,action] = q_table[state,action] + config.learning_rate * (reward + config.discount_rate * np.max(q_table[state_1,:]) - q_table[state,action])

    state = state_1

    if (done is True):
      total_optimal_action_reward = sum(optimal_action_rewards)
      average_optimal_action_reward = total_optimal_action_reward / len(optimal_action_rewards) if total_optimal_action_reward > 0 else None
      
      if episode % 500 == 0:
        wandb.log({ "phase": "train", "episode": episode, "episode_average_optimal_reward": average_optimal_action_reward, "exploration_rate": config.exploration_rate })
      break;
  
  config.exploration_rate = config.min_exploration_rate + (config.max_exploration_rate - config.min_exploration_rate) * np.exp(-config.exploration_decay_rate*episode)

Test the model

In [None]:
env.reset()
total_rewards = []

for episode in range(config.test_episodes):
  state = env.reset()
  step = 0
  total_episode_reward = 0
  done = False

  for step in range(config.max_steps):
    env.render()

    action = np.argmax(q_table[state,:])

    state_1, reward, done, info = env.step(action)

    total_episode_reward += reward

    state = state_1

    if (done is True):
      total_rewards.append(total_episode_reward)
      wandb.log({ "phase": "test", "episode": episode, "episode_reward": total_episode_reward})
      print(f"Episode reward: {total_rewards}")
      break;

env.close()

print("Avg reward over episodes: " + str(sum(total_rewards) / config.test_episodes))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | :[43m [0m| : |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: | : :[34;1mG[0m|
| : |[43m [0m: : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: |[43m [0m: :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: | :[43m [0m:[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (East)
+---------+
|R: | : :[34;1m[43mG[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (East)
+---------+
|R: | : :[42mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (Pickup)
+---------+
|R: | : :G|
| : | : :[42m_[0m|
| : : : : |
| | : | : |
|[35mY[0m| : |