#### Optimal Opponent Experiments
Author: Yemi Kelani

##### Google Drive Setup (Skip if running locally)

> To run this notebook, follow these steps:
> 1. Download the latest version of the [repository](https://github.com/yemi-kelani/artificial-intelligence/tree/master).
> 2. Upload the repsitory files to your Google Drive account under the path `Projects/artificial-intelligence`.
> 3. Open this file (`train.ipynb`) from your Google Drive and run the experiments.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
ROOT_FOLDER = "drive/MyDrive/Projects/artificial-intelligence/models/ReinforcementLearning/"
PROJECT_PATH = f"{ROOT_FOLDER}/DeepQ_TicTacToe_v2"
NOTEBOOK_LOCATION = f"{PROJECT_PATH}/experiments"

In [None]:
!cp {PROJECT_PATH}/DeepQAgent.py .
!cp {PROJECT_PATH}/TicTacToeGame.py .
!cp {ROOT_FOLDER}/Utils.py .

from DeepQAgent import DeepQAgent
from TicTacToeGame import TicTacToeGame, OPPONENT_LEVEL
from Utils import (
    train_agent,
    test_agent
)
MODEL_PATH = "drive/MyDrive/Projects/artificial-intelligence/trained_models/ReinforcementLearning/TicTacToeV2"

##### Local Setup (Skip if running remotely)
> 1. Run the following cells

In [None]:
from models.ReinforcementLearning.DeepQ_TicTacToe_v2.DeepQAgent import DeepQAgent
from models.ReinforcementLearning.DeepQ_TicTacToe_v2.TicTacToeGame import TicTacToeGame, OPPONENT_LEVEL
from models.ReinforcementLearning.Utils import (
    train_agent,
    test_agent
)
MODEL_PATH = "../../../../trained_models/ReinforcementLearning/TicTacToeV2"

##### Experiments

In [None]:
import os
import torch

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

# DeepQ parameters
BATCH_SIZE     = 256
NUM_EPISODES   = 10000
STATE_SPACE    = 9
ACTION_SPACE   = 9
HIDDEN_SIZE    = 256
EPSILON        = 1.0
GAMMA          = 0.90
LEARNING_RATE  = 0.001
DROPOUT        = 0.15
TRAIN_START    = 1000 # =< 2000 (Maxlen of replay memory)
NEGATIVE_SLOPE = 0.01

# model roots
BASELINE = "TicTacToe-v2-BASELINE"
NAIVE = "TicTacToe-v2-NAIVE"
AGENT = "TicTacToe-v2-AGENT"
OPTIMAL = "TicTacToe-v2-OPTIMAL"
SELF = "TicTacToe-v2-SELF"


def get_full_model_path(agent_name):
  return os.path.join(MODEL_PATH, agent_name + ".pt")

def supply_model(load_if_exists: bool = True, agent_name: str = None):

  agent = DeepQAgent(
      device         = DEVICE,
      epsilon        = EPSILON,
      gamma          = GAMMA,
      state_space    = STATE_SPACE,
      action_space   = ACTION_SPACE,
      hidden_size    = HIDDEN_SIZE,
      dropout        = DROPOUT,
      train_start    = TRAIN_START,
      batch_size     = BATCH_SIZE,
      negative_slope = NEGATIVE_SLOPE
  )

  full_model_path = get_full_model_path(agent_name)
  if load_if_exists and os.path.exists(full_model_path):
    print("Loading Model Parameters...")
    agent.load_model(filepath=full_model_path)

  optimizer = torch.optim.Adam(agent.parameters(), lr=LEARNING_RATE)
  criterion = torch.nn.SmoothL1Loss() # Huber Loss

  return agent, optimizer, criterion

def compare_to_naive(agent_name: str, num_episodes: int = 10000):
  _agent, _, _ = supply_model(load_if_exists=True, agent_name=agent_name)
  _environment = TicTacToeGame(DEVICE, None, OPPONENT_LEVEL.NAIVE, start_as_X=False)
  test_agent(_agent, _environment, num_episodes)

def compare_to_optimal(agent_name: str, num_episodes: int = 100):
  _agent, _, _ = supply_model(load_if_exists=True, agent_name=agent_name)
  _environment = TicTacToeGame(DEVICE, None, OPPONENT_LEVEL.OPTIMAL, start_as_X=False)
  test_agent(_agent, _environment, num_episodes)

def compare_to_model(agent_name: str, model_name: str, num_episodes: int = 10000):
  _agent, _, _ = supply_model(load_if_exists=True, agent_name=agent_name)
  _enemy, _, _ = supply_model(load_if_exists=True, agent_name=model_name)
  _environment = TicTacToeGame(DEVICE, _enemy, OPPONENT_LEVEL.AGENT, start_as_X=False)
  test_agent(_agent, _environment, num_episodes)


In [None]:
agent, _, _ = supply_model()
agent.save_model(MODEL_PATH, BASELINE)

In [None]:
compare_to_naive(BASELINE)

In [None]:
NUM_EPISODES = 10000
TRAIN_START  = 2500

In [None]:
agent, optimizer, criterion = supply_model(load_if_exists=True, agent_name=BASELINE)
environment = TicTacToeGame(DEVICE, None, OPPONENT_LEVEL.NAIVE)
reward_history = train_agent(
    agent,
    environment,
    NUM_EPISODES,
    optimizer,
    criterion,
    DEVICE,
    MODEL_PATH,
    model_name = NAIVE,
    save_every = 2000,
    epsilon_min_value = 0.30,
    epsilon_max_value = 0.75,
)

In [None]:
compare_to_naive(f"{NAIVE}-2K")

In [None]:
compare_to_naive(f"{NAIVE}-4K")

In [None]:
compare_to_naive(f"{NAIVE}-6K")

In [None]:
compare_to_naive(f"{NAIVE}-8K")

In [None]:
compare_to_naive(f"{NAIVE}-10K")

In [None]:
# The NAIVE-10+ model has trained for
# 2000 extra episodes to account for
# episodes not trained upon before TRAIN start
compare_to_naive(f"{NAIVE}-10K+")

In [None]:
agent, optimizer, criterion = supply_model(load_if_exists=True, agent_name=f"{NAIVE}-12K")
environment = TicTacToeGame(DEVICE, None, OPPONENT_LEVEL.NAIVE)
reward_history = train_agent(
    agent,
    environment,
    NUM_EPISODES,
    optimizer,
    criterion,
    DEVICE,
    MODEL_PATH,
    model_name = f"{NAIVE}-12K+",
    save_every = 2000,
    epsilon_min_value = 0.40,
    epsilon_max_value = 0.80,
)

In [None]:
compare_to_naive(f"{NAIVE}-12K")

In [None]:
compare_to_naive(f"{NAIVE}-14K")

In [None]:
compare_to_naive(f"{NAIVE}-16K")

In [None]:
compare_to_naive(f"{NAIVE}-18K")

In [None]:
compare_to_naive(f"{NAIVE}-20K")

In [None]:
compare_to_naive(f"{NAIVE}-22K")

In [None]:
agent, optimizer, criterion = supply_model(load_if_exists=True, agent_name=f"{NAIVE}-22K")
environment = TicTacToeGame(DEVICE, None, OPPONENT_LEVEL.NAIVE)
reward_history = train_agent(
    agent,
    environment,
    NUM_EPISODES,
    optimizer,
    criterion,
    DEVICE,
    MODEL_PATH,
    model_name = f"{NAIVE}-22K+",
    save_every = 2000,
    epsilon_min_value = 0.20,
    epsilon_max_value = 0.80,
)

In [None]:
compare_to_naive(f"{NAIVE}-24K")

In [None]:
compare_to_naive(f"{NAIVE}-26K")

In [None]:
compare_to_naive(f"{NAIVE}-28K")

In [None]:
compare_to_naive(f"{NAIVE}-30K")

In [None]:
compare_to_naive(f"{NAIVE}-32K")

In [None]:
compare_to_optimal(f"{NAIVE}-32K")

In [None]:
agent, optimizer, criterion = supply_model(load_if_exists=True, agent_name=f"{NAIVE}-32K")
environment = TicTacToeGame(DEVICE, None, OPPONENT_LEVEL.NAIVE)
reward_history = train_agent(
    agent,
    environment,
    NUM_EPISODES,
    optimizer,
    criterion,
    DEVICE,
    MODEL_PATH,
    model_name = f"{NAIVE}-32K+",
    save_every = 2000,
    epsilon_min_value = 0.1,
)

In [None]:
compare_to_naive(f"{NAIVE}-34K")

In [None]:
compare_to_naive(f"{NAIVE}-36K")

In [None]:
compare_to_naive(f"{NAIVE}-38K")

In [None]:
compare_to_naive(f"{NAIVE}-40K")

In [None]:
compare_to_naive(f"{NAIVE}-42K")

In [None]:
agent, optimizer, criterion = supply_model(load_if_exists=True, agent_name=f"{NAIVE}-42K")
environment = TicTacToeGame(DEVICE, None, OPPONENT_LEVEL.NAIVE)
reward_history = train_agent(
    agent,
    environment,
    20500,
    optimizer,
    criterion,
    DEVICE,
    MODEL_PATH,
    model_name = f"{NAIVE}-42K+",
    save_every = 2000,
    epsilon_min_value = 0.1,
)

In [None]:
compare_to_naive(f"{NAIVE}-44K")

In [None]:
compare_to_naive(f"{NAIVE}-46K")

In [None]:
compare_to_naive(f"{NAIVE}-48K")

In [None]:
compare_to_naive(f"{NAIVE}-48K")

In [None]:
compare_to_naive(f"{NAIVE}-50K")

In [None]:
compare_to_naive(f"{NAIVE}-52K")

In [None]:
compare_to_naive(f"{NAIVE}-54K")

In [None]:
agent, optimizer, criterion = supply_model(load_if_exists=True, agent_name=f"{NAIVE}-54K")
environment = TicTacToeGame(DEVICE, None, OPPONENT_LEVEL.OPTIMAL)
reward_history = train_agent(
    agent,
    environment,
    NUM_EPISODES,
    optimizer,
    criterion,
    DEVICE,
    MODEL_PATH,
    model_name = OPTIMAL,
    save_every = 2000,
    epsilon_min_value = 0.30,
    epsilon_max_value = 0.75
)

In [None]:
compare_to_naive(f"{OPTIMAL}-2K")

In [None]:
compare_to_naive(f"{OPTIMAL}-4K")

In [None]:
compare_to_naive(f"{OPTIMAL}-6K")

In [None]:
compare_to_naive(f"{OPTIMAL}-8K")

In [None]:
compare_to_naive(f"{OPTIMAL}-10K")

In [None]:
compare_to_optimal(f"{OPTIMAL}-10K")

In [None]:
agent, optimizer, criterion = supply_model(load_if_exists=True, agent_name=f"{OPTIMAL}-10K")
environment = TicTacToeGame(DEVICE, None, OPPONENT_LEVEL.OPTIMAL)
reward_history = train_agent(
    agent,
    environment,
    NUM_EPISODES,
    optimizer,
    criterion,
    DEVICE,
    MODEL_PATH,
    model_name = f"{OPTIMAL}-10K+",
    save_every = 2000,
    epsilon_min_value = 0.20,
    epsilon_max_value = 0.50
)

In [None]:
compare_to_naive(f"{OPTIMAL}-12K")

In [None]:
compare_to_naive(f"{OPTIMAL}-14K")

In [None]:
compare_to_naive(f"{OPTIMAL}-16K")

In [None]:
compare_to_optimal(f"{OPTIMAL}-16K")

In [None]:
agent, optimizer, criterion = supply_model(load_if_exists=True, agent_name=f"{OPTIMAL}-16K")
environment = TicTacToeGame(DEVICE, None, OPPONENT_LEVEL.OPTIMAL)
reward_history = train_agent(
    agent,
    environment,
    NUM_EPISODES,
    optimizer,
    criterion,
    DEVICE,
    MODEL_PATH,
    model_name = f"{OPTIMAL}-16K+",
    save_every = 2000,
)

In [None]:
compare_to_naive(f"{OPTIMAL}-18K")
compare_to_optimal(f"{OPTIMAL}-18K")

In [None]:
compare_to_naive(f"{OPTIMAL}-20K")
compare_to_optimal(f"{OPTIMAL}-20K")

In [None]:
compare_to_naive(f"{OPTIMAL}-22K")
compare_to_optimal(f"{OPTIMAL}-22K")

In [None]:
NUM_EPISODES = 10000
agent, optimizer, criterion = supply_model(load_if_exists=True, agent_name=f"{OPTIMAL}-22K")
agent.prep_cosine_anneal(0.5, 1.0, NUM_EPISODES)
environment = TicTacToeGame(DEVICE, None, OPPONENT_LEVEL.OPTIMAL)
reward_history = train_agent(
    agent,
    environment,
    NUM_EPISODES,
    optimizer,
    criterion,
    DEVICE,
    MODEL_PATH,
    model_name = f"{OPTIMAL}-22K+",
    save_every = 2000
)

In [None]:
NUM_EPISODES = 10000
agent, optimizer, criterion = supply_model(load_if_exists=True, agent_name=f"{OPTIMAL}-32K")
agent.prep_cosine_anneal(0.1, 1.0, NUM_EPISODES)
environment = TicTacToeGame(DEVICE, None, OPPONENT_LEVEL.OPTIMAL)
reward_history = train_agent(
    agent,
    environment,
    NUM_EPISODES,
    optimizer,
    criterion,
    DEVICE,
    MODEL_PATH,
    model_name = f"{OPTIMAL}-32K+",
    save_every = 2000
)

In [None]:
NUM_EPISODES = 10000
agent, optimizer, criterion = supply_model(load_if_exists=True, agent_name=f"{OPTIMAL}-42K")
agent.prep_cosine_anneal(0.1, 1.0, NUM_EPISODES)
environment = TicTacToeGame(DEVICE, None, OPPONENT_LEVEL.OPTIMAL)
reward_history = train_agent(
    agent,
    environment,
    NUM_EPISODES,
    optimizer,
    criterion,
    DEVICE,
    MODEL_PATH,
    model_name = f"{OPTIMAL}-42K+",
    save_every = 2000
)

In [None]:
NUM_EPISODES = 10000
agent, optimizer, criterion = supply_model(load_if_exists=True, agent_name=f"{OPTIMAL}-52K")
agent.prep_cosine_anneal(0.1, 1.0, NUM_EPISODES)
environment = TicTacToeGame(DEVICE, None, OPPONENT_LEVEL.OPTIMAL)
reward_history = train_agent(
    agent,
    environment,
    NUM_EPISODES,
    optimizer,
    criterion,
    DEVICE,
    MODEL_PATH,
    model_name = f"{OPTIMAL}-52K+",
    save_every = 2000
)

In [None]:
compare_to_naive(f"{OPTIMAL}-50K")
compare_to_optimal(f"{OPTIMAL}-50K")