#### Optimal Opponent Experiments
Author: Yemi Kelani

##### Google Drive Setup (Skip if running locally)

> To run this notebook, follow these steps:
> 1. Download the latest version of the [repository](https://github.com/yemi-kelani/artificial-intelligence/tree/master).
> 2. Upload the repsitory files to your Google Drive account under the path `Projects/artificial-intelligence`.
> 3. Open this file (`optimal-train.ipynb`) from your Google Drive and run the experiments.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [9]:
ROOT_FOLDER = "drive/MyDrive/Projects/artificial-intelligence/models/ReinforcementLearning/"
PROJECT_PATH = f"{ROOT_FOLDER}/DeepQ_TicTacToe_v2"
NOTEBOOK_LOCATION = f"{PROJECT_PATH}/experiments"

In [10]:
!cp {PROJECT_PATH}/DeepQAgent.py .
!cp {PROJECT_PATH}/TicTacToeGame.py .
!cp {ROOT_FOLDER}/Utils.py .

from DeepQAgent import DeepQAgent
from  TicTacToeGame import TicTacToeGame, OPPONENT_LEVEL
from Utils import (
    train_agent,
    test_agent,
    set_seed
)
MODEL_PATH = "drive/MyDrive/Projects/artificial-intelligence/trained_models/ReinforcementLearning/TicTacToeV2"

##### Local Setup (Skip if running remotely)
> 1. Run the following cells

In [None]:
from models.ReinforcementLearning.DeepQ_TicTacToe_v2.DeepQAgent import DeepQAgent
from models.ReinforcementLearning.DeepQ_TicTacToe_v2.TicTacToeGame import TicTacToeGame, OPPONENT_LEVEL
from models.ReinforcementLearning.Utils import (
    train_agent,
    test_agent,
    set_seed
)
MODEL_PATH = "../../../../trained_models/ReinforcementLearning/TicTacToeV2"

##### Experiments

In [17]:
import os
import torch

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

SEED = 100
set_seed(SEED)

# DeepQ parameters
BATCH_SIZE     = 128
NUM_EPISODES   = 1000 if torch.cuda.is_available() else 100
STATE_SPACE    = 9
ACTION_SPACE   = 9
HIDDEN_SIZE    = 128
EPSILON        = 1.0
GAMMA          = 0.99
LEARNING_RATE  = 0.001
DROPOUT        = 0.25
TRAIN_START = 1000
NEGATIVE_SLOPE = 0.01
COMPUTER_LEVEL = OPPONENT_LEVEL.NAIVE

# save path
MODEL_NAME = ""

BASELINE = "TicTacToev2-Baseline-Untrained"
NAIVE_2K = "TicTacToev2-NAIVE-2K"
NAIVE_4K = "TicTacToev2-NAIVE-4K"
NAIVE_6K = "TicTacToev2-NAIVE-6K"
NAIVE_8K = "TicTacToev2-NAIVE-8K"
NAIVE_10K = "TicTacToev2-NAIVE-10K"
AGENT_1K = "TicTacToev2-AGENT-1K"
AGENT_2K = "TicTacToev2-AGENT-2K"
AGENT_3K = "TicTacToev2-AGENT-3K"
AGENT_4K = "TicTacToev2-AGENT-4K"
OPTIMAL_1K = "TicTacToev2-OPTIMAL-1K"
OPTIMAL_2K = "TicTacToev2-OPTIMAL-2K"
OPTIMAL_4K = "TicTacToev2-OPTIMAL-4K"
OPTIMAL_6K = "TicTacToev2-OPTIMAL-6K"
OPTIMAL_8K = "TicTacToev2-OPTIMAL-8K"


def get_full_model_path(agent_name: str = None):
  if agent_name is None:
    return os.path.join(MODEL_PATH, MODEL_NAME + ".pt")
  return os.path.join(MODEL_PATH, agent_name + ".pt")

def supply_model(load_if_exists: bool = True, agent_name: str = None):

  agent = DeepQAgent(
      device         = DEVICE,
      epsilon        = EPSILON,
      gamma          = GAMMA,
      state_space    = STATE_SPACE,
      action_space   = ACTION_SPACE,
      hidden_size    = HIDDEN_SIZE,
      dropout        = DROPOUT,
      train_start    = TRAIN_START,
      batch_size     = BATCH_SIZE,
      negative_slope = NEGATIVE_SLOPE
  )

  full_model_path = get_full_model_path(agent_name)
  if load_if_exists and os.path.exists(full_model_path):
    print("Loading Model Parameters...")
    agent.load_model(filepath=full_model_path)

  optimizer = torch.optim.Adam(agent.parameters(), lr=LEARNING_RATE)
  criterion = torch.nn.SmoothL1Loss() # Huber Loss

  return agent, optimizer, criterion

def compare_to_naive(agent_name: str, num_episodes: int = 25000):
  _agent, _, _ = supply_model(load_if_exists=True, agent_name=agent_name)
  _environment = TicTacToeGame(DEVICE, None, OPPONENT_LEVEL.NAIVE, start_as_X=False)
  test_agent(_agent, _environment, num_episodes)

def compare_to_optimal(agent_name: str, num_episodes: int = 100):
  _agent, _, _ = supply_model(load_if_exists=True, agent_name=agent_name)
  _environment = TicTacToeGame(DEVICE, None, OPPONENT_LEVEL.OPTIMAL, start_as_X=False)
  test_agent(_agent, _environment, num_episodes)

def compare_to_model(agent_name: str, model_name: str, num_episodes: int = 10000):
  _agent, _, _ = supply_model(load_if_exists=True, agent_name=agent_name)
  _enemy, _, _ = supply_model(load_if_exists=True, agent_name=model_name)
  _environment = TicTacToeGame(DEVICE, _enemy, OPPONENT_LEVEL.AGENT, start_as_X=False)
  test_agent(_agent, _environment, num_episodes)


cuda


In [5]:
agent, optimizer, criterion = supply_model(load_if_exists=True, agent_name=AGENT_1K)
environment = TicTacToeGame(DEVICE, None, OPPONENT_LEVEL.OPTIMAL)
reward_history = train_agent(
    agent,
    environment,
    NUM_EPISODES,
    optimizer,
    criterion,
    DEVICE,
    MODEL_PATH,
    model_name = OPTIMAL_1K
)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
‾‾‾‾‾‾‾
episode: 168/1000, steps: 4, reward_total: -2, e: 1.0, time: 05:22:40
_______
|O|X|X|
|O|O|O|
|X| |X|
‾‾‾‾‾‾‾
episode: 169/1000, steps: 3, reward_total: -2, e: 1.0, time: 05:22:46
_______
|X|O|X|
|X| |O|
|X| |O|
‾‾‾‾‾‾‾
episode: 170/1000, steps: 4, reward_total: -2, e: 1.0, time: 05:22:46
_______
|X|O|X|
|O|O|O|
| |X|X|
‾‾‾‾‾‾‾
episode: 171/1000, steps: 3, reward_total: -2, e: 1.0, time: 05:22:54
_______
|X|O|X|
|O|O|X|
| | |X|
‾‾‾‾‾‾‾
episode: 172/1000, steps: 4, reward_total: -2, e: 1.0, time: 05:22:55
_______
|O|X|O|
|X|X|O|
|X| |O|
‾‾‾‾‾‾‾
episode: 173/1000, steps: 2, reward_total: -2, e: 1.0, time: 05:23:01
_______
|X|X|X|
| |O| |
|O| | |
‾‾‾‾‾‾‾
episode: 174/1000, steps: 4, reward_total: 1, e: 1.0, time: 05:23:02
_______
|O|X|O|
|O|X| |
|X|X| |
‾‾‾‾‾‾‾
episode: 175/1000, steps: 3, reward_total: -2, e: 1.0, time: 05:23:10
_______
|X|X|O|
| |X| |
|O|X|O|
‾‾‾‾‾‾‾
episode: 176/1000, steps: 3, reward_total: -2, e

In [7]:
agent, optimizer, criterion = supply_model(load_if_exists=True, agent_name=OPTIMAL_1K)
environment = TicTacToeGame(DEVICE, None, OPPONENT_LEVEL.OPTIMAL)
reward_history = train_agent(
    agent,
    environment,
    NUM_EPISODES,
    optimizer,
    criterion,
    DEVICE,
    MODEL_PATH,
    model_name = OPTIMAL_2K
)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
‾‾‾‾‾‾‾
episode: 168/1000, steps: 4, reward_total: -2, e: 1.0, time: 06:54:43
_______
|O|X|X|
|O|O|O|
|X| |X|
‾‾‾‾‾‾‾
episode: 169/1000, steps: 3, reward_total: -2, e: 1.0, time: 06:54:48
_______
|X|O|X|
|X| |O|
|X| |O|
‾‾‾‾‾‾‾
episode: 170/1000, steps: 4, reward_total: -2, e: 1.0, time: 06:54:48
_______
|X|O|X|
|O|O|O|
| |X|X|
‾‾‾‾‾‾‾
episode: 171/1000, steps: 3, reward_total: -2, e: 1.0, time: 06:54:57
_______
|X|O|X|
|O|O|X|
| | |X|
‾‾‾‾‾‾‾
episode: 172/1000, steps: 4, reward_total: -2, e: 1.0, time: 06:54:58
_______
|O|X|O|
|X|X|O|
|X| |O|
‾‾‾‾‾‾‾
episode: 173/1000, steps: 2, reward_total: -2, e: 1.0, time: 06:55:04
_______
|X|X|X|
| |O| |
|O| | |
‾‾‾‾‾‾‾
episode: 174/1000, steps: 4, reward_total: 1, e: 1.0, time: 06:55:05
_______
|O|X|O|
|O|X| |
|X|X| |
‾‾‾‾‾‾‾
episode: 175/1000, steps: 3, reward_total: -2, e: 1.0, time: 06:55:12
_______
|X|X|O|
| |X| |
|O|X|O|
‾‾‾‾‾‾‾
episode: 176/1000, steps: 3, reward_total: -2, e

In [8]:
compare_to_naive(OPTIMAL_1K)

Loading Model Parameters...
Model loaded from 'drive/MyDrive/Projects/artificial-intelligence/trained_models/ReinforcementLearning/TicTacToeV2/TicTacToev2-OPTIMAL-1K.pt'.


100%|██████████| 25000/25000 [03:14<00:00, 128.50it/s]


Win rate:  27.912%
Draw rate: 30.744%
Loss rate: 41.344%





In [9]:
compare_to_naive(OPTIMAL_2K)

Loading Model Parameters...
Model loaded from 'drive/MyDrive/Projects/artificial-intelligence/trained_models/ReinforcementLearning/TicTacToeV2/TicTacToev2-OPTIMAL-2K.pt'.


100%|██████████| 25000/25000 [02:49<00:00, 147.74it/s]


Win rate:  56.516%
Draw rate: 5.36%
Loss rate: 38.124%





In [10]:
compare_to_optimal(OPTIMAL_1K)

Loading Model Parameters...
Model loaded from 'drive/MyDrive/Projects/artificial-intelligence/trained_models/ReinforcementLearning/TicTacToeV2/TicTacToev2-OPTIMAL-1K.pt'.


100%|██████████| 100/100 [06:23<00:00,  3.84s/it]


Win rate:  0.0%
Draw rate: 40.0%
Loss rate: 60.0%





In [11]:
compare_to_optimal(OPTIMAL_2K)

Loading Model Parameters...
Model loaded from 'drive/MyDrive/Projects/artificial-intelligence/trained_models/ReinforcementLearning/TicTacToeV2/TicTacToev2-OPTIMAL-2K.pt'.


100%|██████████| 100/100 [06:19<00:00,  3.80s/it]


Win rate:  8.0%
Draw rate: 0.0%
Loss rate: 92.0%





In [13]:
compare_to_model(OPTIMAL_1K, NAIVE_2K)

Loading Model Parameters...
Model loaded from 'drive/MyDrive/Projects/artificial-intelligence/trained_models/ReinforcementLearning/TicTacToeV2/TicTacToev2-OPTIMAL-1K.pt'.
Loading Model Parameters...
Model loaded from 'drive/MyDrive/Projects/artificial-intelligence/trained_models/ReinforcementLearning/TicTacToeV2/TicTacToev2-NAIVE-2K.pt'.


100%|██████████| 10000/10000 [01:28<00:00, 113.43it/s]


Win rate:  31.01%
Draw rate: 23.0%
Loss rate: 45.99%





In [14]:
compare_to_model(OPTIMAL_2K, NAIVE_2K)

Loading Model Parameters...
Model loaded from 'drive/MyDrive/Projects/artificial-intelligence/trained_models/ReinforcementLearning/TicTacToeV2/TicTacToev2-OPTIMAL-2K.pt'.
Loading Model Parameters...
Model loaded from 'drive/MyDrive/Projects/artificial-intelligence/trained_models/ReinforcementLearning/TicTacToeV2/TicTacToev2-NAIVE-2K.pt'.


100%|██████████| 10000/10000 [01:16<00:00, 131.56it/s]


Win rate:  55.45%
Draw rate: 16.78%
Loss rate: 27.77%





In [12]:
TRAIN_START = 1000
NUM_EPISODES = 2000

In [9]:
agent, optimizer, criterion = supply_model(load_if_exists=True, agent_name=OPTIMAL_2K)
environment = TicTacToeGame(DEVICE, None, OPPONENT_LEVEL.OPTIMAL)
reward_history = train_agent(
    agent,
    environment,
    NUM_EPISODES,
    optimizer,
    criterion,
    DEVICE,
    MODEL_PATH,
    model_name = OPTIMAL_4K
)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
‾‾‾‾‾‾‾
episode: 1168/2000, steps: 3, reward_total: 1, e: 0.062, time: 18:44:26
_______
|O| | |
|O| | |
|X|X|X|
‾‾‾‾‾‾‾
episode: 1169/2000, steps: 2, reward_total: -2, e: 0.062, time: 18:44:34
_______
|X|O| |
|X| | |
|X|O| |
‾‾‾‾‾‾‾
episode: 1170/2000, steps: 3, reward_total: 1, e: 0.061, time: 18:44:36
_______
|O| | |
| | |O|
|X|X|X|
‾‾‾‾‾‾‾
episode: 1171/2000, steps: 4, reward_total: -2, e: 0.061, time: 18:44:46
_______
|X|X|O|
|O|X|X|
|O|X|O|
‾‾‾‾‾‾‾
episode: 1172/2000, steps: 3, reward_total: 1, e: 0.061, time: 18:44:48
_______
|O| |O|
| | | |
|X|X|X|
‾‾‾‾‾‾‾
episode: 1173/2000, steps: 3, reward_total: -2, e: 0.061, time: 18:44:57
_______
|X|O|X|
|X| | |
|X|O|O|
‾‾‾‾‾‾‾
episode: 1174/2000, steps: 3, reward_total: 1, e: 0.061, time: 18:45:00
_______
|O| | |
| | |O|
|X|X|X|
‾‾‾‾‾‾‾
episode: 1175/2000, steps: 4, reward_total: -2, e: 0.06, time: 18:45:10
_______
|X|X|X|
|O|O|X|
|X|O|O|
‾‾‾‾‾‾‾
episode: 1176/2000, steps: 5

In [5]:
compare_to_naive(OPTIMAL_4K)

Loading Model Parameters...
Model loaded from 'drive/MyDrive/Projects/artificial-intelligence/trained_models/ReinforcementLearning/TicTacToeV2/TicTacToev2-OPTIMAL-4K.pt'.


100%|██████████| 25000/25000 [02:37<00:00, 158.69it/s]


Win rate:  52.392%
Draw rate: 17.216%
Loss rate: 30.392%





In [6]:
compare_to_optimal(OPTIMAL_4K)


Loading Model Parameters...
Model loaded from 'drive/MyDrive/Projects/artificial-intelligence/trained_models/ReinforcementLearning/TicTacToeV2/TicTacToev2-OPTIMAL-4K.pt'.


100%|██████████| 100/100 [06:31<00:00,  3.91s/it]


Win rate:  27.0%
Draw rate: 35.0%
Loss rate: 38.0%





In [7]:
compare_to_model(OPTIMAL_4K, NAIVE_2K)

Loading Model Parameters...
Model loaded from 'drive/MyDrive/Projects/artificial-intelligence/trained_models/ReinforcementLearning/TicTacToeV2/TicTacToev2-OPTIMAL-4K.pt'.
Loading Model Parameters...
Model loaded from 'drive/MyDrive/Projects/artificial-intelligence/trained_models/ReinforcementLearning/TicTacToeV2/TicTacToev2-NAIVE-2K.pt'.


100%|██████████| 10000/10000 [01:09<00:00, 143.06it/s]


Win rate:  70.92%
Draw rate: 11.91%
Loss rate: 17.17%





In [8]:
compare_to_model(OPTIMAL_4K, OPTIMAL_2K)

Loading Model Parameters...
Model loaded from 'drive/MyDrive/Projects/artificial-intelligence/trained_models/ReinforcementLearning/TicTacToeV2/TicTacToev2-OPTIMAL-4K.pt'.
Loading Model Parameters...
Model loaded from 'drive/MyDrive/Projects/artificial-intelligence/trained_models/ReinforcementLearning/TicTacToeV2/TicTacToev2-OPTIMAL-2K.pt'.


100%|██████████| 10000/10000 [01:16<00:00, 130.83it/s]


Win rate:  54.89%
Draw rate: 17.28%
Loss rate: 27.83%





The evaluation results for OPTIMAL_4K look promising. That being said, the state outputs from the training loop seem to fixate on attaining certain states (i.e. all Xs on the bottom row).