#### Optimal Opponent Experiments
Author: Yemi Kelani

##### Google Drive Setup (Skip if running locally)

> To run this notebook, follow these steps:
> 1. Download the latest version of the [repository](https://github.com/yemi-kelani/artificial-intelligence/tree/master).
> 2. Upload the repsitory files to your Google Drive account under the path `Projects/artificial-intelligence`.
> 3. Open this file (`train.ipynb`) from your Google Drive and run the experiments.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
ROOT_FOLDER = "drive/MyDrive/Projects/artificial-intelligence/models/ReinforcementLearning/"
PROJECT_PATH = f"{ROOT_FOLDER}/DeepQ_TicTacToe_v2"
NOTEBOOK_LOCATION = f"{PROJECT_PATH}/experiments"

In [None]:
!cp {PROJECT_PATH}/DeepQAgent.py .
!cp {PROJECT_PATH}/TicTacToeGame.py .
!cp {ROOT_FOLDER}/Utils.py .

from DeepQAgent import DeepQAgent
from TicTacToeGame import TicTacToeGame, OPPONENT_LEVEL
from Utils import (
    train_agent,
    test_agent
)
MODEL_PATH = "drive/MyDrive/Projects/artificial-intelligence/trained_models/ReinforcementLearning/TicTacToeV2"

##### Local Setup (Skip if running remotely)

> 1. Run the following cells

In [1]:
from models.ReinforcementLearning.DeepQ_TicTacToe_v2.DeepQAgent import DeepQAgent
from models.ReinforcementLearning.DeepQ_TicTacToe_v2.TicTacToeGame import TicTacToeGame, OPPONENT_LEVEL
from models.ReinforcementLearning.Utils import (
    train_agent,
    test_agent
)
MODEL_PATH = "../../../../trained_models/ReinforcementLearning/TicTacToeV2"

##### Experiments

In [2]:
import os
import torch

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

# DeepQ parameters
BATCH_SIZE     = 256
NUM_EPISODES   = 10000
STATE_SPACE    = 9
ACTION_SPACE   = 9
HIDDEN_SIZE    = 128
EPSILON        = 1.0
GAMMA          = 0.95
LEARNING_RATE  = 0.001
MOEMENTUM      = 0.90
DROPOUT        = 0.20
TRAIN_START    = 1000 # =< 2000 (Maxlen of replay memory)

# model roots
BASELINE = "TicTacToe-v2-BASELINE"
NAIVE = "TicTacToe-v2-NAIVE"
AGENT = "TicTacToe-v2-AGENT"
OPTIMAL = "TicTacToe-v2-OPTIMAL"
SELF = "TicTacToe-v2-SELF"

def get_full_model_path(agent_name: str = None):
  if agent_name is None:
    return os.path.join(MODEL_PATH, "" + ".pt")
  return os.path.join(MODEL_PATH, agent_name + ".pt")

def supply_model(
  load_if_exists: bool = True, 
  agent_name: str = None,
  optimizer_type = ""
  ):

  agent = DeepQAgent(
      device         = DEVICE,
      epsilon        = EPSILON,
      gamma          = GAMMA,
      state_space    = STATE_SPACE,
      action_space   = ACTION_SPACE,
      hidden_size    = HIDDEN_SIZE,
      dropout        = DROPOUT,
      train_start    = TRAIN_START,
      batch_size     = BATCH_SIZE,
  )

  full_model_path = get_full_model_path(agent_name)
  if load_if_exists and os.path.exists(full_model_path):
    print("Loading Model Parameters...")
    agent.load_model(filepath=full_model_path)

  match optimizer_type.upper():
    case "SGD":
      optimizer = torch.optim.SGD(
        agent.parameters(), 
        lr=LEARNING_RATE, 
        momentum=MOEMENTUM
      )
    case "RMS":
      optimizer = torch.optim.RMSprop(agent.parameters(), lr=LEARNING_RATE)
    case "ADAM":
      optimizer = torch.optim.Adam(agent.parameters(), lr=LEARNING_RATE)
    case _:
      optimizer = torch.optim.Adam(agent.parameters(), lr=LEARNING_RATE)
      
  # criterion = torch.nn.SmoothL1Loss() # Huber Loss
  # criterion = torch.nn.MSELoss()
  criterion = torch.nn.CrossEntropyLoss()
  
  return agent, optimizer, criterion

def compare_to_naive(agent_name: str, num_episodes: int = 10000):
  _agent, _, _ = supply_model(load_if_exists=True, agent_name=agent_name)
  _environment = TicTacToeGame(DEVICE, None, OPPONENT_LEVEL.NAIVE, start_as_X=False)
  test_agent(_agent, _environment, num_episodes)

def compare_to_optimal(agent_name: str, num_episodes: int = 100):
  _agent, _, _ = supply_model(load_if_exists=True, agent_name=agent_name)
  _environment = TicTacToeGame(DEVICE, None, OPPONENT_LEVEL.OPTIMAL, start_as_X=False)
  test_agent(_agent, _environment, num_episodes)

def compare_to_model(agent_name: str, model_name: str, num_episodes: int = 10000):
  _agent, _, _ = supply_model(load_if_exists=True, agent_name=agent_name)
  _enemy, _, _ = supply_model(load_if_exists=True, agent_name=model_name)
  _environment = TicTacToeGame(DEVICE, _enemy, OPPONENT_LEVEL.AGENT, start_as_X=False)
  test_agent(_agent, _environment, num_episodes)


cpu


In [3]:
agent, _, _ = supply_model()
agent.save_model(MODEL_PATH, BASELINE)

Model saved to '../../../../trained_models/ReinforcementLearning/TicTacToeV2/TicTacToe-v2-BASELINE.pt'.


'../../../../trained_models/ReinforcementLearning/TicTacToeV2/TicTacToe-v2-BASELINE.pt'

In [4]:
compare_to_naive(BASELINE)

Loading Model Parameters...
Model loaded from '../../../../trained_models/ReinforcementLearning/TicTacToeV2/TicTacToe-v2-BASELINE.pt'.


100%|██████████| 10000/10000 [00:14<00:00, 702.72it/s]



Win rate:  44.6%
Draw rate: 7.42%
Loss rate: 47.98%





In [None]:
agent, optimizer, criterion = supply_model(
    load_if_exists=True, 
    agent_name=BASELINE,
    optimizer_type="SGD"
)
agent.prep_cosine_anneal(0.0, 1.0, NUM_EPISODES)
environment = TicTacToeGame(DEVICE, None, OPPONENT_LEVEL.NAIVE)
reward_history = train_agent(
    agent,
    environment,
    NUM_EPISODES,
    optimizer,
    criterion,
    DEVICE,
    MODEL_PATH,
    model_name = NAIVE,
    save_every = 2000,
    # epsilon_min_value = 0.30,
    # epsilon_max_value = 0.75,
)
agent.plot_loss_history()

Loading Model Parameters...
Model loaded from '../../../../trained_models/ReinforcementLearning/TicTacToeV2/TicTacToe-v2-BASELINE.pt'.
Copied weights from policy network to target network.
episode: 1/10000, steps: 3, reward_total: -1, loss_avg: n/a, e: 1.0, time: 12:22:42
_______
|X|X|X|
|O|O| |
|X|O| |
‾‾‾‾‾‾‾
episode: 2/10000, steps: 4, reward_total: 1, loss_avg: n/a, e: 1.0, time: 12:22:42
_______
|O| | |
|X|X|X|
|O|O|X|
‾‾‾‾‾‾‾
episode: 3/10000, steps: 3, reward_total: 1, loss_avg: n/a, e: 1.0, time: 12:22:42
_______
| |O|X|
|X|O| |
|X|O| |
‾‾‾‾‾‾‾
episode: 4/10000, steps: 4, reward_total: 1, loss_avg: n/a, e: 1.0, time: 12:22:42
_______
|O|O|X|
| |X| |
|X|O|X|
‾‾‾‾‾‾‾
episode: 5/10000, steps: 3, reward_total: -1, loss_avg: n/a, e: 1.0, time: 12:22:42
_______
|O| |X|
| |X|X|
|O|O|X|
‾‾‾‾‾‾‾
episode: 6/10000, steps: 3, reward_total: -1, loss_avg: n/a, e: 1.0, time: 12:22:42
_______
| |O|X|
|X|O|X|
| |O| |
‾‾‾‾‾‾‾
episode: 7/10000, steps: 4, reward_total: 1, loss_avg: n/a, e: 1.0, ti

KeyboardInterrupt: 

In [10]:
compare_to_naive(f"{OPTIMAL}-2K")
compare_to_optimal(f"{OPTIMAL}-2K")

100%|██████████| 10000/10000 [00:11<00:00, 894.35it/s]




Win rate:  57.11%
Draw rate: 9.71%
Loss rate: 33.18%


100%|██████████| 100/100 [01:01<00:00,  1.62it/s]



Win rate:  0.0%
Draw rate: 0.0%
Loss rate: 100.0%





In [11]:
compare_to_naive(f"{OPTIMAL}-4K")
compare_to_optimal(f"{OPTIMAL}-4K")

100%|██████████| 10000/10000 [00:11<00:00, 892.58it/s]




Win rate:  50.32%
Draw rate: 12.92%
Loss rate: 36.76%


100%|██████████| 100/100 [01:03<00:00,  1.57it/s]



Win rate:  0.0%
Draw rate: 0.0%
Loss rate: 100.0%





In [12]:
compare_to_naive(f"{OPTIMAL}-6K")
compare_to_optimal(f"{OPTIMAL}-6K")

100%|██████████| 10000/10000 [00:11<00:00, 882.20it/s]




Win rate:  47.17%
Draw rate: 15.62%
Loss rate: 37.21%


100%|██████████| 100/100 [01:02<00:00,  1.59it/s]



Win rate:  0.0%
Draw rate: 4.0%
Loss rate: 96.0%





In [13]:
compare_to_naive(f"{OPTIMAL}-8K")
compare_to_optimal(f"{OPTIMAL}-8K")

100%|██████████| 10000/10000 [00:11<00:00, 889.11it/s]




Win rate:  54.6%
Draw rate: 8.04%
Loss rate: 37.36%


100%|██████████| 100/100 [01:05<00:00,  1.52it/s]



Win rate:  0.0%
Draw rate: 0.0%
Loss rate: 100.0%





In [14]:
compare_to_naive(f"{OPTIMAL}-10K")
compare_to_optimal(f"{OPTIMAL}-10K")

100%|██████████| 10000/10000 [00:10<00:00, 912.72it/s]




Win rate:  68.63%
Draw rate: 5.3%
Loss rate: 26.07%


100%|██████████| 100/100 [01:02<00:00,  1.59it/s]



Win rate:  0.0%
Draw rate: 14.0%
Loss rate: 86.0%





In [8]:
torch.flatten(torch.tensor([[[0,1,2],[3,4,5],[6,7,8]], [[0,1,2],[3,4,5],[6,7,8]]]))

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8])

In [23]:
batch = torch.stack([
    torch.tensor([1,2,3,4,5,6,7,8,9]).reshape((1, 9)),
    torch.tensor([1,2,3,4,5,6,7,8,9]).reshape((1, 9)),
    torch.tensor([1,2,3,4,5,6,7,8,9]).reshape((1, 9)),
    torch.tensor([1,2,3,4,5,6,7,8,9]).reshape((1, 9))
])

batch.squeeze().shape

# batch.reshape((len(batch), len(batch[0]))).shape

torch.Size([4, 9])

In [21]:
torch.zeros((3, 3)).reshape((1, agent.action_space)).shape

torch.Size([1, 9])

In [13]:
len(torch.zeros((3, 3)))

3

In [5]:
torch.tensor([1,2,3,4,5,6,7,8,9]).reshape((1, 9)).squeeze().shape

torch.Size([9])

In [5]:
[1,2,3,4,5][-3:]

[3, 4, 5]