In [3]:
import torch

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

from models.ReinforcementLearning.DeepQ_TicTacToe_v2.DeepQAgent import DeepQAgent
from models.ReinforcementLearning.DeepQ_TicTacToe_v2.TicTacToeGame import TicTacToeGame, OPPONENT_LEVEL
from models.ReinforcementLearning.Utils import (
    train_agent,
    test_agent,
    set_seed
)

SEED = 100
set_seed(SEED)

# DeepQ parameters
BATCH_SIZE     = 128
NUM_EPISODES   = 1000 if torch.cuda.is_available() else 100
STATE_SPACE    = 9
ACTION_SPACE   = 9
HIDDEN_SIZE    = 128
EPSILON        = 1.0
GAMMA          = 0.99
LEARNING_RATE  = 0.001
DROPOUT        = 0.25
TRAIN_START    = 1500
NEGATIVE_SLOPE = 0.01
COMPUTER_LEVEL = OPPONENT_LEVEL.NAIVE

# save path
MODEL_PATH = "../../../../trained_models/ReinforcementLearning/TicTacToeV2"
MODEL_NAME = ""

BASELINE = "TicTacToev2-Baseline-Untrained"
NAIVE_2K = "TicTacToev2-NAIVE-2K"
NAIVE_4K = "TicTacToev2-NAIVE-4K"
NAIVE_6K = "TicTacToev2-NAIVE-6K"
NAIVE_8K = "TicTacToev2-NAIVE-8K"
NAIVE_10K = "TicTacToev2-NAIVE-10K"
AGENT_1K = "TicTacToev2-AGENT-1K"
AGENT_2K = "TicTacToev2-AGENT-2K"
AGENT_3K = "TicTacToev2-AGENT-3K"
AGENT_4K = "TicTacToev2-AGENT-4K"


def get_full_model_path(agent_name: str = None):
  if agent_name is None:
    return os.path.join(MODEL_PATH, MODEL_NAME + ".pt")
  return os.path.join(MODEL_PATH, agent_name + ".pt")

def supply_model(load_if_exists: bool = True, agent_name: str = None):
  
  agent = DeepQAgent(
      device         = DEVICE,
      epsilon        = EPSILON, 
      gamma          = GAMMA,
      state_space    = STATE_SPACE, 
      action_space   = ACTION_SPACE, 
      hidden_size    = HIDDEN_SIZE,
      dropout        = DROPOUT,
      train_start    = TRAIN_START,
      batch_size     = BATCH_SIZE,
      negative_slope = NEGATIVE_SLOPE
  )

  full_model_path = get_full_model_path(agent_name)
  if load_if_exists and os.path.exists(full_model_path):
    print("Loading Model Parameters...")
    agent.load_model(filepath=full_model_path)
  
  optimizer = torch.optim.Adam(agent.parameters(), lr=LEARNING_RATE)
  criterion = torch.nn.SmoothL1Loss() # Huber Loss
  
  return agent, optimizer, criterion

def compare_to_naive(agent_name: str, num_episodes: int = 25000):
  _agent, _, _ = supply_model(load_if_exists=True, agent_name=agent_name)
  _environment = TicTacToeGame(DEVICE, None, OPPONENT_LEVEL.NAIVE, start_as_X=False)
  test_agent(_agent, _environment, num_episodes)

def compare_to_optimal(agent_name: str, num_episodes: int = 100):
  _agent, _, _ = supply_model(load_if_exists=True, agent_name=agent_name)
  _environment = TicTacToeGame(DEVICE, None, OPPONENT_LEVEL.OPTIMAL, start_as_X=False)
  test_agent(_agent, _environment, num_episodes)

def compare_to_model(agent_name: str, model_name: str, num_episodes: int = 10000):
  _agent, _, _ = supply_model(load_if_exists=True, agent_name=agent_name)
  _enemy, _, _ = supply_model(load_if_exists=True, agent_name=model_name)
  _environment = TicTacToeGame(DEVICE, _enemy, OPPONENT_LEVEL.AGENT, start_as_X=False)
  test_agent(_agent, _environment, num_episodes)


cpu


The NAIVE_2K and NAIVE_4K models seem to perform well against the the Naive opponent, and better than the other NAIVE models against the Optimal Opponent.

In [2]:
compare_to_model(NAIVE_2K, BASELINE)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-NAIVE-2K.pt'.
Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-Baseline-Untrained.pt'.


100%|██████████| 10000/10000 [00:47<00:00, 210.71it/s]


Win rate:  94.51%
Draw rate: 0.0%
Loss rate: 5.49%





In [3]:
compare_to_model(NAIVE_4K, BASELINE)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-NAIVE-4K.pt'.
Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-Baseline-Untrained.pt'.


100%|██████████| 10000/10000 [00:51<00:00, 195.92it/s]


Win rate:  60.57%
Draw rate: 0.0%
Loss rate: 39.43%





The NAIVE_2K model performs better against the BASELINE model than the NAIVE_4K model. It also performs better against the NAIVE opponent (see naive-train.ipynb). We'll focus on this model for agent training.

In [2]:
agent, optimizer, criterion = supply_model(load_if_exists=True, agent_name=NAIVE_2K)
enemy, _, _ = supply_model(load_if_exists=True, agent_name=NAIVE_2K)
environment = TicTacToeGame(DEVICE, enemy, OPPONENT_LEVEL.AGENT)
reward_history = train_agent(
    agent,
    environment,
    NUM_EPISODES,
    optimizer,
    criterion,
    DEVICE,
    MODEL_PATH,
    model_name = AGENT_1K
)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-NAIVE-2K.pt'.
Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-NAIVE-2K.pt'.
episode: 1/1000, steps: 3, reward_total: -2, e: 1.0, time: 12:15:41
_______
|O|O|X|
| |X| |
|X|O|X|
‾‾‾‾‾‾‾
episode: 2/1000, steps: 4, reward_total: -2, e: 1.0, time: 12:15:41
_______
|O|X|X|
|O|O| |
|O|X|X|
‾‾‾‾‾‾‾
episode: 3/1000, steps: 2, reward_total: -2, e: 1.0, time: 12:15:41
_______
|X| | |
| |X|O|
|O| |X|
‾‾‾‾‾‾‾
episode: 4/1000, steps: 4, reward_total: -2, e: 1.0, time: 12:15:41
_______
|O|X| |
|X|O|X|
|O|X|O|
‾‾‾‾‾‾‾
episode: 5/1000, steps: 3, reward_total: -2, e: 1.0, time: 12:15:41
_______
|O|X|X|
| |X|O|
|X| |O|
‾‾‾‾‾‾‾
episode: 6/1000, steps: 3, reward_total: 1, e: 1.0, time: 12:15:41
_______
|X|X|X|
| |O| |
| | |O|
‾‾‾‾‾‾‾
episode: 7/1000, steps: 4, reward_total: -1, e: 1.0, time: 12:15:41
_______
|X|O|X|
|X|O|O|

In [3]:
agent, optimizer, criterion = supply_model(load_if_exists=True, agent_name=AGENT_1K)
enemy, _, _ = supply_model(load_if_exists=True, agent_name=AGENT_1K)
environment = TicTacToeGame(DEVICE, enemy, OPPONENT_LEVEL.AGENT)
reward_history = train_agent(
    agent,
    environment,
    NUM_EPISODES,
    optimizer,
    criterion,
    DEVICE,
    MODEL_PATH,
    model_name = AGENT_2K
)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-AGENT-1K.pt'.
Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-AGENT-1K.pt'.
episode: 1/1000, steps: 4, reward_total: -1, e: 1.0, time: 12:25:11
_______
|X|X|O|
|O|O|X|
|X|X|O|
‾‾‾‾‾‾‾
episode: 2/1000, steps: 4, reward_total: -2, e: 1.0, time: 12:25:11
_______
|O|X|X|
|X|O|O|
|X| |O|
‾‾‾‾‾‾‾
episode: 3/1000, steps: 3, reward_total: -2, e: 1.0, time: 12:25:11
_______
| | |X|
|O|O|X|
|X|O|X|
‾‾‾‾‾‾‾
episode: 4/1000, steps: 3, reward_total: -2, e: 1.0, time: 12:25:11
_______
|O| | |
|X|O|X|
|X| |O|
‾‾‾‾‾‾‾
episode: 5/1000, steps: 3, reward_total: -2, e: 1.0, time: 12:25:11
_______
|O| |X|
|O|X| |
|X|O|X|
‾‾‾‾‾‾‾
episode: 6/1000, steps: 3, reward_total: 1, e: 1.0, time: 12:25:11
_______
|X| |O|
|X| | |
|X| |O|
‾‾‾‾‾‾‾
episode: 7/1000, steps: 3, reward_total: -2, e: 1.0, time: 12:25:11
_______
| | |X|
|O|X|O|

In [4]:
agent, optimizer, criterion = supply_model(load_if_exists=True, agent_name=AGENT_2K)
enemy, _, _ = supply_model(load_if_exists=True, agent_name=AGENT_2K)
environment = TicTacToeGame(DEVICE, enemy, OPPONENT_LEVEL.AGENT)
reward_history = train_agent(
    agent,
    environment,
    NUM_EPISODES,
    optimizer,
    criterion,
    DEVICE,
    MODEL_PATH,
    model_name = AGENT_3K
)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-AGENT-2K.pt'.
Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-AGENT-2K.pt'.
episode: 1/1000, steps: 3, reward_total: -2, e: 1.0, time: 12:34:24
_______
|X| |X|
|X|O| |
|X|O|O|
‾‾‾‾‾‾‾
episode: 2/1000, steps: 4, reward_total: 1, e: 1.0, time: 12:34:24
_______
|X|X|X|
|O|O| |
|O| |X|
‾‾‾‾‾‾‾
episode: 3/1000, steps: 4, reward_total: -2, e: 1.0, time: 12:34:24
_______
|O|X|X|
|X|X|O|
|O|X|O|
‾‾‾‾‾‾‾
episode: 4/1000, steps: 5, reward_total: -1, e: 1.0, time: 12:34:24
_______
|X|X|O|
|O|O|X|
|X|X|O|
‾‾‾‾‾‾‾
episode: 5/1000, steps: 2, reward_total: -2, e: 1.0, time: 12:34:24
_______
| | |X|
| |X|O|
|X|O| |
‾‾‾‾‾‾‾
episode: 6/1000, steps: 3, reward_total: -2, e: 1.0, time: 12:34:24
_______
|X| |O|
| |O| |
|O|X|X|
‾‾‾‾‾‾‾
episode: 7/1000, steps: 4, reward_total: -2, e: 1.0, time: 12:34:24
_______
|O|X|X|
|O|O|X|

In [5]:
agent, optimizer, criterion = supply_model(load_if_exists=True, agent_name=AGENT_3K)
enemy, _, _ = supply_model(load_if_exists=True, agent_name=AGENT_3K)
environment = TicTacToeGame(DEVICE, enemy, OPPONENT_LEVEL.AGENT)
reward_history = train_agent(
    agent,
    environment,
    NUM_EPISODES,
    optimizer,
    criterion,
    DEVICE,
    MODEL_PATH,
    model_name = AGENT_4K
)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-AGENT-3K.pt'.
Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-AGENT-3K.pt'.
episode: 1/1000, steps: 3, reward_total: -2, e: 1.0, time: 12:44:05
_______
|X| |O|
|X|X|X|
|O|O| |
‾‾‾‾‾‾‾
episode: 2/1000, steps: 3, reward_total: 1, e: 1.0, time: 12:44:05
_______
| | |X|
|O|X|O|
|X| | |
‾‾‾‾‾‾‾
episode: 3/1000, steps: 4, reward_total: 1, e: 1.0, time: 12:44:05
_______
|O|O|O|
|O|X|X|
| |X|X|
‾‾‾‾‾‾‾
episode: 4/1000, steps: 5, reward_total: 1, e: 1.0, time: 12:44:05
_______
|X|X|X|
|O|O|X|
|O|X|O|
‾‾‾‾‾‾‾
episode: 5/1000, steps: 2, reward_total: -2, e: 1.0, time: 12:44:05
_______
|X| |O|
|X|O| |
|X| | |
‾‾‾‾‾‾‾
episode: 6/1000, steps: 3, reward_total: -2, e: 1.0, time: 12:44:05
_______
|O| |X|
|X|O| |
| |X|O|
‾‾‾‾‾‾‾
episode: 7/1000, steps: 4, reward_total: -1, e: 1.0, time: 12:44:05
_______
|X|O|X|
|X|O|X|
|

##### Comparing the AGENT models to a Naive opponent.

In [21]:
compare_to_naive(BASELINE)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-Baseline-Untrained.pt'.


100%|██████████| 25000/25000 [02:08<00:00, 194.31it/s]


Win rate:  55.3%
Draw rate: 9.4%
Loss rate: 35.3%





In [6]:
compare_to_naive(NAIVE_2K)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-NAIVE-2K.pt'.


100%|██████████| 25000/25000 [01:54<00:00, 218.72it/s]


Win rate:  72.432%
Draw rate: 7.204%
Loss rate: 20.364%





In [7]:
compare_to_naive(AGENT_1K)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-AGENT-1K.pt'.


100%|██████████| 25000/25000 [01:44<00:00, 238.10it/s]


Win rate:  69.548%
Draw rate: 4.164%
Loss rate: 26.288%





In [8]:
compare_to_naive(AGENT_2K)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-AGENT-2K.pt'.


100%|██████████| 25000/25000 [01:49<00:00, 228.08it/s]


Win rate:  54.968%
Draw rate: 7.524%
Loss rate: 37.508%





In [9]:
compare_to_naive(AGENT_3K)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-AGENT-3K.pt'.


100%|██████████| 25000/25000 [02:12<00:00, 189.37it/s]


Win rate:  48.048%
Draw rate: 5.544%
Loss rate: 46.408%





In [10]:
compare_to_naive(AGENT_4K)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-AGENT-4K.pt'.


100%|██████████| 25000/25000 [02:17<00:00, 181.44it/s]


Win rate:  52.012%
Draw rate: 7.32%
Loss rate: 40.668%





The 2K, 3K, and 4K, AGENT models perform worse than BASELINE...

##### Comparing the AGENT models to a NAIVE_2K.

In [11]:
compare_to_model(NAIVE_2K, NAIVE_2K)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-NAIVE-2K.pt'.
Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-NAIVE-2K.pt'.


100%|██████████| 10000/10000 [01:03<00:00, 157.60it/s]


Win rate:  5.31%
Draw rate: 24.33%
Loss rate: 70.36%





In [12]:
compare_to_model(AGENT_1K, NAIVE_2K)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-AGENT-1K.pt'.
Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-NAIVE-2K.pt'.


100%|██████████| 10000/10000 [00:53<00:00, 188.18it/s]


Win rate:  59.31%
Draw rate: 11.91%
Loss rate: 28.78%





In [13]:
compare_to_model(AGENT_2K, NAIVE_2K)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-AGENT-2K.pt'.
Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-NAIVE-2K.pt'.


100%|██████████| 10000/10000 [00:56<00:00, 178.52it/s]


Win rate:  0.0%
Draw rate: 5.57%
Loss rate: 94.43%





In [14]:
compare_to_model(AGENT_3K, NAIVE_2K)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-AGENT-3K.pt'.
Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-NAIVE-2K.pt'.


100%|██████████| 10000/10000 [00:56<00:00, 177.45it/s]


Win rate:  12.81%
Draw rate: 0.0%
Loss rate: 87.19%





In [15]:
compare_to_model(AGENT_4K, NAIVE_2K)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-AGENT-4K.pt'.
Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-NAIVE-2K.pt'.


100%|██████████| 10000/10000 [00:57<00:00, 174.04it/s]


Win rate:  54.87%
Draw rate: 0.0%
Loss rate: 45.13%





##### Comparing AGENT models to Optimal opponent.

In [16]:
compare_to_optimal(NAIVE_2K)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-NAIVE-2K.pt'.


100%|██████████| 100/100 [09:22<00:00,  5.63s/it]


Win rate:  31.0%
Draw rate: 3.0%
Loss rate: 66.0%





In [17]:
compare_to_optimal(AGENT_1K)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-AGENT-1K.pt'.


100%|██████████| 100/100 [10:10<00:00,  6.11s/it]


Win rate:  13.0%
Draw rate: 4.0%
Loss rate: 83.0%





In [18]:
compare_to_optimal(AGENT_2K)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-AGENT-2K.pt'.


100%|██████████| 100/100 [10:43<00:00,  6.44s/it]


Win rate:  0.0%
Draw rate: 0.0%
Loss rate: 100.0%





In [19]:
compare_to_optimal(AGENT_3K)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-AGENT-3K.pt'.


100%|██████████| 100/100 [11:58<00:00,  7.19s/it]


Win rate:  0.0%
Draw rate: 0.0%
Loss rate: 100.0%





In [20]:
compare_to_optimal(AGENT_4K)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-AGENT-4K.pt'.


100%|██████████| 100/100 [10:14<00:00,  6.15s/it]


Win rate:  8.0%
Draw rate: 5.0%
Loss rate: 87.0%



