In [5]:
import torch

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

from models.ReinforcementLearning.DeepQ_TicTacToe_v2.DeepQAgent import DeepQAgent
from models.ReinforcementLearning.DeepQ_TicTacToe_v2.TicTacToeGame import TicTacToeGame, OPPONENT_LEVEL
from models.ReinforcementLearning.Utils import (
    test_agent,
    set_seed
)

SEED = 100
set_seed(SEED)

# DeepQ parameters
BATCH_SIZE     = 128
NUM_EPISODES   = 1000 if torch.cuda.is_available() else 100
STATE_SPACE    = 9
ACTION_SPACE   = 9
HIDDEN_SIZE    = 128
EPSILON        = 1.0
GAMMA          = 0.99
LEARNING_RATE  = 0.001
DROPOUT        = 0.25
TRAIN_START    = 1500
NEGATIVE_SLOPE = 0.01
COMPUTER_LEVEL = OPPONENT_LEVEL.NAIVE

# save path
MODEL_PATH = "../../../trained_models/ReinforcementLearning/TicTacToeV2"
MODEL_NAME = ""

BASELINE = "TicTacToev2-Baseline-Untrained"
NAIVE_2K = "TicTacToev2-NAIVE-2K"
NAIVE_4K = "TicTacToev2-NAIVE-4K"
NAIVE_6K = "TicTacToev2-NAIVE-6K"
NAIVE_8K = "TicTacToev2-NAIVE-8K"
NAIVE_10K = "TicTacToev2-NAIVE-10K"
AGENT_1K = "TicTacToev2-AGENT-1K"
AGENT_2K = "TicTacToev2-AGENT-2K"
AGENT_3K = "TicTacToev2-AGENT-3K"
AGENT_4K = "TicTacToev2-AGENT-4K"

NAIVE_5K = "TicTacToe-NAIVE-5000"
NAIVE_10K2 = "TicTacToe-NAIVE-10000"
NAIVE_15K = "TicTacToe-NAIVE-15000(overfit)"
AGENT_5K = "TicTacToe-NAIVE-10K-AGENT-5K"
AGENT_7500 = "TicTacToe-NAIVE-10K-AGENT-7500"
AGENT_10K = "TicTacToe-NAIVE-10K-AGENT-10K"

def get_full_model_path(agent_name: str = None):
  if agent_name is None:
    return os.path.join(MODEL_PATH, MODEL_NAME + ".pt")
  return os.path.join(MODEL_PATH, agent_name + ".pt")

def supply_model(load_if_exists: bool = True, agent_name: str = None):
  
  agent = DeepQAgent(
      device         = DEVICE,
      epsilon        = EPSILON, 
      gamma          = GAMMA,
      state_space    = STATE_SPACE, 
      action_space   = ACTION_SPACE, 
      hidden_size    = HIDDEN_SIZE,
      dropout        = DROPOUT,
      train_start    = TRAIN_START,
      batch_size     = BATCH_SIZE,
      negative_slope = NEGATIVE_SLOPE
  )

  full_model_path = get_full_model_path(agent_name)
  if load_if_exists and os.path.exists(full_model_path):
    print("Loading Model Parameters...")
    agent.load_model(filepath=full_model_path)
  
  optimizer = torch.optim.Adam(agent.parameters(), lr=LEARNING_RATE)
  criterion = torch.nn.SmoothL1Loss() # Huber Loss
  
  return agent, optimizer, criterion

def compare_to_naive(agent_name: str, num_episodes: int = 25000):
  _agent, _, _ = supply_model(load_if_exists=True, agent_name=agent_name)
  _environment = TicTacToeGame(DEVICE, None, OPPONENT_LEVEL.NAIVE, start_as_X=False)
  test_agent(_agent, _environment, num_episodes)

def compare_to_optimal(agent_name: str, num_episodes: int = 100):
  _agent, _, _ = supply_model(load_if_exists=True, agent_name=agent_name)
  _environment = TicTacToeGame(DEVICE, None, OPPONENT_LEVEL.OPTIMAL, start_as_X=False)
  test_agent(_agent, _environment, num_episodes)

def compare_to_model(agent_name: str, model_name: str, num_episodes: int = 10000):
  _agent, _, _ = supply_model(load_if_exists=True, agent_name=agent_name)
  _enemy, _, _ = supply_model(load_if_exists=True, agent_name=model_name)
  _environment = TicTacToeGame(DEVICE, _enemy, OPPONENT_LEVEL.AGENT, start_as_X=False)
  test_agent(_agent, _environment, num_episodes)


cuda


In [2]:
compare_to_naive(BASELINE)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToe-Baseline-Untrained.pt'.


100%|██████████| 25000/25000 [01:56<00:00, 213.90it/s]


Win rate:  55.316%
Draw rate: 9.456%
Loss rate: 35.228%





In [3]:
compare_to_naive(NAIVE_5K)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToe-NAIVE-5000.pt'.


100%|██████████| 25000/25000 [02:03<00:00, 202.27it/s]


Win rate:  68.352%
Draw rate: 4.444%
Loss rate: 27.204%





In [4]:
compare_to_naive(NAIVE_10K)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToe-NAIVE-10000.pt'.


100%|██████████| 25000/25000 [02:09<00:00, 193.50it/s]


Win rate:  72.24%
Draw rate: 7.108%
Loss rate: 20.652%





In [5]:
compare_to_naive(NAIVE_15K)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToe-NAIVE-15000(overfit).pt'.


100%|██████████| 25000/25000 [02:08<00:00, 194.53it/s]


Win rate:  69.332%
Draw rate: 5.78%
Loss rate: 24.888%





In [3]:
compare_to_naive(AGENT_5K)


Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToe-NAIVE-10K-AGENT-5K.pt'.


100%|██████████| 25000/25000 [01:58<00:00, 210.37it/s]


Win rate:  65.104%
Draw rate: 4.516%
Loss rate: 30.38%





In [6]:
compare_to_naive(AGENT_7500)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToe-NAIVE-10K-AGENT-7500.pt'.


100%|██████████| 25000/25000 [02:17<00:00, 181.29it/s]


Win rate:  55.452%
Draw rate: 10.7%
Loss rate: 33.848%





In [4]:
compare_to_naive(AGENT_10K)

100%|██████████| 25000/25000 [01:59<00:00, 209.58it/s]


Win rate:  46.736%
Draw rate: 8.94%
Loss rate: 44.324%





In [3]:
compare_to_model(NAIVE_2K, NAIVE_5K)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-NAIVE-2K.pt'.
Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToe-NAIVE-5000.pt'.


100%|██████████| 10000/10000 [00:50<00:00, 198.73it/s]


Win rate:  82.87%
Draw rate: 0.0%
Loss rate: 17.13%





In [4]:
compare_to_model(AGENT_1K, NAIVE_5K)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-AGENT-1K.pt'.
Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToe-NAIVE-5000.pt'.


100%|██████████| 10000/10000 [00:53<00:00, 186.51it/s]


Win rate:  66.0%
Draw rate: 0.0%
Loss rate: 34.0%





In [6]:
compare_to_model(NAIVE_2K, NAIVE_10K2)
compare_to_model(AGENT_1K, NAIVE_10K2)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-NAIVE-2K.pt'.
Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToe-NAIVE-10000.pt'.


100%|██████████| 10000/10000 [00:56<00:00, 178.44it/s]



Win rate:  60.37%
Draw rate: 5.95%
Loss rate: 33.68%
Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-AGENT-1K.pt'.
Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToe-NAIVE-10000.pt'.


100%|██████████| 10000/10000 [00:56<00:00, 176.37it/s]


Win rate:  53.9%
Draw rate: 6.57%
Loss rate: 39.53%





In [8]:
compare_to_model(NAIVE_2K, AGENT_5K)
compare_to_model(NAIVE_4K, AGENT_5K)

Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-NAIVE-2K.pt'.
Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToe-NAIVE-10K-AGENT-5K.pt'.


100%|██████████| 10000/10000 [00:48<00:00, 207.19it/s]



Win rate:  71.68%
Draw rate: 0.0%
Loss rate: 28.32%
Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToev2-NAIVE-4K.pt'.
Loading Model Parameters...
Model loaded from '../../../trained_models/ReinforcementLearning/TicTacToeV2\TicTacToe-NAIVE-10K-AGENT-5K.pt'.


100%|██████████| 10000/10000 [00:50<00:00, 199.11it/s]


Win rate:  65.2%
Draw rate: 0.0%
Loss rate: 34.8%



