# REINFORCE

## Setup

In [None]:
import torch
from network.PolicyNetwork import PolicyNetwork
from train import train, test
from agents_tictactoe.ReinforceAgent import ReinforceAgent
from torch.utils.tensorboard import SummaryWriter
from env.TicTacToeEnvironment import TicTacToeEnvironment

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

env = TicTacToeEnvironment()
num_cells = env.board_size[0]*env.board_size[1]
num_hidden_units = 64
num_layers = 1
dropout_probability = 0.5

## Train
The TRAINING procedure has already finished. Only run the train part when you want train again.

In [None]:
net = PolicyNetwork(num_cells, num_hidden_units, num_layers, dropout_probability, num_cells).to(device)

agent_a = ReinforceAgent(env, net, lr = 0.002, weight_decay=0.01)
agent_b = ReinforceAgent(env)
writer=SummaryWriter('runs/tictactoe_8k/reinforce/random')
train(env, agent_a, agent_b, episodes=80000, log_interval=1000, writer=writer)
test(env, agent_a, agent_b)
torch.save(net, 'models/tictactoe/reinforce/random_8k_0.002.pth')

In [None]:
net_a = PolicyNetwork(num_cells, num_hidden_units, num_layers, dropout_probability, num_cells)
net_b = PolicyNetwork(num_cells, num_hidden_units, num_layers, dropout_probability, num_cells)
agent_a = ReinforceAgent(env, net_a, lr = 0.002, weight_decay=0.01)
agent_b = ReinforceAgent(env, net_b, lr = 0.002, weight_decay=0.01)

writer=SummaryWriter('runs/tictactoe_8k/reinforce/agents')
train(env, agent_a, agent_b, episodes=80000, log_interval=1000, writer=writer)
test(env, agent_a, agent_b)
torch.save(net_a, 'models/tictactoe/reinforce/agents_8k_0.002.pth')

# TEST

In [None]:
def train_with_random(policy_net, draw_board: bool = False, episodes: int = 10000):
    env = TicTacToeEnvironment()
    agent_a = ReinforceAgent(env, policy_net)
    agent_b = ReinforceAgent(env)
    test(env, agent_a, agent_b, draw_board=draw_board, episodes=episodes)


def train_with_agents(policy_net_1, policy_net_2, draw_board: bool = True):
    env = TicTacToeEnvironment()
    agent_a = ReinforceAgent(env, policy_net_1)
    agent_b = ReinforceAgent(env, policy_net_2)
    test(env, agent_a, agent_b, draw_board=draw_board, episodes=1)


## Test the agent trained with random sampling
# Load the networks
policy_net_random = torch.load('models/tictactoe/reinforce/random_40k_0.001.pth')
# Test with random sampling
train_with_random(policy_net_random)
## Test the agent trained with dual agents
# Load the networks
policy_net_agents = torch.load('models/tictactoe/reinforce/agents_40k_0.001.pth')
# Test with random sampling
train_with_random(policy_net_agents)
## Random Sampling vs. Dual Agents
train_with_agents(policy_net_random, policy_net_agents)