In [1]:
from env.env import BigtwoEnv
import numpy as np
import torch
import random

from torch import nn


class BigtwoModel(nn.Module):
    def __init__(self, device):
        super().__init__()
        self.device = torch.device(device)
        self.dense1 = nn.Linear(52 + 52 + 52, 256)
        self.dense2 = nn.Linear(256, 128)
        self.dense3 = nn.Linear(128, 1)
        self.to(self.device)

    def forward(self, x, return_value=False, flags=None):

        x = self.dense1(x)
        x = torch.relu(x)
        x = self.dense2(x)
        x = torch.relu(x)
        x = self.dense3(x)
        if return_value:
            return dict(values=x)
        else:
            if (
                flags is not None
                and flags.exp_epsilon > 0
                and np.random.rand() < flags.exp_epsilon
            ):
                action = torch.randint(x.shape[0], (1,))[0]
            else:
                action = torch.argmax(x, dim=0)[0]
            return dict(action=action)

env = BigtwoEnv()
model0 = BigtwoModel("cuda:0")
optimizer0 = torch.optim.Adam(model0.parameters(), lr=0.001)
model1 = BigtwoModel("cuda:0")
optimizer1 = torch.optim.Adam(model1.parameters(), lr=0.001)



In [2]:
for _ in range(1000):
    game_num = 100
    record0 = []
    result0 = []
    record1 = []
    result1 = []
    obs = env.reset().to_tensor("cuda:0")
    for i in range(game_num):
        row0 = []
        row1 = []
        while not obs.done:
            if obs.player_to_act == 0:
                with torch.no_grad():
                    output = model0(obs.x_batch)
                action_index = output["action"]
                action = obs.legal_actions[action_index]
                row0.append(obs.x_batch[action_index])
            if obs.player_to_act == 1:
                with torch.no_grad():
                    output = model1(obs.x_batch)
                action_index = output["action"]
                action = obs.legal_actions[action_index]
                row1.append(obs.x_batch[action_index])
            else:
                action = random.choice(obs.legal_actions)
            obs = env.step(action).to_tensor("cuda:0")
        result0.append(obs.winner == 0)
        record0.append(row0)
        result1.append(obs.winner == 1)
        record1.append(row1)
        obs = env.reset().to_tensor("cuda:0")
    print(np.mean(result0), np.mean(result1))
    for index, row0 in enumerate(record0):
        optimizer0.zero_grad()
        x_batch = torch.stack(row0)
        output = model0(x_batch, return_value=True)
        y_batch = torch.ones(x_batch.shape[0], 1).to("cuda:0") * result0[index]
        loss = torch.nn.functional.mse_loss(output["values"], y_batch)
        loss.backward()
        optimizer0.step()
    for index, row1 in enumerate(record1):
        optimizer1.zero_grad()
        x_batch = torch.stack(row1)
        output = model1(x_batch, return_value=True)
        y_batch = torch.ones(x_batch.shape[0], 1).to("cuda:0") * result1[index]
        loss = torch.nn.functional.mse_loss(output["values"], y_batch)
        loss.backward()
        optimizer1.step()

0.26 0.05
0.39 0.05


KeyboardInterrupt: 

In [3]:
game.winner

0

In [3]:
from utils.checkpoint import checkpoint


checkpoint(model1, optimizer1, "bigtwo_model3")

Model and optimizer saved as checkpoints/bigtwo_model3_20240923_004149.pth
