In [1]:
import numpy as np
import torch
from torch import nn
from env.game import Bigtwo

torch.manual_seed(0)
torch.cuda.manual_seed(0)


class Bigtwo156(nn.Module):
    def __init__(self, device):
        super().__init__()
        self.device = torch.device(device)
        self.dense1 = nn.Linear(52 + 52 + 52, 256)
        self.dense2 = nn.Linear(256, 128)
        self.dense3 = nn.Linear(128, 1)
        self.to(self.device)

    def forward(self, x):
        x = self.dense1(x)
        x = torch.relu(x)
        x = self.dense2(x)
        x = torch.relu(x)
        x = self.dense3(x)
        return x


class Agent156:
    def __init__(self):
        self.histories = []
        self.rewards = []
        self.model = Bigtwo156("cuda:0")
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001)

    def act(self, game):
        if len(self.histories) < len(self.rewards) + 1:
            self.histories.append([])
        obs = self.observe(game)
        with torch.no_grad():
            output = self.model(obs["x_batch"])
        action_index = torch.argmax(output, dim=0)[0]
        self.histories[-1].append(obs["x_batch"][action_index])
        action = game.players[game.player_to_act].legal_actions[action_index]
        game.step(action)

    def learn(self):
        losses = []
        for i, history in enumerate(self.histories):
            self.optimizer.zero_grad()
            x_batch = torch.stack(history)
            output = self.model(x_batch.float())
            y_batch = torch.ones(x_batch.shape[0], 1).to("cuda:0") * self.rewards[i]
            loss = torch.nn.functional.mse_loss(output, y_batch)
            loss.backward()
            self.optimizer.step()
            losses.append(loss.detach())
        self.histories = []
        self.rewards = []
        return torch.mean(torch.tensor(losses)).cpu().item()

    def observe(self, game: Bigtwo):
        players = game.players
        player_to_act = game.player_to_act
        player = players[player_to_act]
        legal_actions = player.legal_actions
        legal_actions = torch.tensor(np.array([a.code for a in legal_actions])).to(
            "cuda:0"
        )
        holding = torch.tensor(player.holding).to("cuda:0")
        other_indices = [
            (i + player_to_act) % 4
            for i in range(4)
            if (i + player_to_act) % 4 != player_to_act
        ]
        others_holding = [players[i].holding for i in other_indices]
        others_holding = np.bitwise_or.reduce(others_holding, axis=0)
        others_holding = torch.tensor(others_holding).to("cuda:0")
        x = torch.cat([holding, others_holding], dim=0)
        x_batch = x.repeat(len(legal_actions), 1)
        x_batch = torch.cat([x_batch, legal_actions], dim=1).float()
        return dict(
            x_batch=x_batch,
        )


class RandomAgent:
    def __init__(self):
        self.histories = []
        self.rewards = []

    def act(self, game: Bigtwo):
        index = game.np_random.choice(
            len(game.players[game.player_to_act].legal_actions)
        )
        action = game.players[game.player_to_act].legal_actions[index]
        game.step(action)

    def learn(self):
        self.rewards = []
        self.histories = []
        return 0


game = Bigtwo()

agents = [
    Agent156(),
    Agent156(),
    Agent156(),
    Agent156(),
]

In [2]:
for _ in range(1000):
    winners = [0, 0, 0, 0]
    losses = [0, 0, 0, 0]
    for i in range(100):
        game.reset()
        while game.winner == None:
            agents[game.player_to_act].act(game)
        for index, agent in enumerate(agents):
            agent.rewards.append(1 if game.winner == index else -1)
        winners[game.winner] += 1

    for index, agent in enumerate(agents):
        loss = agent.learn()
        losses[index] = loss
    print(winners)
    print(losses)

[29, 27, 23, 21]
[0.8672682046890259, 0.8304236531257629, 0.8398073315620422, 0.7300475239753723]
[33, 22, 25, 20]
[0.9081538915634155, 0.7182988524436951, 0.7991465926170349, 0.6800662875175476]
[32, 36, 20, 12]
[0.9330984354019165, 0.9467445611953735, 0.692852258682251, 0.44227370619773865]
[27, 26, 35, 12]
[0.8586874604225159, 0.8003907799720764, 0.9206987023353577, 0.448481947183609]
[22, 19, 47, 12]
[0.7223523855209351, 0.6292674541473389, 1.0040183067321777, 0.4561153054237366]
[21, 23, 40, 16]
[0.7124958634376526, 0.6792768836021423, 0.9506361484527588, 0.5684380531311035]
[30, 11, 38, 21]
[0.8432789444923401, 0.4355664551258087, 0.8600237369537354, 0.6897258758544922]
[32, 14, 35, 19]
[0.8644799590110779, 0.5708885788917542, 0.8646455407142639, 0.5962798595428467]
[28, 11, 27, 34]
[0.8094014525413513, 0.3815540075302124, 0.7870721220970154, 0.8613006472587585]
[30, 14, 27, 29]
[0.7937235236167908, 0.4971553385257721, 0.7724397778511047, 0.8228410482406616]
[39, 7, 18, 36]
[0.89

KeyboardInterrupt: 

In [3]:
from utils.checkpoint import checkpoint


checkpoint(model1, optimizer1, "bigtwo_model3")

Model and optimizer saved as checkpoints/bigtwo_model3_20240923_004149.pth


In [3]:
players = [
    agents[0],
    RandomAgent(),
    RandomAgent(),
    RandomAgent(),
]

for _ in range(100):
    winners = [0, 0, 0, 0]
    losses = [0, 0, 0, 0]
    for i in range(100):
        game.reset()
        while game.winner == None:
            players[game.player_to_act].act(game)
        winners[game.winner] += 1
    print(winners)

[58, 15, 14, 13]
[58, 17, 10, 15]
[65, 10, 10, 15]
[71, 7, 9, 13]
[67, 12, 9, 12]
[68, 11, 11, 10]
[56, 19, 12, 13]
[67, 17, 6, 10]
[62, 11, 16, 11]


KeyboardInterrupt: 