In [1]:
import torch
from torch.functional import F
from torch import nn
from torch.nn import init
from torch.utils.tensorboard import SummaryWriter

In [2]:
def init_weights(layer):
    name = layer.__class__.__name__
    if name.find('Linear') != -1:
        init.xavier_uniform_(layer.weight.data)
        layer.bias.data.fill_(0)

class MancalaModel(nn.Module):

    def __init__(self, n_inputs=16, n_outputs=16):
        super().__init__()

        n_neurons = 512

        self.linear1 = nn.Linear(n_inputs, n_neurons)
        self.linear2 = nn.Linear(n_neurons, n_neurons)
        self.linear3 = nn.Linear(n_neurons, n_neurons)

        self.actor = nn.Linear(n_neurons, n_outputs)
        self.critics = nn.Linear(n_neurons, 1)

        self.apply(init_weights)

    def forward(self, x):
        x = F.dropout(F.relu(self.linear1(x)), p=0.1)
        x = F.dropout(F.relu(self.linear2(x)), p=0.1)
        x = F.relu(self.linear3(x))
        return F.softmax(self.actor(x), -1), self.critics(x)

In [3]:
from torchsummary import summary
model = MancalaModel()
summary(model, (1, 16), device='cpu')

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1               [-1, 1, 512]           8,704
            Linear-2               [-1, 1, 512]         262,656
            Linear-3               [-1, 1, 512]         262,656
            Linear-4                [-1, 1, 16]           8,208
            Linear-5                 [-1, 1, 1]             513
Total params: 542,737
Trainable params: 542,737
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.01
Params size (MB): 2.07
Estimated Total Size (MB): 2.08
----------------------------------------------------------------


In [4]:
import re
from datetime import datetime
import random
import numpy as np

log_dir = "runs/init_" + re.sub(r'[^\d]', '-', str(datetime.now().time()))
writer = SummaryWriter(log_dir)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
seed = 313
n_holes = 7
lr = 0.01
max_game_length = 100
reward_discount = 0.9
eps = 1e-7

torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

In [5]:
from game.mancalaenv import MancalaEnv
from torch import optim

env = MancalaEnv()
model = MancalaModel(n_inputs=n_holes*2+2, n_outputs=n_holes)
optimizer = optim.Adam(model.parameters(), lr=0.01)


In [6]:
import torch.distributions as dist
def select_action(model_output):
    outputs = dist.Categorical(model_output)
    action = outputs.sample()
    return outputs.log_prob(action), action.item() + 1

In [7]:
def get_random_move(side, env):
    return np.random.choice(env.get_valid_moves(side), 1, replace=False)[0]

In [8]:
def get_losses(rewards, log_probabilities, values):
    discounted_rewards = []
    accumulated_rewards = 0
    for current_reward in rewards[::-1]:
        accumulated_rewards = reward_discount * accumulated_rewards + current_reward
        discounted_rewards.append(accumulated_rewards)

    discounted_rewards = discounted_rewards[::-1]
    discounted_rewards = torch.tensor(discounted_rewards).float().to(device)
    normalized_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + eps)

    policy_loss = []
    value_loss = []
    for reward, log_probability, value in zip(normalized_rewards, log_probabilities, values):
        policy_loss.append((reward - value.item()) * -log_probability)
        reward = reward.unsqueeze(0).unsqueeze(0)
        # print("value", value.shape)
        # print("value", value)
        # print("reward", reward.shape)
        # print("reward", reward)
        value_loss.append(F.smooth_l1_loss(value, reward.to(device)))

    return policy_loss, value_loss

In [9]:
def train_one_game(model: nn.Module):
    game_finished = False
    env.reset()
    model.to(device)
    rewards = []
    values = []
    log_probabilities = []

    while not game_finished:
        # print(env)
        x = torch.unsqueeze(torch.from_numpy(env.board), 0).float().to(device)
        # print("x", x)
        distribution, value = model(x)
        # print("distribution", distribution)
        # print("value", value)
        log_prob, hole = select_action(distribution)
        # print("hole", hole)
        # print("log_prob", log_prob)
        # print("action.item()", action.item())
        _, reward, done = env.step('north', hole)
        model.train()

        # opponent move
        env.step('south', get_random_move('south', env))
        # print(env)

        rewards.append(reward)
        log_probabilities.append(log_prob)
        values.append(value)

        if done:
            game_finished = True

    # print("rewards", rewards)
    # print("log_probabilities", log_probabilities)
    # print("values", values)
    policy_loss, value_loss = get_losses(rewards, log_probabilities, values)
    # print("p losses", policy_loss)
    # print("v losses", value_loss)
    optimizer.zero_grad()
    total_loss = torch.stack(policy_loss).sum() + torch.stack(value_loss).sum()
    total_loss.backward()
    optimizer.step()
    return total_loss.detach()

In [10]:
interval = 5000

for i in range(1, 100001):
    if i % interval == 0:
        loss = train_one_game(model)
        print(f'i={i:10d} loss: {loss:4f}')

i=      5000 loss: -10.678265
i=     10000 loss: 160.569931
i=     15000 loss: 37.291012
i=     20000 loss: 34.475227
i=     25000 loss: 0.627771
i=     30000 loss: 12.128546
i=     35000 loss: 12.050107
i=     40000 loss: 2.160525
i=     45000 loss: 1.344607
i=     50000 loss: 0.902831
i=     55000 loss: 1.484973
i=     60000 loss: 0.595250
i=     65000 loss: 1.299363
i=     70000 loss: 0.345010
i=     75000 loss: 7.288711
i=     80000 loss: 0.237803
i=     85000 loss: 2.549199
i=     90000 loss: 1.868187
i=     95000 loss: 0.239661
i=    100000 loss: 7.185636


In [11]:
torch.save(model.state_dict(), "saved_models")
