In [1]:
import torch
from torch.functional import F
from torch import nn
from torch.utils.tensorboard import SummaryWriter
from agents.model_agent import MancalaModel

import re
from datetime import datetime
import random
import numpy as np

from game.mancalaenv import MancalaEnv
from torch import optim
import torch.distributions as dist
import os

from game.play import play
from agents.model_agent import ModelAgent


In [2]:
time_tag = re.sub(r'[^\d]', '-', str(datetime.now().time()))
log_dir = "runs/" + time_tag
writer = SummaryWriter(log_dir)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
seed = 1234
n_holes = 7
lr = 0.0001
hidden_size = 1024
max_game_length = 100
reward_discount = 0.99
epochs = 500000
print_interval = 2000
writer_interval = 2000
solved_win_rate = 0.99
eps = np.finfo(np.float32).eps.item()

torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)

In [3]:
from torchsummary import summary
model = MancalaModel(hidden_size=hidden_size, n_inputs=n_holes*2, n_outputs=n_holes)
env = MancalaEnv()
optimizer = optim.Adam(model.parameters(), lr=lr)

# summary(model, [(1, n_holes*2), ((hidden_size,), (hidden_size,))], device='cpu')

In [4]:
def get_random_move(side, env):
    return np.random.choice(env.get_valid_moves(side), 1, replace=False)[0]

In [5]:
def calculate_loss(rewards, log_probabilities, values):
    discounted_rewards = []
    accumulated_rewards = 0
    for current_reward in rewards[::-1]:
        accumulated_rewards = reward_discount * accumulated_rewards + current_reward
        discounted_rewards.append(accumulated_rewards)

    discounted_rewards = torch.tensor(discounted_rewards[::-1]).float().to(device)
    normalized_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + eps)

    policy_loss = []
    value_loss = []
    for reward, log_probability, value in zip(normalized_rewards, log_probabilities, values):
        policy_loss.append((reward - value) * -log_probability)
        value = value.squeeze(0).squeeze(0)
        value_loss.append(F.smooth_l1_loss(value, reward))

    return torch.stack(policy_loss).sum() + 0.5 * torch.stack(value_loss).sum()

In [6]:

def record(total_loss, env, epoch, model_side):
    writer.add_scalar('loss', total_loss, epoch)
    match_record_dir = log_dir + '/_' + str(epoch) + '.txt'
    torch.save(model.state_dict(), log_dir + '_' + str(epoch) + '.msd')
    with open(match_record_dir, 'w') as file:
        file.write(f'model side is: {model_side}' + os.linesep)
        for side, move, score, board in env.history:
            file.write(f'{side} moved {move}, score: {score}' + os.linesep)
            file.write(board + os.linesep)

In [7]:
from agents.simpleagent import SimpleAgent

def evaluation(model, epoch):
    wins = 0
    n_games = 100
    model.eval()
    model.to('cpu')
    simple_agent = SimpleAgent()
    model_agent = ModelAgent(hidden_size=hidden_size, model=model)
    for _ in range(n_games):
        winner = play(model_agent, simple_agent, max_game_length=max_game_length)
        if winner == model_agent:
            wins += 1
    writer.add_scalar(f'wins against simple agent / {n_games} games', wins, epoch)
    print(f'wins against simple agent: {wins} / {n_games}, {wins/n_games*100:.2f}% @ epoch={epoch}/{epochs}')
    return wins/n_games

In [8]:
def select_action(env, side, model, hidden):
    x = ModelAgent.get_model_input(env, side).to(device)
    distribution, value, hidden = model.train().to(device)(x, hidden)
    outputs = dist.Categorical(distribution)
    action = outputs.sample()
    return outputs.log_prob(action), action.item() + 1, value, hidden

In [9]:

def train_one_game(model: nn.Module, epoch):
    game_finished = False
    env.reset()
    rewards = []
    values = []
    log_probabilities = []
    game_step = 0
    win_rate = -1
    next_player = random.choice(['north', 'south'])
    hx = torch.zeros((1, hidden_size), dtype=torch.float).to(device)
    cx = torch.zeros((1, hidden_size), dtype=torch.float).to(device)
    hidden = (hx, cx)
    opponent = SimpleAgent()
    while not game_finished:
        if next_player == 'north':
            # model move
            log_prob, action, value, hidden = select_action(env, 'north', model, hidden)
            next_player, reward, done = env.step('north', action)
            rewards.append(reward)
            log_probabilities.append(log_prob)
            values.append(value)
        else:
            # opponent move
            with torch.no_grad():
                next_player, _, done = env.step('south', opponent.get_move(env, 'south'))

        game_step += 1
        if done or game_step > max_game_length:
            game_finished = True

    optimizer.zero_grad()
    total_loss = calculate_loss(rewards, log_probabilities, values)
    total_loss.backward()
    optimizer.step()

    if epoch % writer_interval == 0:
        record(total_loss, env, epoch, 'north')
        win_rate = evaluation(model, epoch)

    return total_loss.detach(), win_rate

In [10]:
import time, datetime

def get_time_elapsed(start):
    return str(datetime.timedelta(seconds=int(time.time() - start)))

start_time = time.time()
for i in range(0, epochs):
    loss, win_rate = train_one_game(model, i)
    time_elapsed = int(time.time() - start_time)
    if i % print_interval == 0:
        print(f'i={i:8d} loss={loss:6f} elapsed={get_time_elapsed(start_time)}')
    if win_rate >= solved_win_rate:
        print(f'Game solved with win rate: {win_rate} >= {solved_win_rate}')
        break
print(f'Finished Training, total time take: {get_time_elapsed(start_time)}')

wins against simple agent: 55 / 100, 55.00% @ epoch=0/500000
i=       0 loss=6.086492 elapsed=0:00:07
wins against simple agent: 50 / 100, 50.00% @ epoch=2000/500000
i=    2000 loss=0.014216 elapsed=0:02:11
wins against simple agent: 47 / 100, 47.00% @ epoch=4000/500000
i=    4000 loss=0.005759 elapsed=0:04:17
wins against simple agent: 57 / 100, 57.00% @ epoch=6000/500000
i=    6000 loss=0.002096 elapsed=0:06:27
wins against simple agent: 55 / 100, 55.00% @ epoch=8000/500000
i=    8000 loss=0.002480 elapsed=0:08:34
wins against simple agent: 46 / 100, 46.00% @ epoch=10000/500000
i=   10000 loss=0.000817 elapsed=0:10:45
wins against simple agent: 41 / 100, 41.00% @ epoch=12000/500000
i=   12000 loss=0.000786 elapsed=0:12:53
wins against simple agent: 47 / 100, 47.00% @ epoch=14000/500000
i=   14000 loss=0.001557 elapsed=0:15:00
wins against simple agent: 47 / 100, 47.00% @ epoch=16000/500000
i=   16000 loss=0.000309 elapsed=0:17:10
wins against simple agent: 53 / 100, 53.00% @ epoch=18

In [None]:
torch.save(model.state_dict(), 'final_model.msd')



In [11]:
torch.save(model.state_dict(), 'final_model.msd')

