In [1]:
game_spec = dict(
    n_players=3,
    n_territories=8,
    baseline_reinforcements=3,
    n_attacks_per_turn=10,
)

In [2]:
import time
import copy
import numpy as np
import risk_ext

import torch
import torch.nn as nn
from torch.distributions.categorical import Categorical
from torch.optim import Adam
torch.manual_seed(0)

def vec_to_matrix(game, v):
    return v.reshape((game.n_max_territories, 1 + game.n_max_players))

def start_game(seed):
    return risk_ext.start_game(game_spec['n_players'], game_spec['n_territories'], game_spec['baseline_reinforcements'], game_spec['n_attacks_per_turn'], 0)

def get_state_dim():
    tmp = start_game(0)
    return tmp.n_max_territories * (1 + tmp.n_max_players)

def play_game(players, verbose=False, max_turns=None):
    env.reset()
    while True:
        action = players[env.game.player_idx].act(env.game, env.game.board_state)
        if verbose:
            print(f'turn={env.game.turn_idx}, player={env.game.player_idx}, phase={env.game.phase}, action={action}, board={env.game.board_state}')
        _,_,done = env.step(action)
        if done:
            break
        if max_turns is not None:
            if env.game.turn_idx > max_turns:
                break
    return env.game.player_idx

def faceoff(players, n_games):
    winners = np.empty(n_games, dtype = np.int32)
    for i in range(n_games):
        winners[i] = play_game(players)
    winner_counts = np.unique(winners, return_counts=True)
    return winners, winner_counts[0], winner_counts[1] / n_games

class NNPlayer:
    def __init__(self):
        self.state_dim = get_state_dim()
        self.n_actions = game_spec['n_territories']
        self.lr = 0.01
        
        def mlp(sizes, activation=nn.Tanh, output_activation=nn.Identity):
            # Build a feedforward neural network.
            layers = []
            for j in range(len(sizes)-1):
                act = activation if j < len(sizes)-2 else output_activation
                layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
            return nn.Sequential(*layers)
        
        self.logits_net = mlp(sizes=[self.state_dim]+[32]+[self.n_actions])
        self.optimizer = Adam(self.logits_net.parameters(), lr=self.lr)

    def get_policy(self, obs):
        logits = self.logits_net(obs)
        return Categorical(logits=logits)

    def get_action(self, obs):
        return self.get_policy(obs).sample().item()

    def compute_loss(self, obs, act, weights):
        logp = self.get_policy(obs).log_prob(act)
        return -(logp * weights).mean()
    
    def act(self, game, state_vec):
        owner_col = vec_to_matrix(game, state_vec)[:, game.player_idx + 1]
        attack_from = (owner_col == 1).argmax()
        attack_to = self.get_action(torch.as_tensor(state_vec, dtype=torch.float32))
        return attack_from, attack_to
    
    def learn(self, obs, actions, weights):
        self.optimizer.zero_grad()
        batch_loss = self.compute_loss(obs=torch.as_tensor(obs, dtype=torch.float32),
                                  act=torch.as_tensor(actions, dtype=torch.int32),
                                  weights=torch.as_tensor(weights, dtype=torch.float32)
                                  )
        batch_loss.backward()
        self.optimizer.step()
        return batch_loss

class DumbPlayer:
    def act(self, game, state_vec):
        owner_col = vec_to_matrix(game, state_vec)[:, game.player_idx + 1]
        attack_from = (owner_col == 1).argmax()
        attack_to = (owner_col != 1).argmax()
        return attack_from, attack_to

    def learn(self, obs, actions, weights):
        return 0

In [3]:
class Env:
    def __init__(self):
        self.n_games = 0
        
    def reset(self, seed=None):
        self.n_games += 1
        if seed is None:
            seed = self.n_games
        self.game = start_game(seed)
        return self.game.board_state
    
    def step(self, action):
        self.game.step(*action)
        done = self.game.phase == 3
        return self.game.board_state, float(done), done

class PlayerBatch:
    def __init__(self):
        self.obs = []
        self.actions = []
        self.weights = []
        self.returns = []
        self.lengths = []
        self.start_episode()
    
    def record(self, obs, action):      
        self.obs.append(obs.copy())
        self.actions.append(action[1])
        self.ep_length += 1
    
    def finish_episode(self, reward):
        self.ep_returns = reward
        self.returns.append(self.ep_returns)
        self.lengths.append(self.ep_length)
        self.weights += [self.ep_returns] * self.ep_length
        self.start_episode()
    
    def start_episode(self):
        self.ep_length = 0
        self.ep_returns = 0

In [4]:
def train_one_epoch(env, players, batch_size):
    batches = [PlayerBatch() for p in players]
    obs = env.reset()

    go = True
    while go:
        #print(env.game.turn_idx, env.game.player_idx)
        action = players[env.game.player_idx].act(env.game, obs)
        batches[env.game.player_idx].record(obs, action)
        obs, reward, done = env.step(action)
        if done:
            for i in range(len(players)):
                player_reward = reward if i == env.game.player_idx else 0
                b = batches[i]
                b.finish_episode(player_reward)
                if len(b.obs) > batch_size:
                    go = False
            obs = env.reset()
            
    loss = []
    for i in range(len(players)):
        if len(batches[i].obs) == 0:
            loss.append(0)
        else:
            loss.append(players[i].learn(batches[i].obs, batches[i].actions, batches[i].weights))
    return loss, [b.returns for b in batches], [b.lengths for b in batches]

In [5]:
def train(env, players, n_batches, batch_size, print_players):
    for i in range(n_batches):
        loss, rets, lens = train_one_epoch(env, players, batch_size)
        for j in print_players:
            win_percentage = np.mean(rets[j])
            game_length = np.mean(lens[j])
            print(f'epoch: {i}, player: {j}, loss: {loss[j]:.3f} return: {win_percentage:.3f} ep_len: {game_length:.3f}')

In [6]:
env = Env()
players = [NNPlayer()] + [DumbPlayer() for i in range(game_spec['n_players'])]
nn_player_idx = 0

In [8]:
train(env, players, 50, 3000, [0])

epoch: 0, player: 0, loss: 0.744 return: 0.500 ep_len: 330.700
epoch: 1, player: 0, loss: 1.299 return: 0.900 ep_len: 363.800
epoch: 2, player: 0, loss: 1.109 return: 0.833 ep_len: 508.000
epoch: 3, player: 0, loss: 1.141 return: 0.889 ep_len: 392.778
epoch: 4, player: 0, loss: 1.237 return: 0.750 ep_len: 446.500
epoch: 5, player: 0, loss: 1.205 return: 0.800 ep_len: 649.400
epoch: 6, player: 0, loss: 0.934 return: 0.800 ep_len: 671.600
epoch: 7, player: 0, loss: 1.190 return: 0.833 ep_len: 521.667
epoch: 8, player: 0, loss: 1.206 return: 0.714 ep_len: 487.143
epoch: 9, player: 0, loss: 1.273 return: 1.000 ep_len: 614.800
epoch: 10, player: 0, loss: 1.305 return: 1.000 ep_len: 750.250
epoch: 11, player: 0, loss: 1.279 return: 1.000 ep_len: 747.000
epoch: 12, player: 0, loss: 1.092 return: 0.667 ep_len: 526.000
epoch: 13, player: 0, loss: 1.288 return: 1.000 ep_len: 1881.000
epoch: 14, player: 0, loss: 1.233 return: 1.000 ep_len: 1192.000
epoch: 15, player: 0, loss: 1.257 return: 1.000 

In [None]:
selfplayer = copy.deepcopy(players[0])
selfplayers = [selfplayer] * 3
train(env, selfplayers, 10, 5000, [0,1,2])

epoch: 0, player: 0, loss: 0.574 return: 0.667 ep_len: 1723.667
epoch: 0, player: 1, loss: -0.000 return: 0.000 ep_len: 1690.000
epoch: 0, player: 2, loss: 0.444 return: 0.333 ep_len: 808.667
epoch: 1, player: 0, loss: -0.000 return: 0.000 ep_len: 2005.000
epoch: 1, player: 1, loss: 0.441 return: 0.500 ep_len: 2994.500
epoch: 1, player: 2, loss: 0.648 return: 0.500 ep_len: 1676.500


In [None]:
train(env, [DumbPlayer(), NNPlayer(), DumbPlayer()], 50, 1000)

In [None]:
train(env, [DumbPlayer(), copy.deepcopy(players[0]), DumbPlayer()], 50, 1000)

In [None]:
nn_player_idx = 0
for i in range(50):
    loss, rets, lens = train_one_epoch(env, players, 4000)
    for pi in range(len(players)):
        print('epoch: {%3d} \t loss0: %.3f \t return0: %.3f \t ep_len: %.3f'%
                (i, loss[pi], np.mean(rets[pi]), np.mean(lens[pi])))

In [None]:
faceoff(selfplayers, 100)[1:]

In [9]:
faceoff(players, 100)[1:]

(array([0, 1], dtype=int32), array([0.95, 0.05]))

In [None]:
faceoff([DumbPlayer() for i in range(game_spec['n_players'])], 1)

In [7]:
play_game(players, verbose=True)

turn=0, player=0, phase=1, action=(1, 1), board=[8. 0. 0. 1. 8. 1. 0. 0. 3. 1. 0. 0. 8. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
turn=0, player=0, phase=1, action=(1, 1), board=[8. 0. 0. 1. 8. 1. 0. 0. 3. 1. 0. 0. 8. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
turn=0, player=0, phase=1, action=(1, 1), board=[8. 0. 0. 1. 8. 1. 0. 0. 3. 1. 0. 0. 8. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
turn=0, player=0, phase=1, action=(1, 2), board=[8. 0. 0. 1. 8. 1. 0. 0. 3. 1. 0. 0. 8. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
turn=0, player=0, phase=1, action=(1, 1), board=[8. 0. 0. 1. 8. 1. 0. 0. 3. 1. 0. 0. 8. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
turn=0, player=1, phase=1, action=(3, 0), board=[ 8.  0.  0.  1.  8.  1.  0.  0.  3.  1.  0.  0. 11.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
turn=0, player=1, phase=1, action=(3, 0), board=[ 6.  0.  0.  1.  8.  1.  0.  0.  3.  1.  0.

  attack_to = self.get_action(torch.as_tensor(state_vec, dtype=torch.float32))


2