In [1]:
#importing libraries
import numpy as np
import pyspiel
import random
from pathlib import Path

from open_spiel.python.algorithms import mcts
from open_spiel.python.algorithms.alpha_zero import evaluator as evaluator_lib
from open_spiel.python.algorithms.alpha_zero import model as model_lib

In [2]:
class TrajectoryState(object):
  #A particular point along a trajectory.
    def __init__(self, observation, current_player, legals_mask, action, policy, value):
        self.observation = observation
        self.current_player = current_player
        self.legals_mask = legals_mask
        self.action = action
        self.policy = policy
        self.value = value


class Trajectory(object):
    #A sequence of observations, actions and policies, and the outcomes.
    def __init__(self):
        self.states = []
        self.returns = None

    def add(self, information_state, action, policy):
        self.states.append((information_state, action, policy))
        
class Buffer(object):
  #A fixed size buffer that keeps the newest values.

    def __init__(self, max_size):
        self.max_size = max_size
        self.data = []
        self.total_seen = 0  # The number of items that have passed through.

    def __len__(self):
        return len(self.data)

    def __bool__(self):
        return bool(self.data)

    def append(self, val):
        return self.extend([val])

    def extend(self, batch):
        batch = list(batch)
        self.total_seen += len(batch)
        self.data.extend(batch)
        self.data[:-self.max_size] = []

    def sample(self, count):
        return random.sample(self.data, count)

In [3]:
def build_model(game, model_type, nn_width, nn_depth, learning_rate, weight_decay, model_saves_path):
    return model_lib.Model.build_model(
      model_type, game.observation_tensor_shape(), game.num_distinct_actions(),
      nn_width=nn_width, nn_depth=nn_depth, weight_decay=weight_decay, learning_rate=learning_rate, path=model_saves_path)

In [4]:
def initialize_bot(game, model, uct_c, max_simulations, policy_epsilon, policy_alpha):
    
    if policy_epsilon == None or policy_alpha == None:
        noise = None
    else:
        noise = (policy_epsilon, policy_alpha)

    az_evaluator = evaluator_lib.AlphaZeroEvaluator(game, model)

    bot = mcts.MCTSBot(
          game,
          uct_c,
          max_simulations,
          az_evaluator,
          solve=False,
          dirichlet_noise=noise,
          child_selection_fn=mcts.SearchNode.puct_value,
          verbose=False)
    
    return bot

In [5]:
def play_one_game(game, bots, temperature, temperature_drop):
    trajectory = Trajectory()
    actions = []
    state = game.new_initial_state()
    while not state.is_terminal():
        root = bots[state.current_player()].mcts_search(state)
        policy = np.zeros(game.num_distinct_actions())
        for c in root.children:
            policy[c.action] = c.explore_count
        policy = policy ** (1 / temperature)
        policy /= policy.sum()
        if len(actions) >= temperature_drop:
            action = root.best_child().action
        else:
            action = np.random.choice(len(policy), p=policy)
        
        trajectory.states.append(TrajectoryState(state.observation_tensor(), state.current_player(), 
                                                 state.legal_actions_mask(), action,
                                                 policy,root.total_reward / root.explore_count))
        
        state.apply_action(action)
    
    trajectory.returns = state.returns()
    return trajectory

In [6]:
def collect_trajectories(game, bots, replay_buffer, learn_rate, temperature, temperature_drop):
    
    num_states = 0
    
    while(1):
        trajectory = play_one_game(game, bots, temperature, temperature_drop)
        p1_outcome = trajectory.returns[0]
        replay_buffer.extend(model_lib.TrainInput(s.observation, s.legals_mask, s.policy, p1_outcome) 
                             for s in trajectory.states)
        num_states += len(trajectory.states)
        if num_states >= learn_rate:
            break

In [7]:
def evaluate_model(game, model, best_model, uct_c, max_simulations, n_evaluations):
    
    total_wins = 0
    
    for _ in range(n_evaluations):

        bot1 = initialize_bot(game, model, uct_c, max_simulations, None, None)
        bot2 = initialize_bot(game, best_model, uct_c, max_simulations, None, None)
        bots = [bot1, bot2]
        trajectory = play_one_game(game, bots, temperature=1, temperature_drop=10)
        if (trajectory.returns[0] == 1):
            total_wins += 1
            
    return total_wins, total_wins/n_evaluations
            
    
    

In [8]:
def train_and_evaluate_model(game, model, best_model, bots, iterations, temperature, temperature_drop, 
                             train_batch_size, 
                             replay_buffer_size, replay_buffer_reuse, uct_c, 
                             max_simulations, n_evaluations):
    
    
    replay_buffer = Buffer(replay_buffer_size)
    learn_rate = replay_buffer_size // replay_buffer_reuse
    
    for i in range(iterations):
        
        losses = []
        collect_trajectories(game, bots, replay_buffer, learn_rate, temperature, temperature_drop)
        for _ in range(len(replay_buffer) // train_batch_size):
            data = replay_buffer.sample(train_batch_size)
            losses.append(model.update(data))
            
        losses = sum(losses, model_lib.Losses(0, 0, 0)) / len(losses)
        
        print(f"Loss at iteration {i}: {losses}")
        
        total_wins, rate = evaluate_model(game, model, best_model, uct_c, max_simulations, n_evaluations)
        print(f'  - Challenger won {total_wins} games ({rate:.2%} win rate)')
        if rate > 0.55:
            print(f"  - Model at iteration {i} supersedes previous model")
            best_model = copy_and_create_checkpoint(i, model)
        print("\n")

In [9]:
def load_model(iteration, model):
    
    game_name = "connect_four"
    model_saves_path = '../model_saves/connect_four'
    nn_width = 10
    nn_depth = 5
    learning_rate = 0.001
    weight_decay = 0.0001
    model_type = 'mlp'

    game = pyspiel.load_game(game_name)
    
    new_model = build_model(game, model_type, nn_width, nn_depth, learning_rate, weight_decay, model_saves_path)
    new_model.load_checkpoint(f"{model._path}/checkpoint-{iteration}")
    return new_model

def copy_and_create_checkpoint(iteration, model):
    # Generate checkpoint
    output_path = Path(model_saves_path)
    output_path.mkdir(parents=True, exist_ok=True)
    model.save_checkpoint(iteration)
    return load_model(iteration, model)

In [10]:
def main(game_name, model_saves_path, nn_width, nn_depth, learning_rate, weight_decay, model_type,
        uct_c, max_simulations, policy_epsilon, policy_alpha, temperature, temperature_drop,
        train_batch_size, replay_buffer_size, replay_buffer_reuse, n_evaluations, iterations):
    
    
    game = pyspiel.load_game(game_name)
    
    model = build_model(game, model_type, nn_width, nn_depth, learning_rate, weight_decay, model_saves_path)
    best_model = copy_and_create_checkpoint(0, model)
    
    bot1 = initialize_bot(game, model, uct_c, max_simulations, policy_epsilon, policy_alpha)
    bot2 = initialize_bot(game, model, uct_c, max_simulations, policy_epsilon, policy_alpha)
    bots = [bot1, bot2]
    
    train_and_evaluate_model(game, model, best_model, bots, iterations, temperature, 
                         temperature_drop, train_batch_size, 
                         replay_buffer_size, replay_buffer_reuse, uct_c, 
                         max_simulations, n_evaluations)
    

In [11]:
game_name = "connect_four"                        # Name of the game.
model_saves_path = '../model_saves/connect_four'  # Where to save checkpoints.
nn_width = 10                                     # How wide should the network be
nn_depth = 5                                      # How deep should the network be
learning_rate = 0.001                             # Learning rate
weight_decay = 0.0001                             # L2 regularization strength
model_type = 'mlp'                                # What type of model should be used

uct_c = 2                                         # UCT's exploration constant
max_simulations = 25                              # How many simulations to run
policy_epsilon = 0.25                             # What noise epsilon to use
policy_alpha = 1                                  # What dirichlet noise alpha to use
temperature = 1                                   # Temperature for final move selection
temperature_drop = 10                             # Drop the temperature to 0 after this many moves


train_batch_size = 2 ** 10                        # Batch size for learning
replay_buffer_size = 2 ** 16                      # How many states to store in the replay buffer
replay_buffer_reuse = 3                           # How many times to learn from each state

n_evaluations = 100                               # How many games to average results over
iterations = 15                                   # How many learn iterations before exiting


In [12]:
main(game_name, model_saves_path, nn_width, nn_depth, learning_rate, weight_decay, model_type,
        uct_c, max_simulations, policy_epsilon, policy_alpha, temperature, temperature_drop,
        train_batch_size, replay_buffer_size, replay_buffer_reuse, n_evaluations, iterations)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Restoring parameters from ../model_saves/connect_four/checkpoint-0
Loss at iteration 0: Losses(total: 2.962, policy: 1.927, value: 1.031, l2: 0.004)
  - Challenger won 55 games (55.00% win rate)


Loss at iteration 1: Losses(total: 2.913, policy: 1.929, value: 0.980, l2: 0.004)
  - Challenger won 62 games (62.00% win rate)
  - Model at iteration 1 supersedes previous model
INFO:tensorflow:Restoring parameters from ../model_saves/connect_four/checkpoint-1




KeyboardInterrupt: 