In [None]:
%cd ..
%reload_ext autoreload
%autoreload 2

# Import Libraries

In [None]:
#importing libraries
import numpy as np
import pyspiel
import math

from statistics import mean


from open_spiel.python.algorithms.alpha_zero import model as model_lib
from open_spiel.python.algorithms.alpha_zero import evaluator as evaluator_lib
from open_spiel.python.algorithms import mcts

from alpha_one.metrics import MatchOutcome, EloRatingSystem, TrueSkillRatingSystem, calculate_entropy
from alpha_one.game.trajectory import GameTrajectory
from alpha_one.game.buffer import ReplayBuffer
from alpha_one.game.observer import get_observation_tensor_shape
from alpha_one.utils.mcts import initialize_bot, compute_mcts_policy, play_one_game, mcts_inference
from alpha_one.utils.mcts_II import IIGMCTSConfig
from alpha_one.utils.logging import TensorboardLogger, generate_run_name
from alpha_one.utils.play import GameMachine
from alpha_one.model.model_manager import OpenSpielCheckpointManager, OpenSpielModelManager
from alpha_one.model.evaluation import EvaluationManager, ParallelEvaluationManager
from alpha_one.model.config import OpenSpielModelConfig
from alpha_one.train import AlphaOneTrainManager, MCTSConfig
from alpha_one.data.replay import ReplayDataManager
from env import MODEL_SAVES_DIR, LOGS_DIR

from alpha_one.utils.state_to_value import state_to_value

The training is done in a similar way as AlphaZero.

Some Remarks:
1. Variables with intial word "observation" is associated with observation NN model
2. Variables with intial word "game" is associated with game (after guess state) NN model
3. alpha_one in MCTSConfig should be set to "True" while training AlphaOne
4. For the observation model, set output shape as well in model config (see below)
5. See AlphaOneTrainManager and utilis/mcts_II for debugging
6. Currently, evaluation doesn't support parallel evaluation
7. Also, pass state_to_value in the MCTS config because it is used by observation NN to calculate state mask same as legal action mask

In [None]:
game_name = 'leduc_poker'
game_prefix = 'LP-local'

In [None]:
# this is state to id
state_to_value = state_to_value(game_name)

In [None]:
n_iterations = 50                     # How often the whole procedure is repeated. Also corresponds to the number of evaluations

# Train samples generation
n_games_train = 100             # How many new states will be generated by the best model via self-play for training (Training set size delta). Has to be larger than batch_size
n_games_valid = 10
store_replays_every = 10

# Model update
n_most_recent_train_samples = 50000    # Among which training samples to choose to train current model
n_most_recent_valid_samples = 50000
n_train_steps_obs = 1000                     # Gradient updates for observation model
n_train_steps_game = 40                     # Gradient updates for game model
n_valid_steps = 10
batch_size = 128

# Evaluation
n_evaluations = 10                    # How many games should be played to measure which model is better
evaluation_strategy = 'mcts'           # 'best_response'
win_ratio_needed = 0.55                # Minimum win ratio that the challenger model needs in order to supersede the current best model

# MCTS config
UCT_C = 10                              # Exploration constant. Should be higher if absolute rewards are higher in a game
max_mcts_simulations = 10
optimism = 0.1                         # Whether guessing states is biased towards good outcomes

policy_epsilon = None #0.25            # What noise epsilon to use
policy_alpha = None #1                 # What dirichlet noise alpha to use

temperature = 1
temperature_drop = 10

alpha_one = True
omniscient_observer = True             # Whether the game model should have total information of the state it guessed
use_reward_policy = True               # Whether the total rewards of nodes should be taken into account when constructing policies, or only the explore_counts
use_teacher_forcing = True             # Whether the true game states should be used as label for the observation model, or the guessing policy of the IIG-MCTS
n_previous_observations = 3            # How many previous observations the observation model should use

In [None]:
mcts_config = IIGMCTSConfig(UCT_C, max_mcts_simulations, temperature, temperature_drop, policy_epsilon, policy_alpha, alpha_one=alpha_one, state_to_value=state_to_value, use_reward_policy=use_reward_policy, optimism=optimism, n_previous_observations=n_previous_observations)
evaluation_mcts_config = IIGMCTSConfig(UCT_C, max_mcts_simulations, 0, None, None, None, alpha_one=alpha_one, state_to_value=state_to_value, use_reward_policy=use_reward_policy, optimism=optimism, n_previous_observations=n_previous_observations)

In [None]:
# Model Hyperparameters
model_type_obs = 'mlp'
nn_width_obs = 128
nn_depth_obs = 4
weight_decay_obs = 1e-5
learning_rate_obs = 1e-2

model_type_game = 'mlp'
nn_width_game = 64
nn_depth_game = 2
weight_decay_game = 1e-5
learning_rate_game = 1e-5

In [None]:
hyperparameters = dict(
    game_name=game_name,
    UCT_C=UCT_C,
    max_mcts_simulations=max_mcts_simulations,
    n_iterations=n_iterations,
    
    n_games_train=n_games_train,
    n_games_valid=n_games_valid,
    store_replays_every=store_replays_every,
    
    n_most_recent_train_samples=n_most_recent_train_samples,
    n_most_recent_valid_samples=n_most_recent_valid_samples,
    n_train_steps_obs=n_train_steps_obs,
    n_train_steps_game=n_train_steps_game,
    n_valid_steps=n_valid_steps,
    batch_size=batch_size,
    
    n_evaluations=n_evaluations,
    win_ratio_needed=win_ratio_needed,
    
    policy_epsilon=policy_epsilon,
    policy_alpha=policy_alpha,
    
    temperature=temperature,
    temperature_drop=temperature_drop,
    
    model_type_obs=model_type_obs,
    nn_width_obs=nn_width_obs,
    nn_depth_obs=nn_depth_obs,
    weight_decay_obs=weight_decay_obs,
    learning_rate_obs=learning_rate_obs,
    
    model_type_game=model_type_game,
    nn_width_game=nn_width_game,
    nn_depth_game=nn_depth_game,
    weight_decay_game=weight_decay_game,
    learning_rate_game=learning_rate_game,
    
    omniscient_observer=omniscient_observer,
    use_reward_policy=use_reward_policy,
    optimism=optimism,
    use_teacher_forcing=use_teacher_forcing,
    n_previous_observations=n_previous_observations
)

In [None]:
def mean_total_loss(losses):
    return mean([loss.total for loss in losses])

In [None]:
# Setup model and game
run_name = generate_run_name(f'{LOGS_DIR}/{game_name}', game_prefix, match_arbitrary_suffixes=True)
print(f"Starting run: {run_name}")

game = pyspiel.load_game(game_name)

# Setup Model Manager
observation_model_config = OpenSpielModelConfig(
                           game, 
                           model_type_obs, 
                           [game.observation_tensor_shape()[0] * n_previous_observations], 
                           nn_width_obs, 
                           nn_depth_obs, 
                           weight_decay_obs, 
                           learning_rate_obs,
                           omniscient_observer=False, output_shape=len(state_to_value))
observation_model_manager = OpenSpielCheckpointManager(game_name, f"{run_name}-observation_model")
observation_model_manager.store_config(observation_model_config)


game_model_config = OpenSpielModelConfig(
                           game, 
                           model_type_game, 
                           get_observation_tensor_shape(game, omniscient_observer), 
                           nn_width_game, 
                           nn_depth_game, 
                           weight_decay_game, 
                           learning_rate_game,
                           omniscient_observer=omniscient_observer)
game_model_manager = OpenSpielCheckpointManager(game_name, f"{run_name}-game_model")
game_model_manager.store_config(game_model_config)

model_manager = {"game_model_manager": game_model_manager, "observation_model_manager": observation_model_manager}

In [None]:
# Setup Evaluation Manager
evaluation_manager = EvaluationManager(game, n_evaluations, evaluation_mcts_config)

# Setup rating systems for evaluation
elo_rating_system = EloRatingSystem(40)
true_skill_rating_system = TrueSkillRatingSystem()
rating_systems = [elo_rating_system, true_skill_rating_system]

# Setup final training manager
train_manager = AlphaOneTrainManager(game, model_manager, evaluation_manager, n_most_recent_train_samples, n_most_recent_valid_samples, rating_systems)

print("Observation Model: Num variables:", train_manager.observation_model_challenger.num_trainable_variables)
train_manager.observation_model_challenger.print_trainable_variables()
print("")
print("Game Model: Num variables:", train_manager.game_model_challenger.num_trainable_variables)
train_manager.game_model_challenger.print_trainable_variables()

In [None]:
observation_tensorboard = TensorboardLogger(f"{LOGS_DIR}/{game_name}/{run_name}-observation_model")
observation_tensorboard.log_hyperparameters(hyperparameters)

In [None]:
game_tensorboard = TensorboardLogger(f"{LOGS_DIR}/{game_name}/{run_name}-game_model")
game_tensorboard.log_hyperparameters(hyperparameters)

In [None]:
for iteration in range(1, n_iterations + 1):
    print(f"Iteration {iteration}")
    
    # 1 Generate training data with current best model
    new_train_observation_samples,\
    new_valid_observation_samples,\
    new_train_game_samples,\
    new_valid_game_samples = train_manager.generate_training_data(n_games_train, n_games_valid, mcts_config, use_teacher_forcing=use_teacher_forcing)
    print(f'  - Generated {len(new_train_observation_samples)} additional training observation samples and {len(new_valid_observation_samples)} additional validation observation samples')
    print(f'  - Generated {len(new_train_game_samples)} additional training game samples and {len(new_valid_game_samples)} additional validation game samples')
    observation_tensorboard.log_scalar("n_training_observation_samples", train_manager.replay_buffer_observation.get_total_samples(), iteration)
    game_tensorboard.log_scalar("n_training_game_samples", train_manager.replay_buffer_model.get_total_samples(), iteration)
    
    # 2 Repeatedly sample from training set and update weights on current model
    train_observation_losses,\
    valid_observation_losses,\
    train_game_losses,\
    valid_game_losses = train_manager.train_model(n_train_steps_obs, n_train_steps_game, n_valid_steps, batch_size, weight_decay_obs, weight_decay_game)
    print(f'  - Training Observation Model: {mean_total_loss(train_observation_losses[:int(len(train_observation_losses)/4)]):.2f} \
            -> {mean_total_loss(train_observation_losses[int(len(train_observation_losses)/4):int(2 * len(train_observation_losses)/4)]):.2f} \
            -> {mean_total_loss(train_observation_losses[int(2 * len(train_observation_losses)/4):int(3 * len(train_observation_losses)/4)]):.2f} \
            -> {mean_total_loss(train_observation_losses[int(3 * len(train_observation_losses)/4):]):.2f}')
    
    print(f'  - Training Game Model: {mean_total_loss(train_game_losses[:int(len(train_game_losses)/4)]):.2f} \
            -> {mean_total_loss(train_game_losses[int(len(train_game_losses)/4):int(2 * len(train_game_losses)/4)]):.2f} \
            -> {mean_total_loss(train_game_losses[int(2 * len(train_game_losses)/4):int(3 * len(train_game_losses)/4)]):.2f} \
            -> {mean_total_loss(train_game_losses[int(3 * len(train_game_losses)/4):]):.2f}')
    
    
    
    observation_tensorboard.log_scalars("Loss", {
        "total/train": mean([loss.total for loss in train_observation_losses]),
        "policy/train": mean([loss.policy for loss in train_observation_losses]),
        "value/train": mean([loss.value for loss in train_observation_losses]),
        "total/valid": mean([loss.total for loss in valid_observation_losses]),
        "policy/valid": mean([loss.policy for loss in valid_observation_losses]),
        "value/valid": mean([loss.value for loss in valid_observation_losses])
    }, iteration)
    
    
    game_tensorboard.log_scalars("Loss", {
        "total/train": mean([loss.total for loss in train_game_losses]),
        "policy/train": mean([loss.policy for loss in train_game_losses]),
        "value/train": mean([loss.value for loss in train_game_losses]),
        "total/valid": mean([loss.total for loss in valid_game_losses]),
        "policy/valid": mean([loss.policy for loss in valid_game_losses]),
        "value/valid": mean([loss.value for loss in valid_game_losses])
    }, iteration)
    
    
    challenger_win_rate, challenger_policies, match_outcomes, challenger_average_reward = train_manager.evaluate_challenger_model()
    
    player_name_current_best = train_manager.get_player_name_current_best()
    player_name_challenger = train_manager.get_player_name_challenger()
    
    true_skill_rating_system.update_ratings(match_outcomes)
    elo_rating_system.update_ratings(match_outcomes)
    print(f"  - Ratings current best: {true_skill_rating_system.get_rating(player_name_current_best)}, {elo_rating_system.get_rating(player_name_current_best):0.3f}")
    print(f"  - Ratings challenger: {true_skill_rating_system.get_rating(player_name_challenger)}, {elo_rating_system.get_rating(player_name_challenger):0.3f}")
    
    game_tensorboard.log_scalars("elo_rating", {
        "current_best": elo_rating_system.get_rating(player_name_current_best),
        "challenger": elo_rating_system.get_rating(player_name_challenger)
    }, iteration)
    observation_tensorboard.log_scalars("elo_rating", {
        "current_best": elo_rating_system.get_rating(player_name_current_best),
        "challenger": elo_rating_system.get_rating(player_name_challenger)
    }, iteration)
    
    game_tensorboard.log_scalars("true_skill_rating", {
        "current_best": true_skill_rating_system.get_rating(player_name_current_best).mu,
        "challenger": true_skill_rating_system.get_rating(player_name_challenger).mu
    }, iteration)
    
    observation_tensorboard.log_scalars("true_skill_rating", {
        "current_best": true_skill_rating_system.get_rating(player_name_current_best).mu,
        "challenger": true_skill_rating_system.get_rating(player_name_challenger).mu
    }, iteration)
    
    print(f'  - Challenger won {int(round(challenger_win_rate * n_evaluations))}/{n_evaluations} games ({challenger_win_rate:.2%} win rate)')
    game_tensorboard.log_scalar("challenger_win_rate", challenger_win_rate, iteration)
    observation_tensorboard.log_scalar("challenger_average_reward", challenger_average_reward, iteration)
    
    observation_tensorboard.log_scalar("challenger_win_rate", challenger_win_rate, iteration)
    
    # 3 Evaluate trained model against current best model
    train_manager.replace_model_with_challenger(challenger_win_rate, win_ratio_needed)
    if challenger_win_rate > win_ratio_needed:
        print(f"  - Model at iteration {iteration} supersedes previous model ({challenger_win_rate:.2%} win rate)")
        
        
    game_tensorboard.flush()
    observation_tensorboard.flush()

## Optionally, Train Observation Model a bit more

In [None]:
losses = []
for _ in range(100):
    batch = train_manager.replay_buffer_observation.sample(100, n_most_recent=100000)
    loss = train_manager.observation_model_challenger.update(batch)
    losses.append(loss)
print(mean([loss.policy for loss in losses]))

# 4. Investigation of Game scenarios

In [None]:
game_machine = GameMachine(game)
game_machine.new_game()
game_machine.play_action(0)
game_machine.play_action(1)
game_machine.play_action(2)
game_machine.play_action(2)
game_machine.play_action(1)
game_machine.play_action(5)

guess_state_mask = np.zeros(len(state_to_value), dtype=np.bool)
for s in game_machine.information_set_generator.calculate_information_set():
    guess_state_mask[state_to_value[s.__str__()]] = 1
value, policy = train_manager.observation_model_challenger.inference([game_machine.state.observation_tensor()], [guess_state_mask])
policy[0][guess_state_mask]