In [None]:
%cd ..
%reload_ext autoreload
%autoreload 2

In [None]:
#importing libraries
import numpy as np
import pyspiel
import math
import matplotlib.pyplot as plt
import torch
import copy
import tensorflow as tf
import pickle
import os
import ray
from datetime import datetime
from statistics import mean
from torch.distributions import Categorical
from torch.utils.tensorboard import SummaryWriter
from pathlib import Path

from scipy.stats import entropy


from open_spiel.python.algorithms.alpha_zero import model as model_lib
from open_spiel.python.algorithms.alpha_zero import evaluator as evaluator_lib
from open_spiel.python.algorithms import mcts

from alpha_one.metrics import MatchOutcome, EloRatingSystem, TrueSkillRatingSystem, calculate_entropy
from alpha_one.game.trajectory import GameTrajectory
from alpha_one.game.buffer import ReplayBuffer
from alpha_one.game.observer import get_observation_tensor_shape
from alpha_one.utils.mcts import initialize_bot, compute_mcts_policy, play_one_game, mcts_inference
from alpha_one.utils.logging import TensorboardLogger, generate_run_name
from alpha_one.model.model_manager import OpenSpielCheckpointManager, OpenSpielModelManager
from alpha_one.model.evaluation import EvaluationManager, ParallelEvaluationManager
from alpha_one.model.config import OpenSpielModelConfig
from alpha_one.train import AlphaZeroTrainManager, MCTSConfig
from alpha_one.data.replay import ReplayDataManager
from env import MODEL_SAVES_DIR, LOGS_DIR

First attempt at imitating the training procedure of AlphaZero. It is comprised of 3 main parts:  
 1. Generating training data using MCTS and the current best model
 2. Updating weights of a challenger model using the generated training data
 3. Evaluating the challenger model against the current best model. If it can beat it by a significant margin, the challenger model will from then on be used for generating the training data

# 1. Parameters

In [None]:
game_name = 'leduc_poker'
game_prefix = 'LP-local'

In [None]:
n_iterations = 50                     # How often the whole procedure is repeated. Also corresponds to the number of evaluations

# Train samples generation
n_games_train = 100             # How many new states will be generated by the best model via self-play for training (Training set size delta). Has to be larger than batch_size
n_games_valid = 10
store_replays_every = 10

# Model update
n_most_recent_train_samples = 50000    # Among which training samples to choose to train current model
n_most_recent_valid_samples = 50000
n_train_steps = 40                     # After how many gradient updates the new model tries to beat the current best
n_valid_steps = 10
batch_size = 8

# Evaluation
n_evaluations = 100                    # How many games should be played to measure which model is better
evaluation_strategy = 'mcts'           # 'best_response'
win_ratio_needed = None #0.55                # Minimum win ratio that the challenger model needs in order to supersede the current best model
average_reward_needed = 0.2            # Minimum average reward over current best model that the challenger model needs in order to supersede the current best model. Mutually exclusive with win_ratio_needed 

# MCTS config
UCT_C = 3                              # Amount of exploration. Apparently, for games with higher absolute rewards (e.g., Poker) this should be higher
max_mcts_simulations = 100

policy_epsilon = None #0.25            # What noise epsilon to use
policy_alpha = None #1                 # What dirichlet noise alpha to use

temperature = 1
temperature_drop = 10
omniscient_observer = True             # Whether the observation tensor input to the model is the total information (omniscient) or only the player's observation
use_reward_policy = True               # Whether the MCTS policy should be weighted by reward or only the expore counts are taken into account

In [None]:
assert win_ratio_needed is None and average_reward_needed is not None or win_ratio_needed is not None and average_reward_needed is None, f"win_ratio_needed and average_reward_needed are mutually exclusive"

In [None]:
mcts_config = MCTSConfig(
    UCT_C, 
    max_mcts_simulations, 
    temperature, 
    temperature_drop, 
    policy_epsilon, 
    policy_alpha, 
    omniscient_observer=omniscient_observer, 
    use_reward_policy=use_reward_policy)

evaluation_mcts_config = MCTSConfig(
    UCT_C, 
    max_mcts_simulations, 
    0, 
    None, 
    None, 
    None, 
    omniscient_observer=omniscient_observer,
    use_reward_policy=use_reward_policy)

In [None]:
# Model Hyperparameters
model_type = 'mlp'
nn_width = 64
nn_depth = 2
weight_decay = 1e-5
learning_rate = 1e-5

In [None]:
hyperparameters = dict(
    game_name=game_name,
    UCT_C=UCT_C,
    max_mcts_simulations=max_mcts_simulations,
    n_iterations=n_iterations,
    
    n_games_train=n_games_train,
    n_games_valid=n_games_valid,
    store_replays_every=store_replays_every,
    
    n_most_recent_train_samples=n_most_recent_train_samples,
    n_most_recent_valid_samples=n_most_recent_valid_samples,
    n_train_steps=n_train_steps,
    n_valid_steps=n_valid_steps,
    batch_size=batch_size,
    
    n_evaluations=n_evaluations,
    win_ratio_needed=win_ratio_needed,
    average_reward_needed=average_reward_needed,
    
    policy_epsilon=policy_epsilon,
    policy_alpha=policy_alpha,
    
    temperature=temperature,
    temperature_drop=temperature_drop,
    
    model_type=model_type,
    nn_width=nn_width,
    nn_depth=nn_depth,
    weight_decay=weight_decay,
    learning_rate=learning_rate,
    
    omniscient_observer=omniscient_observer,
    use_reward_policy=use_reward_policy
)

# 2. Functions

In [None]:
def mean_total_loss(losses):
    return mean([loss.total for loss in losses])

# 3. Training

In [None]:
#ray.shutdown()
#ray.init(num_cpus=2)

In [None]:
# Setup model and game
run_name = generate_run_name(f'{LOGS_DIR}/{game_name}', game_prefix, match_arbitrary_suffixes=True)
print(f"Starting run: {run_name}")

game = pyspiel.load_game(game_name)

# Setup Model Manager
model_config = OpenSpielModelConfig(
    game, 
    model_type, 
    get_observation_tensor_shape(game, omniscient_observer), 
    nn_width, 
    nn_depth, 
    weight_decay, 
    learning_rate,
    omniscient_observer=omniscient_observer)
model_manager = OpenSpielCheckpointManager(game_name, run_name)
model_manager.store_config(model_config)

# Setup Evaluation Manager
if ray.is_initialized():
    evaluation_manager = ParallelEvaluationManager(game, model_manager, n_evaluations, evaluation_mcts_config)
else:
    evaluation_manager = EvaluationManager(game, n_evaluations, evaluation_mcts_config)
    
# Setup Replay Data Manager
replay_data_manager = ReplayDataManager(model_manager.model_store_path)
    
# Setup rating systems for evaluation
elo_rating_system = EloRatingSystem(40)
true_skill_rating_system = TrueSkillRatingSystem()
rating_systems = [elo_rating_system, true_skill_rating_system]

# Setup final training manager
train_manager = AlphaZeroTrainManager(game, model_manager, evaluation_manager, n_most_recent_train_samples, n_most_recent_valid_samples, rating_systems)

print("Num variables:", train_manager.model_challenger.num_trainable_variables)
train_manager.model_challenger.print_trainable_variables()

In [None]:
tensorboard = TensorboardLogger(f"{LOGS_DIR}/{game_name}/{run_name}")
tensorboard.log_hyperparameters(hyperparameters)

In [None]:
# Training loop
for iteration in range(1, n_iterations + 1):
    print(f"Iteration {iteration}")
    
    # 1 Generate training data with current best model
    new_train_samples, new_valid_samples = train_manager.generate_training_data(n_games_train, n_games_valid, mcts_config)
    print(f'  - Generated {len(new_train_samples)} additional training samples and {len(new_valid_samples)} additional validation samples')
    tensorboard.log_scalar("n_training_samples", train_manager.replay_buffer.get_total_samples(), iteration)
    
    # 2 Repeatedly sample from training set and update weights on current model
    train_losses, valid_losses = train_manager.train_model(n_train_steps, n_valid_steps, batch_size, weight_decay)
    print(f'  - Training: {mean_total_loss(train_losses[:int(len(train_losses)/4)]):.2f} \
            -> {mean_total_loss(train_losses[int(len(train_losses)/4):int(2 * len(train_losses)/4)]):.2f} \
            -> {mean_total_loss(train_losses[int(2 * len(train_losses)/4):int(3 * len(train_losses)/4)]):.2f} \
            -> {mean_total_loss(train_losses[int(3 * len(train_losses)/4):]):.2f}')
    tensorboard.log_scalars("Loss", {
        "total/train": mean([loss.total for loss in train_losses]),
        "policy/train": mean([loss.policy for loss in train_losses]),
        "value/train": mean([loss.value for loss in train_losses]),
        "total/valid": mean([loss.total for loss in valid_losses]),
        "policy/valid": mean([loss.policy for loss in valid_losses]),
        "value/valid": mean([loss.value for loss in valid_losses])
    }, iteration)
    
    # 3 Evaluate trained model against current best model
    challenger_win_rate, challenger_policies, match_outcomes, challenger_average_reward = train_manager.evaluate_challenger_model()
    
    player_name_current_best = train_manager.get_player_name_current_best()
    player_name_challenger = train_manager.get_player_name_challenger()
    
    true_skill_rating_system.update_ratings(match_outcomes)
    elo_rating_system.update_ratings(match_outcomes)
    print(f"  - Ratings current best: {true_skill_rating_system.get_rating(player_name_current_best)}, {elo_rating_system.get_rating(player_name_current_best):0.3f}")
    print(f"  - Ratings challenger: {true_skill_rating_system.get_rating(player_name_challenger)}, {elo_rating_system.get_rating(player_name_challenger):0.3f}")
    tensorboard.log_scalars("elo_rating", {
        "current_best": elo_rating_system.get_rating(player_name_current_best),
        "challenger": elo_rating_system.get_rating(player_name_challenger)
    }, iteration)
    tensorboard.log_scalars("true_skill_rating", {
        "current_best": true_skill_rating_system.get_rating(player_name_current_best).mu,
        "challenger": true_skill_rating_system.get_rating(player_name_challenger).mu
    }, iteration)
    
    print(f'  - Challenger won {int(round(challenger_win_rate * n_evaluations))}/{n_evaluations} games ({challenger_win_rate:.2%} win rate)')
    tensorboard.log_scalar("challenger_win_rate", challenger_win_rate, iteration)
    tensorboard.log_scalar("challenger_average_reward", challenger_average_reward, iteration)
    
    # 4 Replace current best model with challenger model if it is better
    train_manager.replace_model_with_challenger(challenger_win_rate, win_ratio_needed, challenger_average_reward, average_reward_needed)
    if win_ratio_needed is not None:
        if challenger_win_rate > win_ratio_needed:
            print(f"  - Model at iteration {iteration} supersedes previous model ({challenger_win_rate:.2%} win rate)")
    elif average_reward_needed is not None:
        if challenger_average_reward > average_reward_needed:
            print(f"  - Model at iteration {iteration} supersedes previous model ({challenger_average_reward:.2f} average reward)")
        
    challenger_entropy = calculate_entropy(challenger_policies)
    print(f"  - Challenger entropy: {challenger_entropy:0.3f}")
    label_entropy = calculate_entropy([sample.policy for sample in new_train_samples])
    print(f"  - Label entropy: {label_entropy:0.3f}")
    
    tensorboard.log_scalars("entropy", {
        "current_best": label_entropy,
        "challenger": challenger_entropy}, iteration)
    tensorboard.log_scalar("best_model_generation", player_name_current_best, iteration)
    
    if iteration % store_replays_every == 0:
        print("Replay buffer stored")
        replay_data_manager.store_replays(train_manager.replay_buffer, iteration)
    tensorboard.flush()
replay_data_manager.store_replays(train_manager.replay_buffer, iteration)

# 4. Train Blind Model

In [None]:
replay_buffer = replay_data_manager.load_replays()

In [None]:
blind_model_config = OpenSpielModelConfig(game, 'mlp', replay_buffer.data[0].observation['player_observation'].shape, 128, 4, weight_decay=1e-5, learning_rate = 1e-5)

In [None]:
blind_model_manager = OpenSpielModelManager(game_name, f"{run_name}-blind").new_run()

In [None]:
blind_model_manager.store_config(blind_model_config)
blind_model = blind_model_manager.build_model(blind_model_config)

In [None]:
tensorboard_blind = TensorboardLogger(f"{LOGS_DIR}/{game_name}/{blind_model_manager.get_run_name()}")

In [None]:
for iteration in range(100):
    blind_losses = []
    for _ in range(100):
        sampled_train_inputs = replay_buffer.sample(batch_size*10, 'player_observation', n_most_recent=500)
        loss = blind_model.update(sampled_train_inputs)
        blind_losses.append(loss)
    blind_model_manager.store_checkpoint(blind_model, iteration)
    tensorboard_blind.log_scalars("Loss", {
        "total/train": mean([loss.total for loss in blind_losses]),
        "policy/train": mean([loss.policy for loss in blind_losses]),
        "value/train": mean([loss.value for loss in blind_losses])
    }, iteration)
    tensorboard_blind.flush()

# 5. Investigation of specific game scenarios

## 5.1. Kuhn Poker Comparison of Policies

In [None]:
state = game.new_initial_state()
state.apply_action(1)
state.apply_action(0)
state.apply_action(0)

In [None]:
from alpha_one.game.observer import OmniscientObserver
from alpha_one.alg.imperfect_information import AlphaZeroOmniscientMCTSEvaluator
from alpha_one.utils.mcts import compute_mcts_policy_reward

In [None]:
omniscient_observer = OmniscientObserver(game)
random_model = model_manager.build_model(model_config)
mcts_bot = initialize_bot(game, train_manager.model_current_best, uct_c=UCT_C, max_simulations=max_mcts_simulations, omniscient_observer=True)

### 5.1.1. Trained Omniscient Model

In [None]:
train_manager.model_current_best.inference([omniscient_observer.get_observation_tensor(state)], [state.legal_actions_mask()])

### 5.1.2. Untrained Omniscient Model

In [None]:
random_model.inference([omniscient_observer.get_observation_tensor(state)], [state.legal_actions_mask()])

### 5.1.3. MCTS with trained Omniscient Model

In [None]:
root = mcts_bot.mcts_search(state)
print(root.total_reward / root.explore_count)
policy = np.zeros(game.num_distinct_actions())
for c in root.children:
    if c.outcome is not None:
        policy[c.action] = c.total_reward / c.explore_count
    else:
        policy[c.action] = c.total_reward / (c.explore_count - 1)  # If node is not a leaf, one explore count is used to unfold it. To get a proper average, we have to subtract that here

policy = np.exp(policy) / np.sum(np.exp(policy))
print(policy)

### 5.1.4. Trained Blind Model

In [None]:
blind_model.inference([state.observation_tensor(state.current_player())], [state.legal_actions_mask()])

## 5.2. Leduc Poker

In [None]:
state = game.new_initial_state()
state.apply_action(0)
state.apply_action(4)
state.apply_action(1)

In [None]:
omniscient_observer = OmniscientObserver(game)
random_model = model_manager.build_model(model_config)
mcts_bot = initialize_bot(game, train_manager.model_current_best, uct_c=UCT_C, max_simulations=max_mcts_simulations, omniscient_observer=True)

### 5.2.1. Trained Omniscient Model

In [None]:
train_manager.model_current_best.inference([omniscient_observer.get_observation_tensor(state)], [state.legal_actions_mask()])

## 5.2.2. Untrained Omniscient Model

In [None]:
random_model.inference([omniscient_observer.get_observation_tensor(state)], [state.legal_actions_mask()])

## 5.2.3. MCTS with trained Omniscient Model

In [None]:
root = mcts_bot.mcts_search(state)
compute_mcts_policy(game, root, 1)

## 5.2.4. Trained Blind Model

In [None]:
blind_model.inference([state.observation_tensor(state.current_player())], [state.legal_actions_mask()])

## 5.3. Connect Four

In [None]:
state = game.new_initial_state()
state.apply_action(3)
state.apply_action(3)
state.apply_action(2)
#state.apply_action(2)
#state.apply_action(3)
#state.apply_action(2)
print(state.observation_string())

In [None]:
train_manager.model_challenger.inference([state.observation_tensor()], [state.legal_actions_mask()])

In [None]:
train_manager.model_current_best.inference([state.observation_tensor()], [state.legal_actions_mask()])

In [None]:
mcts_inference(game, train_manager.model_challenger, state, uct_c=UCT_C, max_simulations=max_mcts_simulations, temperature=temperature)

In [None]:
model_loaded = model_manager.load_model(427)
model_loaded.inference([state.observation_tensor()], [state.legal_actions_mask()])

In [None]:
_ = evaluate_challenger_model(model_current_best, model_loaded)