In [1]:
%cd ..
%reload_ext autoreload
%autoreload 2

/mnt/d/ownCloud/Uni/Semester Ma 5/Advanced Deep Learning for Robotics (IN2349)/Project/tum-adlr-ws20-9


In [2]:
#importing libraries
import numpy as np
import pyspiel
import math
import matplotlib.pyplot as plt
import torch
import copy
import tensorflow as tf
import pickle
import os
import ray
from datetime import datetime
from statistics import mean
from torch.distributions import Categorical
from torch.utils.tensorboard import SummaryWriter
from pathlib import Path

from scipy.stats import entropy


from open_spiel.python.algorithms.alpha_zero import model as model_lib
from open_spiel.python.algorithms.alpha_zero import evaluator as evaluator_lib
from open_spiel.python.algorithms import mcts

from alpha_one.metrics import MatchOutcome, EloRatingSystem, TrueSkillRatingSystem, calculate_entropy
from alpha_one.game.trajectory import GameTrajectory
from alpha_one.game.buffer import ReplayBuffer
from alpha_one.utils.mcts import initialize_bot, compute_mcts_policy, play_one_game, mcts_inference
from alpha_one.utils.logging import TensorboardLogger, generate_run_name
from alpha_one.model.model_manager import OpenSpielModelManager
from alpha_one.model.evaluation import EvaluationManager, ParallelEvaluationManager
from alpha_one.model.config import OpenSpielModelConfig
from alpha_one.train import AlphaZeroTrainManager, MCTSConfig
from env import MODEL_SAVES_DIR, LOGS_DIR

First attempt at imitating the training procedure of AlphaZero. It is comprised of 3 main parts:  
 1. Generating training data using MCTS and the current best model
 2. Updating weights of a challenger model using the generated training data
 3. Evaluating the challenger model against the current best model. If it can beat it by a significant margin, the challenger model will from then on be used for generating the training data

# 1. Parameters

In [3]:
game_name = 'connect_four'
game_prefix = 'C4'

In [4]:
n_iterations = 1000                     # How often the whole procedure is repeated. Also corresponds to the number of evaluations

# Train samples generation
n_games_train = 100             # How many new states will be generated by the best model via self-play for training (Training set size delta). Has to be larger than batch_size
n_games_valid = 10

# Model update
n_most_recent_train_samples = 50000    # Among which training samples to choose to train current model
n_most_recent_valid_samples = 50000
n_train_steps = 50                     # After how many gradient updates the new model tries to beat the current best
n_valid_steps = 5
batch_size = 256

# Evaluation
n_evaluations = 100                     # How many games should be played to measure which model is better
evaluation_strategy = 'mcts'           # 'best_response'
win_ratio_needed = 0.55                # Minimum win ratio that the challenger model needs in order to supersede the current best model

# MCTS config
UCT_C = math.sqrt(2)
max_mcts_simulations = 100

policy_epsilon = None #0.25            # What noise epsilon to use
policy_alpha = None #1                 # What dirichlet noise alpha to use

temperature = 1
temperature_drop = 10

In [5]:
mcts_config = MCTSConfig(UCT_C, max_mcts_simulations, temperature, temperature_drop, policy_epsilon, policy_alpha)

In [6]:
# Model Hyperparameters
model_type = 'mlp'
nn_width = 64
nn_depth = 4
weight_decay = 1e-5
learning_rate = 5e-4

In [7]:
hyperparameters = dict(
    game_name=game_name,
    UCT_C=UCT_C,
    max_mcts_simulations=max_mcts_simulations,
    n_iterations=n_iterations,
    
    n_games_train=n_games_train,
    n_games_valid=n_games_valid,
    
    n_most_recent_train_samples=n_most_recent_train_samples,
    n_most_recent_valid_samples=n_most_recent_valid_samples,
    n_train_steps=n_train_steps,
    n_valid_steps=n_valid_steps,
    batch_size=batch_size,
    
    n_evaluations=n_evaluations,
    win_ratio_needed=win_ratio_needed,
    
    policy_epsilon=policy_epsilon,
    policy_alpha=policy_alpha,
    
    temperature=temperature,
    temperature_drop=temperature_drop,
    
    model_type=model_type,
    nn_width=nn_width,
    nn_depth=nn_depth,
    weight_decay=weight_decay,
    learning_rate=learning_rate
)

# 2. Functions

In [8]:
def mean_total_loss(losses):
    return mean([loss.total for loss in losses])

# 3. Training

In [9]:
ray.shutdown()
ray.init(num_cpus=2)

2021-01-08 21:25:14,457	INFO services.py:1171 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '192.168.188.23',
 'raylet_ip_address': '192.168.188.23',
 'redis_address': '192.168.188.23:6379',
 'object_store_address': '/tmp/ray/session_2021-01-08_21-25-11_759585_13841/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-01-08_21-25-11_759585_13841/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-01-08_21-25-11_759585_13841',
 'metrics_export_port': 58900,
 'node_id': 'd2779ed438e24506e1eb36176772ae26b26812ed'}

In [10]:
# Setup model and game
run_name = generate_run_name(f'{LOGS_DIR}/{game_name}', game_prefix)
print(f"Starting run: {run_name}")

game = pyspiel.load_game(game_name)

model_config = OpenSpielModelConfig(game, model_type, nn_width, nn_depth, weight_decay, learning_rate)
model_manager = OpenSpielModelManager(f"{game_name}/{run_name}")
model_manager.store_config(model_config)

if ray.is_initialized():
    evaluation_manager = ParallelEvaluationManager(game, model_manager, n_evaluations, mcts_config)
else:
    evaluation_manager = EvaluationManager(game, n_evaluations, mcts_config)
train_manager = AlphaZeroTrainManager(game, model_manager, evaluation_manager, n_most_recent_train_samples, n_most_recent_valid_samples)

print("Num variables:", train_manager.model_challenger.num_trainable_variables)
train_manager.model_challenger.print_trainable_variables()

Starting run: C4-27
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Restoring parameters from /home/tobias/Uni/Semester Ma 5/Advanced Deep Learning for Robotics (IN2349)/Project/model_saves/connect_four/C4-27/checkpoint-0
AlphaZero Train manager will use parallelism
Num variables: 29448
torso_0_dense/kernel:0: (126, 64)
torso_0_dense/bias:0: (64,)
torso_1_dense/kernel:0: (64, 64)
torso_1_dense/bias:0: (64,)
torso_2_dense/kernel:0: (64, 64)
torso_2_dense/bias:0: (64,)
torso_3_dense/kernel:0: (64, 64)
torso_3_dense/bias:0: (64,)
policy_dense/kernel:0: (64, 64)
policy_dense/bias:0: (64,)
policy/kernel:0: (64, 7)
policy/bias:0: (7,)
value_dense/kernel:0: (64, 64)
value_dense/bias:0: (64,)
value/kernel:0: (64, 1)
value/bias:0: (1,)


In [11]:
# Setup rating systems for evaluation
elo_rating_system = EloRatingSystem(40)
true_skill_rating_system = TrueSkillRatingSystem()

In [12]:
tensorboard = TensorboardLogger(f"{LOGS_DIR}/{game_name}/{run_name}")
tensorboard.log_hyperparameters(hyperparameters)

In [None]:
# Training loop
for iteration in range(1, n_iterations + 1):
    print(f"Iteration {iteration}")
    
    # 1 Generate training data with current best model
    new_train_samples, new_valid_samples = train_manager.generate_training_data(n_games_train, n_games_train, mcts_config)
    print(f'  - Generated {len(new_train_samples)} additional training samples and {len(new_valid_samples)} additional validation samples')
    tensorboard.log_scalar("n_training_samples", train_manager.replay_buffer.get_total_samples(), iteration)
    
    # 2 Repeatedly sample from training set and update weights on current model
    train_losses, valid_losses = train_manager.train_model(n_train_steps, n_valid_steps, batch_size, weight_decay)
    print(f'  - Training: {mean_total_loss(train_losses[:int(len(train_losses)/4)]):.2f} \
            -> {mean_total_loss(train_losses[int(len(train_losses)/4):int(2 * len(train_losses)/4)]):.2f} \
            -> {mean_total_loss(train_losses[int(2 * len(train_losses)/4):int(3 * len(train_losses)/4)]):.2f} \
            -> {mean_total_loss(train_losses[int(3 * len(train_losses)/4):]):.2f}')
    tensorboard.log_scalars("Loss", {
        "total/train": mean([loss.total for loss in train_losses]),
        "policy/train": mean([loss.policy for loss in train_losses]),
        "value/train": mean([loss.value for loss in train_losses]),
        "total/valid": mean([loss.total for loss in valid_losses]),
        "policy/valid": mean([loss.policy for loss in valid_losses]),
        "value/valid": mean([loss.value for loss in valid_losses])
    }, iteration)
    
    # 3 Evaluate trained model against current best model
    challenger_win_rate, challenger_policies, match_outcomes = train_manager.evaluate_challenger_model()
    
    player_name_current_best = train_manager.get_player_name_current_best()
    player_name_challenger = train_manager.get_player_name_challenger()
    
    true_skill_rating_system.update_ratings(match_outcomes)
    elo_rating_system.update_ratings(match_outcomes)
    print(f"  - Ratings current best: {true_skill_rating_system.get_rating(player_name_current_best)}, {elo_rating_system.get_rating(player_name_current_best):0.3f}")
    print(f"  - Ratings challenger: {true_skill_rating_system.get_rating(player_name_challenger)}, {elo_rating_system.get_rating(player_name_challenger):0.3f}")
    tensorboard.log_scalars("elo_rating", {
        "current_best": elo_rating_system.get_rating(player_name_current_best),
        "challenger": elo_rating_system.get_rating(player_name_challenger)
    }, iteration)
    tensorboard.log_scalars("true_skill_rating", {
        "current_best": true_skill_rating_system.get_rating(player_name_current_best).mu,
        "challenger": true_skill_rating_system.get_rating(player_name_challenger).mu
    }, iteration)
    
    print(f'  - Challenger won {int(round(challenger_win_rate * n_evaluations))}/{n_evaluations} games ({challenger_win_rate:.2%} win rate)')
    tensorboard.log_scalar("challenger_win_rate", challenger_win_rate, iteration)
    
    # 4 Replace current best model with challenger model if it is better
    train_manager.replace_model_with_challenger(challenger_win_rate, win_ratio_needed, iteration)
    if challenger_win_rate > win_ratio_needed:
        print(f"  - Model at iteration {iteration} supersedes previous model ({challenger_win_rate:.2%} win rate)")
        true_skill_rating_system.add_player(train_manager.get_player_name_challenger(), true_skill_rating_system.get_rating(player_name_challenger))
        elo_rating_system.add_player(train_manager.get_player_name_challenger(), elo_rating_system.get_rating(player_name_challenger))
        
    challenger_entropy = calculate_entropy(challenger_policies)
    print(f"  - Challenger entropy: {challenger_entropy:0.3f}")
    label_entropy = calculate_entropy([sample.policy for sample in new_train_samples])
    print(f"  - Label entropy: {label_entropy:0.3f}")
    
    tensorboard.log_scalars("entropy", {
        "current_best": label_entropy,
        "challenger": challenger_entropy}, iteration)
    tensorboard.log_scalar("best_model_generation", player_name_current_best, iteration)
    
    tensorboard.flush()

Iteration 1


[2m[36m(pid=13942)[0m Instructions for updating:
[2m[36m(pid=13942)[0m If using Keras pass *_constraint arguments to layers.
[2m[36m(pid=13943)[0m Instructions for updating:
[2m[36m(pid=13943)[0m If using Keras pass *_constraint arguments to layers.
[2m[36m(pid=13943)[0m 2021-01-08 21:25:20.443946: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
[2m[36m(pid=13943)[0m 2021-01-08 21:25:20.444956: E tensorflow/stream_executor/cuda/cuda_driver.cc:313] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error
[2m[36m(pid=13943)[0m 2021-01-08 21:25:20.445034: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (DESKTOP-113SD1T): /proc/driver/nvidia/version does not exist
[2m[36m(pid=13943)[0m 2021-01-08 21:25:20.445352: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not comp

  - Generated 2510 additional training samples and 2512 additional validation samples
  - Training: 2.70             -> 2.65             -> 2.59             -> 2.53
  - Ratings current best: trueskill.Rating(mu=23.986, sigma=0.896), -40.283
  - Ratings challenger: trueskill.Rating(mu=26.014, sigma=0.896), 40.283
  - Challenger won 62/100 games (62.00% win rate)
INFO:tensorflow:Restoring parameters from /home/tobias/Uni/Semester Ma 5/Advanced Deep Learning for Robotics (IN2349)/Project/model_saves/connect_four/C4-27/checkpoint-1
  - Model at iteration 1 supersedes previous model (62.00% win rate)
  - Challenger entropy: 0.653
  - Label entropy: 0.711
Iteration 2


# 4. Investigation of specific game scenarios

In [17]:
state = game.new_initial_state()
state.apply_action(3)
state.apply_action(3)
state.apply_action(2)
#state.apply_action(2)
#state.apply_action(3)
#state.apply_action(2)
print(state.observation_string())

.......
.......
.......
.......
...o...
..xx...



In [19]:
train_manager.model_challenger.inference([state.observation_tensor()], [state.legal_actions_mask()])

[array([[0.20870192]], dtype=float32),
 array([[0.00884751, 0.09215064, 0.7334334 , 0.04794201, 0.06036935,
         0.00714639, 0.05011073]], dtype=float32)]

In [20]:
train_manager.model_current_best.inference([state.observation_tensor()], [state.legal_actions_mask()])

[array([[0.5216639]], dtype=float32),
 array([[0.04359566, 0.02505189, 0.75428474, 0.02382881, 0.14099368,
         0.00186914, 0.01037611]], dtype=float32)]

In [25]:
mcts_inference(game, train_manager.model_challenger, state, uct_c=UCT_C, max_simulations=max_mcts_simulations, temperature=temperature)

array([0.01010101, 0.22222222, 0.72727273, 0.01010101, 0.01010101,
       0.01010101, 0.01010101])

In [39]:
model_loaded = model_manager.load_model(427)
model_loaded.inference([state.observation_tensor()], [state.legal_actions_mask()])

INFO:tensorflow:Restoring parameters from /home/tobias/Uni/Semester Ma 5/Advanced Deep Learning for Robotics (IN2349)/Project/model_saves/connect_four/C4-13/checkpoint-427


[array([[0.5216639]], dtype=float32),
 array([[0.04359566, 0.02505189, 0.75428474, 0.02382881, 0.14099368,
         0.00186914, 0.01037611]], dtype=float32)]

In [None]:
_ = evaluate_challenger_model(model_current_best, model_loaded)