In [1]:
%cd ..
%reload_ext autoreload
%autoreload 2

/mnt/d/ownCloud/Uni/Semester Ma 5/Advanced Deep Learning for Robotics (IN2349)/Project/tum-adlr-ws20-9


In [2]:
#importing libraries
import numpy as np
import pyspiel
import math
import matplotlib.pyplot as plt
import torch
import copy
import tensorflow as tf
import pickle
from statistics import mean
from torch.distributions import Categorical
from scipy.stats import entropy

from open_spiel.python.algorithms.alpha_zero import model as model_lib
from open_spiel.python.algorithms.alpha_zero import evaluator as evaluator_lib
from open_spiel.python.algorithms import mcts

from alpha_one.metrics import MatchOutcome, EloRatingSystem, TrueSkillRatingSystem
from alpha_one.utils.logging import TensorboardLogger

First attempt at imitating the training procedure of AlphaZero. It is comprised of 3 main parts:  
 1. Generating training data using MCTS and the current best model
 2. Updating weights of a challenger model using the generated training data
 3. Evaluating the challenger model against the current best model. If it can beat it by a significant margin, the challenger model will from then on be used for generating the training data

# 1. Parameters

In [10]:
game_name = 'connect_four'

model_saves_path = f'../model_saves/{game_name}'
tensorboard_log_dir = f'../tensorboard-logs/{game_name}'

UCT_C = math.sqrt(2)
max_mcts_simulations = 100
n_selfplay_simulations = 10           # How many play throughs should be generated by best model for training. (Training set size)
n_train_steps = 50                     # After how many gradient updates the new model tries to beat the current best
n_iterations = 100                     # How often the whole procedure is repeated. Also corresponds to the number of evaluations
n_evaluations = 50                     # How many games should be played to measure which model is better
batch_size = 256
evaluation_strategy = 'mcts'           # 'best_response'
n_most_recent_train_samples = 50000    # Among which training samples to choose to train current model

policy_epsilon = None #0.25                             # What noise epsilon to use
policy_alpha = None #1                                  # What dirichlet noise alpha to use

In [11]:
model_type = 'mlp'
nn_width = 10
nn_depth = 4
weight_decay = 1e-5
learning_rate = 5e-4

# 2. Functions

In [12]:
# build the tensorflow model
def build_model(game):
    return model_lib.Model.build_model(
      model_type, game.observation_tensor_shape(), game.num_distinct_actions(),
      nn_width=nn_width, nn_depth=nn_depth, weight_decay=weight_decay, learning_rate=learning_rate, path=model_saves_path)

## 2.1 Main methods

In [13]:
def initialize_bot(game, model, uct_c, max_simulations, policy_epsilon=None, policy_alpha=None):
    
    if policy_epsilon == None or policy_alpha == None:
        noise = None
    else:
        noise = (policy_epsilon, policy_alpha)

    az_evaluator = evaluator_lib.AlphaZeroEvaluator(game, model)

    bot = mcts.MCTSBot(
          game,
          uct_c,
          max_simulations,
          az_evaluator,
          solve=False,
          dirichlet_noise=noise,
          child_selection_fn=mcts.SearchNode.puct_value,
          verbose=False)
    
    return bot

In [14]:
def executeEpisode(game, temperature):

    rng = np.random.RandomState(42)
    train_inputs = []
    state = game.new_initial_state()
    
    mcts_bot = initialize_bot(game, model_current_best, UCT_C, max_mcts_simulations, policy_epsilon, policy_alpha)
    
    observations = []
    action_masks = []
    policies = []
        
    while not state.is_terminal():
        root = mcts_bot.mcts_search(state)
        policy = np.zeros(game.num_distinct_actions())
        
        for c in root.children:
            policy[c.action] = c.explore_count
        policy = policy ** (1 / temperature)
        policy /= policy.sum()
        action = np.random.choice(len(policy), p=policy)
        obs = state.observation_tensor()
        act_mask = state.legal_actions_mask()
        
        observations.append(obs)
        action_masks.append(act_mask)
        policies.append(policy)
    
        # train_inputs.append(model_lib.TrainInput(obs, act_mask, policy, value=1))              
   
        state.apply_action(action) 
    
    final_game_reward = state.player_reward(0)
    train_inputs = [model_lib.TrainInput(obs, act_mask, policy, value=final_game_reward) for obs, act_mask, policy in zip(observations, action_masks, policies)]
    
    return train_inputs

In [15]:
def generate_training_data():
    train_inputs = []
    for i in range(n_selfplay_simulations):
        train_inputs.extend(executeEpisode(game, 1))
    return train_inputs

In [16]:
def train_model(train_inputs):
    losses = []   
    for _ in range(n_train_steps): 
        train_set_idx = np.random.choice(range(len(train_inputs)), batch_size)
        loss = model.update([train_inputs[i] for i in train_set_idx])
        losses.append(loss)
    return losses
    

In [17]:
def evaluate_challenger_model(model_challenger, model_current_best):
    rng = np.random.RandomState(42)
    
    challenger_results = []
    challenger_policies = []
    match_outcomes = []
    for _ in range(n_evaluations):
        if evaluation_strategy == 'mcts':
            mcts_bot_best_model = initialize_bot(game, model_current_best, UCT_C, max_mcts_simulations, policy_epsilon, policy_alpha)
            mcts_bot_challenger = initialize_bot(game, model_challenger, UCT_C, max_mcts_simulations, policy_epsilon, policy_alpha)
        
        model_challenger_player = np.random.choice([0, 1]) # ensure that each model will play as each player
        state = game.new_initial_state()
        current_turn = 0
        while not state.is_terminal():
            # model_current_turn = model if state.current_player() == model_challenger_player else model_current_best
            
            if evaluation_strategy == 'mcts':
                mcts_bot_current_turn = mcts_bot_challenger if state.current_player() == model_challenger_player else mcts_bot_best_model
                root = mcts_bot_current_turn.mcts_search(state)
                if current_turn < 30:
                    policy = compute_mcts_policy(root, 1) # Choose action proportional to visit count (Exploration)
                else:
                    policy = compute_mcts_policy(root, 0) # Always choose action with highest visit count
            else:
                pass
                # obs = state.observation_tensor()
                # act_mask = state.legal_actions_mask()
                # value, policy = model_current_turn.inference([obs], [act_mask])
                # TODO: implement
            action = np.random.choice(range(len(policy)), p=policy)
            state.apply_action(action)
            current_turn += 1
            if state.current_player() == model_challenger_player:
                challenger_policies.append(policy)
        
        challenger_reward = state.player_reward(model_challenger_player)
        challenger_results.append(challenger_reward)
        match_outcomes.append(
            MatchOutcome.win(player_id_challenger, player_id_current_best) 
            if challenger_reward == 1 else 
            MatchOutcome.defeat(player_id_challenger, player_id_current_best))
    
    n_challenger_wins = (np.array(challenger_results) == 1).sum()
    challenger_win_rate = n_challenger_wins / n_evaluations
    return challenger_win_rate, challenger_policies, match_outcomes


## 2.2 Helper methods

In [18]:
def compute_mcts_policy(root, temperature):
    policy = np.zeros(game.num_distinct_actions())
        
    for c in root.children:
        policy[c.action] = c.explore_count
    if temperature == 0 or temperature is None:
        # Create probability distribution with peak at most likely action
        new_policy = np.zeros(game.num_distinct_actions())
        new_policy[policy.argmax(-1)] = 1
        policy = new_policy
    else:
        policy = policy ** (1 / temperature)
        policy /= policy.sum()
    return policy

In [19]:
def mean_total_loss(losses):
    return mean([loss.total for loss in losses])

def load_model(iteration):
    new_model = build_model(game)
    new_model.load_checkpoint(f"{model._path}/checkpoint-{iteration}")
    return new_model

def copy_and_create_checkpoint(iteration):
    # Generate checkpoint
    model.save_checkpoint(iteration)
    return load_model(iteration)

In [20]:
import numpy as np
from scipy.stats import entropy
from statistics import mean

def calculate_entropy(policies):
    return mean([entropy(policy) for policy in policies])

# 3. Training

In [21]:
# Setup model and game
game = pyspiel.load_game(game_name)
model = build_model(game)
print("Num variables:", model.num_trainable_variables)
model.print_trainable_variables()
model_current_best = copy_and_create_checkpoint(0)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Num variables: 1908
torso_0_dense/kernel:0: (126, 10)
torso_0_dense/bias:0: (10,)
torso_1_dense/kernel:0: (10, 10)
torso_1_dense/bias:0: (10,)
torso_2_dense/kernel:0: (10, 10)
torso_2_dense/bias:0: (10,)
torso_3_dense/kernel:0: (10, 10)
torso_3_dense/bias:0: (10,)
policy_dense/kernel:0: (10, 10)
policy_dense/bias:0: (10,)
policy/kernel:0: (10, 7)
policy/bias:0: (7,)
value_dense/kernel:0: (10, 10)
value_dense/bias:0: (10,)
value/kernel:0: (10, 1)
value/bias:0: (1,)
INFO:tensorflow:Restoring parameters from ../model_saves/connect_four/checkpoint-0


In [22]:
# Setup rating systems for evaluation
elo_rating_system = EloRatingSystem(40)
true_skill_rating_system = TrueSkillRatingSystem()

player_id_current_best = 0
player_id_challenger = 1

In [16]:
# Setup Tensorboard
# TODO: generate unique model name
run_name = 'run-1'
tensorboard = TensorboardLogger(f"{tensorboard_log_dir}/{run_name}")

In [18]:
# Training loop
train_inputs = []
for iteration in range(n_iterations):
    print(f"Iteration {iteration}")
    
    # 1 Generate training data with current best model
    new_training_data = generate_training_data()
    train_inputs.extend(new_training_data)
    train_inputs = train_inputs[-n_most_recent_train_samples:]
    print(f'  - Generated {len(train_inputs)} additional training samples')
    
    # 2 Repeatedly sample from training set and update weights on current model
    losses = train_model(train_inputs)
    print(f'  - Training: {mean_total_loss(losses[:int(len(losses)/4)]):.2f} \
            -> {mean_total_loss(losses[int(len(losses)/4):int(2 * len(losses)/4)]):.2f} \
            -> {mean_total_loss(losses[int(2 * len(losses)/4):int(3 * len(losses)/4)]):.2f} \
            -> {mean_total_loss(losses[int(3 * len(losses)/4):]):.2f}')
    tensorboard.log_scalar("Loss", mean_total_loss(losses), iteration)
    
    # 3 Evaluate trained model against current best model
    challenger_win_rate, challenger_policies, match_outcomes = evaluate_challenger_model(model, model_current_best)
    
    true_skill_rating_system.update_ratings(match_outcomes)
    elo_rating_system.update_ratings(match_outcomes)
    print(f"  - Ratings current best: {true_skill_rating_system.get_rating(player_id_current_best)}, {elo_rating_system.get_rating(player_id_current_best):0.3f}")
    print(f"  - Ratings challenger: {true_skill_rating_system.get_rating(player_id_challenger)}, {elo_rating_system.get_rating(player_id_challenger):0.3f}")
    tensorboard.log_scalar("elo_rating/current_best", elo_rating_system.get_rating(player_id_current_best), iteration)
    tensorboard.log_scalar("elo_rating/challenger", elo_rating_system.get_rating(player_id_challenger), iteration)
    tensorboard.log_scalar("true_skill_rating/current_best", true_skill_rating_system.get_rating(player_id_current_best).mu, iteration)
    tensorboard.log_scalar("true_skill_rating/challenger", true_skill_rating_system.get_rating(player_id_challenger).mu, iteration)
    
    print(f'  - Challenger won {int(round(challenger_win_rate * n_evaluations))}/{n_evaluations} games ({challenger_win_rate:.2%} win rate)')
    if challenger_win_rate > 0.55:
        print(f"  - Model at iteration {iteration} supersedes previous model ({challenger_win_rate:.2%} win rate)")
        model_current_best = copy_and_create_checkpoint(iteration)
        player_id_current_best = player_id_challenger
        
    challenger_entropy = calculate_entropy(challenger_policies)
    print(f"  - Challenger entropy: {challenger_entropy:0.3f}")
    label_entropy = calculate_entropy([sample.policy for sample in new_training_data])
    print(f"  - Label entropy: {label_entropy:0.3f}")
    
    tensorboard.log_scalar("entropy/current_best", label_entropy, iteration)
    tensorboard.log_scalar("entropy/challenger", challenger_entropy, iteration)
    
    player_id_challenger += 1

Iteration 0
  - Generated 211 additional training samples
  - Training: 3.18             -> 2.94             -> 2.87             -> 2.84
  - Ratings current best: trueskill.Rating(mu=25.379, sigma=1.155), 29.978
  - Ratings challenger: trueskill.Rating(mu=24.621, sigma=1.155), -29.978
  - Challenger won 23.0/50 games (46.00% win rate)
  - Challenger entropy: 1.654
  - Label entropy: 1.683
Iteration 1
  - Generated 382 additional training samples
  - Training: 2.86             -> 2.85             -> 2.85             -> 2.84
  - Ratings current best: trueskill.Rating(mu=25.259, sigma=0.899), -8.035
  - Ratings challenger: trueskill.Rating(mu=26.520, sigma=1.106), 38.014
  - Challenger won 29.0/50 games (58.00% win rate)
  - Model at iteration 1 supersedes previous model (58.00% win rate)
INFO:tensorflow:Restoring parameters from ../model_saves/connect_four/checkpoint-1
  - Challenger entropy: 1.728
  - Label entropy: 1.647


ValueError: Tensor("entropy/current_best/write_summary/Identity:0", shape=(), dtype=float32, device=/device:CPU:0) must be from the same graph as Tensor("create_file_writer/SummaryWriter:0", shape=(), dtype=resource, device=/device:CPU:0).

In [38]:
from torch.utils.tensorboard import SummaryWriter

In [40]:
writer = SummaryWriter(tensorboard_log_dir)
writer.add_scalar("hi", 1)
writer.flush()

In [25]:
tensorboard_log_dir

'../tensorboard-logs/connect_four'

In [30]:
writer = tf.summary.create_file_writer(tensorboard_log_dir)


In [33]:
a = writer.flush()

# 4. Investigation of specific game scenarios

In [48]:
state = game.new_initial_state()
state.apply_action(3)
state.apply_action(3)
state.apply_action(2)
#state.apply_action(2)
#state.apply_action(3)
#state.apply_action(2)
print(state.observation_string())

.......
.......
.......
.......
...o...
..xx...



In [49]:
model.inference([state.observation_tensor()], [state.legal_actions_mask()])

[array([[0.40367833]], dtype=float32),
 array([[0.09463616, 0.13984184, 0.13587886, 0.15393914, 0.2371599 ,
         0.10649913, 0.13204496]], dtype=float32)]

In [50]:
model_current_best.inference([state.observation_tensor()], [state.legal_actions_mask()])

[array([[0.21628237]], dtype=float32),
 array([[0.09736948, 0.13812093, 0.13371477, 0.15287039, 0.23582101,
         0.10691894, 0.13518453]], dtype=float32)]

In [51]:
model_loaded = load_model(0)
model_loaded.inference([state.observation_tensor()], [state.legal_actions_mask()])

INFO:tensorflow:Restoring parameters from ../../model_saves/connect_four/checkpoint-0


[array([[0.00363491]], dtype=float32),
 array([[0.13665117, 0.14974193, 0.14714976, 0.13916534, 0.14012644,
         0.14000261, 0.14716277]], dtype=float32)]

In [None]:
_ = evaluate_challenger_model(model_current_best, model_loaded)