In [None]:
%cd ..
%reload_ext autoreload
%autoreload 2

In [None]:
# importing the libraries

import pyspiel
import tensorflow.compat.v1 as tf
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

from math import sqrt

from open_spiel.python import policy
from open_spiel.python import rl_environment
from open_spiel.python.algorithms import exploitability
from open_spiel.python.algorithms import policy_gradient

from open_spiel.python.algorithms import mcts
from open_spiel.python.algorithms.alpha_zero import evaluator as evaluator_lib
from open_spiel.python.algorithms.alpha_zero import model as model_lib

from alpha_one.model.model_manager import OpenSpielModelManager, PolicyGradientCheckpointManager, PolicyGradientModelManager, PolicyGradientConfig
from alpha_one.utils.mcts import initialize_bot, MCTSConfig
from alpha_one.utils.logging import generate_run_name
from alpha_one.model.agent import PolicyGradientAgent, MCTSAgent
from env import MODEL_SAVES_DIR

# 1. Setup Game and Model Managers

In [None]:
# load the game
game_name = "connect_four"
game = pyspiel.load_game(game_name)

pg_checkpoint_manager = PolicyGradientModelManager(game_name).new_run()

# RL environment configurations for policy gradient
num_players = 2
env_configs = {"players": num_players}
env = rl_environment.Environment(game, **env_configs)
info_state_size = env.observation_spec()["info_state"][0]
num_actions = env.action_spec()["num_actions"]

# 2. Configure Policy Gradient

In [None]:
pg_configs = [
    PolicyGradientConfig(
        player_id=player_id,
        info_state_size=info_state_size,
        num_actions=num_actions,
        loss_str="qpg",
        hidden_layers_sizes=[50, 50, 50, 50, 50],
        batch_size=32,
        entropy_cost=0.001,
        critic_learning_rate=0.001,
        pi_learning_rate=0.001,
        num_critic_before_pi=4) 
    for player_id in [0, 1]
]

In [None]:
agents = np.array([pg_checkpoint_manager.build_model(config) for config in pg_configs])

# 3. Train Policy Gradient Baseline

In [None]:
# Number of train episodes for policy gradient 
num_episodes = 100000

In [None]:
for ep in tqdm(range(num_episodes)):
    time_step = env.reset()
    while not time_step.last():
        player_id = time_step.observations["current_player"]
        agent_output = agents[player_id].step(time_step)
        action_list = [agent_output.action]
        time_step = env.step(action_list)

  # Episode is over, step all agents with final info state.
    for agent in agents:
        agent.step(time_step)

In [None]:
# Store trained Policy Gradient
pg_checkpoint_manager.store_config(pg_configs[0])
pg_checkpoint_manager.store_checkpoint(agents[0], 0)

# 4. Evaluate Policy Gradient vs trained Alpha Zero 

In [None]:
pg_checkpoint_manager = PolicyGradientCheckpointManager(game_name, 'PG-5')
pg_model = pg_checkpoint_manager.load_checkpoint(0)
pg_agent = PolicyGradientAgent(pg_model)

In [None]:
# load our trained model
az_model_manager = OpenSpielModelManager(game_name, 'C4')
print(az_model_manager.list_runs())

In [None]:
run_name = 'C4-13'
az_checkpoint_manager = az_model_manager.get_checkpoint_manager(run_name)
print(az_checkpoint_manager.list_checkpoints())

In [None]:
checkpoint = -1
model = checkpoint_manager.load_checkpoint(checkpoint)
az_agent = MCTSAgent.from_config(game, model, MCTSConfig(sqrt(2), 100, 0))

In [None]:
# compare our alphazero model with trained policy bot
track_wins = []
track_lost = []
n_evaluations = 10
for j in range(10):
    wins = 0
    for i in range(n_evaluations):
        az_player_id = 1
        
        state = game.new_initial_state()
        mcts_bot = initialize_bot(game, model, 2, 25, None, None)

        # select the first policy bot to play against alphazero bot
        policy_bot = pg_model #agents[0]
        while not state.is_terminal():
            actions = []
            temperature = 1
            temperature_drop = 10

            # if current turn is of alphazero bot
            if state.current_player() == az_player_id:
                action, policy = az_agent.next_move(state)

            # if the turn is of policy bot
            else:
                action, policy = pg_agent.next_move(state)
                # action, probs = policy_bot._act(state.observation_tensor(), state.legal_actions())

            state.apply_action(action)

        # if alphazero bot wins
        if (state.returns()[az_player_id] == 1):
            wins += 1

    print(f"Win Rate of AlphaZero: {wins/n_evaluations * 100}%")
    track_wins.append(wins/n_evaluations * 100)
    track_lost.append((n_evaluations - wins)/n_evaluations * 100)

In [None]:
#plots
w = 0.4
bar1 = np.arange(1, 10 + 1)
bar2 = [i + w for i in bar1]
plt.bar(bar1, track_wins, w, label="AlphaZero")
plt.bar(bar2, track_lost, w, label="Policy Gradient")
plt.xticks(bar1+w/2, bar1)
plt.xlabel("Runs - Each run represents evaluation out of 100 games")
plt.ylabel("Win Rate in %")
plt.title("Head to Head - AlphaZero vs Policy Gradient")
plt.legend()
plt.show()