In [1]:
from absl import logging
import numpy as np
import tensorflow.compat.v1 as tf

from open_spiel.python import rl_environment
from open_spiel.python.algorithms import dqn
from open_spiel.python.algorithms import random_agent
from open_spiel.python.algorithms import minimax
from open_spiel.python.algorithms import minimax_agent
from open_spiel.python.algorithms import mcts
from open_spiel.python.algorithms import mcts_agent



2023-12-04 10:34:15.539016: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
# Training parameters
#Directory to save/load the agent models.
checkpoint_dir = "/Users/titus/Desktop/MLAgents/Checkers2"

#Episode frequency at which the DQN agent models are saved
save_every = int(1e4)

#Number of training episodes.
num_train_episodes = int(1e6)

#Episode frequency at which the DQN agents are evaluated.
eval_every = 100


# DQN model hyper-parameters

#Number of hidden units in the Q-Network MLP
hidden_layers_sizes = [64, 64]

#Size of the replay buffer
replay_buffer_capacity = int(1e5)

#Number of transitions to sample at each learning step.                     
batch_size = 32

In [3]:
def evaluate(state):
  kWhite = state.observation_string().count('o')
  kWhiteKing = state.observation_string().count('8') - 1 #one row is also labeld 8
  kBlack = state.observation_string().count('+') 
  kBlackKing = state.observation_string().count('*')
  return -1 * state.current_player() * (kWhite + 2*kWhiteKing - kBlack - 2*kBlackKing) #player 0 moves white pieces

In [5]:
game = "checkers"
num_players = 2
env = rl_environment.Environment(game, include_full_state=True)
info_state_size = env.observation_spec()["info_state"][0]
num_actions = env.action_spec()["num_actions"]

#random agents for evaluation
minimax_agents = [
    minimax_agent.MiniMaxAgent(env.game, env.get_state, player_id=idx, num_actions=num_actions, maximum_depth=8, value_function=evaluate)
    for idx in range(num_players)
]
mcts_bot = mcts.MCTSBot(env.game, 1.5, 100, mcts.RandomRolloutEvaluator())
mcts_agents = [
        mcts_agent.MCTSAgent(player_id=idx, num_actions=num_actions, mcts_bot=mcts_bot)
        for idx in range(num_players)
]
sess = tf.Session()

with sess.as_default():
    agents = [
          dqn.DQN(
              session=sess,
              player_id=idx,
              state_representation_size=info_state_size,
              num_actions=num_actions,
              hidden_layers_sizes=hidden_layers_sizes,
              replay_buffer_capacity=replay_buffer_capacity,
              batch_size=batch_size) for idx in range(num_players)
      ]
    for agent in agents:
        agent.restore(checkpoint_dir)

INFO:tensorflow:Restoring parameters from /Users/titus/Desktop/MLAgents/Checkers2/q_network_pid0


2023-12-04 10:35:38.885205: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:353] MLIR V1 optimization pass is not enabled
INFO:tensorflow:Restoring parameters from /Users/titus/Desktop/MLAgents/Checkers2/q_network_pid0


INFO:tensorflow:Restoring parameters from /Users/titus/Desktop/MLAgents/Checkers2/target_q_network_pid0


INFO:tensorflow:Restoring parameters from /Users/titus/Desktop/MLAgents/Checkers2/target_q_network_pid0


INFO:tensorflow:Restoring parameters from /Users/titus/Desktop/MLAgents/Checkers2/q_network_pid1


INFO:tensorflow:Restoring parameters from /Users/titus/Desktop/MLAgents/Checkers2/q_network_pid1


INFO:tensorflow:Restoring parameters from /Users/titus/Desktop/MLAgents/Checkers2/target_q_network_pid1


INFO:tensorflow:Restoring parameters from /Users/titus/Desktop/MLAgents/Checkers2/target_q_network_pid1


In [80]:
def eval_against_mcts(env, trained_agents, mcts_agents, num_episodes):
  """Evaluates `trained_agents` against `random_agents` for `num_episodes`."""
  num_players = len(trained_agents)
  sum_episode_rewards = np.zeros(num_players)
  results = np.zeros((num_players, 3), dtype=int)
  for player_pos in range(num_players):
    cur_agents = mcts_agents[:]    
    cur_agents[player_pos] = trained_agents[player_pos]
    for _ in range(num_episodes):
      if num_epsiodes % 100:
        print(num_episodes)
      time_step = env.reset()
      episode_rewards = 0
      while not time_step.last():
        player_id = time_step.observations["current_player"]
        agent_output = cur_agents[player_id].step(time_step, is_evaluation=True)
        action_list = [agent_output.action]
        time_step = env.step(action_list)
        episode_rewards += time_step.rewards[player_pos]
      results[player_pos, int(episode_rewards % 3)] += 1
      sum_episode_rewards[player_pos] += episode_rewards
  results_percentage = (results * 100).astype(float) / num_episodes
  results_percentage_str =  np.array(["{:.2f}%".format(x) for x in results_percentage.flatten()]).reshape(results_percentage.shape)
  for player_pos in range(num_players):
      print(f"Results for DQN Agent {player_pos}")
      print(f"   Win:  {results[player_pos, 1]}   ({results_percentage_str[player_pos, 1]})")
      print(f"   Draw: {results[player_pos, 0]}   ({results_percentage_str[player_pos, 0]})") 
      print(f"   Lost: {results[player_pos, 2]}   ({results_percentage_str[player_pos, 2]})")
  return sum_episode_rewards / num_episodes

In [None]:
eval_against_mcts(env, agents, mcts_agents, 10000)

In [5]:
def eval_against_minimax(env, trained_agents, minimax_agents, value_function, maximum_depth):
  """Evaluates `trained_agents` against `minimax` for."""
  num_players = len(trained_agents)
  sum_episode_rewards = np.zeros(num_players)
  for player_pos in range(num_players):
    cur_agents = minimax_agents[:]
    cur_agents[player_pos] = trained_agents[player_pos]
    time_step = env.reset()
    episode_rewards = 0
    while not time_step.last():
      player_id = time_step.observations["current_player"]
      if player_id == player_pos:
        agent_output = cur_agents[player_id].step(time_step, is_evaluation=True)
        action_list = [agent_output.action]
      else:
        _ , action = minimax.alpha_beta_search(env.game, env.get_state, value_function=value_function, maximum_depth = maximum_depth,maximizing_player_id=player_id)
        action_list = [action]
      time_step = env.step(action_list)
    sum_episode_rewards[player_pos] = time_step.rewards[player_pos]    
  return sum_episode_rewards

In [6]:
with sess.as_default():
    agents2 = [agents[1], agents[0]]
    # 0 for tie, 1 for win, -1 for lost
    for i in range(1,10):
        print("Minimax with Depth of: ", i , " Reward of DQN Agents: ", eval_against_minimax(env, agents, minimax_agents, evaluate, i))

Minimax with Depth of:  1  Reward of DQN Agents:  [1. 1.]
Minimax with Depth of:  2  Reward of DQN Agents:  [0. 1.]
Minimax with Depth of:  3  Reward of DQN Agents:  [0. 1.]
Minimax with Depth of:  4  Reward of DQN Agents:  [1. 1.]
Minimax with Depth of:  5  Reward of DQN Agents:  [1. 0.]
Minimax with Depth of:  6  Reward of DQN Agents:  [1. 1.]
Minimax with Depth of:  7  Reward of DQN Agents:  [1. 1.]
Minimax with Depth of:  8  Reward of DQN Agents:  [1. 1.]
Minimax with Depth of:  9  Reward of DQN Agents:  [1. 1.]
