In [9]:
from absl import logging
import numpy as np
import tensorflow.compat.v1 as tf

from open_spiel.python import rl_environment
from open_spiel.python import rl_agent
from open_spiel.python.algorithms import minimax
from open_spiel.python.algorithms import dqn
from open_spiel.python.algorithms import random_agent

In [10]:
class MiniMaxAgent(rl_agent.AbstractAgent):
  """MiniMax agent class."""

  def __init__(self, 
               game,
               state,
               player_id,
               num_actions,
               maximum_depth, 
               value_function=None,
               name="minimax_agent"):
    assert num_actions > 0
    self._game = game
    self._state = state
    self._player_id = player_id
    self._num_actions = num_actions
    self._maximum_depth = maximum_depth
    self._value_function = value_function
    

  def step(self, time_step, is_evaluation=False, state=None):
    # If it is the end of the episode, don't select an action.
    if time_step.last():
      return
    if state is None:
      state = self._game.new_initial_state()

    # Pick a random legal action.
    cur_legal_actions = time_step.observations["legal_actions"][self._player_id]
    _ , action = minimax.alpha_beta_search(self._game, 
                                           state, 
                                           value_function=self._value_function, 
                                           maximum_depth = self._maximum_depth,
                                           maximizing_player_id=self._player_id)
    probs = np.zeros(self._num_actions)
    probs[cur_legal_actions] = 1.0 / len(cur_legal_actions)

    return rl_agent.StepOutput(action=action, probs=probs)

In [11]:
# Training parameters
#Directory to save/load the agent models.
checkpoint_dir = ""

#Episode frequency at which the DQN agent models are saved
save_every = int(1e4)

#Number of training episodes.
num_train_episodes = int(1e6)

#Episode frequency at which the DQN agents are evaluated.
eval_every = 100


# DQN model hyper-parameters

#Number of hidden units in the Q-Network MLP
hidden_layers_sizes = [64, 64]

#Size of the replay buffer
replay_buffer_capacity = int(1e5)

#Number of transitions to sample at each learning step.                     
batch_size = 32

In [12]:
def evaluate(state):
  kWhite = state.observation_string().count('o')
  kWhiteKing = state.observation_string().count('8') - 1 #one row is also labeld 8
  kBlack = state.observation_string().count('+') 
  kBlackKing = state.observation_string().count('*')
  return -1 * state.current_player() * (kWhite + 2*kWhiteKing - kBlack - 2*kBlackKing) #player 0 moves white pieces

In [13]:
game = "checkers"
num_players = 2
env = rl_environment.Environment(game)
info_state_size = env.observation_spec()["info_state"][0]
num_actions = env.action_spec()["num_actions"]

#random agents for evaluation
minimax_agents = [
    MiniMaxAgent(env.game, env.get_state, player_id=idx, num_actions=num_actions, maximum_depth=8, value_function=evaluate)
    for idx in range(num_players)
]
sess = tf.Session()

with sess.as_default():
    agents = [
          dqn.DQN(
              session=sess,
              player_id=idx,
              state_representation_size=info_state_size,
              num_actions=num_actions,
              hidden_layers_sizes=hidden_layers_sizes,
              replay_buffer_capacity=replay_buffer_capacity,
              batch_size=batch_size) for idx in range(num_players)
      ]
    for agent in agents:
        agent.restore(checkpoint_dir)

INFO:tensorflow:Restoring parameters from /Users/titus/Desktop/MLAgents/Checkers2/q_network_pid0


2023-12-03 17:58:31.447294: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:353] MLIR V1 optimization pass is not enabled
INFO:tensorflow:Restoring parameters from /Users/titus/Desktop/MLAgents/Checkers2/q_network_pid0


INFO:tensorflow:Restoring parameters from /Users/titus/Desktop/MLAgents/Checkers2/target_q_network_pid0


INFO:tensorflow:Restoring parameters from /Users/titus/Desktop/MLAgents/Checkers2/target_q_network_pid0


INFO:tensorflow:Restoring parameters from /Users/titus/Desktop/MLAgents/Checkers2/q_network_pid1


INFO:tensorflow:Restoring parameters from /Users/titus/Desktop/MLAgents/Checkers2/q_network_pid1


INFO:tensorflow:Restoring parameters from /Users/titus/Desktop/MLAgents/Checkers2/target_q_network_pid1


INFO:tensorflow:Restoring parameters from /Users/titus/Desktop/MLAgents/Checkers2/target_q_network_pid1


In [16]:
def eval_against_minimax(env, trained_agents, minimax_agents, value_function, maximum_depth):
  """Evaluates `trained_agents` against `minimax` for."""
  num_players = len(trained_agents)
  sum_episode_rewards = np.zeros(num_players)
  for player_pos in range(num_players):
    cur_agents = minimax_agents[:]
    cur_agents[player_pos] = trained_agents[player_pos]
    time_step = env.reset()
    episode_rewards = 0
    while not time_step.last():
      player_id = time_step.observations["current_player"]
      if player_id == player_pos:
        agent_output = cur_agents[player_id].step(time_step, is_evaluation=True)
        action_list = [agent_output.action]
      else:
        _ , action = minimax.alpha_beta_search(env.game, env.get_state, value_function=value_function, maximum_depth = maximum_depth,maximizing_player_id=player_id)
        action_list = [action]
      time_step = env.step(action_list)
    sum_episode_rewards[player_pos] = time_step.rewards[player_pos] 
  return sum_episode_rewards

In [17]:
with sess.as_default():
    agents2 = [agents[1], agents[0]]
    # 0 for tie, 1 for win, -1 for lost
    for i in range(1,10):
        print("Minimax with Depth of: ", i , " Reward of DQN Agents: ", eval_against_minimax(env, agents, minimax_agents, evaluate, i))

Minimax with Depth of:  1  Reward of DQN Agents:  [1. 1.]
Minimax with Depth of:  2  Reward of DQN Agents:  [0. 1.]
Minimax with Depth of:  3  Reward of DQN Agents:  [0. 1.]
Minimax with Depth of:  4  Reward of DQN Agents:  [1. 1.]
Minimax with Depth of:  5  Reward of DQN Agents:  [1. 0.]
Minimax with Depth of:  6  Reward of DQN Agents:  [1. 1.]
Minimax with Depth of:  7  Reward of DQN Agents:  [1. 1.]
Minimax with Depth of:  8  Reward of DQN Agents:  [1. 1.]
Minimax with Depth of:  9  Reward of DQN Agents:  [1. 1.]
