In [1]:
from absl import logging
import numpy as np
import tensorflow.compat.v1 as tf

from open_spiel.python import rl_environment
from open_spiel.python.algorithms import dqn
from open_spiel.python.algorithms import random_agent
from open_spiel.python.algorithms import minimax
from open_spiel.python.algorithms import minimax_agent
from open_spiel.python.algorithms import mcts
from open_spiel.python.algorithms import mcts_agent



2023-12-08 20:53:43.275437: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
# Training parameters
#Directory to save/load the agent models.
checkpoint_dir = "/Users/titus/Desktop/ml/trainingResults/tictactoe"

#Episode frequency at which the DQN agent models are saved
save_every = int(1e4)

#Number of training episodes.
num_train_episodes = int(1e6)

#Episode frequency at which the DQN agents are evaluated.
eval_every = 100


# DQN model hyper-parameters

#Number of hidden units in the Q-Network MLP
hidden_layers_sizes = [64, 64]

#Size of the replay buffer
replay_buffer_capacity = int(1e5)

#Number of transitions to sample at each learning step.                     
batch_size = 32

In [3]:
def evaluate(state):
  kWhite = state.observation_string().count('o')
  kWhiteKing = state.observation_string().count('8') - 1 #one row is also labeld 8
  kBlack = state.observation_string().count('+') 
  kBlackKing = state.observation_string().count('*')
  return -1 * state.current_player() * (kWhite + 2*kWhiteKing - kBlack - 2*kBlackKing) #player 0 moves white pieces

In [4]:
game = "tic_tac_toe"
num_players = 2
env = rl_environment.Environment(game, include_full_state=True)
info_state_size = env.observation_spec()["info_state"][0]
num_actions = env.action_spec()["num_actions"]

#random agents for evaluation
minimax_agents = [
    minimax_agent.MiniMaxAgent(env.game, env.get_state, player_id=idx, num_actions=num_actions, maximum_depth=8, value_function=evaluate)
    for idx in range(num_players)
]
mcts_bot = mcts.MCTSBot(env.game, 1.5, 100, mcts.RandomRolloutEvaluator())
mcts_agents = [
        mcts_agent.MCTSAgent(player_id=idx, num_actions=num_actions, mcts_bot=mcts_bot)
        for idx in range(num_players)
]


sess = tf.Session()

with sess.as_default():
    agents = [
          dqn.DQN(
              session=sess,
              player_id=idx,
              state_representation_size=info_state_size,
              num_actions=num_actions,
              hidden_layers_sizes=hidden_layers_sizes,
              replay_buffer_capacity=replay_buffer_capacity,
              batch_size=batch_size) for idx in range(num_players)
      ]
    for agent in agents:
        agent.restore(checkpoint_dir)

INFO:tensorflow:Restoring parameters from /Users/titus/Desktop/ml/trainingResults/tictactoe/q_network_pid0


2023-12-08 20:53:49.047350: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:353] MLIR V1 optimization pass is not enabled
INFO:tensorflow:Restoring parameters from /Users/titus/Desktop/ml/trainingResults/tictactoe/q_network_pid0


INFO:tensorflow:Restoring parameters from /Users/titus/Desktop/ml/trainingResults/tictactoe/target_q_network_pid0


INFO:tensorflow:Restoring parameters from /Users/titus/Desktop/ml/trainingResults/tictactoe/target_q_network_pid0


INFO:tensorflow:Restoring parameters from /Users/titus/Desktop/ml/trainingResults/tictactoe/q_network_pid1


INFO:tensorflow:Restoring parameters from /Users/titus/Desktop/ml/trainingResults/tictactoe/q_network_pid1


INFO:tensorflow:Restoring parameters from /Users/titus/Desktop/ml/trainingResults/tictactoe/target_q_network_pid1


INFO:tensorflow:Restoring parameters from /Users/titus/Desktop/ml/trainingResults/tictactoe/target_q_network_pid1


In [5]:
def eval_against_mcts(env, trained_agents, mcts_agents, num_episodes):
  """Evaluates `trained_agents` against `random_agents` for `num_episodes`."""
  num_players = len(trained_agents)
  sum_episode_rewards = np.zeros(num_players)
  results = np.zeros((num_players, 3), dtype=int)
  for player_pos in range(num_players):
    cur_agents = mcts_agents[:]    
    cur_agents[player_pos] = trained_agents[player_pos]
    for ep in range(num_episodes):
      time_step = env.reset()
      episode_rewards = 0
      while not time_step.last():
        player_id = time_step.observations["current_player"]
        agent_output = cur_agents[player_id].step(time_step, is_evaluation=True)
        action_list = [agent_output.action]
        time_step = env.step(action_list)
        episode_rewards += time_step.rewards[player_pos]
      sum_episode_rewards[player_pos] += episode_rewards
      results[player_pos, int(episode_rewards % 3)] += 1
      if (ep + 1) % 10 == 0:
          print(f"Episode {ep+1}")
          results_percentage = (results * 100).astype(float) / (ep + 1)
          results_percentage_str =  np.array(["{:.2f}%".format(x) for x in results_percentage.flatten()]).reshape(results_percentage.shape)
          for pid in range(num_players):
              print(f"Results for DQN Agent {pid}")
              print(f"   Win:  {results[pid, 1]}   ({results_percentage_str[pid, 1]})")
              print(f"   Draw: {results[pid, 0]}   ({results_percentage_str[pid, 0]})") 
              print(f"   Lost: {results[pid, 2]}   ({results_percentage_str[pid, 2]})")
  return sum_episode_rewards / num_episodes

In [9]:
eval_against_mcts(env, agents, mcts_agents, 10000)

Episode 10
Results for DQN Agent 0
   Win:  4   (40.00%)
   Draw: 6   (60.00%)
   Lost: 0   (0.00%)
Results for DQN Agent 1
   Win:  0   (0.00%)
   Draw: 0   (0.00%)
   Lost: 0   (0.00%)
Episode 20
Results for DQN Agent 0
   Win:  10   (50.00%)
   Draw: 10   (50.00%)
   Lost: 0   (0.00%)
Results for DQN Agent 1
   Win:  0   (0.00%)
   Draw: 0   (0.00%)
   Lost: 0   (0.00%)
Episode 30
Results for DQN Agent 0
   Win:  13   (43.33%)
   Draw: 17   (56.67%)
   Lost: 0   (0.00%)
Results for DQN Agent 1
   Win:  0   (0.00%)
   Draw: 0   (0.00%)
   Lost: 0   (0.00%)
Episode 40
Results for DQN Agent 0
   Win:  20   (50.00%)
   Draw: 20   (50.00%)
   Lost: 0   (0.00%)
Results for DQN Agent 1
   Win:  0   (0.00%)
   Draw: 0   (0.00%)
   Lost: 0   (0.00%)
Episode 50
Results for DQN Agent 0
   Win:  25   (50.00%)
   Draw: 25   (50.00%)
   Lost: 0   (0.00%)
Results for DQN Agent 1
   Win:  0   (0.00%)
   Draw: 0   (0.00%)
   Lost: 0   (0.00%)
Episode 60
Results for DQN Agent 0
   Win:  31   (51.67%)

array([5.5e-01, 4.0e-04])

In [8]:
def eval_against_minimax(env, trained_agents, minimax_agents):
  """Evaluates `trained_agents` against `minimax` for."""
  num_players = len(trained_agents)
  sum_episode_rewards = np.zeros(num_players)
  for player_pos in range(num_players):
    cur_agents = minimax_agents[:]
    cur_agents[player_pos] = trained_agents[player_pos]
    time_step = env.reset()
    episode_rewards = 0
    while not time_step.last():
      player_id = time_step.observations["current_player"]
      if player_id == player_pos:
        agent_output = cur_agents[player_id].step(time_step, is_evaluation=True)
        action_list = [agent_output.action]
      else:
        _ , action = minimax.alpha_beta_search(env.game, env.get_state, maximizing_player_id=player_id)
        action_list = [action]
      time_step = env.step(action_list)
    sum_episode_rewards[player_pos] = time_step.rewards[player_pos]    
  return sum_episode_rewards

In [16]:
with sess.as_default():
    agents2 = [agents[1], agents[0]]
    # 0 for tie, 1 for win, -1 for lost
    for i in range(1,10):
        print("Minimax with Depth of: ", i , " Reward of DQN Agents: ", eval_against_minimax(env, agents, minimax_agents))

Minimax with Depth of:  1  Reward of DQN Agents:  [0. 0.]
Minimax with Depth of:  2  Reward of DQN Agents:  [0. 0.]
Minimax with Depth of:  3  Reward of DQN Agents:  [0. 0.]
Minimax with Depth of:  4  Reward of DQN Agents:  [0. 0.]
Minimax with Depth of:  5  Reward of DQN Agents:  [0. 0.]
Minimax with Depth of:  6  Reward of DQN Agents:  [0. 0.]
Minimax with Depth of:  7  Reward of DQN Agents:  [0. 0.]
Minimax with Depth of:  8  Reward of DQN Agents:  [0. 0.]
Minimax with Depth of:  9  Reward of DQN Agents:  [0. 0.]


In [8]:
def eval_minimax_mcts(env, trained_agents, minimax_agents, value_function, maximum_depth, num_episodes):
  """Evaluates `trained_agents` against `minimax` for."""
  num_players = len(trained_agents)
  sum_episode_rewards = np.zeros(num_players)
  results = np.zeros((num_players, 3), dtype=int)
  for player_pos in range(num_players):
    cur_agents = minimax_agents[:]
    cur_agents[player_pos] = trained_agents[player_pos]
    for ep in range(num_episodes):
        time_step = env.reset()
        episode_rewards = 0
        while not time_step.last():
          player_id = time_step.observations["current_player"]
          if player_id == player_pos:
            agent_output = cur_agents[player_id].step(time_step, is_evaluation=True)
            action_list = [agent_output.action]
          else:
            _ , action = minimax.alpha_beta_search(env.game, env.get_state, value_function=value_function, maximum_depth = maximum_depth,maximizing_player_id=player_id)
            action_list = [action]
          time_step = env.step(action_list)
          episode_rewards += time_step.rewards[player_pos]
        sum_episode_rewards[player_pos] += episode_rewards
        results[player_pos, int(episode_rewards % 3)] += 1
        if (ep + 1) % 100 == 0:
          #print(f"Episode {ep+1}")
          results_percentage = (results * 100).astype(float) / (ep + 1)
          results_percentage_str =  np.array(["{:.2f}%".format(x) for x in results_percentage.flatten()]).reshape(results_percentage.shape)
          #for pid in range(num_players):
           #   print(f"Results for DQN Agent {pid}")
           #   print(f"   Win:  {results[pid, 1]}   ({results_percentage_str[pid, 1]})")
           #   print(f"   Draw: {results[pid, 0]}   ({results_percentage_str[pid, 0]})") 
           #   print(f"   Lost: {results[pid, 2]}   ({results_percentage_str[pid, 2]})")
  return sum_episode_rewards / num_episodes

In [9]:
with sess.as_default():
    agents2 = [agents[1], agents[0]]
    # 0 for tie, 1 for win, -1 for lost
    for i in range(1,9):
        print("Minimax with Depth of: ", i , " Reward of MCTS Agents: ", eval_minimax_mcts(env, mcts_agents, minimax_agents, evaluate, i, 100))

Minimax with Depth of:  1  Reward of MCTS Agents:  [1.   0.94]
Minimax with Depth of:  2  Reward of MCTS Agents:  [0.91 0.21]
Minimax with Depth of:  3  Reward of MCTS Agents:  [0.47 0.58]
Minimax with Depth of:  4  Reward of MCTS Agents:  [0.95 0.08]
Minimax with Depth of:  5  Reward of MCTS Agents:  [ 0.37 -0.52]
Minimax with Depth of:  6  Reward of MCTS Agents:  [ 0.38 -0.66]
Minimax with Depth of:  7  Reward of MCTS Agents:  [ 0.   -0.51]
Minimax with Depth of:  8  Reward of MCTS Agents:  [-0.02 -0.52]
