In [13]:
%load_ext autoreload
%autoreload 2

# Setup Environment
import sys
import os
ROOT = '../../'
sys.path.append(ROOT)

from pettingzoo.classic import tictactoe_v3
import numpy as np
import copy
from gym.wrappers.monitoring.video_recorder import VideoRecorder
from IPython.display import HTML
from source.agents.dqn_agent import DQNAgent
from source.agents.advantage_actor_critic_agent import A2CAgent
from source.agents.ppo_agent import PPOAgent
from source.agents.random_agent import RandomAgent
from source.utils import utils
from tqdm import tqdm
import torch
import random
from typing import Dict, Optional, Tuple
from collections import defaultdict
from source.agents.agent import Agent
from pettingzoo.utils.env import AECEnv

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
# Policy Eval
# Params
env = tictactoe_v3.env()

# set random seeds
random_seed = 101
rng = np.random.default_rng(random_seed)
env.np_random = rng
#random.seed(random_seed)
#torch.manual_seed(random_seed)

# Create Environment.
#env = gym.make('Taxi-v3')
# using render_mode=rgb_array so that video recording works
#env = gym.make(
#    "LunarLander-v2",
#    render_mode='rgb_array'
#)
#env = gym.make("CarRacing-v2", render_mode='rgb_array', continuous=False)
#env = gym.make("CartPole-v1", render_mode='rgb_array')


# Create Agent
## Best performance (dqn, random): 0 loss in 1000, eps_decay=1e7
## Best performance (dqn, dqn): 1000 draw in 1000, eps_decay=1e8
dqn_agent = DQNAgent(
    state_space=env.observation_space('player_1')['observation'],
    action_space=env.action_space('player_1'),
    discount_rate=0.5,
    epsilon=0.9, # use epsilon_schedule
    learning_rate=1e-3,
    learning=True,
    batch_size = 64,
    tau = 0.005,
    eps_decay=1e7, #1e7 for (dqn, random), 1e8 for (dqn, dqn)
    net_params=[128],
    update_freq=1
)
## Best performance (a2c, random): ~50 loss in 1000, discount=0.5, width=256
## 
a2c_agent = A2CAgent(
    state_space=env.observation_space('player_1')['observation'],
    action_space=env.action_space('player_1'),
    discount_rate=0.5,
    epsilon=None, # use epsilon_schedule
    learning_rate=None,
    policy_lr=1e-3,
    value_lr=1e-3,
    net_params=[256], #128 perform best with temp = 1
    tempreture=1
)
ppo_agent = PPOAgent(
    state_space=env.observation_space('player_1')['observation'],
    action_space=env.action_space('player_1'),
    discount_rate=0.5,
    epsilon=None, # use epsilon_schedule
    learning_rate=None,
    policy_lr=1e-3, #spinup default=3e-4
    value_lr=1e-3, #spinup default=1e-3
    net_params=[128, 16],
    gae_lambda=0.97, #spinup default=0.97
    clip_ratio=0.1, #spinup default=0.1
    num_updates=80, #spinup default=80
    batch_size=1000,
)
random_agent = RandomAgent(    
    state_space=env.observation_space('player_2')['observation'],
    action_space=env.action_space('player_2'),
    discount_rate=None,
    epsilon=None, # use epsilon_schedule
    learning_rate=None,
    learning=False
)
stats = utils.duel_training(
    env=env, 
    agent_dict={'player_1': ppo_agent, 'player_2': dqn_agent}, 
    num_epoch=100, 
    num_episode=1000, 
    self_play=False, 
    shuffle=True, 
    verbal=True,
    debug=False
)

using device: cpu
using device: cpu
using device: cpu
agents: {'player_1': <source.agents.ppo_agent.PPOAgent object at 0x14fb24c40>, 'player_2': <source.agents.ppo_agent.PPOAgent object at 0x2a0bb50a0>}


100%|██████████| 1000/1000 [00:02<00:00, 357.84it/s]


epoch: 0, win: 427, lose: 458, draw: 115, reward: -0.03100, episode_len: 4.72800, value_loss: 0.13188, policy_loss: -0.05835, num_policy_udpate: 80.00000


100%|██████████| 1000/1000 [00:03<00:00, 300.29it/s]


epoch: 1, win: 441, lose: 475, draw: 84, reward: -0.03400, episode_len: 4.51900, value_loss: 0.11593, policy_loss: -0.06077, num_policy_udpate: 80.00000


100%|██████████| 1000/1000 [00:02<00:00, 377.77it/s]


epoch: 2, win: 491, lose: 449, draw: 60, reward: 0.04200, episode_len: 4.42300, value_loss: 0.11210, policy_loss: -0.00325, num_policy_udpate: 4.00000


100%|██████████| 1000/1000 [00:02<00:00, 477.08it/s]


epoch: 3, win: 493, lose: 450, draw: 57, reward: 0.04300, episode_len: 4.29900, value_loss: 0.09892, policy_loss: -0.04062, num_policy_udpate: 32.00000


100%|██████████| 1000/1000 [00:02<00:00, 463.25it/s]


epoch: 4, win: 530, lose: 414, draw: 56, reward: 0.11600, episode_len: 4.21100, value_loss: 0.10834, policy_loss: -0.01278, num_policy_udpate: 4.66667


100%|██████████| 1000/1000 [00:01<00:00, 560.88it/s]


epoch: 5, win: 448, lose: 488, draw: 64, reward: -0.04000, episode_len: 4.10200, value_loss: 0.10187, policy_loss: -0.01772, num_policy_udpate: 3.50000


100%|██████████| 1000/1000 [00:01<00:00, 565.36it/s]


epoch: 6, win: 478, lose: 441, draw: 81, reward: 0.03700, episode_len: 4.15400, value_loss: 0.09740, policy_loss: -0.02360, num_policy_udpate: 5.66667


100%|██████████| 1000/1000 [00:01<00:00, 569.31it/s]


epoch: 7, win: 424, lose: 471, draw: 105, reward: -0.04700, episode_len: 4.11600, value_loss: 0.07927, policy_loss: -0.01618, num_policy_udpate: 4.33333


100%|██████████| 1000/1000 [00:01<00:00, 521.47it/s]


epoch: 8, win: 461, lose: 366, draw: 173, reward: 0.09500, episode_len: 4.30900, value_loss: 0.08343, policy_loss: -0.01587, num_policy_udpate: 27.33333


100%|██████████| 1000/1000 [00:02<00:00, 431.29it/s]


epoch: 9, win: 368, lose: 373, draw: 259, reward: -0.00500, episode_len: 4.42500, value_loss: 0.06387, policy_loss: -0.03687, num_policy_udpate: 30.75000


100%|██████████| 1000/1000 [00:02<00:00, 439.50it/s]


epoch: 10, win: 341, lose: 323, draw: 336, reward: 0.01800, episode_len: 4.55500, value_loss: 0.05335, policy_loss: -0.03596, num_policy_udpate: 32.00000


100%|██████████| 1000/1000 [00:02<00:00, 429.32it/s]


epoch: 11, win: 311, lose: 259, draw: 430, reward: 0.05200, episode_len: 4.66300, value_loss: 0.04639, policy_loss: -0.01034, num_policy_udpate: 15.33333


 14%|█▍        | 142/1000 [00:00<00:02, 397.53it/s]


KeyboardInterrupt: 

In [19]:
env = tictactoe_v3.env(render_mode='human')
utils.play_multiagent_episode({'player_1': ppo_agent, 'player_2': ppo_agent}, env, shuffle=False,debug=False)
#html=utils.render_mp4(video_path)
#HTML(html)


     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  -  |  X  |  -  
_____|_____|_____
     |     |     
  -  |  -  |  -  
     |     |     
     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  -  |  X  |  -  
_____|_____|_____
     |     |     
  -  |  -  |  O  
     |     |     
     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  -  |  X  |  -  
_____|_____|_____
     |     |     
  X  |  -  |  O  
     |     |     
     |     |     
  -  |  -  |  O  
_____|_____|_____
     |     |     
  -  |  X  |  -  
_____|_____|_____
     |     |     
  X  |  -  |  O  
     |     |     
     |     |     
  -  |  -  |  O  
_____|_____|_____
     |     |     
  -  |  X  |  X  
_____|_____|_____
     |     |     
  X  |  -  |  O  
     |     |     
     |     |     
  -  |  -  |  O  
_____|_____|_____
     |     |     
  O  |  X  |  X  
_____|_____|_____
     |     |     
  X  |  -  |  O  
     |     |     
     |     |     
  X  |  - 

defaultdict(<function source.utils.utils.play_multiagent_episode.<locals>.<lambda>()>,
            {'player_1': defaultdict(float,
                         {'reward': 0.0, 'episode_len': 6.0}),
             'player_2': defaultdict(float,
                         {'reward': 0.0, 'episode_len': 5.0})})