# Hanabi + dqn

see also tutorial https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html

In [None]:
!pip install git+https://github.com/google-deepmind/hanabi-learning-environment.git


Collecting git+https://github.com/google-deepmind/hanabi-learning-environment.git
  Cloning https://github.com/google-deepmind/hanabi-learning-environment.git to /tmp/pip-req-build-3d7hi4mo
  Running command git clone --filter=blob:none --quiet https://github.com/google-deepmind/hanabi-learning-environment.git /tmp/pip-req-build-3d7hi4mo
  Resolved https://github.com/google-deepmind/hanabi-learning-environment.git to commit 54e79594f4b6fb40ebb3004289c6db0e34a8b5fb
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
import numpy as np
import random
import math
from hanabi_learning_environment import rl_env
from hanabi_learning_environment.agents.random_agent import RandomAgent
from hanabi_learning_environment.agents.simple_agent import SimpleAgent

import torch
import matplotlib.pyplot as plt
import logging
FORMAT = '%(asctime)s %(levelname)s %(message)s'
logging.basicConfig(format=FORMAT, level=logging.INFO, force=True)




## env with wrapper

In [None]:
def card2s(card):
  color = card['color']
  rank = card['rank']
  return f'{color or "?"}' + f'{"_" if rank is None or rank < 0 else int(rank)}'
def hand2s(hand):
  return ','.join([card2s(_) for _ in hand])
def hanabi2s(fireworks):
  ret = []
  for key in sorted(fireworks.keys()):
    ret += [f'{key}{fireworks[key]}']
  return ','.join(ret)
def index_to_binaryvector(indices, length):
  """a tool to mask illegal moves"""
  a = torch.zeros(length)
  a.scatter_(0, torch.Tensor(indices).to(torch.int64), 1)
  return a

In [None]:
class GameRecord:
  def __init__(self):
    self.player_obs = []
    self.actions = []
    self.action_names = []
    self.rewards = []
    #self.intrinsic_rewards=[]
    self.cooperation_rate=[]
    self.play_discard_rate=[]
    self.hanabi_scores=[]
    self.history_for_human = []
  def episode_return(self):
    return sum(self.rewards)
  def __len__(self):
    return len(self.actions)

In [None]:
class HanabiEnv:
  """a wrapper for HLE env"""
  def __init__(self, name='Hanabi-Full', num_players=2):
    self.env = rl_env.make(environment_name=name, num_players=num_players)
    # default make(environment_name='Hanabi-Full', num_players=2, pyhanabi_path=None)
    # env = rl_env.make(environment_name='Hanabi-Small')
    self.N = num_players
    self.reset()
  def reset(self):
    self.observations = self.env.reset()
    self.record = GameRecord()
    self.reward = 0
    self.intrinsic_reward=0
    self.done = False
    self.update()
  def step(self, action: int):
    if action not in self.legal_moves():
      raise RuntimeError(f'action {action} not in {self.legal_moves}')

    self.observations, self.reward, self.done, _ = self.env.step(action)
    self.update(action)
  def obs(self, agent_id: int | None = None):
    if agent_id is None:
      agent_id = self.to_play
    return self.local_obs[agent_id]
  def legal_moves(self, agent_id: int | None = None):
    return self.obs(agent_id)['legal_moves_as_int']
  def legal_moves_dict(self, agent_id: int | None = None):
    return self.obs(agent_id)['legal_moves']
  def vector_obs(self, agent_id: int | None = None):
    return self.obs()['vectorized']
  def update(self, action: int | None = None):
    if action is not None:
      self.record.actions.append(action)
      action_ord = self.legal_moves().index(action)
      name = list(self.obs()['legal_moves'][action_ord].values())
      self.record.action_names.append(str(name))
      self.record.rewards.append(self.reward)
      #append intrinsic reward
      #self.record.intrinsic_rewards.append(self.intrinsic_reward)
    self.to_play = self.observations['current_player']
    self.local_obs = [self.observations['player_observations'][_]
                      for _ in range(self.N)]
    self.record.player_obs.append(self.obs()['vectorized'])
    self.record.history_for_human.append(self.debug_info())
  def debug_info(self):
    obs = self.obs()
    s = ''
    s += f'Fireworks: {hanabi2s(obs["fireworks"])} life: {obs["life_tokens"]}' \
         + f' info: {obs["information_tokens"]}'
    for i in range(self.N):
      s += f'  Hand{i} ' + hand2s(self.obs((i+self.N-1)%self.N)["observed_hands"][1])  \
           + f'  Know{i} ' + hand2s(self.obs(i)["card_knowledge"][0])
    return s

    def calculate_score(self,agent_id):
        """Calculate the final score based on the fireworks state."""
        return sum(self.obs(agent_id)['fireworks'].values())


### env usage

In [None]:
index_to_binaryvector([3,8], 10)

tensor([0., 0., 0., 1., 0., 0., 0., 0., 1., 0.])

In [None]:
env0 = HanabiEnv()
env0.reset()
env0.debug_info()

'Fireworks: B0,G0,R0,W0,Y0 life: 3 info: 8  Hand0 Y0,G3,W0,R2,Y4  Know0 ?_,?_,?_,?_,?_  Hand1 R0,Y3,Y0,B1,W1  Know1 ?_,?_,?_,?_,?_'

In [None]:
print(env0.legal_moves(), env0.legal_moves_dict())

[5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 18] [{'action_type': 'PLAY', 'card_index': 0}, {'action_type': 'PLAY', 'card_index': 1}, {'action_type': 'PLAY', 'card_index': 2}, {'action_type': 'PLAY', 'card_index': 3}, {'action_type': 'PLAY', 'card_index': 4}, {'action_type': 'REVEAL_COLOR', 'target_offset': 1, 'color': 'R'}, {'action_type': 'REVEAL_COLOR', 'target_offset': 1, 'color': 'Y'}, {'action_type': 'REVEAL_COLOR', 'target_offset': 1, 'color': 'W'}, {'action_type': 'REVEAL_COLOR', 'target_offset': 1, 'color': 'B'}, {'action_type': 'REVEAL_RANK', 'target_offset': 1, 'rank': 0}, {'action_type': 'REVEAL_RANK', 'target_offset': 1, 'rank': 1}, {'action_type': 'REVEAL_RANK', 'target_offset': 1, 'rank': 3}]


In [None]:
def step_and_show(env, action):
  env.step(action)
  print(env.debug_info())
  print(env.legal_moves(), env.legal_moves_dict())
  if env.done:
    print('gameends')
  else:
    print(f'{env.to_play=}, {env.reward=}')
  print(f'{env.record.episode_return()=}')

In [None]:
step_and_show(env0, 5)

Fireworks: B0,G0,R0,W0,Y1 life: 3 info: 8  Hand0 G3,W0,R2,Y4,Y0  Know0 ?_,?_,?_,?_,?_  Hand1 R0,Y3,Y0,B1,W1  Know1 ?_,?_,?_,?_,?_
[5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17, 18, 19] [{'action_type': 'PLAY', 'card_index': 0}, {'action_type': 'PLAY', 'card_index': 1}, {'action_type': 'PLAY', 'card_index': 2}, {'action_type': 'PLAY', 'card_index': 3}, {'action_type': 'PLAY', 'card_index': 4}, {'action_type': 'REVEAL_COLOR', 'target_offset': 1, 'color': 'R'}, {'action_type': 'REVEAL_COLOR', 'target_offset': 1, 'color': 'Y'}, {'action_type': 'REVEAL_COLOR', 'target_offset': 1, 'color': 'G'}, {'action_type': 'REVEAL_COLOR', 'target_offset': 1, 'color': 'W'}, {'action_type': 'REVEAL_RANK', 'target_offset': 1, 'rank': 0}, {'action_type': 'REVEAL_RANK', 'target_offset': 1, 'rank': 2}, {'action_type': 'REVEAL_RANK', 'target_offset': 1, 'rank': 3}, {'action_type': 'REVEAL_RANK', 'target_offset': 1, 'rank': 4}]
env.to_play=1, env.reward=1
env.record.episode_return()=1


## Agents and playing

In [None]:
class FCModel(torch.nn.Module):
  """fully connected neural networks serving as a building block of an agent"""
  def __init__(self, obs_dim: int, action_dim: int, device, hidden_size=256):
    super().__init__()
    logging.info(f'model {obs_dim=}, {action_dim=}, {device=}')
    self.device = device
    self.action_dim= action_dim
    self.fc = torch.nn.Sequential(
      torch.nn.Linear(obs_dim, hidden_size),
      torch.nn.ReLU(),
      torch.nn.Linear(hidden_size, hidden_size),
      torch.nn.ReLU(),
      torch.nn.Linear(hidden_size, action_dim),
    )
  def forward(self, obs: torch.Tensor) -> torch.Tensor:
    """return Q values of all actions given an observation as obs"""
    return self.fc(obs.to(torch.float32).to(device))

### Intrinsic reward

#### Cooperative Agent

In [None]:
def calculate_cooperative_agent_reward(current_observation,next_observation, action_taken):
    """
    Calculate the intrinsic reward

    Parameters:
    previous_observation (dict): The observation of the game before the action was taken.
    current_observation (dict): The observation of the game after the action was taken.
    action_taken (dict): The action taken by the agent.

    Returns:
    float: The calculated intrinsic reward.
    """
    intrinsic_reward = 0.0
    set_reward = {'reveal_play': 2, 'repeat_reveal': -0.5}


    cur_fireworks = current_observation['fireworks']

    try:
        action_idx = current_observation['legal_moves_as_int'].index(action_taken)
        action_dict = current_observation['legal_moves'][action_idx]
    except ValueError:
        # Handle the case where action_id is not in legal_moves_as_int
        return intrinsic_reward

    action_type = action_dict['action_type']
    cur_player_observed_hands = current_observation['observed_hands']
    card_knowledge = current_observation['card_knowledge']

    if action_type == 'PLAY':
      intrinsic_reward-= 1


    if action_type == 'DISCARD':
        intrinsic_reward-= 1
    # Reward for giving hints (cooperation)
    if 10 <= action_taken <= 19:
        intrinsic_reward += 1

    if action_type in ['REVEAL_COLOR', 'REVEAL_RANK']:
        target_offset = action_dict['target_offset']
        target_hand = cur_player_observed_hands[target_offset]
        target_knowledge = card_knowledge[target_offset]

        if action_type == 'REVEAL_COLOR':
            color = action_dict['color']
            for card_idx, card in enumerate(target_hand):
                if target_knowledge[card_idx]['color'] == color:
                    intrinsic_reward += set_reward['repeat_reveal']
                elif card['color'] == color and card['rank'] == cur_fireworks[color]:
                    intrinsic_reward += set_reward['reveal_play']

        else:  # REVEAL_RANK
            rank = action_dict['rank']
            for card_idx, card in enumerate(target_hand):
                if target_knowledge[card_idx]['rank'] == rank:
                    intrinsic_reward += set_reward['repeat_reveal']
                elif card['rank'] == rank and cur_fireworks[card['color']] == rank:
                    intrinsic_reward += set_reward['reveal_play']

    # Penalty or reward for information token usage
    info_token_difference = next_observation['information_tokens'] - \
                            current_observation['information_tokens']
    intrinsic_reward += info_token_difference


    return intrinsic_reward


#### Non Cooperative Agent

#####Dumb Agent

In [None]:
"""
Agents are rewarded for playing or discarding a card without checking if it is successful or not
"""
def calculate_dumb_reward(current_observation,next_observation, action_taken):
    """
    Calculate the intrinsic reward

    Parameters:
    previous_observation (dict): The observation of the game before the action was taken.
    current_observation (dict): The observation of the game after the action was taken.
    action_taken (dict): The action taken by the agent.

    Returns:
    float: The calculated intrinsic reward.
    """
    intrinsic_reward = 0.0

    try:
        action_idx = current_observation['legal_moves_as_int'].index(action_taken)
        action_dict = current_observation['legal_moves'][action_idx]
    except ValueError:
        # Handle the case where action_id is not in legal_moves_as_int
        return intrinsic_reward

    action_type = action_dict['action_type']
    cur_player_observed_hands = current_observation['observed_hands']
    card_knowledge = current_observation['card_knowledge']


    # Reward for certainty / progress in fireworks and successful play

    if action_type == 'PLAY':
      intrinsic_reward+= 1


    if action_type == 'DISCARD':
        intrinsic_reward+= 1

    return intrinsic_reward


##### Intuitive Agent

In [None]:
"""
Agents are rewarded for using more intuition to play and discard cards
"""
def calculate_intuitive_reward(current_observation,next_observation, action_taken):
    """
    Calculate the intrinsic reward

    Parameters:
    previous_observation (dict): The observation of the game before the action was taken.
    current_observation (dict): The observation of the game after the action was taken.
    action_taken (dict): The action taken by the agent.

    Returns:
    float: The calculated intrinsic reward.
    """
    intrinsic_reward = 0.0
    set_reward = {'reward_successful_play': 1 ,'penalty_wrong_play': -1  ,'discard_useless': 1,'discard_useful': -1}


    cur_fireworks = current_observation['fireworks']

    try:
        action_idx = current_observation['legal_moves_as_int'].index(action_taken)
        action_dict = current_observation['legal_moves'][action_idx]
    except ValueError:
        # Handle the case where action_id is not in legal_moves_as_int
        return intrinsic_reward

    action_type = action_dict['action_type']
    cur_player_observed_hands = current_observation['observed_hands']
    card_knowledge = current_observation['card_knowledge']

    # Reward for certainty / progress in fireworks and successful play

    if action_type == 'PLAY' or 'DISCARD':

    # Reward for intuition:
        intuition_score=0.1

        card_index = action_dict['card_index']

        if card_index >= len(card_knowledge[0]):
            return False  # Invalid card index

        card = card_knowledge[0][card_index]
        if card['color'] is None: intuition_score += 0.45
        if card['rank'] is None: intuition_score += 0.45

        if action_type == 'PLAY':
    #check successful or failed play
          fireworks_progress = sum(next_observation['fireworks'].values()) - \
                             sum(current_observation['fireworks'].values())
          if fireworks_progress > 0:
            intrinsic_reward += intuition_score * set_reward['reward_successful_play']
          else:
            intrinsic_reward += intuition_score * set_reward['penalty_wrong_play']
    # Reward for strategic discard

        else:
          #rank might be None so we get it from the discarded pile in next observation
          known_discarded_card= next_observation['discard_pile'][len(next_observation['discard_pile'])-1]
          color=known_discarded_card['color']
          fireworks = current_observation['fireworks']
          if known_discarded_card['rank'] == fireworks[color]+1:
            intrinsic_reward+= intuition_score * set_reward['discard_useful']
          else:
            intrinsic_reward+= intuition_score * set_reward['discard_useless']

    return intrinsic_reward


#####Certain Agent

In [None]:
"""
Agents are rewarded for playing or discarding fully known cards
"""
def calculate_certain_reward(current_observation,next_observation, action_taken):
    """
    Calculate the intrinsic reward

    Parameters:
    previous_observation (dict): The observation of the game before the action was taken.
    current_observation (dict): The observation of the game after the action was taken.
    action_taken (dict): The action taken by the agent.

    Returns:
    float: The calculated intrinsic reward.
    """
    intrinsic_reward = 0.0
    set_reward = {'reward_successful_play': 1 ,'penalty_wrong_play': -1  ,'discard_useless': 1,'discard_useful': -1}


    cur_fireworks = current_observation['fireworks']

    try:
        action_idx = current_observation['legal_moves_as_int'].index(action_taken)
        action_dict = current_observation['legal_moves'][action_idx]
    except ValueError:
        # Handle the case where action_id is not in legal_moves_as_int
        return intrinsic_reward

    action_type = action_dict['action_type']
    cur_player_observed_hands = current_observation['observed_hands']
    card_knowledge = current_observation['card_knowledge']


    # Reward for certainty / progress in fireworks and successful play

    if action_type == 'PLAY' or action_type =='DISCARD':

    # Reward for card certainty:
        certainty_score=0.1

        card_index = action_dict['card_index']
        if card_index >= len(card_knowledge[0]):
            return False  # Invalid card index

        card = card_knowledge[0][card_index]
        if card['color'] is not None: certainty_score += 0.45
        if card['rank'] is not None: certainty_score += 0.45

        if action_type == 'PLAY':

      #check successful or failed play
          fireworks_progress = sum(next_observation['fireworks'].values()) - \
                              sum(current_observation['fireworks'].values())
          if fireworks_progress > 0:
              intrinsic_reward += certainty_score * set_reward['reward_successful_play']
          else:
              intrinsic_reward += certainty_score * set_reward['penalty_wrong_play']
    # Reward for strategic discard
        else:
          #rank might be None so we get it from the discarded pile in next observation
          known_discarded_card= next_observation['discard_pile'][len(next_observation['discard_pile'])-1]
          color=known_discarded_card['color']
          fireworks = current_observation['fireworks']
          if known_discarded_card['rank'] == fireworks[color]+1:
            intrinsic_reward+= certainty_score * set_reward['discard_useful']
          else:
            intrinsic_reward+= certainty_score * set_reward['discard_useless']

    return intrinsic_reward


##### Certain Agent with strategic discard

In [None]:
def calculate_certain_strategic_reward(current_observation,next_observation, action_taken):
    """
    Calculate the intrinsic reward

    Parameters:
    previous_observation (dict): The observation of the game before the action was taken.
    current_observation (dict): The observation of the game after the action was taken.
    action_taken (dict): The action taken by the agent.

    Returns:
    float: The calculated intrinsic reward.
    """
    intrinsic_reward = 0.0
    set_reward = {'reward_successful_play': 1 ,'penalty_wrong_play': -1  ,'discard_useless': 1,'discard_useful': -1}


    cur_fireworks = current_observation['fireworks']

    try:
        action_idx = current_observation['legal_moves_as_int'].index(action_taken)
        action_dict = current_observation['legal_moves'][action_idx]
    except ValueError:
        # Handle the case where action_id is not in legal_moves_as_int
        return intrinsic_reward

    action_type = action_dict['action_type']
    cur_player_observed_hands = current_observation['observed_hands']
    card_knowledge = current_observation['card_knowledge']



    # Reward for certainty / progress in fireworks and successful play

    if action_type == 'PLAY':

    # Reward for card certainty:
        certainty_score=0.1
        card_index = action_dict['card_index']

        if card_index >= len(card_knowledge[0]):
            return False  # Invalid card index

        card = card_knowledge[0][card_index]
        if card['color'] is not None: certainty_score += 0.45
        if card['rank'] is not None: certainty_score += 0.45

    #check successful or failed play
        fireworks_progress = sum(next_observation['fireworks'].values()) - \
                             sum(current_observation['fireworks'].values())
        if fireworks_progress > 0:
            intrinsic_reward += certainty_score * set_reward['reward_successful_play']
        else:
            intrinsic_reward += certainty_score * set_reward['penalty_wrong_play']
    # Reward for strategic discard

    if action_type == 'DISCARD':

      player_hand = current_observation['card_knowledge'][0]
      card_index = action_dict['card_index']
      discarded_card = player_hand[card_index]
      known_discarded_card= next_observation['discard_pile'][len(next_observation['discard_pile'])-1]
      color=discarded_card['color']
      rank=discarded_card['rank']
      fireworks = current_observation['fireworks']

      certainty_score=0.1

      if color is not None: certainty_score += 0.3
      if rank is not None:  certainty_score += 0.3

      #treating special cases where we know we can discad the card even without full information:

      # completely useless card that is not needed anymore to finish fireworks
      if rank is not None and color is not None and rank <= fireworks[color] :
        certainty_score += 0.3

      #if we only know the rank and it is inferior to the minimum card value on the table then it can be discarded for sure
      elif rank is not None and color is None and rank <= min(fireworks.values()):
            certainty_score=1
      #final score
      if known_discarded_card['rank'] == fireworks[known_discarded_card['color']]+1:
            intrinsic_reward+= certainty_score * set_reward['discard_useful']
      else:
            intrinsic_reward+= certainty_score * set_reward['discard_useless']
    # Penalty or reward for information token usage
    info_token_difference = next_observation['information_tokens'] - \
                            current_observation['information_tokens']
    intrinsic_reward += info_token_difference

    return intrinsic_reward


#### Balanced Agent

In [None]:
def calculate_balanced_reward(current_observation,next_observation, action_taken):
    """
    Calculate the intrinsic reward

    Parameters:
    previous_observation (dict): The observation of the game before the action was taken.
    current_observation (dict): The observation of the game after the action was taken.
    action_taken (dict): The action taken by the agent.

    Returns:
    float: The calculated intrinsic reward.
    """
    intrinsic_reward = 0.0
    set_reward = {'reward_successful_play': 0.2 ,'penalty_wrong_play': -0.2  ,'discard_useless': 0.2,'discard_useful': -0.2,
                  'reveal_play': 0.2,'repeat_reveal': -0.2}


    cur_fireworks = current_observation['fireworks']

    try:
        action_idx = current_observation['legal_moves_as_int'].index(action_taken)
        action_dict = current_observation['legal_moves'][action_idx]
    except ValueError:
        # Handle the case where action_id is not in legal_moves_as_int
        return intrinsic_reward

    action_type = action_dict['action_type']
    cur_player_observed_hands = current_observation['observed_hands']
    card_knowledge = current_observation['card_knowledge']

    # Reward for giving hints (cooperation)

    if 10 <= action_taken <= 19:
            intrinsic_reward += set_reward['reveal_play']

    if action_type in ['REVEAL_COLOR', 'REVEAL_RANK']:
        target_offset = action_dict['target_offset']
        target_hand = cur_player_observed_hands[target_offset]
        target_knowledge = card_knowledge[target_offset]

        if action_type == 'REVEAL_COLOR':
            color = action_dict['color']
            for card_idx, card in enumerate(target_hand):
                if target_knowledge[card_idx]['color'] == color:
                    intrinsic_reward += set_reward['repeat_reveal']
                elif card['color'] == color and card['rank'] == cur_fireworks[color]:
                    intrinsic_reward += set_reward['reveal_play']

        else:  # REVEAL_RANK
            rank = action_dict['rank']
            for card_idx, card in enumerate(target_hand):
                if target_knowledge[card_idx]['rank'] == rank:
                    intrinsic_reward += set_reward['repeat_reveal']
                elif card['rank'] == rank and cur_fireworks[card['color']] == rank:
                    intrinsic_reward += set_reward['reveal_play']



    # Reward for certainty / progress in fireworks and successful play

    if action_type == 'PLAY':

    # Reward for card certainty:
        certainty_score=0.2

        card_index = action_dict['card_index']

        if card_index >= len(card_knowledge[0]):
            return False  # Invalid card index

        card = card_knowledge[0][card_index]
        if card['color'] is not None: certainty_score += 0.4
        if card['rank'] is not None: certainty_score += 0.4

    #check successful or failed play
        fireworks_progress = sum(next_observation['fireworks'].values()) - \
                             sum(current_observation['fireworks'].values())
        if fireworks_progress > 0:
            intrinsic_reward += set_reward['reward_successful_play']
        else:
            intrinsic_reward += set_reward['penalty_wrong_play']


    # Reward for strategic discard

    if action_type == 'DISCARD':
      player_hand = current_observation['card_knowledge'][0]
      card_index = action_dict['card_index']
      discarded_card = player_hand[card_index]
      known_discarded_card= next_observation['discard_pile'][len(next_observation['discard_pile'])-1]
      color=discarded_card['color']
      rank=discarded_card['rank']
      fireworks = current_observation['fireworks']

      certainty_score=0.1

      if color is not None: certainty_score += 0.3
      if rank is not None:  certainty_score += 0.3

      #treating special cases where we know we can discad the card even without full information:

      # completely useless card that is not needed anymore to finish fireworks
      if rank is not None and color is not None and rank <= fireworks[color] :
        certainty_score += 0.3

      #if we only know the rank and it is inferior to the minimum card value on the table then it can be discarded for sure
      elif rank is not None and color is None and rank <= min(fireworks.values()):
            certainty_score=1
      #final score
      if known_discarded_card['rank'] == fireworks[known_discarded_card['color']]+1:
            intrinsic_reward+= certainty_score * set_reward['discard_useful']
      else:
            intrinsic_reward+= certainty_score * set_reward['discard_useless']
    # Penalty or reward for information token usage
    info_token_difference = next_observation['information_tokens'] - \
                            current_observation['information_tokens']
    intrinsic_reward += info_token_difference


    return intrinsic_reward


##Evaluation

In [None]:
def play_by_model(env, model: FCModel, agent_class, *, epsilon=0, games=100, record_out=None):
  """evaluate the playing performance of a given model with agent_class"""
  N = env.N
  episode_returns = []

  model.eval()

  for e in range(games):
    intrinsic_rewards = []
    cooperation_rate=0
    play_discard_rate=0
    steps=0
    env.reset()
    agents = [agent_class(_, model) for _ in range(N)]
    while not env.done:
      steps+=1
    # Get the current observation
      current_observation = env.obs(env.to_play)
      # choose an action
      for agent_id, agent in enumerate(agents):
        if env.to_play == agent_id:
          action = agent.act(env, epsilon)
        else:
          agent.act(env)  # observation to be utilized future

      #keep track of action nature of the agent
      if 0 <= action <=9:
        play_discard_rate+=1
      else:
        cooperation_rate+=1
      env.step(action)
      #save intrinsic reward
      next_observation = env.obs(env.to_play)
      env.intrinsic_reward = calculate_balanced_reward(current_observation,next_observation, action)
      hanabi_score=sum(env.obs(env.to_play)['fireworks'].values())

      #save episode return
      intrinsic_rewards.append(env.intrinsic_reward)

    episode_returns.append(sum(intrinsic_rewards))
    #episode_returns.append(env.record.episode_return())

    #save final score /cooperation rate and play/discard rate
    env.record.hanabi_scores.append(hanabi_score)
    env.record.cooperation_rate.append(cooperation_rate/steps)
    env.record.play_discard_rate.append(play_discard_rate/steps)
    if record_out is not None:
      record_out.append(env.record)
  return episode_returns,env.record.hanabi_scores,env.record.cooperation_rate,env.record.play_discard_rate

## DQN and variants

see also pytorch tutorial https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html

In [None]:
device = 'cpu'

In [None]:
def rows2cols(rows):
  C = len(rows[0])
  return [[r[c] for r in rows] for c in range(C)]

In [None]:
class ReplayBuffer:
  """buffer to keep past experiences as a sequence of transitions (s,a,r,s',done)"""
  def __init__(self, limit: int=100_000):
    self.buffer = []
    self.limit = limit
    self.cur = 0

  def add(self, transition):
    if len(self.buffer) >= self.limit:
      self.buffer[self.cur] = transition
      self.cur = (self.cur+1) % self.limit
    else:
      self.buffer.append(transition)

  def __getitem__(self, idx: int):
    return self.buffer[idx]

  def __len__(self):
    return len(self.buffer)

  def sample(self, k: int=1):
    rows = random.sample(self.buffer, k)
    return rows2cols(rows)

In [None]:
def train_minibatch(model, optimizer, criterion, target_model, samples, gamma):
  optimizer.zero_grad()
  states, actions, rewards, succ_states, has_successors = samples
  q_values = model(torch.Tensor(np.array(states)))
  action_index = torch.Tensor(actions).to(torch.int64).unsqueeze(-1)
  # print(f'{q_values.shape=}, {action_index.shape=}')
  q_values = q_values.gather(1, action_index.to(device))
  target_qs = torch.Tensor(rewards).to(device)
  successor_mask = torch.tensor(has_successors, dtype=torch.bool)
  with torch.no_grad():
    target_q_all = target_model(torch.Tensor(np.array(succ_states)))
    # print(f'{target_q_all.shape=}, {successor_mask.shape=}, {x.shape=}')
    # print(f'{target_qs[successor_mask].shape}')
    target_qs[successor_mask] += gamma * target_q_all[successor_mask].max(1).values

  loss = criterion(q_values, target_qs.unsqueeze(-1))
  loss.backward()
  torch.nn.utils.clip_grad_value_(model.parameters(), 10)
  optimizer.step()

In [None]:
def train_agents(env, buffer, model, optimizer, target_model, step_limit,
                 agent_class,
                 eps_threshold=0.5, batch_size=1, gamma=0.99):
  criterion = torch.nn.MSELoss()
  N = env.N
  episode_returns = []
  hanabi_scores=[]
  intrinsic_rewards=[]
  steps=0
  cooperation_rate=0
  play_discard_rate=0
  agents = [agent_class(_, model) for _ in range(N)]
  env.reset()

  target_model.eval()           # no training for this model
  model.train()

  for t in range(step_limit):
    steps+=1
    # Get the current observation
    current_observation = env.obs(env.to_play)


    # choose an action
    for agent_id, agent in enumerate(agents):
      if env.to_play == agent_id:
        action = agent.act(env, eps_threshold)

      else:
        agent.act(env)  # observation to be utilized future
      #keep track of action nature of the agent
    if 0 <= action <=9:
        play_discard_rate+=1
    else:
        cooperation_rate+=1

    prev_local_state = agents[env.to_play].make_observation(env)

    env.step(action)
    # Get the new observation
    next_observation = env.obs(env.to_play)
     # Calculate intrinsic reward
    env.intrinsic_reward = calculate_balanced_reward(current_observation,next_observation, action)

    #save intrinsic rewards
    intrinsic_rewards.append(env.intrinsic_reward)
    """
    total_reward = env.reward + env.intrinsic_reward
    transition = (prev_local_state, action, total_reward,
                  agents[env.to_play].make_observation(env),
                  not env.done)
    """
    transition = (prev_local_state, action, env.intrinsic_reward,
                  agents[env.to_play].make_observation(env), not env.done)
    buffer.add(transition)

    if env.done:

      #score of the game at the end
      hanabi_score=sum(env.obs(env.to_play)['fireworks'].values())
      hanabi_scores.append(hanabi_score)

      #print(cooperation_rate/steps,play_discard_rate/steps)


    # prepare new game

      #episode_returns.append(env.record.episode_return())
      episode_returns.append(sum(intrinsic_rewards))
      intrinsic_rewards=[]
      env.reset()
      agents = [agent_class(_, model) for _ in range(N)]
      steps=0
      cooperation_rate=0
      play_discard_rate=0

    # learning
    if len(buffer) >= batch_size and t % batch_size == 0:
      train_minibatch(model, optimizer, criterion, target_model,
                      buffer.sample(batch_size), gamma)

  return episode_returns,hanabi_scores

In [None]:
def train_main(env, model, target_model, optimizer, agent_class,
               *, buffer=None, interval=8000, batch_size=64,
               eps_start=0.9, eps_end=0.1, eps_decay=10,
               repeat=200, eval_games=2, tau = 0.25):
  if buffer is None:
    buffer = ReplayBuffer()
  training_returns = []
  evaluation_returns = []
  epsilons = []
  hanabi_scores=[]
  cooperation_scores=[]
  play_discard_scores=[]

  for tt in range(repeat):
    # eps = eps_init * (eps_decay ** tt)
    eps = eps_end + (eps_start - eps_end) * math.exp(-1. * tt / eps_decay)

    epsilons.append(eps)
    returns,train_scores= train_agents(env, buffer, model, optimizer, target_model, interval,
                           eps_threshold=eps, batch_size=batch_size,
                           agent_class=agent_class)
    #average training rewards:
    mean = sum(returns) / len(returns)
    training_returns.append(mean)

    #best Hanabi score in training
    train_best_score=max(train_scores)

    # soft update of target_model
    parameters = model.state_dict()
    target_parameters = target_model.state_dict()
    for key in parameters:
      target_parameters[key] = parameters[key]*tau + target_parameters[key]*(1-tau)
    target_model.load_state_dict(target_parameters)

    # evaluation
    ereturns,eval_scores,cooperation_rates,play_discard_rates= play_by_model(env, model, games=eval_games, agent_class=agent_class)#try increase eval_games outside of training loop
    print(cooperation_rates,play_discard_rates)
    #average evaluation rewards:
    eval_mean = sum(ereturns) / len(ereturns)
    evaluation_returns.append(eval_mean)

    #best Hanabi score in evaluation
    eval_best_score=max(eval_scores)
    hanabi_scores.append(eval_best_score)

    #cooperation/non-cooperation action-ratios:
    cooperation_mean=sum(cooperation_rates) / len(cooperation_rates)
    play_discard_mean=sum(play_discard_rates) / len(play_discard_rates)
    cooperation_scores.append(cooperation_mean)
    play_discard_scores.append(play_discard_mean)


    logging.info(f'{tt=} {mean=:5.2f} {eps=:5.2f} {eval_mean=:5.2f} {train_best_score=:5.2f} {eval_best_score=:5.2f}')
  return training_returns, evaluation_returns, epsilons, hanabi_scores,cooperation_scores,play_discard_scores

In [None]:
    # evaluation
def eval(env,model,agent_class,eval_games=2):
      evaluation_returns = []
      ereturns,eval_scores,cooperation_rates,play_discard_rates= play_by_model(env, model, games=eval_games, agent_class=agent_class)#try increase eval_games outside of training loop
      print(cooperation_rates,play_discard_rates)
      print(ereturns)
      #average evaluation rewards:
      eval_mean = sum(ereturns) / len(ereturns)
      evaluation_returns.append(eval_mean)

      #best Hanabi score in evaluation
      eval_best_score=max(eval_scores)


      #cooperation/non-cooperation action-ratios:
      cooperation_mean=sum(cooperation_rates) / len(cooperation_rates)
      play_discard_mean=sum(play_discard_rates) / len(play_discard_rates)

      logging.info(f'{eval_mean=:5.2f} {eval_best_score=:5.2f} {cooperation_mean=:5.2f} {play_discard_mean=:5.2f}')
      return evaluation_returns, hanabi_scores

### fair agent

In [None]:
class FairAgent:
  def __init__(self, agent_id: int, model):
    self.model = model
    self.agent_id = agent_id

  def make_observation(self, env):
    return np.array(env.vector_obs())

  def act(self, env, epsilon=0) -> int:
    if env.to_play != self.agent_id:
      return None
    legal_moves = env.legal_moves()
    p = random.random()
    if p < epsilon:
      return random.choice(legal_moves)
    with torch.no_grad():
      obs = self.make_observation(env)
      batch = obs[np.newaxis, :]       # make a single-element batch
      qs = self.model(torch.from_numpy(batch)).detach().to('cpu')
      qs = qs[0]  # retrieve the first element in the batch
      qs -= qs.min()
      # zero-out illegal moves
      qs *= index_to_binaryvector(legal_moves, self.model.action_dim)
    return qs.argmax().item() #rerurns int action

#### experiments

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
#device = 'cpu'
print(f"Using {device} device")

Using cuda device


In [None]:
env = HanabiEnv()
individual_obs_dim = len(env.vector_obs())
action_dim = 20  # ?
model = FCModel(individual_obs_dim, action_dim, device).to(device)
target_model = FCModel(individual_obs_dim, action_dim, device).to(device)
# optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
optimizer = torch.optim.AdamW(model.parameters())

interval = 100000
buffer = ReplayBuffer()
train_r, eval_r, eps,hanabi_scores,cooperation_scores,play_discard_scores = train_main(env, model, target_model, optimizer,
                                  FairAgent,
                                  buffer=buffer,
                                  interval=interval, repeat=300, eps_decay=50)
eval_results, hanabi_scores= eval(env,model,FairAgent,eval_games=200)


2024-01-04 06:09:23,980 INFO model obs_dim=658, action_dim=20, device='cuda'
2024-01-04 06:09:23,987 INFO model obs_dim=658, action_dim=20, device='cuda'
2024-01-04 06:10:50,758 INFO tt=0 mean=-0.86 eps= 0.90 eval_mean=-0.40 train_best_score= 9.00 eval_best_score= 0.00


[0.0] [1.0]


2024-01-04 06:12:16,511 INFO tt=1 mean=-0.71 eps= 0.88 eval_mean=-0.60 train_best_score=11.00 eval_best_score= 0.00


[0.0] [1.0]


2024-01-04 06:13:45,887 INFO tt=2 mean=-0.65 eps= 0.87 eval_mean=-0.60 train_best_score= 9.00 eval_best_score= 0.00


[0.0] [1.0]


2024-01-04 06:15:14,906 INFO tt=3 mean=-0.53 eps= 0.85 eval_mean=-0.60 train_best_score= 9.00 eval_best_score= 0.00


[0.0] [1.0]


2024-01-04 06:16:45,358 INFO tt=4 mean=-0.45 eps= 0.84 eval_mean=-0.50 train_best_score=10.00 eval_best_score= 1.00


[0.0] [1.0]


2024-01-04 06:18:16,484 INFO tt=5 mean=-0.40 eps= 0.82 eval_mean= 0.50 train_best_score= 9.00 eval_best_score= 0.00


[0.2] [0.8]


2024-01-04 06:19:47,524 INFO tt=6 mean=-0.30 eps= 0.81 eval_mean= 0.10 train_best_score=10.00 eval_best_score= 1.00


[0.0] [1.0]


2024-01-04 06:21:20,107 INFO tt=7 mean=-0.23 eps= 0.80 eval_mean= 0.02 train_best_score= 9.00 eval_best_score= 0.00


[0.2] [0.8]


2024-01-04 06:22:54,636 INFO tt=8 mean=-0.20 eps= 0.78 eval_mean= 9.78 train_best_score= 8.00 eval_best_score= 2.00


[0.47435897435897434] [0.5256410256410257]


2024-01-04 06:24:28,419 INFO tt=9 mean=-0.13 eps= 0.77 eval_mean=12.45 train_best_score= 9.00 eval_best_score= 5.00


[0.4533333333333333] [0.5466666666666666]


2024-01-04 06:26:02,709 INFO tt=10 mean=-0.05 eps= 0.75 eval_mean=11.95 train_best_score=11.00 eval_best_score= 0.00


[0.4875] [0.5125]


2024-01-04 06:27:38,590 INFO tt=11 mean=-0.01 eps= 0.74 eval_mean=13.27 train_best_score= 7.00 eval_best_score= 1.00


[0.4810126582278481] [0.5189873417721519]


2024-01-04 06:29:14,164 INFO tt=12 mean= 0.06 eps= 0.73 eval_mean=12.67 train_best_score= 9.00 eval_best_score= 5.00


[0.4533333333333333] [0.5466666666666666]


2024-01-04 06:30:51,564 INFO tt=13 mean= 0.08 eps= 0.72 eval_mean=10.94 train_best_score= 9.00 eval_best_score= 0.00


[0.4875] [0.5125]


2024-01-04 06:32:28,327 INFO tt=14 mean= 0.10 eps= 0.70 eval_mean=13.17 train_best_score= 8.00 eval_best_score= 1.00


[0.4810126582278481] [0.5189873417721519]


2024-01-04 06:34:06,820 INFO tt=15 mean= 0.22 eps= 0.69 eval_mean=11.83 train_best_score=11.00 eval_best_score= 2.00


[0.47435897435897434] [0.5256410256410257]


2024-01-04 06:35:45,210 INFO tt=16 mean= 0.24 eps= 0.68 eval_mean=10.32 train_best_score= 9.00 eval_best_score= 4.00


[0.4605263157894737] [0.5394736842105263]


2024-01-04 06:37:23,589 INFO tt=17 mean= 0.30 eps= 0.67 eval_mean=10.91 train_best_score=13.00 eval_best_score= 1.00


[0.4810126582278481] [0.5189873417721519]


2024-01-04 06:39:03,291 INFO tt=18 mean= 0.34 eps= 0.66 eval_mean=12.23 train_best_score= 8.00 eval_best_score= 0.00


[0.4875] [0.5125]


2024-01-04 06:40:48,005 INFO tt=19 mean= 0.39 eps= 0.65 eval_mean= 9.12 train_best_score= 9.00 eval_best_score= 3.00


[0.4675324675324675] [0.5324675324675324]


2024-01-04 06:42:30,103 INFO tt=20 mean= 0.44 eps= 0.64 eval_mean=12.15 train_best_score= 8.00 eval_best_score= 2.00


[0.47435897435897434] [0.5256410256410257]


2024-01-04 06:44:11,184 INFO tt=21 mean= 0.46 eps= 0.63 eval_mean=11.88 train_best_score= 8.00 eval_best_score= 1.00


[0.4810126582278481] [0.5189873417721519]


2024-01-04 06:45:53,387 INFO tt=22 mean= 0.50 eps= 0.62 eval_mean= 9.15 train_best_score= 9.00 eval_best_score= 2.00


[0.47435897435897434] [0.5256410256410257]


2024-01-04 06:47:36,282 INFO tt=23 mean= 0.53 eps= 0.61 eval_mean=10.99 train_best_score=11.00 eval_best_score= 1.00


[0.4810126582278481] [0.5189873417721519]


2024-01-04 06:49:18,995 INFO tt=24 mean= 0.57 eps= 0.60 eval_mean= 6.64 train_best_score= 9.00 eval_best_score= 1.00


[0.4810126582278481] [0.5189873417721519]


2024-01-04 06:51:02,603 INFO tt=25 mean= 0.61 eps= 0.59 eval_mean=10.85 train_best_score= 9.00 eval_best_score= 2.00


[0.4696969696969697] [0.5303030303030303]


2024-01-04 06:52:46,232 INFO tt=26 mean= 0.63 eps= 0.58 eval_mean=11.44 train_best_score= 9.00 eval_best_score= 1.00


[0.4810126582278481] [0.5189873417721519]


2024-01-04 06:54:30,843 INFO tt=27 mean= 0.67 eps= 0.57 eval_mean= 9.00 train_best_score=10.00 eval_best_score= 1.00


[0.4810126582278481] [0.5189873417721519]


2024-01-04 06:56:15,742 INFO tt=28 mean= 0.70 eps= 0.56 eval_mean=10.40 train_best_score=10.00 eval_best_score= 3.00


[0.4675324675324675] [0.5324675324675324]


2024-01-04 06:58:01,119 INFO tt=29 mean= 0.72 eps= 0.55 eval_mean=12.56 train_best_score= 9.00 eval_best_score= 1.00


[0.4810126582278481] [0.5189873417721519]


2024-01-04 06:59:47,959 INFO tt=30 mean= 0.77 eps= 0.54 eval_mean= 9.80 train_best_score= 9.00 eval_best_score= 1.00


[0.4810126582278481] [0.5189873417721519]


2024-01-04 07:01:33,465 INFO tt=31 mean= 0.75 eps= 0.53 eval_mean=10.99 train_best_score= 9.00 eval_best_score= 2.00


[0.47435897435897434] [0.5256410256410257]


2024-01-04 07:03:20,029 INFO tt=32 mean= 0.81 eps= 0.52 eval_mean=11.55 train_best_score= 9.00 eval_best_score= 3.00


[0.4675324675324675] [0.5324675324675324]


2024-01-04 07:05:08,370 INFO tt=33 mean= 0.84 eps= 0.51 eval_mean= 9.91 train_best_score=10.00 eval_best_score= 2.00


[0.47435897435897434] [0.5256410256410257]


2024-01-04 07:06:57,050 INFO tt=34 mean= 0.85 eps= 0.51 eval_mean=13.45 train_best_score=10.00 eval_best_score= 1.00


[0.4810126582278481] [0.5189873417721519]


2024-01-04 07:08:45,056 INFO tt=35 mean= 0.89 eps= 0.50 eval_mean= 9.17 train_best_score= 9.00 eval_best_score= 2.00


[0.4810126582278481] [0.5189873417721519]


2024-01-04 07:10:34,200 INFO tt=36 mean= 0.91 eps= 0.49 eval_mean= 8.56 train_best_score=11.00 eval_best_score= 1.00


[0.4810126582278481] [0.5189873417721519]


2024-01-04 07:12:23,128 INFO tt=37 mean= 0.94 eps= 0.48 eval_mean=11.23 train_best_score=10.00 eval_best_score= 3.00


[0.4675324675324675] [0.5324675324675324]


2024-01-04 07:14:13,562 INFO tt=38 mean= 0.94 eps= 0.47 eval_mean= 7.49 train_best_score= 9.00 eval_best_score= 1.00


[0.4875] [0.5125]


2024-01-04 07:16:03,242 INFO tt=39 mean= 0.99 eps= 0.47 eval_mean= 8.15 train_best_score= 9.00 eval_best_score= 2.00


[0.47435897435897434] [0.5256410256410257]


2024-01-04 07:17:53,026 INFO tt=40 mean= 0.96 eps= 0.46 eval_mean= 8.35 train_best_score= 9.00 eval_best_score= 2.00


[0.47435897435897434] [0.5256410256410257]


2024-01-04 07:19:39,443 INFO tt=41 mean= 1.03 eps= 0.45 eval_mean= 4.87 train_best_score= 8.00 eval_best_score= 1.00


[0.2857142857142857] [0.7142857142857143]


2024-01-04 07:21:26,665 INFO tt=42 mean= 1.04 eps= 0.45 eval_mean=10.01 train_best_score= 8.00 eval_best_score= 2.00


[0.47435897435897434] [0.5256410256410257]


2024-01-04 07:23:14,621 INFO tt=43 mean= 1.04 eps= 0.44 eval_mean=10.62 train_best_score= 9.00 eval_best_score= 1.00


[0.4810126582278481] [0.5189873417721519]


2024-01-04 07:25:02,377 INFO tt=44 mean= 1.02 eps= 0.43 eval_mean=10.47 train_best_score= 8.00 eval_best_score= 1.00


[0.4810126582278481] [0.5189873417721519]


2024-01-04 07:26:51,081 INFO tt=45 mean= 1.01 eps= 0.43 eval_mean= 8.29 train_best_score= 9.00 eval_best_score= 3.00


[0.4675324675324675] [0.5324675324675324]


2024-01-04 07:28:39,520 INFO tt=46 mean= 1.02 eps= 0.42 eval_mean= 8.13 train_best_score=10.00 eval_best_score= 2.00


[0.47435897435897434] [0.5256410256410257]


2024-01-04 07:30:28,505 INFO tt=47 mean= 1.05 eps= 0.41 eval_mean= 6.78 train_best_score= 8.00 eval_best_score= 2.00


[0.4810126582278481] [0.5189873417721519]


2024-01-04 07:32:17,747 INFO tt=48 mean= 1.05 eps= 0.41 eval_mean= 6.99 train_best_score= 9.00 eval_best_score= 1.00


[0.4810126582278481] [0.5189873417721519]


2024-01-04 07:34:07,032 INFO tt=49 mean= 1.05 eps= 0.40 eval_mean= 7.02 train_best_score= 8.00 eval_best_score= 1.00


[0.4810126582278481] [0.5189873417721519]


2024-01-04 07:35:57,786 INFO tt=50 mean= 1.06 eps= 0.39 eval_mean= 5.01 train_best_score=10.00 eval_best_score= 3.00


[0.4675324675324675] [0.5324675324675324]


2024-01-04 07:37:47,912 INFO tt=51 mean= 1.08 eps= 0.39 eval_mean= 5.17 train_best_score=11.00 eval_best_score= 1.00


[0.4810126582278481] [0.5189873417721519]


2024-01-04 07:39:37,611 INFO tt=52 mean= 1.15 eps= 0.38 eval_mean= 4.72 train_best_score= 8.00 eval_best_score= 2.00


[0.47435897435897434] [0.5256410256410257]


2024-01-04 07:41:31,477 INFO tt=53 mean= 1.13 eps= 0.38 eval_mean= 2.58 train_best_score=12.00 eval_best_score= 2.00


[0.47435897435897434] [0.5256410256410257]


2024-01-04 07:43:30,850 INFO tt=54 mean= 1.15 eps= 0.37 eval_mean= 7.09 train_best_score= 9.00 eval_best_score= 5.00


[0.4605263157894737] [0.5394736842105263]


2024-01-04 07:45:28,078 INFO tt=55 mean= 1.22 eps= 0.37 eval_mean= 5.94 train_best_score= 8.00 eval_best_score= 2.00


[0.47435897435897434] [0.5256410256410257]


2024-01-04 07:47:28,307 INFO tt=56 mean= 1.16 eps= 0.36 eval_mean= 4.17 train_best_score= 9.00 eval_best_score= 2.00


[0.47435897435897434] [0.5256410256410257]


2024-01-04 07:49:22,293 INFO tt=57 mean= 1.14 eps= 0.36 eval_mean=-0.69 train_best_score= 9.00 eval_best_score= 4.00


[0.4605263157894737] [0.5394736842105263]


2024-01-04 07:51:13,867 INFO tt=58 mean= 1.20 eps= 0.35 eval_mean= 3.89 train_best_score= 8.00 eval_best_score= 2.00


[0.47435897435897434] [0.5256410256410257]


2024-01-04 07:53:06,633 INFO tt=59 mean= 1.23 eps= 0.35 eval_mean= 5.63 train_best_score= 9.00 eval_best_score= 2.00


[0.47435897435897434] [0.5256410256410257]


2024-01-04 07:54:59,463 INFO tt=60 mean= 1.21 eps= 0.34 eval_mean= 2.59 train_best_score= 8.00 eval_best_score= 2.00


[0.47435897435897434] [0.5256410256410257]


2024-01-04 07:56:52,712 INFO tt=61 mean= 1.24 eps= 0.34 eval_mean= 5.65 train_best_score= 8.00 eval_best_score= 3.00


[0.47435897435897434] [0.5256410256410257]


2024-01-04 07:58:46,302 INFO tt=62 mean= 1.25 eps= 0.33 eval_mean= 7.40 train_best_score=10.00 eval_best_score= 3.00


[0.4675324675324675] [0.5324675324675324]


2024-01-04 08:00:38,872 INFO tt=63 mean= 1.28 eps= 0.33 eval_mean= 5.09 train_best_score= 8.00 eval_best_score= 2.00


[0.47435897435897434] [0.5256410256410257]


2024-01-04 08:02:33,147 INFO tt=64 mean= 1.26 eps= 0.32 eval_mean= 4.67 train_best_score= 9.00 eval_best_score= 1.00


[0.4875] [0.5125]


2024-01-04 08:04:26,821 INFO tt=65 mean= 1.33 eps= 0.32 eval_mean= 5.71 train_best_score= 9.00 eval_best_score= 1.00


[0.4810126582278481] [0.5189873417721519]


2024-01-04 08:06:21,196 INFO tt=66 mean= 1.30 eps= 0.31 eval_mean= 5.37 train_best_score= 9.00 eval_best_score= 1.00


[0.4810126582278481] [0.5189873417721519]


2024-01-04 08:08:16,247 INFO tt=67 mean= 1.33 eps= 0.31 eval_mean= 7.14 train_best_score=10.00 eval_best_score= 3.00


[0.4675324675324675] [0.5324675324675324]


2024-01-04 08:10:10,163 INFO tt=68 mean= 1.32 eps= 0.31 eval_mean= 5.40 train_best_score=10.00 eval_best_score= 2.00


[0.47435897435897434] [0.5256410256410257]


2024-01-04 08:12:05,077 INFO tt=69 mean= 1.32 eps= 0.30 eval_mean= 6.07 train_best_score=10.00 eval_best_score= 2.00


[0.47435897435897434] [0.5256410256410257]


2024-01-04 08:14:00,335 INFO tt=70 mean= 1.32 eps= 0.30 eval_mean= 4.94 train_best_score= 9.00 eval_best_score= 1.00


[0.4722222222222222] [0.5277777777777778]


2024-01-04 08:15:56,851 INFO tt=71 mean= 1.46 eps= 0.29 eval_mean= 3.80 train_best_score=11.00 eval_best_score= 3.00


[0.46835443037974683] [0.5316455696202531]


In [None]:
plt.plot(train_r, label='training')
plt.plot(eval_r, label='evaluation (greedy)')
plt.plot(eps, label='epsilon')
plt.legend()
plt.xlabel(f'learning step (x{interval})')
plt.ylabel(f'episode return')

In [None]:
#plt.plot(hanabi_scores, label='game score')
plt.plot(cooperation_scores, label='cooperation rate')
plt.plot(play_discard_scores, label='p/d rate')
plt.legend()
plt.xlabel(f'learning step (x{interval})')
plt.ylabel(f'episode return')

In [None]:
filename = 'fairmodeel.pth'
torch.save(model.state_dict(), filename)

### cheating agent

In [None]:
class CheatingAgent:
  def __init__(self, agent_id: int, model):
    self.model = model
    self.agent_id = agent_id

  def make_observation(self, env):
    all_obs = [env.vector_obs((env.to_play + i) % env.N) for i in range(env.N)]
    return np.array(sum(all_obs, []))

  def act(self, env, epsilon=0) -> int:
    if env.to_play != self.agent_id:
      return None
    legal_moves = env.legal_moves()
    p = random.random()
    if p < epsilon:
      return random.choice(legal_moves)
    with torch.no_grad():
      obs = self.make_observation(env)
      batch = obs[np.newaxis, :]       # make a single-element batch
      qs = self.model(torch.from_numpy(batch)).detach().to('cpu')
      qs = qs[0]  # retrieve the first element in the batch
      qs -= qs.min()
      # zero-out illegal moves
      qs *= index_to_binaryvector(legal_moves, self.model.action_dim)
    return qs.argmax().item()

In [None]:
env = HanabiEnv()
indivisual_obs_dim = len(env.vector_obs())
action_dim = 20  # ?
cheat_model = FCModel(indivisual_obs_dim*2, action_dim, device).to(device)
cheat_target_model = FCModel(indivisual_obs_dim*2, action_dim, device).to(device)
# optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
optimizer = torch.optim.AdamW(cheat_model.parameters())

buffer = ReplayBuffer()
interval = 2000

ctrain_r, ceval_r, ceps = train_main(
  env, cheat_model, cheat_target_model, optimizer,
  buffer=buffer, interval=interval, repeat=200, eps_decay=50,
  agent_class=CheatingAgent
)

In [None]:
plt.plot(ctrain_r, label='training')
plt.plot(ceval_r, label='evaluation (greedy)')
plt.plot(ceps, label='epsilon')
plt.legend()
plt.xlabel(f'learning step (x{interval})')
plt.ylabel(f'episode return')

In [None]:
cfilename = 'cheatingmodeel.pth'
torch.save(cheat_model.state_dict(), cfilename)

In [None]:
record_store = []
play_by_model(env, cheat_model, agent_class=CheatingAgent, games=10, record_out=record_store)

### 3-player games

#### fair agents

In [None]:
env3 = HanabiEnv(num_players=3)
indivisual_obs_dim3 = len(env3.vector_obs())
action_dim = 20  # ?
model3 = FCModel(indivisual_obs_dim3, action_dim, device).to(device)
target_model3 = FCModel(indivisual_obs_dim3, action_dim, device).to(device)
optimizer3 = torch.optim.AdamW(model3.parameters())

interval = 2000
buffer = ReplayBuffer()
train_r3, eval_r3, eps3 = train_main(env3, model3, target_model3, optimizer3,
                                     FairAgent,
                                     buffer=buffer,
                                     interval=interval, repeat=200, eps_decay=50)

In [None]:
plt.plot(train_r3, label='training')
plt.plot(eval_r3, label='evaluation (greedy)')
plt.plot(eps3, label='epsilon')
plt.legend()
plt.xlabel(f'learning step (x{interval})')
plt.ylabel(f'episode return')

#### cheating agents

In [None]:
env3 = HanabiEnv(num_players=3)
indivisual_obs_dim3 = len(env3.vector_obs())
action_dim = 30  # ?
cheat_model3 = FCModel(indivisual_obs_dim3*3, action_dim, device).to(device)
cheat_target_model3 = FCModel(indivisual_obs_dim3*3, action_dim, device).to(device)
optimizer3c = torch.optim.AdamW(cheat_model3.parameters())

buffer = ReplayBuffer()
interval = 2000

ctrain_r3, ceval_r3, c3eps = train_main(
  env3, cheat_model3, cheat_target_model3, optimizer3c,
  buffer=buffer, interval=interval, repeat=200, eps_decay=50,
  agent_class=CheatingAgent
)

In [None]:
plt.plot(ctrain_r3, label='training')
plt.plot(ceval_r3, label='evaluation (greedy)')
plt.plot(c3eps, label='epsilon')
plt.legend()
plt.xlabel(f'learning step (x{interval})')
plt.ylabel(f'episode return')