In [450]:
import pandas as pd
import numpy as np
import random
import copy
import math

In [451]:
actions_map = {
    0: 'take 1 coin',
    1: 'coup',
    2: 'take 2 coins',
    3: 'take 3 coins',
    4: 'steal 2 coins',
    5: 'assassinate',
    6: 'exchange',
    7: 'challenge',
    8: 'block foreign aid',
    9: 'block stealing',
    10: 'block assassination'
}

In [452]:
class Action:
    def __init__(self, name, challengeable, response_card, response_action,
                 p1_net_coins, p2_net_coins, p1_net_cards, p2_net_cards, vector):
        self.name = name
        self.challengeable = challengeable
        self.response_card = response_card
        self.response_action = response_action
        self.p1_net_coins = p1_net_coins
        self.p2_net_coins = p2_net_coins
        self.p1_net_cards = p1_net_cards
        self.p2_net_cards = p2_net_cards
#         self.base_utility = base_utility
#         self.p_bluff = p_bluff
        self.vector = vector

    def update_responses(self, response_card, response_action):
        self.response_card = response_card
        self.response_action = response_action


In [453]:
take_1 = Action(actions_map[0], False, None, None, 1, 0, 0, 0, [0])

coup = Action(actions_map[1], False, None, None, -7, 0, 0, -1, [1])

take_2 = Action(actions_map[2], True, 'Duke', actions_map[8], 2, 0, 0, 0, [2])

take_3 = Action(actions_map[3], True, None, actions_map[7], 3, 0, 0, 0, [3])

steal_2 = Action(actions_map[4], True, ['Captain', 'Ambassador'], actions_map[9], 2, -2, 0, 0,[4])

assassinate = Action(actions_map[5], True, 'Contessa', actions_map[10], -3, 0, 0, -1, [5])

exchange = Action(actions_map[6], True, None, actions_map[7], 0, 0, 0, 0,[6])

# challenge = Action(actions_map[7], False, None, None, 0, 0, -1, -1, 1, 0)

block_take_2 = Action(actions_map[8], True, None, actions_map[7], 0, -2, 0, 0, [7])

block_steal = Action(actions_map[9], True, None, actions_map[7], 2, -2, 0, 0, [8])

block_assassination = Action(actions_map[10], True, None, actions_map[7], 0, 0, 1, 0, [9])

# challenge =

actions = {
    0: take_1,
    1: coup,
    2: take_2,
    3: take_3,
    4: steal_2,
    5: assassinate,
    6: exchange,
    7: block_take_2,
    8: block_steal,
    9: block_assassination
}

take_2.response_action = actions[7]
steal_2.response_action = actions[8]
assassinate.response_action = actions[9]

influences = {
    'Duke': [take_3, block_take_2, take_1, coup],
    'Captain': [steal_2, block_steal, take_2, take_1, coup],
    'Assassin': [assassinate, take_2, take_1, coup],
    'Contessa': [take_2, block_assassination, take_1, coup],
    'Ambassador': [exchange, block_steal, take_2, take_1, coup]
    }

inf_map = {
    'Dead': 0,
    'Duke': 1,
    'Captain': 2,
    'Assassin': 3,
    'Contessa': 4,
    'Ambassador': 5,
    'Hidden': 6
}


influences_reverse = {
    take_1: ['Duke', 'Captain', 'Assassin', 'Contessa', 'Ambassador'],
    coup: ['Duke', 'Captain', 'Assassin', 'Contessa', 'Ambassador'],
    take_2: ['Captain', 'Assassin', 'Contessa', 'Ambassador'],
    take_3: ['Duke'],
    steal_2: ['Captain'],
    assassinate: ['Assassin'],
    exchange: ['Ambassador'],
    block_take_2: ['Duke'],
    block_steal: ['Captain','Ambassador'],
    block_assassination: ['Contessa']
}

In [454]:
import torch
import torch.nn as nn
import torch.optim as optim

class QNetwork(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=128):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.relu = nn.ReLU()
        # self.dropout = nn.Dropout(0.2)
        self.rnn = nn.RNN(hidden_size, hidden_size, batch_first=True)
        self.fc3 = nn.Linear(hidden_size, 64)
        self.fc4 = nn.Linear(64, action_size)

    def forward(self, x):
        """
        x: shape (batch_size, seq_len, state_size)
        returns: shape (batch_size, action_size)
        """
        # 1) Pass each frame in the sequence through a linear + ReLU
        #    => shape (batch_size, seq_len, hidden_size)
        x = self.fc1(x)
        x = self.relu(x)
        # x = self.dropout(x)

        # 2) Pass entire sequence through RNN
        #    => out: shape (batch_size, seq_len, hidden_size)
        out, _ = self.rnn(x)

        # 3) We want the LAST hidden vector from the sequence (i.e. out[:, -1, :])
        last_out = out[:, -1, :]  # shape (batch_size, hidden_size)

        # 4) Pass to fully connected heads for Q-values
        x = self.fc3(last_out)
        x = self.relu(x)
        x = self.fc4(x)  # shape (batch_size, action_size)

        return x


embedding_cards = nn.Embedding(7, 1)
cards_tens = torch.tensor([0,1,2,3,4,5,6])
cards_emb = embedding_cards(cards_tens)

# actions = {
#     0: take_1,
#     1: coup,
#     2: take_2,
#     3: take_3,
#     4: steal_2,
#     5: assassinate,
#     6: exchange,
#     7: block_take_2,
#     8: block_steal,
#     9: block_assassination
# }

embedding_actions = nn.Embedding(8, 3)
actions_tens = torch.tensor([0,1,2,3,4,5,6,7])
actions_emb = embedding_actions(actions_tens)

embedding_coins = nn.Embedding(13, 2)
coins_tens = torch.tensor([0,1,2,3,4,5,6,7,8,9,10,11,12])
coins_emb = embedding_coins(coins_tens)

embedding_players = nn.Embedding(5, 3)
players_tens = torch.tensor([0,1,2,3,4])
players_emb = embedding_players(players_tens)

state_size_a = 12
state_size_b = 13
action_size = 16
block_size = 2
challenge_size = 2
card_size = 2

criterion = nn.HuberLoss(delta=1.0)
learning_rate = 0.001
gamma = 0.99
epsilon = 1.0
epsilon_decay = 0.005
min_epsilon = 0.01
# batch_size = 64
# replay_buffer_size = 10000

In [455]:
class StateSummarizer(nn.Module):
    def __init__(self, input_size, hidden_size, embedding_size):
        super(StateSummarizer, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, embedding_size)

    def forward(self, next_states):
        # Initialize hidden state and cell state
        h0 = torch.zeros(1, next_states.shape[0], self.hidden_size).to(next_states.device)
        c0 = torch.zeros(1, next_states.shape[0], self.hidden_size).to(next_states.device)

        # Pass the sequence of next states through the LSTM
        out, _ = self.lstm(next_states, (h0, c0))

        # print(f'out.shape: {out.shape}')

        # Take the last hidden state as the summary
        summary = torch.mean(out, dim=1)  # Average across the sequence dimension (dim=1)

        # Project the summary to the desired embedding size
        embedding = self.fc(summary)
        return embedding

# summarizer = StateSummarizer(12, 64, 12)

In [456]:
class Bot:
    def __init__(self, cards, num_coins, hostility, name, action_q, block_q, challenge_q, card_q,
                 optimizer_action, optimizer_block, optimizer_challenge, optimizer_card,
                 summarizer):
        self.cards = cards
        self.num_coins = num_coins
        self.hostility = hostility
        self.name = name
        self.action_q = action_q
        self.block_q = block_q
        self.challenge_q = challenge_q
        self.card_q = card_q
        self.optimizer_action = optimizer_action
        self.optimizer_block = optimizer_block
        self.optimizer_challenge = optimizer_challenge
        self.optimizer_card = optimizer_card
        self.summarizer = summarizer

    def num_coins_adj(self, n):
        self.num_coins += n

    def cards_adj(self, card):
        self.cards.remove(card)


In [457]:
bag = ['Duke', 'Captain', 'Assassin', 'Contessa', 'Ambassador'] * 3
random.shuffle(bag)

bots = []
bluff_degree = 0

for i in range(4):
    cards = random.sample(bag, 2)
    for card in cards:
        bag.remove(card)
#     kb = []

    action_q = QNetwork(state_size_a, action_size)
    optimizer_action = optim.Adam(action_q.parameters(), lr=learning_rate)

    block_q = QNetwork(state_size_b, block_size)
    optimizer_block = optim.Adam(block_q.parameters(), lr=learning_rate)

    challenge_q = QNetwork(state_size_b, challenge_size)
    optimizer_challenge = optim.Adam(challenge_q.parameters(), lr=learning_rate)

    card_q = QNetwork(state_size_b, card_size)
    optimizer_card = optim.Adam(card_q.parameters(), lr=learning_rate)

    summarizer = StateSummarizer(12, 64, 12)

    bots.append(Bot(cards, 2, None, i, action_q, block_q, challenge_q, card_q,
                    optimizer_action, optimizer_block, optimizer_challenge, optimizer_card, summarizer))




for bot in bots:
    print(bot.cards)
print(bots[0].name)

['Contessa', 'Duke']
['Ambassador', 'Duke']
['Ambassador', 'Captain']
['Contessa', 'Assassin']
0


In [458]:
replay_buffer = []

In [459]:
# actions = {
#     0: take_1,
#     1: coup,
#     2: take_2,
#     3: take_3,
#     4: steal_2,
#     5: assassinate,
#     6: exchange,
#     7: block_take_2,
#     8: block_steal,
#     9: block_assassination
# }

def get_legal_actions(bot, bots):
  """Returns a list of legal action indices for the given bot."""
  legal_actions = []

  if bot.num_coins >= 10:
    return [1]

  # Always legal actions
  legal_actions.extend([0, 2, 3, 6])  # Income, Foreign Aid, Tax, Exchange are always legal

  # Conditional actions
  if bot.num_coins >= 7:
    legal_actions.append(1)  # Coup
  if bot.num_coins >= 3:
    legal_actions.append(5)  # Assassinate (if enough coins)

  # Actions that target other players
  for other_bot in bots:
    if other_bot != bot and other_bot.num_coins > 1 : #Can't steal from players with no coins
      legal_actions.append(4) #Steal
      break  # Only need to add steal once if there's a valid target

  return legal_actions

In [460]:
def action_selection(i, bots, actions_vector, actions, epsilon, state):

    legal_actions = get_legal_actions(bots[i], bots)  # Get list of legal actions

    if int(bots[i].name) == 0:
      if random.random() >= epsilon:
          q_values = bots[i].action_q(state)
          max_q_value = float('-inf')
          best_action = None

          # Iterate through Q-values and find the maximum for legal actions
          for action_idx, q_value in enumerate(q_values[0]):  # q_values[0] is assumed to be a 1D tensor
              if action_idx in legal_actions and q_value.item() > max_q_value:
                  max_q_value = q_value.item()
                  best_action = action_idx

          # If no best action was selected from Q-values, choose randomly from legal actions
          if best_action is None:
              best_action = random.choice(legal_actions)

          # Now determine the target bot: choose the bot (other than self) with the highest number of cards,
          # and if there's a tie, choose the one with the most coins.
          target = None
          max_cards = -1
          max_coins = -1
          for bot_idx, other_bot in enumerate(bots):
              if int(other_bot.name) != 0:  # Skip self (bot 0)
                  num_coins = other_bot.num_coins  # Assuming 'cards' is a list of the bot's cards
                  if num_coins > max_coins:
                      max_coins = num_coins  # Assuming num_coins is a numeric attribute
                      target = bots[bot_idx]

          # Return the chosen action and the target bot index
          return [best_action, target]
      else:
        # Random action selection:
        action = None
        if bots[i].num_coins >= 10:
            action = 1  # Coup
        else:
            # Choose a random action from the legal actions
            action = random.choice(legal_actions)

        target = None
        if (actions[action].p2_net_coins != 0 or actions[action].p2_net_cards != 0) and actions[action].response_action != 'challenge':
            targets = bots[:i] + bots[i+1:]
            valid_targets = [bot for bot in targets if bot.num_coins >= -actions[action].p2_net_coins]
            if valid_targets:
                target = random.choice(valid_targets)

        return [action, target]

# actions = {
#     0: take_1,
#     1: coup,
#     2: take_2,
#     3: take_3,
#     4: steal_2,
#     5: assassinate,
#     6: exchange,
#     7: block_take_2,
#     8: block_steal,
#     9: block_assassination
# }

    else:

      target = None
      action = None
      targets = bots[:i] + bots[i+1:]

      # Play truthfully:
      # bot = bots[i]
      if bots[i].num_coins >= 10:  # Coup if possible
          action = 1  # Coup action index
          target = random.choice(targets)  # Choose a random target
      else:
          # Prioritize actions based on cards and coins:
          if 'Duke' in bots[i].cards:  # Take 3 coins if Duke
              action = 3  # Take 3 coins action index
          elif 'Captain' in bots[i].cards and 4 in legal_actions:
              action = 4  # Steal action index
              valid_targets = [bot for bot in targets if bot.num_coins >= 2]
              if valid_targets:
                  target = random.choice(valid_targets)
              # target = bots.index(random.choice([other_bot for other_bot in bots if other_bot != bots[i] and other_bot.num_coins > 0]))
          elif 'Assassin' in bots[i].cards and 5 in legal_actions:
              action = 5  # Assassinate action index
              target = random.choice(targets)  # Choose a random target
          elif 'Ambassador' in bots[i].cards:  # Exchange if Ambassador
              action = 6  # Exchange action index
          else:  # Otherwise, take income
              action = 0  # Take 1 coin action index
          if bots[i].num_coins >= 7:
            action = 1 # Coup if previous action was not legal and can coup
            target = random.choice(targets)
          elif 'Duke' not in bots[i].cards:
            action = 2 # foreign aid if previous action was not legal and can foreign aid
          # else:
          #   action = 0 # income if previous action was not legal and can only income

      # If action requires a target and none is selected yet:
      # if (actions[action].p2_net_coins != 0 or actions[action].p2_net_cards != 0) and actions[action].response_action != 'challenge' and target is None:
      #     target = bots.index(random.choice(bots[:i] + bots[i+1:]))

      return [action, target]




def reaction_selection(i, bots, target, response_action, epsilon, state):

  if target is None:

    target = random.choice(bots[:i] + bots[i+1:])

  if random.random() >= epsilon and int(target.name) == 0:

    # state_tensor = torch.FloatTensor(state).unsqueeze(0)
    q_values = target.block_q(state)
    return torch.argmax(q_values).item(), target

  elif int(target.name) != 0:

    # Play truthfully - block only if has the card
    if response_action.name == actions_map[8]:  # Block foreign aid
        if 'Duke' in target.cards:
            return 1, target  # Block
        else:
            return 0, target  # Pass
    elif response_action.name == actions_map[9]:  # Block stealing
        if 'Captain' in target.cards or 'Ambassador' in target.cards:
            return 1, target  # Block
        else:
            return 0, target  # Pass
    elif response_action.name == actions_map[10]:  # Block assassination
        if 'Contessa' in target.cards:
            return 1, target  # Block
        else:
            return 0, target  # Pass
    else:  # Other actions (cannot be blocked truthfully)
        return 0, target  # Pass

  else:

    return random.choice([0, 1]), target


def challenge_selection(epsilon, state, bot):

  if random.random() >= epsilon and int(bot.name) == 0:

    return 0

    # state_tensor = torch.FloatTensor(state).unsqueeze(0)
    # q_values = bot.challenge_q(state_tensor)
    # return torch.argmax(q_values).item()

  else:

    return 0


def card_selection(bot, cards, epsilon, state, action):

  if random.random() >= epsilon and len(cards) > 1 and int(bot.name) == 0:

    # state_tensor = torch.FloatTensor(state).unsqueeze(0)
    q_values = bot.card_q(state)
    card_index = torch.argmax(q_values).item()  # Get index (0 or 1)
    return card_index  # Return the index directly

  else:

    c = random.choice(cards)

    c = cards.index(c)

    return c

In [461]:
def perform_action(bot, target, action, discard_pile, state, card_chosen, epsilon, bag):

    if target is not None:

        target.num_coins += action.p2_net_coins

        if action.p2_net_cards < 0 and len(target.cards) > 0:

            card = card_selection(target, target.cards, epsilon, state, action)
            # print(card)

            x = target.cards[card]

            discard_pile.append(inf_map[x])

            card_chosen = card

            target.cards.remove(x)

    bot.num_coins += action.p1_net_coins

    if action == exchange:

        card = card_selection(bot, bot.cards, epsilon, state, action)

        x = bot.cards[card]

        c = random.sample(bag, 2)

        # arr = [x] + c

        next_choice = card_selection(bot, c, epsilon, state, action)

        next_choice = c[next_choice]

        arr = [x] + [next_choice]

        final_choice = card_selection(bot, arr, epsilon, state, action)

        card_chosen = final_choice

        final_choice = arr[final_choice]

        arr.remove(final_choice)

        for i in arr:
            bag.insert(-1, i)

        random.shuffle(bag)
        bot.cards.insert(-1, final_choice)
        bot.cards.remove(x)

    return bot, target, discard_pile, card_chosen, bag




In [462]:
def reset_game(bots_copy):
  bag = ['Duke', 'Captain', 'Assassin', 'Contessa', 'Ambassador'] * 3
  random.shuffle(bag)
  new_bots = []  # Create a new list
  for i, bot in enumerate(bots_copy):
      cards = random.sample(bag, 2)
      for card in cards:
          bag.remove(card)
      new_bots.append(Bot(cards, 2, None, f'{i}', bot.action_q, bot.block_q, bot.challenge_q, bot.card_q,
                          bot.optimizer_action, bot.optimizer_block, bot.optimizer_challenge, bot.optimizer_card))
  return new_bots  # Return the new list

In [463]:
# Base Game Loop

def game_loop_random(bots, actions, influences_reverse, epsilon):
    bag = ['Duke', 'Captain', 'Assassin', 'Contessa', 'Ambassador'] * 3
    random.shuffle(bag)

    bots_copy = copy.deepcopy(bots)

    states = torch.empty((0, 12), dtype=torch.float32)
    states_block = torch.empty((0, 13), dtype=torch.float32)
    states_challenge = torch.empty((0, 13), dtype=torch.float32)
    states_card = torch.empty((0, 13), dtype=torch.float32)

    discard_piles = []
    discard_piles.append([])
    acting_players = []
    reacting_players = []
    current_players = [[1,1,1,1]]
    actions_game = [7]
    reactions_game = [0]
    challenges_game = [0]
    challenges_direction = []
    cards_game = []
    coins_game = []

    # state_tuples = []

    rewards = [0]

    action_history = [7]
    reaction_history = [0]
    challenge_history = [0]
    card_history = [0]

    cards_turn = [[inf_map[c] for c in bots[0].cards],
                  [6,6],
                  [6,6],
                  [6,6]]
    # for i in range(3):

    #     cards_ind = [inf_map[c] for c in bot.cards]
    #     cards_turn.append(cards_ind)

    cards_game.append(cards_turn)

    coins_turn = [2,2,2,2]
    coins_game.append(coins_turn)
    done = []
    cards_chosen = [0]

    t = 0


#     print(cards_game[-1])
#     print(coins_game[-1])

    while len(bots) > 1:

        if t > 100 and bots[0].name == 0:
          done.append(1)
          rewards.append(1.0)
          break


#         for bot in bots:
#             print(f'{bot.name}')
        i = 0
        while i < len(bots):

            # print(i)

            # for bot in bots:
            #   print(bot.cards)

            card_chosen = 0
#             print(i)

            if len(bots) == 1:
                # done.append(1)
                break


            done.append(0)
            rewards.append(0)

            challenge_dir = 2

            discard_pile = copy.deepcopy(discard_piles[-1])

            curr = None
            try:
                curr = bots[i]
            except:
                i = 0
                curr = bots[i]

            acting_players.append(int(curr.name))

            # cards_state = cards_game[-1]
            # coins_state = coins_game[-1]
            # current_players_state = current_players[-1]

            # cards_game: N x 4 x 2 -> 8 tensors of size N
            cards_game_tensors = [
                torch.tensor([cards_game[i][j][k] for i in range(len(cards_game))] )
                for j in range(4) for k in range(2)
            ]

            # current_players: N x 4 -> 1 tensor of size N
            current_players_tensors = torch.tensor([sum(row) for row in current_players])

            coins_game_tensors = [
                      torch.tensor([coins_game[i][j] for i in range(len(coins_game))] )
                      for j in range(4)
            ]

            # discard_piles: N x y (max y = 7) -> 7 tensors of size N
            max_discard_len = 7  # Maximum possible length of discard_piles
            discard_piles_tensors = [
                torch.tensor([discard_piles[i][j] if j < len(discard_piles[i]) else 0
                              for i in range(len(discard_piles))] )
                for j in range(max_discard_len)
            ]

            # Concatenate tensors for states_action
            # states_action = torch.cat(([
            #     torch.tensor(acting_players[1:]).unsqueeze(1), # unsqueeze to add a dimension
            #     torch.tensor(reacting_players).unsqueeze(1),
            #     torch.tensor(reactions_game).unsqueeze(1),
            #     torch.tensor(challenges_game).unsqueeze(1),
            #     torch.tensor(current_players_tensors).unsqueeze(1),
            #     *[t.unsqueeze(1) for t in cards_game_tensors],
            #     *[t.unsqueeze(1) for t in discard_piles_tensors],
            #     *[t.unsqueeze(1) for t in coins_game_tensors],
            #     torch.tensor(done).unsqueeze(1)
            # ]), 1)  # changed dim to 1

            state = None

            if int(bots[0].name) == 0:

              # 1. Cards in play (embedded):
              cards_in_play_embedded = []
              for card_name in influences.keys():
                  num_in_discard = discard_piles[-1].count(inf_map[card_name])
                  num_in_play = 3 - num_in_discard  # Assuming 3 of each card initially
                  cards_in_play_embedded.append(torch.tensor(num_in_play))  # Keep as tensor

              # Stack the embeddings to create a 2D tensor
              cards_in_play_embedded = torch.stack(cards_in_play_embedded).squeeze()

              # 4. Bot 0's normalized coins:
              bot0_coins_normalized = bots[0].num_coins / 12 # Normalize to 0-1 range (assuming max coins is 12)

              # 5. Average cards of other players (normalized and embedded):
              other_bots_cards = [len(bot.cards) for bot in bots if bot != bots[0]]
              avg_other_cards = sum(other_bots_cards) / len(other_bots_cards) if other_bots_cards else 0  # Avoid division by zero
              avg_other_cards_normalized = avg_other_cards / 2  # Normalize to 0-1 range (assuming max cards per bot is 2)
              # avg_other_cards_embedded = embedding_cards(torch.tensor(int(avg_other_cards_normalized))).tolist()  # Assuming embedding_cards is your embedding layer

              # 6. Bot 0's current cards (embedded):
              bot0_cards_embedded = []
              for card in bots[0].cards:
                  bot0_cards_embedded.append(embedding_cards(torch.tensor(inf_map[card])))  # Keep as tensor

              # If the bot has cards, concatenate the embeddings. Otherwise, create a zero tensor
              if bot0_cards_embedded:
                  bot0_cards_embedded = torch.cat(bot0_cards_embedded)
              else:
                  # Create a zero tensor with the expected shape if bot has no cards
                  bot0_cards_embedded = torch.cat((embedding_cards(torch.tensor(0)), embedding_cards(torch.tensor(0))))

              if len(bot0_cards_embedded) == 1:
                  bot0_cards_embedded = torch.cat((bot0_cards_embedded, embedding_cards(torch.tensor(0))))

              # 7. The last action taken (embedded):
              last_action = actions_game[-1]
              last_action_embedded = embedding_actions(torch.tensor(last_action))  # Keep as a tensor

              state = torch.cat(([cards_in_play_embedded,
                                  torch.tensor([bot0_coins_normalized]),
                                  torch.tensor([avg_other_cards_normalized]),
                                  bot0_cards_embedded,
                                  last_action_embedded])
                                ).type(torch.float32)

              state_block = torch.cat([state, torch.tensor([reactions_game[-1]])], 0)
              state_challenge = torch.cat([state, torch.tensor([challenges_game[-1]])], 0)
              state_card = torch.cat([state, torch.tensor([cards_chosen[-1]])], 0)

              # states_temp = torch.cat((states_temp, state.unsqueeze(0)), 0)
              # states_summarized = bot.summarizer(states_temp)
              states = torch.cat([states, state.unsqueeze(0)], 0)
              states_block = torch.cat([states_block, state_block.unsqueeze(0)], 0)
              states_challenge = torch.cat([states_challenge, state_challenge.unsqueeze(0)], 0)
              states_card = torch.cat([states_card, state_card.unsqueeze(0)], 0)


            action_stack = []

            action_vector = [0,1,2,3,4,5,6]
            for j in action_vector:
                if actions[j].p1_net_coins * (-1) > bots[i].num_coins:
                    action_vector.remove(j)

            # state = None
            # state_tensor = None
            # if bots[0].name == 0:
            #   state = get_state(bots, discard_pile, action_history, reaction_history, challenge_history, card_history, bots[i], network_type="action")
            #   state_tensor = torch.FloatTensor(state).unsqueeze(0)
            #   print(len(state_tensor[0]))

            action_selection_output = action_selection(i, bots, action_vector, actions, epsilon, states.unsqueeze(0))

            action = action_selection_output[0]

            for x in [states, states_block, states_challenge, states_card]:
              x[-1][9:12] = torch.tensor(embedding_actions(torch.tensor(action)))

            try:
              last_state = states[-2]
            except:
              last_state = None

            # state_tuples.append((states[-1], actions[action], rewards[-1], last_state, done[-1]))

            if actions_game[-1] == 7:
              actions_game[-1] = action

            else:
              actions_game.append(action)

            action_e = actions_emb[action]

            action = actions[action]
    #         print(action_selection_output[1])

            # print(f'bot {bots[i].name} is performing action {action.name}')
            # print(f'target is {action_selection_output[1]}')

            target = None
            reacting_player = 4
            challenge = 0
            reaction = 0
            try:
              target = action_selection_output[1]
              reacting_player = int(target.name)
            except:
              target = None
              # reacting_player = 4
                # print("no target")
            # if target is not None:
            #     print(f'target is {target.name}')

            action_stack.append(action)

            if action.response_action is not None and target is None:
              target = random.choice(bots)
              reacting_player = int(target.name)
            reacting_players.append(reacting_player)

            # state_reaction = copy.deepcopy(state_action)
            # state_reaction.append(action)

            # state_challenge = copy.deepcopy(state_action)
            # state_challenge.append(action)

            # state_card = copy.deepcopy(state_action)
            # state_card.append(action)

            if action.response_action is not None and action.response_action != 'challenge':  # is blockable?

                response, target = reaction_selection(i, bots, target, action.response_action, epsilon, states_block.unsqueeze(0))

                # reacting_player = int(target.name)

#                 try:
#                     print(f'bot {target.name} is considering blocking')
#                 except:
#                     print("no target, check reaction selection")

                if response == 1:

                    reaction = 1

                    reactions_game.append(1)

#                     reacting_players.append(int(target.name))

                    action_stack.append(action.response_action)
                    # state_challenge.append(reaction)
                    # state_card.append(reaction)

                    # print(f'bot {target.name} is performing action {action.response_action.name} against bot {bots[i].name}')

                else:

                    reactions_game.append(0)

            else:

                reactions_game.append(0)

#                     print(f'target will not block')

            if action_stack[-1].response_action == 'challenge':  # is challengeable?

                response = challenge_selection(epsilon, states_challenge.unsqueeze(0), target if len(action_stack) == 3 else bots[i])

                if response == 1:

                    challenge = 1

                    challenges_game.append(1)

                    action_stack.append('challenge')

                    if len(action_stack) == 3:
                        challenge_dir = 1

                    else:
                        challenge_dir = 0
                    challenges_direction.append(challenge_dir)

                else:

                    challenges_game.append(0)
                    challenges_direction.append(challenge_dir)

#                     print('no challenge')

            else:

                challenges_game.append(0)
                challenges_direction.append(2)

            # challenges_game.append(0)
            # challenges_direction.append(2)

            while len(action_stack) != 0:

                # state_card.append(challenge)
                # state_card.append(challenge_dir)

                a = action_stack.pop()
                # if a != 'challenge':
                #   print(a.name)

                if int(bots[0].name) != 0:
                  rewards[-1] = -1.0
                  done[-1] = 1

                if a == 'challenge':

                    print('error!')

                    if len(action_stack) > 1:

                        if influences_reverse[action_stack[-1]] in target.cards:

                            # print(f'bot {bots[i].name} has lost the challenge')

                            card = 0
                            if len(bots[i].cards) > 1:

                              card = card_selection(bots[i], bots[i].cards, epsilon, states_card.unsqueeze(0), a)

                            # print(card)
                            x = bots[i].cards[card]

                            discard_pile.append(inf_map[x])

                            card_chosen = card
                            # card = inf_map.get(card)

                            bots[i].cards.remove(x)

                            if len(bots[i].cards) == 0:

                                print (f'bot {bots[i].name} is out!')

                                bots.remove(bots[i])

                                i -= 1

                            target.cards.remove(influences_reverse[action_stack[-1]])
                            bag.insert(influences_reverse[action_stack[-1]])
                            random.shuffle(bag)
                            c = random.sample(bag, 1)
                            bag.remove(c)
                            target.cards.insert(c)

                            action_stack.clear()

                        else:

                            # print(f'bot {target.name} has lost the challenge')

                            card = 0
                            if len(target.cards) > 1:
                              card = card_selection(target, target.cards, epsilon, states_card.unsqueeze(0), a)

                            # print(card)
                            x = target.cards[card]

                            discard_pile.append(inf_map[x])

                            card_chosen = card

                            # card = inf_map.get(card)

                            target.cards.remove(x)

                            if len(target.cards) == 0:

                                bots.remove(target)

                                # print (f'bot {target.name} is out!')

                            action_stack.pop()

                    else:

                        if influences_reverse[action_stack[-1]] in bots[i].cards:

                            # print(f'bot {target.name} has lost the challenge')

                            card = 0
                            if len(target.cards) > 1:
                              card = card_selection(target, target.cards, epsilon, states_card.unsqueeze(0), a)

                            # print(card)
                            x = target.cards[card]

                            discard_pile.append(inf_map[x])

                            card_chosen = card
                            # card = inf_map.get(card)

                            target.cards.remove(x)

                            if len(target.cards) == 0:

                                # print (f'bot {target.name} is out!')

                                bots.remove(target)

                            bots[i].cards.remove(influences_reverse[action_stack[-1]])
                            bag.insert(influences_reverse[action_stack[-1]])
                            random.shuffle(bag)
                            c = random.sample(bag, 1)
                            bag.remove(c)
                            bots[i].cards.insert(c)

                        else:

                            # print(f'bot {bots[i].name} has lost the challenge')

                            card = 0
                            if len(bots[i].cards) > 1:
                              card = card_selection(bots[i], bots[i].cards, epsilon, states_card.unsqueeze(0), a)
                            # print(card)
                            x = bots[i].cards[card]

                            discard_pile.append(inf_map[x])

                            card_chosen = card
                            # card = inf_map.get(card)

                            bots[i].cards.remove(x)

                            if len(bots[i].cards) == 0:

                                # print (f'bot {bots[i].name} is out!')

                                bots.remove(bots[i])

                                i -= 1

                            action_stack.pop()

                else:

                    # print(f'current action: {a.name}')

                    if len(action_stack) == 1:

                        target, curr, discard_pile, card_chosen, bag = perform_action(target, curr, a, discard_pile, states_card.unsqueeze(0), card_chosen, epsilon, bag)
                        if curr in bots:
                            if len(curr.cards) == 0:
                                if curr in bots:
                                    bots.remove(curr)
                                    # print(f'{curr.name} is out!')
                                    i -= 1

                    else:

                        curr, target, discard_pile, card_chosen, bag = perform_action(curr, target, a, discard_pile, states_card.unsqueeze(0), card_chosen, epsilon, bag)
                        if target in bots:
                            if len(target.cards) == 0:
                                if target in bots:
                                    bots.remove(target)
                                    # print(f'{target.name} is out!')




            # print(f'bot {curr.name} has {curr.num_coins} coins.')
            # if target is not None:
            #     print(f'bot {target.name} has {target.num_coins} coins.')

            # print(f'bot {curr.name} has {len(curr.cards)} cards.')
            # if target is not None:
            #     print(f'bot {target.name} has {len(target.cards)} cards.')


            curr_players = [0,0,0,0]
            for bot in bots:
#                 print(int(bot.name))
                curr_players[int(bot.name)] = 1
            # print(curr_players)
            current_players.append(curr_players)

            cards_turn = [[0,0],
                          [0,0],
                          [0,0],
                          [0,0]]
            coins_turn = [0,0,0,0]

            # if len(bots[0].cards) == 0:
            #     cards_turn[0] = [0,0]
            # if len(bots[0].cards) == 1:
            #     cards_turn[0].append(0)

            for bot in bots:

                bot_index = int(bot.name)

                cards_ind = []

                cards_turn[bot_index] = [inf_map[c] for c in bot.cards]

                if len(bot.cards) == 0:
                    cards_turn[bot_index] = [0, 0]
                if len(bot.cards) == 1:
                    cards_turn[bot_index].append(0)

                coins_turn[bot_index] = bot.num_coins

            cards_game.append(cards_turn)
            coins_game.append(coins_turn)

            discard_piles.append(discard_pile)

            # reacting_players.append(reacting_player)
            # reactions_game.append(reaction)
            # challenges_game.append(challenge)
            # challenges_direction.append(challenge_dir)
            cards_chosen.append(card_chosen)

            action_history.append(action)
            reaction_history.append(reaction)
            challenge_history.append(challenge)
            card_history.append(card_chosen)

            # print(actions_game[-1])

            i += 1
            t += 1
#             print(cards_turn)
#             print(coins_turn)
# #             print(curr_players)
#             print(discard_pile)


    # print(f'bot {bots[0].name} wins!')
    acting_players.append(4)
    reacting_players.append(4)
    actions_game.append(7)
    # reactions_game.append(0)
    # challenges_game.append(0)
    challenges_direction.append(2)
    done.append(1)
    # rewards = copy.deepcopy(done)
    if int(bots[0].name) != 0:
      rewards[-1] = -1.0
    else:
      rewards[-1] = 1.0


    # Reset Game

    bag = ['Duke', 'Captain', 'Assassin', 'Contessa', 'Ambassador'] * 3
    random.shuffle(bag)
    # new_bots = []  # Create a new list

    for bot in bots_copy:
        bot.cards = random.sample(bag, 2)
        for card in bot.cards:
            bag.remove(card)
        bot.num_coins = 2

    bots = bots_copy


    return discard_piles, acting_players, reacting_players, current_players, actions_game, reactions_game, challenges_game, cards_game, coins_game, challenges_direction, done, rewards, cards_chosen, bots_copy


In [464]:
### Test ###

win_rate = 0.0

# for i in range(50):

discard_piles, acting_players, reacting_players, current_players, actions_game, reactions_game, challenges_game, cards_game, coins_game, challenges_direction, done, rewards, cards_chosen, bots_copy = game_loop_random(bots, actions, influences_reverse, 0.0)

bots = bots_copy

  # if rewards[-1] == 1.0:
  #   win_rate += 1.0

# print(win_rate/50)
print(len(acting_players))
print(len(reacting_players))
print(len(actions_game))
print(len(reactions_game))
print(len(challenges_game))
print(len(cards_game))
print(len(coins_game))
print(len(done))


data = {
    'acting_players': acting_players,
    'reacting_players': reacting_players,
    'actions_game': actions_game,
    'reactions_game': reactions_game,
    'challenges_game': challenges_game,
    'challenges_direction': challenges_direction,
    'cards_game': cards_game,
    'card_chosen': cards_chosen,
    'coins_game': coins_game,
    'done': done,
    'rewards': rewards
}



df = pd.DataFrame(data = data)

print(df.head(3))
print()
print(df.tail(3))
print()

print(df[10:15])

#

35
35
35
35
35
35
35
35
   acting_players  reacting_players  actions_game  reactions_game  \
0               0                 1             0               0   
1               1                 1             3               0   
2               2                 3             2               0   

   challenges_game  challenges_direction                        cards_game  \
0                0                     2  [[4, 1], [6, 6], [6, 6], [6, 6]]   
1                0                     2  [[4, 1], [5, 1], [5, 2], [4, 3]]   
2                0                     2  [[4, 1], [5, 1], [5, 2], [4, 3]]   

   card_chosen    coins_game  done  rewards  
0            0  [2, 2, 2, 2]     0      0.0  
1            0  [3, 2, 2, 2]     0      0.0  
2            0  [3, 5, 2, 2]     0      0.0  

    acting_players  reacting_players  actions_game  reactions_game  \
32               3                 1             2               0   
33               1                 3             1           

  x[-1][9:12] = torch.tensor(embedding_actions(torch.tensor(action)))


In [465]:
from os import stat
from collections import defaultdict
from collections import Counter
num_episodes = 1000
max_steps_per_episode = 200
epsilon = 1.0
list_division = 4
gamma = 0.99

bot = copy.deepcopy(bots[0])

avg_losses_action = []
avg_losses_block = []
avg_losses_challenge = []
avg_losses_card = []

# bots.remove(bots[-1])
# bots.remove(bots[-1])

win_rates = []
avg_game_lengths = []

data_fraction = 1/5

batch_size = 64

for episode in range(num_episodes):

  replay_buffer_actions = []
  replay_buffer_blocks = []
  replay_buffer_challenges = []
  replay_buffer_cards = []

  print(f'episode {episode} of 1000')
  print(f'epsilon: {epsilon}')
  print(f'gamma: {gamma}')

  # discard_piles, acting_players, reacting_players, current_players, actions_game, reactions_game, challenges_game, cards_game, coins_game, challenges_direction, done, rewards, cards_chosen, bots_copy = game_loop_random(bots, actions, influences_reverse, epsilon)
  # bots = bots_copy
  state = torch.empty((0, 12), dtype=torch.float32)  # Assume state_size = 25 for action network
  state_block = torch.empty((0, 13), dtype=torch.float32)
  state_challenge = torch.empty((0, 13), dtype=torch.float32)
  state_card = torch.empty((0, 13), dtype=torch.float32)
  # states_action = torch.empty((0, 24), dtype=torch.float32)  # Assume state_size = 25 for action network
  # next_states_action = torch.empty((0, 24), dtype=torch.float32)
  actions_main = torch.empty((0,), dtype=torch.int64)
  # states_block = torch.empty((0, 23), dtype=torch.float32)  # Assume state_size = 24 for block network
  # next_states_block = torch.empty((0, 23), dtype=torch.float32)
  actions_block = torch.empty((0,), dtype=torch.int64)
  # states_challenge = torch.empty((0, 24), dtype=torch.float32)  # Assume state_size = 25 for challenge network
  # next_states_challenge = torch.empty((0, 24), dtype=torch.float32)
  actions_challenge = torch.empty((0,), dtype=torch.int64)
  # states_card = torch.empty((0, 19), dtype=torch.float32)  # Assume state_size = 20 for card network
  # next_states_card = torch.empty((0, 19), dtype=torch.float32)
  actions_card = torch.empty((0,), dtype=torch.int64)
  rewards = torch.empty((0,), dtype=torch.float32)
  done = torch.empty((0,), dtype=torch.float32)
  game_length_sum = 0
  all_discard_piles = []
  acting_players = []
  reacting_players = []
  reactions_game = []

  num_games = 0


  while len(state) <= 25 * batch_size:
    # print(f'game {num_games}')

    discard_pile, acting_player, reacting_player, current_player, action_game, reaction_game, challenge_game, card_game, coin_game, challenge_direction, done_0, reward, card_chosen, bots_copy = game_loop_random(bots, actions, influences_reverse, epsilon)

    num_games += 1
    # print(f'Game Number {num_games}')

    # if random.random():
    #   print(reacting_player)

    # start_index = int(3 * len(acting_player) / list_division)

    bots = bots_copy

    game_length_sum += len(acting_player)

    split_point = int((1 - data_fraction) * len(acting_player))
    acting_players += acting_player
    reacting_players += reacting_player
    current_players = current_player
    actions_game = action_game
    reactions_game += reaction_game
    challenges_game = challenge_game
    cards_game = card_game
    coins_game = coin_game
    challenges_direction = challenge_direction
    cards_chosen = card_chosen
    discard_piles = discard_pile
    all_discard_piles += discard_pile

    avg_game_lengths.append(game_length_sum / 100)

    # 1. Cards in play (embedded):
    all_cards_in_play_embedded = []

    for current_discard_pile in discard_pile:
        cards_in_play_embedded = []
        for card_name in influences.keys():
            num_in_discard = current_discard_pile.count(inf_map[card_name])
            num_in_play = 3 - num_in_discard
            cards_in_play_embedded.append(torch.tensor(num_in_play)) # Remove .tolist() here

        all_cards_in_play_embedded.append(torch.stack(cards_in_play_embedded)) # Stack the embedded tensors here

    # Convert to a single tensor outside the loop
    all_cards_in_play_embedded = torch.stack(all_cards_in_play_embedded)

    # 4. Bot 0's normalized coins:
    bot0_coins_normalized = torch.tensor(coin_game)[:, 0] / 12  # Get Bot 0's coins and normalize

    # 5. Average cards of other players (normalized and embedded):
    avg_other_cards_normalized = []
    for step_cards in cards_game:
        other_bots_cards = [len([card for card in bot_cards if card != 0])
                          for bot_cards in step_cards[1:]]  # Exclude Bot 0
        avg_other_cards = sum(other_bots_cards) / len(other_bots_cards) if other_bots_cards else 0
        avg_other_cards_normalized.append(avg_other_cards / 2)

    avg_other_cards_normalized = torch.tensor(avg_other_cards_normalized)
    # avg_other_cards_embedded = embedding_cards(torch.tensor(int(avg_other_cards_normalized))).tolist()  # Assuming embedding_cards is your embedding layer

    # 6. Bot 0's current cards (embedded):
    all_bot0_cards_embedded = []  # Store embedded cards for all steps

    for step_cards in cards_game:
        bot0_cards_embedded = []
        for card in step_cards[0]:  # Get Bot 0's cards for this step
            if card != 0:  # Assuming 0 represents the absence of a card
                bot0_cards_embedded.extend(embedding_cards(torch.tensor(card)).tolist())

        # If Bot 0 has no cards, add zero embeddings for consistency
        while len(bot0_cards_embedded) < embedding_cards.embedding_dim * 2:  # Assuming 2 cards max
            bot0_cards_embedded.extend([0] * embedding_cards.embedding_dim)

        all_bot0_cards_embedded.append(torch.tensor(bot0_cards_embedded))  # Convert to tensor and store

    all_bot0_cards_embedded = torch.stack(all_bot0_cards_embedded) # Stack to create a 2D tensor

    # 7. The last action taken (embedded):
    all_last_action_embedded = []  # Store embedded last actions for all steps

    actions_game.insert(0, 7)  # Add a dummy action at the beginning
    reaction_game.insert(0, 0)  # Add a dummy reaction at the beginning
    challenges_game.insert(0, 0)  # Add a dummy challenge at the beginning

    for i in range(len(actions_game[:-1])):
        last_action = actions_game[i]  # Get the action for the current step
        last_action_embedded = embedding_actions(torch.tensor(last_action)).tolist()
        all_last_action_embedded.append(last_action_embedded)

    all_last_action_embedded = torch.tensor(all_last_action_embedded)  # Convert to a tensor

    new_state = torch.cat(([all_cards_in_play_embedded.unsqueeze(-1),
                        bot0_coins_normalized.unsqueeze(-1).unsqueeze(-1),
                        avg_other_cards_normalized.unsqueeze(-1).unsqueeze(-1),
                        all_bot0_cards_embedded.unsqueeze(-1),
                        all_last_action_embedded.unsqueeze(-1)]),
                      1).squeeze(2)
    state = torch.cat([state, new_state], 0)

    # print(new_state.shape)

    # print(torch.tensor(reactions_game[:-1]).shape)

    new_state_block = torch.cat([new_state, torch.tensor(reaction_game[:-1]).unsqueeze(1)], 1)
    state_block = torch.cat([state_block, new_state_block], 0)

    new_state_challenge = torch.cat([new_state, torch.tensor(challenges_game[1:]).unsqueeze(1)], 1)
    state_challenge = torch.cat([state_challenge, new_state_challenge], 0)

    new_state_card = torch.cat([new_state, torch.tensor(cards_chosen).unsqueeze(1)], 1)
    state_card = torch.cat([state_card, new_state_card], 0)

    new_actions_main = torch.tensor(actions_game[1:]).type(torch.int64)
    actions_main = torch.cat([actions_main, new_actions_main], 0)


    new_actions_block = torch.tensor(reaction_game).type(torch.int64)
    actions_block = torch.cat([actions_block, new_actions_block], 0)


    new_actions_challenge = torch.tensor(challenges_game).type(torch.int64)
    actions_challenge = torch.cat([actions_challenge, new_actions_challenge], 0)


    new_actions_card = torch.tensor(cards_chosen).type(torch.int64)
    actions_card = torch.cat([actions_card, new_actions_card], 0)

    new_rewards = torch.tensor(reward).type(torch.float32)
    rewards = torch.cat([rewards, new_rewards], 0)

    new_done = torch.tensor(done_0).type(torch.float32)
    done = torch.cat([done, new_done], 0)



  print(f'Number of games in episode {episode}: {num_games}')

  print(state.shape)
  print(state_block.shape)
  print(state_challenge.shape)
  print(state_card.shape)

  states_action = torch.empty((0, 12), dtype=torch.float32)
  states_block = torch.empty((0, 13), dtype=torch.float32)
  states_challenge = torch.empty((0, 13), dtype=torch.float32)
  states_card = torch.empty((0, 13), dtype=torch.float32)

  # Assuming you have a list called 'all_states' that contains all the states
  # generated using the 'new_state' calculation you provided
  # and 'acting_players' list that has acting players per state,
  # and 'reacting_players' list for reacting players

  # all_states = []  # Initialize with your existing state generation logic

  # print(state.shape)
  # print(all_bot0_cards_embedded.shape)

  # Create a dictionary to store current states and their corresponding next states
  state_transitions = defaultdict(list)

  for i in range(len(state) - 1):  # Iterate through all states (except the last one)
      # print(next_state[7:9])
      current_state = state[i]
      next_state = state[i + 1]

      # Add the next state to the list of next states for the current state
      state_transitions[tuple(current_state.tolist())].append(next_state)  # Convert to tuple for dictionary key

  state_indices = {}

  for i, state_tensor in enumerate(state):
      state_indices[tuple(state_tensor.tolist())] = i

  for i in range(len(state) - 1):  # Iterate through all states (except the last one)

      current_state_action = state[i]
      current_state_block = state_block[i]
      current_state_challenge = state_challenge[i]
      current_state_card = state_card[i]
      # next_state = state[i + 1]

      # --- Action Network ---
      if acting_players[i] == 0:  # Check acting player for current state
          states_action = torch.cat([states_action, current_state_action.unsqueeze(0)], 0)
      # else:
      #     next_states_action = torch.cat([next_states_action, next_state.unsqueeze(0)], 0)

      # --- Reaction Network & Challenge Network ---
      if reacting_players[i] == 0:  # Bot 0 is the reacting player
          states_block = torch.cat([states_block, current_state_block.unsqueeze(0)], 0)
      # else:
      #     next_states_block = torch.cat([next_states_block, next_state.unsqueeze(0)], 0)

      # --- Challenge Network ---
      if reacting_players[i] == 0:
          states_challenge = torch.cat([states_challenge, current_state_challenge.unsqueeze(0)], 0)
      elif acting_players[i] == 0 and reactions_game[i+1] == 1: # Check acting player for current state
          states_challenge = torch.cat([states_challenge, current_state_challenge.unsqueeze(0)], 0)
      # else:
      #     next_states_challenge = torch.cat([next_states_challenge, next_state.unsqueeze(0)], 0)

      # --- Card Network ---
      # Assuming 'all_discard_piles' contains discard piles for each state
      # and 'all_bot0_cards_embedded' contains Bot 0's cards for each state

      # current_discard_pile_size = len(all_discard_piles[i])
      # next_discard_pile_size = len(all_discard_piles[i + 1])

      # Check if Bot 0 lost a card in the transition
      bot0_current_cards = state_card[i][7:9]
      bot0_next_cards    = state_card[i+1][7:9]
      if not torch.equal(bot0_current_cards, bot0_next_cards):

          states_card = torch.cat([states_card, current_state_card.unsqueeze(0)], 0)
      # else:
      #     next_states_card = torch.cat([next_states_card, next_state.unsqueeze(0)], 0)


  # print(state.shape)

  # print(states_action.shape)
  # print(next_states_action.shape)
  # print(len(states_action) + len(next_states_action))

  # print(states_block.shape)
  # print(next_states_block.shape)
  # print(len(states_block) + len(next_states_block))

  # print(states_challenge.shape)
  # print(next_states_challenge.shape)
  # print(len(states_challenge) + len(next_states_challenge))

  # print(states_card.shape)
  # print(next_states_card.shape)
  # print(len(states_card) + len(next_states_card))


  losses_action = []
  losses_block = []
  losses_challenge = []
  losses_card = []

  ############################################
# Example: On-the-fly building raw sequences
#          and passing them into the RNN Q
############################################
  num_batches_action = len(states_action) // batch_size

  # for i in range(num_batches_action):
  # -------------------------------------------------
  # 1) Sample a batch of indices
  # -------------------------------------------------
  batch_indices_action = random.sample(
      range(len(states_action)),
      min(batch_size, len(states_action))
  )

  batch_states_action = torch.stack([states_action[j] for j in batch_indices_action])
  batch_actions_main  = actions_main[batch_indices_action]  # shape (batch_size,)
  # (Optional) Make sure batch_actions_main is torch.LongTensor
  batch_actions_main  = torch.tensor(batch_actions_main, dtype=torch.long)

  # Lists to store Q-values we compute for each item in this mini-batch
  q_values_current_list = []
  q_values_nextmax_list = []
  rewards_list          = []
  done_list             = []

  # -------------------------------------------------
  # 2) Process each single-frame "state" in the batch
  #    to build the raw sequences for current & next
  # -------------------------------------------------
  for idx, single_frame_state in enumerate(batch_states_action):
      # ------------------
      # Find global index
      # ------------------
      indices = torch.where((state == single_frame_state).all(dim=1))[0]
      if len(indices) == 0:
          # We did not find it => treat as terminal
          # => Q(s) is “whatever”, we can do 0
          # => Or we skip, but let's just do zero and done=1
          q_values_current_list.append(torch.tensor(0.0))  # single float
          q_values_nextmax_list.append(torch.tensor(0.0))
          rewards_list.append(torch.tensor(0.0))
          done_list.append(torch.tensor(1.0))
          continue

      state_index = indices[0].item()

      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      # (a) Build the "current" sequence: from last done to now
      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      cur_seq_frames = []
      back_index = state_index
      while back_index >= 0 and done[back_index] != 1:
          cur_seq_frames.insert(0, state[back_index])  # front => chronological
          back_index -= 1
      # shape => (seq_len, 12)
      # try:
      cur_seq_tensor = torch.stack(cur_seq_frames, dim=0)
      # except:
      #   cur_seq_tensor = torch.stack(single_frame_state, dim=0)
      # shape => (1, seq_len, 12)
      cur_seq_tensor = cur_seq_tensor.unsqueeze(0)

      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      # (b) Build the "next" sequence
      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      next_seq_frames = []
      rewards_for_seq = []
      current_index = state_index + 1
      while (current_index < len(state)):
          next_seq_frames.append(state[current_index])
          rewards_for_seq.append(rewards[current_index])
          if done[current_index] == 1:
              break
          current_index += 1

      # We will decide how to handle "no next frames"
      if len(next_seq_frames) == 0:
          # Means terminal or no next chunk
          next_seq_tensor = None  # we'll handle it below
      else:
          next_seq_tensor = torch.stack(next_seq_frames, dim=0)
          next_seq_tensor = next_seq_tensor.unsqueeze(0)  # shape (1, seq_len2, 12)

      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      # (c) Forward pass for the current sequence
      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      # shape => (1, action_dim) if batch_size=1
      out_current = bot.action_q(cur_seq_tensor)
      # We gather the chosen action’s Q
      chosen_action = batch_actions_main[idx]  # a single int
      q_val_current = out_current[0, chosen_action]  # shape => scalar

      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      # (d) Forward pass for the next sequence (if it exists)
      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      if next_seq_tensor is None:
          # Terminal. Let's define next_max_Q = 0
          q_val_nextmax = torch.tensor(0.0)
          # Reward from the first step after current_index if in bounds
          if current_index < len(rewards):
              r = rewards[current_index]
          else:
              r = 0.0
          done_flag = 1.0
      else:
          out_next = bot.action_q(next_seq_tensor)        # shape => (1, action_dim)
          q_val_nextmax = out_next.max(dim=1)[0].squeeze() # scalar
          # Reward is average of rewards_for_seq
          if len(rewards_for_seq) > 0:
              r = sum(rewards_for_seq) / len(rewards_for_seq)
          else:
              r = 0.0
          # Are we done? If in range, check done[current_index]
          if current_index < len(done):
              done_flag = float(done[current_index])
          else:
              done_flag = 1.0

      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      # (e) Collect everything in lists
      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      q_values_current_list.append(q_val_current)
      q_values_nextmax_list.append(q_val_nextmax)
      rewards_list.append(torch.tensor(r, dtype=torch.float32))
      done_list.append(torch.tensor(done_flag, dtype=torch.float32))

  # -------------------------------------------------
  # 3) Convert results to Tensors
  #    so we can do a standard DQN loss
  # -------------------------------------------------
  # shape => (batch_size,)
  q_current_t     = torch.stack(q_values_current_list)  # current Q chosen action
  q_nextmax_t     = torch.stack(q_values_nextmax_list)
  rewards_t       = torch.stack(rewards_list)
  done_t          = torch.stack(done_list)

  # DQN target
  target_q = rewards_t + gamma * q_nextmax_t * (1.0 - done_t)

  # -------------------------------------------------
  # 4) Compute loss & backprop
  # -------------------------------------------------
  loss_action = criterion(q_current_t, target_q)
  bot.optimizer_action.zero_grad()
  loss_action.backward()
  bot.optimizer_action.step()

  # -------------------------------------------------
  # 5) Remove used transitions from replay
  #    (Optional, as in your original code)
  # -------------------------------------------------
  # states_action_np = states_action.cpu().numpy()
  # actions_main_np  = actions_main.cpu().numpy()

  # states_action_np = np.delete(states_action_np, batch_indices_action, axis=0)
  # actions_main_np  = np.delete(actions_main_np,  batch_indices_action, axis=0)

  # states_action = torch.tensor(states_action_np, dtype=torch.float32)
  # actions_main  = torch.tensor(actions_main_np,  dtype=torch.int64)

  losses_action.append(loss_action.item())


    # print('action success')




  ###############################################################################
# RNN-based DQN training loop for your "block" decision, without a summarizer
###############################################################################
  num_batches_block = len(states_block) // batch_size

  # for i in range(num_batches_block):
  # 1) Sample a random batch of indices
  batch_indices_block = random.sample(
      range(len(states_block)),
      min(batch_size, len(states_block))
  )

  # 2) Gather Tensors for this mini-batch
  batch_states_block  = torch.stack([states_block[j] for j in batch_indices_block])
  batch_actions_block = actions_block[batch_indices_block]  # shape (batch_size,)
  # Make sure actions are long-int for gather
  batch_actions_block = torch.tensor(batch_actions_block, dtype=torch.long)

  # Lists to store results for each item in the batch
  q_values_current_list = []
  q_values_nextmax_list = []
  rewards_list          = []
  done_list             = []

  # ------------------------------------------------------------------
  # 3) For each single-frame in batch_states_block:
  #    (a) Build "current" sequence (backwards)
  #    (b) Build "next" sequence (forwards)
  #    (c) Forward pass each through RNN
  #    (d) Collect chosen-action Q, next-max Q, reward, and done
  # ------------------------------------------------------------------
  for idx, single_frame_state in enumerate(batch_states_block):
      # 3.1) Find the global index in your big `state` array
      indices = torch.where((state_block == single_frame_state).all(dim=1))[0]

      if len(indices) == 0:
          # If we didn't find it, treat as terminal
          q_values_current_list.append(torch.tensor(0.0))
          q_values_nextmax_list.append(torch.tensor(0.0))
          rewards_list.append(torch.tensor(0.0))
          done_list.append(torch.tensor(1.0))
          continue

      state_index = indices[0].item()

      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      # (a) Build "current" sequence by going backward until done
      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      cur_seq_frames = []
      back_index = state_index
      while back_index >= 0 and done[back_index] != 1:
          cur_seq_frames.insert(0, state_block[back_index])  # front => chronological
          back_index -= 1
      # If we ended up with an empty chunk, fallback to single frame
      if len(cur_seq_frames) == 0:
          cur_seq_frames.append(state_block[state_index])

      # shape: (seq_len, 12)
      cur_seq_tensor = torch.stack(cur_seq_frames, dim=0)
      # shape: (1, seq_len, 12) for the RNN
      cur_seq_tensor = cur_seq_tensor.unsqueeze(0)

      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      # (b) Build "next" sequence by going forward until done or not reacting
      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      next_seq_frames = []
      rewards_for_seq = []
      current_idx = state_index + 1

      while (current_idx < len(state)):
          next_seq_frames.append(state_block[current_idx])
          rewards_for_seq.append(rewards[current_idx])
          if done[current_idx] == 1:
              break
          current_idx += 1

      # We will define "no next frames" => terminal
      if len(next_seq_frames) == 0:
          next_seq_tensor = None
      else:
          next_seq_tensor = torch.stack(next_seq_frames, dim=0)
          next_seq_tensor = next_seq_tensor.unsqueeze(0)  # (1, seq_len2, 12)

      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      # (c) Forward pass: current sequence
      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      out_current = bot.block_q(cur_seq_tensor)  # shape => (1, num_actions)
      chosen_action = batch_actions_block[idx]
      q_val_current = out_current[0, chosen_action]  # => scalar

      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      # (d) Forward pass: next sequence (if exists)
      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      if next_seq_tensor is None:
          # Terminal
          q_val_nextmax = torch.tensor(0.0)
          # Reward
          if current_idx < len(rewards):
              r = rewards[current_idx]
          else:
              r = 0.0
          done_flag = 1.0
      else:
          out_next = bot.block_q(next_seq_tensor)         # => (1, num_actions)
          q_val_nextmax = out_next.max(dim=1)[0].squeeze()  # => scalar

          # Reward = average (or sum) across the forward chunk
          if len(rewards_for_seq) > 0:
              r = sum(rewards_for_seq) / len(rewards_for_seq)
          else:
              r = 0.0

          # Done or not
          if current_idx < len(done):
              done_flag = float(done[current_idx])
          else:
              done_flag = 1.0

      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      # (e) Store in lists
      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      q_values_current_list.append(q_val_current)
      q_values_nextmax_list.append(q_val_nextmax)
      rewards_list.append(torch.tensor(r, dtype=torch.float32))
      done_list.append(torch.tensor(done_flag, dtype=torch.float32))

  # ------------------------------------------------------------------
  # 4) Convert the results to Tensors for a standard DQN update
  # ------------------------------------------------------------------
  q_current_t = torch.stack(q_values_current_list)   # shape (batch_size,)
  q_nextmax_t = torch.stack(q_values_nextmax_list)   # shape (batch_size,)
  rewards_t   = torch.stack(rewards_list)            # shape (batch_size,)
  done_t      = torch.stack(done_list)               # shape (batch_size,)

  target_q = rewards_t + gamma * q_nextmax_t * (1.0 - done_t)

  # ------------------------------------------------------------------
  # 5) Compute the loss & backprop
  # ------------------------------------------------------------------
  loss_block = criterion(q_current_t, target_q)
  bot.optimizer_block.zero_grad()
  loss_block.backward()
  bot.optimizer_block.step()

  # ------------------------------------------------------------------
  # 6) Remove these samples from your replay buffer (optional)
  # ------------------------------------------------------------------
  # Convert Tensors to NumPy for deletion
  # states_block_np  = states_block.cpu().numpy()
  # actions_block_np = actions_block.cpu().numpy()

  # states_block_np  = np.delete(states_block_np,  batch_indices_block, axis=0)
  # actions_block_np = np.delete(actions_block_np, batch_indices_block, axis=0)

  # states_block  = torch.tensor(states_block_np,  dtype=torch.float32)
  # actions_block = torch.tensor(actions_block_np, dtype=torch.int64)

  # If you have next_states_block or others, remove them similarly...
  # next_states_block = np.delete(...)

  losses_block.append(loss_block.item())


    # print('block success')




  ##############################################################################
# RNN-based DQN loop for your "challenge" decision, without a separate summarizer
##############################################################################

  num_batches_challenge = len(states_challenge) // batch_size

  # for i in range(num_batches_challenge):
  # 1) Sample a batch of indices
  batch_indices_challenge = random.sample(
      range(len(states_challenge)),
      min(batch_size, len(states_challenge))
  )

  # 2) Gather the states & actions for this mini-batch
  batch_states_challenge  = torch.stack([states_challenge[j] for j in batch_indices_challenge])
  batch_actions_challenge = actions_challenge[batch_indices_challenge]
  batch_actions_challenge = torch.tensor(batch_actions_challenge, dtype=torch.long)

  # Lists to store results for the DQN update
  q_values_current_list = []
  q_values_nextmax_list = []
  rewards_list          = []
  done_list             = []

  # ----------------------------------------------------------------
  # 3) For each single-frame in batch_states_challenge:
  #    - Build backward-chunk (current)
  #    - Build forward-chunk (next) per your challenge logic
  #    - RNN forward pass -> Q-values
  # ----------------------------------------------------------------
  for idx, single_frame_state in enumerate(batch_states_challenge):
      # Try to locate this state in the global `state` array
      indices = torch.where((state_challenge == single_frame_state).all(dim=1))[0]

      if len(indices) == 0:
          # If not found, treat as terminal
          # Q(current) = 0, Q(next_max) = 0, reward=0, done=1
          q_values_current_list.append(torch.tensor(0.0))
          q_values_nextmax_list.append(torch.tensor(0.0))
          rewards_list.append(torch.tensor(0.0))
          done_list.append(torch.tensor(1.0))
          continue

      state_index = indices[0].item()

      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      # 3A) Build the backward-chunk for the current state
      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      cur_seq_frames = []
      back_index = state_index
      # Move backward until we hit done=1 or index < 0
      # (You could also incorporate more challenge-specific conditions
      # if you want symmetrical logic with forward-chunk.)
      while back_index >= 0 and done[back_index] != 1:
          cur_seq_frames.insert(0, state_challenge[back_index])  # front => chronological order
          back_index -= 1

      if len(cur_seq_frames) == 0:
          # fallback to single frame if we got nothing
          cur_seq_frames.append(state_challenge[state_index])

      # Shape: (seq_len, 12)
      cur_seq_tensor = torch.stack(cur_seq_frames, dim=0)
      # Shape: (1, seq_len, 12) for the RNN
      cur_seq_tensor = cur_seq_tensor.unsqueeze(0)

      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      # 3B) Build the forward-chunk for the next state
      #     Use your "challenge" condition:
      #         reacting_players[idx] != 0
      #         AND (acting_players[idx] != 0 OR reactions_game[idx] != 1)
      #         AND done[idx] != 1
      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      next_seq_frames = []
      rewards_for_seq = []
      current_idx     = state_index + 1

      while (
          current_idx < len(state)):
          next_seq_frames.append(state_challenge[current_idx])
          rewards_for_seq.append(rewards[current_idx])
          if done[current_idx] == 1:
              break
          current_idx += 1

      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      # RNN Forward pass for the "current" sequence
      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      out_current = bot.challenge_q(cur_seq_tensor)  # shape (1, num_actions)
      chosen_action = batch_actions_challenge[idx]
      q_val_current = out_current[0, chosen_action]   # => scalar

      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      # RNN Forward pass for the "next" sequence
      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      q_val_nextmax = torch.tensor(0.0)
      r = 0.0
      done_flag = 1.0
      if len(next_seq_frames) == 0:
          # Terminal
          q_val_nextmax = torch.tensor(0.0)
          # Reward:
          if current_idx < len(rewards):
              r = rewards[current_idx]
          else:
              r = 0.0
          done_flag = 1.0
      else:
          # Non-terminal
          next_seq_tensor = torch.stack(next_seq_frames, dim=0).unsqueeze(0)
          out_next        = bot.challenge_q(next_seq_tensor)   # shape (1, num_actions)
          q_val_nextmax   = out_next.max(dim=1)[0].squeeze()   # => scalar

          # Reward as average (or adjust logic as you see fit)
          if len(rewards_for_seq) > 0:
              r = sum(rewards_for_seq) / len(rewards_for_seq)
          else:
              r = 0.0

          if current_idx < len(done):
              done_flag = float(done[current_idx])
          else:
              done_flag = 1.0  # out of bounds => terminal

      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      # Collect everything for the DQN update
      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      q_values_current_list.append(q_val_current)
      q_values_nextmax_list.append(q_val_nextmax)
      rewards_list.append(torch.tensor(r, dtype=torch.float32))
      done_list.append(torch.tensor(done_flag, dtype=torch.float32))

  # ----------------------------------------------------------------
  # 4) Convert to Tensors & compute DQN loss
  # ----------------------------------------------------------------
  q_current_t = torch.stack(q_values_current_list)  # shape (batch_size,)
  q_nextmax_t = torch.stack(q_values_nextmax_list)  # shape (batch_size,)
  rewards_t   = torch.stack(rewards_list)           # shape (batch_size,)
  done_t      = torch.stack(done_list)              # shape (batch_size,)

  target_q = rewards_t + gamma * q_nextmax_t * (1.0 - done_t)

  loss_challenge = criterion(q_current_t, target_q)

  # ----------------------------------------------------------------
  # 5) Backprop & optimize
  # ----------------------------------------------------------------
  bot.optimizer_challenge.zero_grad()
  loss_challenge.backward()
  bot.optimizer_challenge.step()

  # ----------------------------------------------------------------
  # 6) Remove these samples from your replay buffer
  # ----------------------------------------------------------------
  # Convert Tensors -> NumPy for np.delete
  # states_challenge_np  = states_challenge.cpu().numpy()
  # actions_challenge_np = actions_challenge.cpu().numpy()
  # # If you have "next_states_challenge" or others, similarly convert them

  # states_challenge_np  = np.delete(states_challenge_np,  batch_indices_challenge, axis=0)
  # actions_challenge_np = np.delete(actions_challenge_np, batch_indices_challenge, axis=0)

  # # Rebuild your PyTorch Tensors
  # states_challenge  = torch.tensor(states_challenge_np,  dtype=torch.float32)
  # actions_challenge = torch.tensor(actions_challenge_np, dtype=torch.int64)

  # If you store next_states_challenge, remove them as well:
  # next_states_challenge_np = next_states_challenge.cpu().numpy()
  # next_states_challenge_np = np.delete(next_states_challenge_np, batch_indices_challenge, axis=0)
  # next_states_challenge = torch.tensor(next_states_challenge_np, dtype=torch.float32)

  losses_challenge.append(loss_challenge.item())


    # print('challenge success')




  ##############################################################################
# RNN-based training loop for your "card" decision,
# building raw backward and forward sequences on-the-fly
##############################################################################
  num_batches_card = len(states_card) // batch_size

  if num_batches_card == 0:
    raise Exception("Something went wrong")

  # for i in range(num_batches_card):

  # 1) Sample batch indices
  batch_indices_card = random.sample(
      range(len(states_card)),
      min(batch_size, len(states_card))
  )

  # 2) Gather states & actions for this batch
  batch_states_card  = torch.stack([states_card[j] for j in batch_indices_card])  # (batch_size, 12)
  batch_actions_card = actions_card[batch_indices_card]                           # (batch_size,)
  batch_actions_card = torch.tensor(batch_actions_card, dtype=torch.long)

  # We'll collect Q-values and targets for the entire batch
  q_current_list = []
  q_nextmax_list = []
  reward_list    = []
  done_list      = []

  # ------------------------------------------------------------------------
  # 3) For each single-frame in batch_states_card, build backward & forward
  # ------------------------------------------------------------------------
  for idx, single_frame_state in enumerate(batch_states_card):
      # Locate this state in the global 'state' buffer
      indices = torch.where((state_card == single_frame_state).all(dim=1))[0]
      if len(indices) == 0:
          # Not found => treat as terminal
          # Q(current) = 0, Q(next) = 0, reward=0, done=1
          q_current_list.append(torch.tensor(0.0))
          q_nextmax_list.append(torch.tensor(0.0))
          reward_list.append(torch.tensor(0.0))
          done_list.append(torch.tensor(1.0))
          continue

      state_index = indices[0].item()

      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      # 3A) BACKWARD chunk for "current" state
      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      cur_seq_frames = []
      back_idx = state_index

      # Move backward until done=1 or out of array
      while back_idx >= 0 and done[back_idx] != 1:
          cur_seq_frames.insert(0, state_card[back_idx])  # insert at front => chronological order
          back_idx -= 1

      if len(cur_seq_frames) == 0:
          # fallback to just the single frame
          cur_seq_frames.append(state_card[state_index])

      # Turn into shape (1, seq_len, 12) for the RNN
      cur_seq_tensor = torch.stack(cur_seq_frames, dim=0).unsqueeze(0)

      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      # 3B) FORWARD chunk for "next" state, until:
      #     - done=1
      #     - Bot 0's cards change (state[..., 7:9] differs from next step)
      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      fwd_seq_frames = []
      rewards_for_seq = []
      current_idx = state_index + 1

      while current_idx < len(state):

          # Append the current frame
          fwd_seq_frames.append(state_card[current_idx])
          rewards_for_seq.append(rewards[current_idx])

          # Check if the *next* step changes Bot 0's cards
          # so we break *after* including the current frame if it
          # leads to a change in [7:9].
          # if current_idx + 1 < len(state):
          #     # Compare the slice [7:9] of the current vs. the next
          #     bot0_current_cards = state_card[current_idx][7:9]
          #     bot0_next_cards    = state_card[current_idx+1][7:9]
          #     if not torch.equal(bot0_current_cards, bot0_next_cards):
          #         # Bot 0's cards changed => break
          #         current_idx += 1  # increment so we include the reward
          #         break

          # Stop if done=1
          if done[current_idx] == 1:
              break

          # If no change, keep going
          current_idx += 1

          # Also break if we’ve run out of array (the while condition checks that anyway)
          # but we’ll rely on the loop condition.

      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      # Forward pass in the RNN for the "current" sequence
      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      out_current = bot.card_q(cur_seq_tensor)     # shape: (1, num_actions)
      # pick the Q-value for the chosen action
      chosen_action = batch_actions_card[idx]
      q_val_current = out_current[0, chosen_action] # => scalar

      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      # Forward pass for the "next" sequence
      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      if len(fwd_seq_frames) == 0:
          # Terminal if we got no next frames
          q_val_nextmax = torch.tensor(0.0)
          # Reward
          if current_idx < len(state):
              r = rewards[current_idx]
          else:
              r = 0.0
          done_flag = 1.0
      else:
          # Non-terminal
          fwd_seq_tensor = torch.stack(fwd_seq_frames, dim=0).unsqueeze(0)
          out_next       = bot.card_q(fwd_seq_tensor)  # shape (1, num_actions)
          q_val_nextmax  = out_next.max(dim=1)[0].squeeze()      # => scalar

          # Could do average or sum of rewards
          if len(rewards_for_seq) > 0:
              r = sum(rewards_for_seq) / len(rewards_for_seq)
          else:
              r = 0.0

          # done flag
          if current_idx < len(done):
              done_flag = float(done[current_idx])
          else:
              done_flag = 1.0

      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      # Collect results for the DQN update
      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      q_current_list.append(q_val_current)
      q_nextmax_list.append(q_val_nextmax)
      reward_list.append(torch.tensor(r, dtype=torch.float32))
      done_list.append(torch.tensor(done_flag, dtype=torch.float32))

  # ------------------------------------------------------------------
  # 4) Convert to Tensors & compute the DQN target
  # ------------------------------------------------------------------
  q_current_t = torch.stack(q_current_list)   # (batch_size,)
  q_nextmax_t = torch.stack(q_nextmax_list)   # (batch_size,)
  reward_t    = torch.stack(reward_list)      # (batch_size,)
  done_t      = torch.stack(done_list)        # (batch_size,)

  # DQN target: r + gamma * max(Q(next)) * (1 - done)
  target_q = reward_t + gamma * q_nextmax_t * (1.0 - done_t)

  # ------------------------------------------------------------------
  # 5) Compute loss & optimize
  # ------------------------------------------------------------------
  loss_card = criterion(q_current_t, target_q)

  bot.optimizer_card.zero_grad()
  loss_card.backward()
  bot.optimizer_card.step()

  # ------------------------------------------------------------------
  # 6) Remove these samples from the replay buffer
  # ------------------------------------------------------------------
  # states_card_np   = states_card.cpu().numpy()
  # actions_card_np  = actions_card.cpu().numpy()
  # # If you have next_states_card, similarly convert & remove

  # states_card_np   = np.delete(states_card_np,  batch_indices_card, axis=0)
  # actions_card_np  = np.delete(actions_card_np, batch_indices_card, axis=0)

  # states_card  = torch.tensor(states_card_np, dtype=torch.float32)
  # actions_card = torch.tensor(actions_card_np, dtype=torch.int64)

  losses_card.append(loss_card.item())



  # bot.cards = bots[0].cards
  # bot.num_coins = bots[0].num_coins






    # i += 1
  epsilon *= 0.995

  # if (episode + 1) % 100 == 0:
  #   data_fraction = min(data_fraction + 1/5, 1)
  #   epsilon = 1.0

  # gamma = min(0.99, gamma + 0.001)

  avg_losses_action.append(sum(losses_action) / len(losses_action))
  avg_losses_block.append(sum(losses_block) / len(losses_block))
  avg_losses_challenge.append(sum(losses_challenge) / len(losses_challenge))
  avg_losses_card.append(sum(losses_card) / len(losses_card))

  print(f'Avg Action Loss, {num_batches_action} batches: {avg_losses_action[-1]}')
  print(f'Avg Block Loss, {num_batches_block} batches: {avg_losses_block[-1]}')
  print(f'Avg Challenge Loss, {num_batches_challenge} batches: {avg_losses_challenge[-1]}')
  print(f'Avg Card Loss, {num_batches_card} batches: {avg_losses_card[-1]}')

  # Copy parameters of action_q network
  bots[0].action_q.load_state_dict(bot.action_q.state_dict())

  # Copy parameters of block_q network
  bots[0].block_q.load_state_dict(bot.block_q.state_dict())

  # Copy parameters of challenge_q network
  bots[0].challenge_q.load_state_dict(bot.challenge_q.state_dict())

  # Copy parameters of card_q network
  bots[0].card_q.load_state_dict(bot.card_q.state_dict())

  def verify_allclose(net_a, net_b, eps=1e-6):
    for p_a, p_b in zip(net_a.parameters(), net_b.parameters()):
        if not torch.allclose(p_a, p_b, atol=eps, rtol=1e-5):
            return False
    return True

  print("Action Q matches?",
        verify_allclose(bots[0].action_q, bot.action_q))
  print("Block Q matches?",
        verify_allclose(bots[0].block_q, bot.block_q))
  print("Challenge Q matches?",
        verify_allclose(bots[0].challenge_q, bot.challenge_q))
  print("Card Q matches?",
        verify_allclose(bots[0].card_q, bot.card_q))


  print(bots[0].name)

  if episode % 10 == 0:

    win_rate = 0

    bot0_actions = []

    game_lengths = 0

    for i in range(100):

      discard_pile, acting_player, reacting_player, current_player, action_game, reaction_game, challenge_game, card_game, coin_game, challenge_direction, done_0, reward, card_chosen, bots_copy = game_loop_random(bots, actions, influences_reverse, 0.0)
      bots = bots_copy

      game_lengths += len(action_game)

      for actor, action in zip(acting_player, action_game):
            if actor == 0:
                bot0_actions.append(action)

      if reward[-1] == 1.0:
        win_rate += 1

    # win_rate = win_rate / 50
    # print(f'Bot 0 Win Rate, Random Actions: {win_rate / 100}')

    if bot0_actions:  # ensure the list is not empty
        counter = Counter(bot0_actions)
        most_common_action, count = counter.most_common(1)[0]
        print(f'Most common action for Bot 0: {most_common_action} taken {count} times.')
        print(f'total game lengths: {game_lengths}')

    win_rate = win_rate / 100
    win_rates.append(win_rate)

    print(f'win rate: {win_rate}')


  # df = pd.DataFrame(data = data)
  # print(df.head())

episode 0 of 1000
epsilon: 1.0
gamma: 0.99


  x[-1][9:12] = torch.tensor(embedding_actions(torch.tensor(action)))


Number of games in episode 0: 42
torch.Size([1648, 12])
torch.Size([1648, 13])
torch.Size([1648, 13])
torch.Size([1648, 13])


  batch_actions_main  = torch.tensor(batch_actions_main, dtype=torch.long)
  rewards_list.append(torch.tensor(r, dtype=torch.float32))
  batch_actions_block = torch.tensor(batch_actions_block, dtype=torch.long)
  rewards_list.append(torch.tensor(r, dtype=torch.float32))
  batch_actions_challenge = torch.tensor(batch_actions_challenge, dtype=torch.long)
  rewards_list.append(torch.tensor(r, dtype=torch.float32))
  batch_actions_card = torch.tensor(batch_actions_card, dtype=torch.long)
  reward_list.append(torch.tensor(r, dtype=torch.float32))


Avg Action Loss, 7 batches: 0.015562765300273895
Avg Block Loss, 7 batches: 0.013180936686694622
Avg Challenge Loss, 8 batches: 0.02278696931898594
Avg Card Loss, 2 batches: 0.046754177659749985
Action Q matches? True
Block Q matches? True
Challenge Q matches? True
Card Q matches? True
0
Most common action for Bot 0: 2 taken 634 times.
total game lengths: 3791
win rate: 0.24
episode 1 of 1000
epsilon: 0.995
gamma: 0.99
Number of games in episode 1: 42
torch.Size([1606, 12])
torch.Size([1606, 13])
torch.Size([1606, 13])
torch.Size([1606, 13])
Avg Action Loss, 6 batches: 0.0151097122579813
Avg Block Loss, 5 batches: 0.017018742859363556
Avg Challenge Loss, 6 batches: 0.04598613828420639
Avg Card Loss, 2 batches: 0.0480349138379097
Action Q matches? True
Block Q matches? True
Challenge Q matches? True
Card Q matches? True
0
episode 2 of 1000
epsilon: 0.990025
gamma: 0.99
Number of games in episode 2: 42
torch.Size([1638, 12])
torch.Size([1638, 13])
torch.Size([1638, 13])
torch.Size([1638,

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

plt.plot(avg_losses_action, label='Avg Action Loss')
plt.plot(avg_losses_block, label='Avg Block Loss')
plt.plot(avg_losses_challenge, label='Avg Challenge Loss')
plt.plot(avg_losses_card, label='Avg Card Loss')

plt.xlabel('Episode/Iteration')
plt.ylabel('Loss')
plt.title('Training Losses')
plt.legend()
plt.show()

In [None]:
plt.plot(win_rates, label='Win Rate')

plt.xlabel('Episode/Iteration')
plt.ylabel('Win Rate')
plt.title('Win Rate over Time')
plt.legend()
plt.show()

In [None]:
plt.plot(avg_game_lengths, label='Avg Game Lengths')

plt.xlabel('Episode/Iteration')
plt.ylabel('Avg Game Length (timesteps)')
plt.title('Avg Game Length over Time')
plt.legend()
plt.show()

In [None]:
torch.save({
        'action_q_state_dict': bot.action_q.state_dict(),
        'block_q_state_dict': bot.block_q.state_dict(),
        'challenge_q_state_dict': bot.challenge_q.state_dict(),
        'card_q_state_dict': bot.card_q.state_dict(),
        'optimizer_action_state_dict': bot.optimizer_action.state_dict(),
        'optimizer_block_state_dict': bot.optimizer_block.state_dict(),
        'optimizer_challenge_state_dict': bot.optimizer_challenge.state_dict(),
        'optimizer_card_state_dict': bot.optimizer_card.state_dict()
    }, 'bot_parameters.pth')

In [None]:
checkpoint = torch.load('bot_parameters.pth')
bot.action_q.load_state_dict(checkpoint['action_q_state_dict'])
bot.block_q.load_state_dict(checkpoint['block_q_state_dict'])
bot.challenge_q.load_state_dict(checkpoint['challenge_q_state_dict'])
bot.card_q.load_state_dict(checkpoint['card_q_state_dict'])
bot.optimizer_action.load_state_dict(checkpoint['optimizer_action_state_dict'])
bot.optimizer_block.load_state_dict(checkpoint['optimizer_block_state_dict'])
bot.optimizer_challenge.load_state_dict(checkpoint['optimizer_challenge_state_dict'])
bot.optimizer_card.load_state_dict(checkpoint['optimizer_card_state_dict'])

In [None]:
bots[0].action_q.load_state_dict(bot.action_q.state_dict())

# Copy parameters of block_q network
bots[0].block_q.load_state_dict(bot.block_q.state_dict())

# Copy parameters of challenge_q network
bots[0].challenge_q.load_state_dict(bot.challenge_q.state_dict())

# Copy parameters of card_q network
bots[0].card_q.load_state_dict(bot.card_q.state_dict())


win_rate = 0

for i in range(100):

  # print(i)
  discard_pile, acting_player, reacting_player, current_player, action_game, reaction_game, challenge_game, card_game, coin_game, challenge_direction, done_0, reward, card_chosen, bots_copy = game_loop_random(bots, actions, influences_reverse, 0.0)
  bots = bots_copy
  if reward[-1] == 1:
    win_rate += 1

# win_rate = win_rate / 50
print(f'Bot 0 Win Rate, Random Actions: {win_rate / 100}')
