In [16]:
import pandas as pd
import numpy as np
import random
import copy
import math

In [17]:
actions_map = {
    0: 'take 1 coin',
    1: 'coup',
    2: 'take 2 coins',
    3: 'take 3 coins',
    4: 'steal 2 coins',
    5: 'assassinate',
    6: 'exchange',
    7: 'challenge',
    8: 'block foreign aid',
    9: 'block stealing',
    10: 'block assassination'
}

In [18]:
class Action:
    def __init__(self, name, challengeable, response_card, response_action,
                 p1_net_coins, p2_net_coins, p1_net_cards, p2_net_cards, vector):
        self.name = name
        self.challengeable = challengeable
        self.response_card = response_card
        self.response_action = response_action
        self.p1_net_coins = p1_net_coins
        self.p2_net_coins = p2_net_coins
        self.p1_net_cards = p1_net_cards
        self.p2_net_cards = p2_net_cards
#         self.base_utility = base_utility
#         self.p_bluff = p_bluff
        self.vector = vector

    def update_responses(self, response_card, response_action):
        self.response_card = response_card
        self.response_action = response_action


In [19]:
take_1 = Action(actions_map[0], False, None, None, 1, 0, 0, 0, [0])

coup = Action(actions_map[1], False, None, None, -7, 0, 0, -1, [1])

take_2 = Action(actions_map[2], True, 'Duke', actions_map[8], 2, 0, 0, 0, [2])

take_3 = Action(actions_map[3], True, None, actions_map[7], 3, 0, 0, 0, [3])

steal_2 = Action(actions_map[4], True, ['Captain', 'Ambassador'], actions_map[9], 2, -2, 0, 0,[4])

assassinate = Action(actions_map[5], True, 'Contessa', actions_map[10], -3, 0, 0, -1, [5])

exchange = Action(actions_map[6], True, None, actions_map[7], 0, 0, 0, 0,[6])

# challenge = Action(actions_map[7], False, None, None, 0, 0, -1, -1, 1, 0)

block_take_2 = Action(actions_map[8], True, None, actions_map[7], 0, -2, 0, 0, [7])

block_steal = Action(actions_map[9], True, None, actions_map[7], 2, -2, 0, 0, [8])

block_assassination = Action(actions_map[10], True, None, actions_map[7], 0, 0, 1, 0, [9])

# challenge =

actions = {
    0: take_1,
    1: coup,
    2: take_2,
    3: take_3,
    4: steal_2,
    5: assassinate,
    6: exchange,
    7: block_take_2,
    8: block_steal,
    9: block_assassination
}

take_2.response_action = actions[7]
steal_2.response_action = actions[8]
assassinate.response_action = actions[9]

influences = {
    'Duke': [take_3, block_take_2, take_1, coup],
    'Captain': [steal_2, block_steal, take_2, take_1, coup],
    'Assassin': [assassinate, take_2, take_1, coup],
    'Contessa': [take_2, block_assassination, take_1, coup],
    'Ambassador': [exchange, block_steal, take_2, take_1, coup]
    }

inf_map = {
    'Dead': 0,
    'Duke': 1,
    'Captain': 2,
    'Assassin': 3,
    'Contessa': 4,
    'Ambassador': 5,
    'Hidden': 6
}


influences_reverse = {
    take_1: ['Duke', 'Captain', 'Assassin', 'Contessa', 'Ambassador'],
    coup: ['Duke', 'Captain', 'Assassin', 'Contessa', 'Ambassador'],
    take_2: ['Captain', 'Assassin', 'Contessa', 'Ambassador'],
    take_3: ['Duke'],
    steal_2: ['Captain'],
    assassinate: ['Assassin'],
    exchange: ['Ambassador'],
    block_take_2: ['Duke'],
    block_steal: ['Captain','Ambassador'],
    block_assassination: ['Contessa']
}

In [20]:
import torch
import torch.nn as nn
import torch.optim as optim

class QNetwork(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=128):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.relu = nn.ReLU()
        self.rnn = nn.RNN(hidden_size, hidden_size, batch_first = True)
        self.fc3 = nn.Linear(hidden_size, 64)
        self.fc4 = nn.Linear(64, action_size)
        self.dropout = nn.Dropout(0.2)

    def forward(self, state):
        x = self.fc1(state)
        x = self.relu(x)
        x = self.dropout(x)
        x = x.unsqueeze(1)
        out, _ = self.rnn(x)
        x = out.squeeze(1)
        x = out[:, -1, :]
        x = self.fc3(x)
        x = self.relu(x)
        return self.fc4(x)

embedding_cards = nn.Embedding(6, 1)
cards_tens = torch.tensor([0,1,2,3,4,5])
cards_emb = embedding_cards(cards_tens)

embedding_actions = nn.Embedding(16, 3)
actions_tens = torch.tensor([ [0,0,0,0], [1,1,1,1], [2,2,2,2], [3,3,3,3], [4,4,4,4], [5,5,5,5], [6,6,6,6], [7,7,7,7] ])
actions_emb = embedding_actions(actions_tens)

embedding_coins = nn.Embedding(13, 2)
coins_tens = torch.tensor([0,1,2,3,4,5,6,7,8,9,10,11,12])
coins_emb = embedding_coins(coins_tens)

embedding_players = nn.Embedding(5, 3)
players_tens = torch.tensor([0,1,2,3,4])
players_emb = embedding_players(players_tens)

state_size = 12
state_size_card = 16
action_size = 16
block_size = 2
challenge_size = 2
card_size = 2

criterion = nn.HuberLoss(delta=1.0)
learning_rate = 0.001
gamma = 0.99
epsilon = 1.0
epsilon_decay = 0.005
min_epsilon = 0.01
# batch_size = 64
# replay_buffer_size = 10000

In [21]:
class StateSummarizer(nn.Module):
    def __init__(self, input_size, hidden_size, embedding_size):
        super(StateSummarizer, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, embedding_size)

    def forward(self, next_states):
        # Initialize hidden state and cell state
        h0 = torch.zeros(1, next_states.shape[0], self.hidden_size).to(next_states.device)
        c0 = torch.zeros(1, next_states.shape[0], self.hidden_size).to(next_states.device)

        # Pass the sequence of next states through the LSTM
        out, _ = self.lstm(next_states, (h0, c0))

        # print(f'out.shape: {out.shape}')

        # Take the last hidden state as the summary
        summary = torch.mean(out, dim=1)  # Average across the sequence dimension (dim=1)

        # Project the summary to the desired embedding size
        embedding = self.fc(summary)
        return embedding

summarizer = StateSummarizer(12, 64, 12)

In [22]:
class Bot:
    def __init__(self, cards, num_coins, hostility, name, action_q, block_q, challenge_q, card_q,
                 optimizer_action, optimizer_block, optimizer_challenge, optimizer_card):
        self.cards = cards
        self.num_coins = num_coins
        self.hostility = hostility
        self.name = name
        self.action_q = action_q
        self.block_q = block_q
        self.challenge_q = challenge_q
        self.card_q = card_q
        self.optimizer_action = optimizer_action
        self.optimizer_block = optimizer_block
        self.optimizer_challenge = optimizer_challenge
        self.optimizer_card = optimizer_card

    def num_coins_adj(self, n):
        self.num_coins += n

    def cards_adj(self, card):
        self.cards.remove(card)


In [23]:
bag = ['Duke', 'Captain', 'Assassin', 'Contessa', 'Ambassador'] * 3
random.shuffle(bag)

bots = []
bluff_degree = 0

for i in range(4):
    cards = random.sample(bag, 2)
    for card in cards:
        bag.remove(card)
#     kb = []

    action_q = QNetwork(state_size, action_size)
    optimizer_action = optim.Adam(action_q.parameters(), lr=learning_rate)

    block_q = QNetwork(state_size, block_size)
    optimizer_block = optim.Adam(block_q.parameters(), lr=learning_rate)

    challenge_q = QNetwork(state_size, challenge_size)
    optimizer_challenge = optim.Adam(challenge_q.parameters(), lr=learning_rate)

    card_q = QNetwork(state_size, card_size)
    optimizer_card = optim.Adam(card_q.parameters(), lr=learning_rate)

    bots.append(Bot(cards, 2, None, f'{i}', action_q, block_q, challenge_q, card_q,
                    optimizer_action, optimizer_block, optimizer_challenge, optimizer_card))




for bot in bots:
    print(bot.cards)
print(bots[0].name)

['Captain', 'Assassin']
['Ambassador', 'Assassin']
['Duke', 'Contessa']
['Captain', 'Contessa']
0


In [24]:
replay_buffer = []

In [25]:
def get_legal_actions(bot, bots):
  """Returns a list of legal action indices for the given bot."""
  legal_actions = []

  if bot.num_coins >= 10:
    return [1]

  # Always legal actions
  legal_actions.extend([0, 1, 6])  # Income, Coup, Exchange are always legal

  # Conditional actions
  if bot.num_coins >= 3:
    legal_actions.append(3)  # Foreign Aid
  if bot.num_coins >= 7:
    legal_actions.append(1)  # Coup
  if bot.num_coins >= 3:
    legal_actions.append(5)  # Assassinate (if enough coins)

  # Actions that target other players
  for other_bot in bots:
    if other_bot != bot and other_bot.num_coins > 0 : #Can't steal from players with no coins
      legal_actions.append(4) #Steal
      break  # Only need to add steal once if there's a valid target

  return legal_actions

In [26]:
def action_selection(i, bots, actions_vector, actions, epsilon, state):

    legal_actions = get_legal_actions(bots[i], bots)  # Get list of legal actions

    if bots[i].name == 0:
      if random.random() >= epsilon:
        q_values = bots[i].action_q(state)
        max_q_value = float('-inf')
        best_action = None
        col_index = None  # Initialize col_index

        # Iterate through Q-values and find the maximum for legal actions
        for action_idx, q_value in enumerate(q_values[0]):  # Iterate with index
            if action_idx in legal_actions and q_value.item() > max_q_value:
                max_q_value = q_value.item()
                best_action = action_idx
                # col_index is now the index of best_action in legal_actions
                col_index = legal_actions.index(best_action)

        if best_action is None:
            # Handle the case where no legal actions have Q-values
            # This could happen if all legal actions have -inf Q-values
            # You might want to choose a random legal action here
            best_action = random.choice(legal_actions)
            col_index = legal_actions.index(best_action)

        return [best_action, col_index]

      else:
        # Random action selection:
        action = None
        if bots[i].num_coins >= 10:
            action = 1  # Coup
        else:
            # Choose a random action from the legal actions
            action = random.choice(legal_actions)

        target = None
        if (actions[action].p2_net_coins != 0 or actions[action].p2_net_cards != 0) and actions[action].response_action != 'challenge':
            targets = bots[:i] + bots[i+1:]
            valid_targets = [bots.index(bot) for bot in targets if bot.num_coins >= -actions[action].p2_net_coins]
            if valid_targets:
                target = random.choice(valid_targets)

        return [action, target]

    else:

      target = None
      action = None
      targets = bots[:i] + bots[i+1:]

      # Play truthfully:
      # bot = bots[i]
      if bots[i].num_coins >= 10:  # Coup if possible
          action = 1  # Coup action index
          target = random.choice(bots[:i] + bots[i+1:])  # Choose a random target
      else:
          # Prioritize actions based on cards and coins:
          if 'Duke' in bots[i].cards and bots[i].num_coins < 10:  # Take 3 coins if Duke
              action = 3  # Take 3 coins action index
          elif 'Captain' in bots[i].cards and 4 in legal_actions:
              action = 4  # Steal action index
              valid_targets = [bots.index(bot) for bot in targets if bot.num_coins >= -actions[action].p2_net_coins]
              if valid_targets:
                  target = random.choice(valid_targets)
              # target = bots.index(random.choice([other_bot for other_bot in bots if other_bot != bots[i] and other_bot.num_coins > 0]))
          elif 'Assassin' in bot.cards and 5 in legal_actions:
              action = 5  # Assassinate action index
              target = random.choice(bots[:i] + bots[i+1:])  # Choose a random target
          elif 'Ambassador' in bots[i].cards:  # Exchange if Ambassador
              action = 6  # Exchange action index
          else:  # Otherwise, take income
              action = 0  # Take 1 coin action index
          if bots[i].num_coins >= 7:
            action = 1 # Coup if previous action was not legal and can coup
          elif bots[i].num_coins >= 3:
            action = 3 # foreign aid if previous action was not legal and can foreign aid
          # else:
          #   action = 0 # income if previous action was not legal and can only income

      # If action requires a target and none is selected yet:
      if (actions[action].p2_net_coins != 0 or actions[action].p2_net_cards != 0) and actions[action].response_action != 'challenge' and target is None:
          target = bots.index(random.choice(bots[:i] + bots[i+1:]))

      return [action, target]




def reaction_selection(i, bots, target, response_action, epsilon, state):

  if target is None:

    target = random.choice(bots[:i] + bots[i+1:])

  if random.random() >= epsilon and target.name == 0:

    state_tensor = torch.FloatTensor(state).unsqueeze(0)
    q_values = target.block_q(state_tensor)
    return torch.argmax(q_values).item(), target

  elif target.name != 0:

    # Play truthfully - block only if has the card
    if response_action.name == actions_map[8]:  # Block foreign aid
        if 'Duke' in target.cards:
            return 1, target  # Block
        else:
            return 0, target  # Pass
    elif response_action.name == actions_map[9]:  # Block stealing
        if 'Captain' in target.cards or 'Ambassador' in target.cards:
            return 1, target  # Block
        else:
            return 0, target  # Pass
    elif response_action.name == actions_map[10]:  # Block assassination
        if 'Contessa' in target.cards:
            return 1, target  # Block
        else:
            return 0, target  # Pass
    else:  # Other actions (cannot be blocked truthfully)
        return 0, target  # Pass

  else:

    return random.choice([0, 1]), target


def challenge_selection(epsilon, state, bot):

  if random.random() >= epsilon and bot.name == 0:

    return 0

    # state_tensor = torch.FloatTensor(state).unsqueeze(0)
    # q_values = bot.challenge_q(state_tensor)
    # return torch.argmax(q_values).item()

  else:

    return random.choice([0, 1])


def card_selection(bot, cards, epsilon, state):

  if random.random() >= epsilon and len(cards) > 1 and bot.name == 0:

    state_tensor = torch.FloatTensor(state).unsqueeze(0)
    q_values = bot.card_q(state_tensor)
    card_index = torch.argmax(q_values).item()  # Get index (0 or 1)
    return card_index  # Return the index directly

  else:

    c = random.choice(cards)

    c = cards.index(c)

    return c

In [27]:
def perform_action(bot, target, action, discard_pile, state, card_chosen, epsilon, bag):

    if target is not None:

        target.num_coins += action.p2_net_coins

        if action.p2_net_cards < 0 and len(target.cards) > 0:

            card = card_selection(target, target.cards, epsilon, state)
            # print(card)

            x = target.cards[card]

            discard_pile.append(inf_map[x])

            card_chosen = card

            target.cards.remove(x)

    bot.num_coins += action.p1_net_coins

    if action == exchange:

        card = card_selection(bot, bot.cards, epsilon, state)

        x = bot.cards[card]

        c = random.sample(bag, 2)

        # arr = [x] + c

        next_choice = card_selection(bot, c, epsilon, state)

        next_choice = c[next_choice]

        arr = [x] + [next_choice]

        final_choice = card_selection(bot, arr, epsilon, state)

        card_chosen = final_choice

        final_choice = arr[final_choice]

        arr.remove(final_choice)

        for i in arr:
            bag.insert(-1, i)

        random.shuffle(bag)
        bot.cards.insert(-1, final_choice)
        bot.cards.remove(x)

    return bot, target, discard_pile, card_chosen, bag




In [28]:
def reset_game(bots_copy):
  bag = ['Duke', 'Captain', 'Assassin', 'Contessa', 'Ambassador'] * 3
  random.shuffle(bag)
  new_bots = []  # Create a new list
  for i, bot in enumerate(bots_copy):
      cards = random.sample(bag, 2)
      for card in cards:
          bag.remove(card)
      new_bots.append(Bot(cards, 2, None, f'{i}', bot.action_q, bot.block_q, bot.challenge_q, bot.card_q,
                          bot.optimizer_action, bot.optimizer_block, bot.optimizer_challenge, bot.optimizer_card))
  return new_bots  # Return the new list

In [29]:
# Base Game Loop

def game_loop_random(bots, actions, influences_reverse, epsilon):
    bag = ['Duke', 'Captain', 'Assassin', 'Contessa', 'Ambassador'] * 3
    random.shuffle(bag)

    bots_copy = copy.deepcopy(bots)

    states = torch.empty((0, 26), dtype=torch.float32)

    discard_piles = []
    discard_piles.append([])
    acting_players = []
    reacting_players = []
    current_players = [[1,1,1,1]]
    actions_game = [7]
    reactions_game = []
    challenges_game = []
    challenges_direction = []
    cards_game = []
    coins_game = []

    rewards = [0]

    action_history = [7]
    reaction_history = [0]
    challenge_history = [0]
    card_history = [0]

    cards_turn = [[inf_map[c] for c in bots[0].cards],
                  [6,6],
                  [6,6],
                  [6,6]]
    # for i in range(3):

    #     cards_ind = [inf_map[c] for c in bot.cards]
    #     cards_turn.append(cards_ind)

    cards_game.append(cards_turn)

    coins_turn = [2,2,2,2]
    coins_game.append(coins_turn)
    done = []
    cards_chosen = [0]

#     print(cards_game[-1])
#     print(coins_game[-1])

    while len(bots) > 1:


#         for bot in bots:
#             print(f'{bot.name}')
        i = 0
        while i < len(bots):

            # for bot in bots:
            #   print(bot.cards)

            card_chosen = 0
#             print(i)

            if len(bots) == 1:
                # done.append(1)
                break


            done.append(0)
            rewards.append(0)

            challenge_dir = 2

            discard_pile = copy.deepcopy(discard_piles[-1])

            curr = None
            try:
                curr = bots[i]
            except:
                i = 0
                curr = bots[i]

            acting_players.append(int(curr.name))

            # cards_state = cards_game[-1]
            # coins_state = coins_game[-1]
            # current_players_state = current_players[-1]

            # cards_game: N x 4 x 2 -> 8 tensors of size N
            cards_game_tensors = [
                torch.tensor([cards_game[i][j][k] for i in range(len(cards_game))] )
                for j in range(4) for k in range(2)
            ]

            # current_players: N x 4 -> 1 tensor of size N
            current_players_tensors = torch.tensor([sum(row) for row in current_players])

            coins_game_tensors = [
                      torch.tensor([coins_game[i][j] for i in range(len(coins_game))] )
                      for j in range(4)
            ]

            # discard_piles: N x y (max y = 7) -> 7 tensors of size N
            max_discard_len = 7  # Maximum possible length of discard_piles
            discard_piles_tensors = [
                torch.tensor([discard_piles[i][j] if j < len(discard_piles[i]) else 0
                              for i in range(len(discard_piles))] )
                for j in range(max_discard_len)
            ]

            # Concatenate tensors for states_action
            # states_action = torch.cat(([
            #     torch.tensor(acting_players[1:]).unsqueeze(1), # unsqueeze to add a dimension
            #     torch.tensor(reacting_players).unsqueeze(1),
            #     torch.tensor(reactions_game).unsqueeze(1),
            #     torch.tensor(challenges_game).unsqueeze(1),
            #     torch.tensor(current_players_tensors).unsqueeze(1),
            #     *[t.unsqueeze(1) for t in cards_game_tensors],
            #     *[t.unsqueeze(1) for t in discard_piles_tensors],
            #     *[t.unsqueeze(1) for t in coins_game_tensors],
            #     torch.tensor(done).unsqueeze(1)
            # ]), 1)  # changed dim to 1

            state = None

            if bots[0].name == 0:

              # 1. Cards in play (embedded):
              cards_in_play_embedded = []
              for card_name in influences.keys():
                  num_in_discard = discard_piles[-1].count(inf_map[card_name])
                  num_in_play = 3 - num_in_discard  # Assuming 3 of each card initially
                  cards_in_play_embedded.append(embedding_cards(torch.tensor(num_in_play)))  # Keep as tensor

              # Stack the embeddings to create a 2D tensor
              cards_in_play_embedded = torch.stack(cards_in_play_embedded)

              # 4. Bot 0's normalized coins:
              bot0_coins_normalized = bots[0].num_coins / 12 # Normalize to 0-1 range (assuming max coins is 12)

              # 5. Average cards of other players (normalized and embedded):
              other_bots_cards = [len(bot.cards) for bot in bots if bot != bots[0]]
              avg_other_cards = sum(other_bots_cards) / len(other_bots_cards) if other_bots_cards else 0  # Avoid division by zero
              avg_other_cards_normalized = avg_other_cards / 2  # Normalize to 0-1 range (assuming max cards per bot is 2)
              # avg_other_cards_embedded = embedding_cards(torch.tensor(int(avg_other_cards_normalized))).tolist()  # Assuming embedding_cards is your embedding layer

              # 6. Bot 0's current cards (embedded):
              bot0_cards_embedded = []
              for card in bots[0].cards:
                  bot0_cards_embedded.append(embedding_cards(torch.tensor(inf_map[card])))  # Keep as tensor

              # If the bot has cards, concatenate the embeddings. Otherwise, create a zero tensor
              if bot0_cards_embedded:
                  bot0_cards_embedded = torch.cat(bot0_cards_embedded)
              else:
                  # Create a zero tensor with the expected shape if bot has no cards
                  bot0_cards_embedded = torch.zeros(embedding_cards.embedding_dim * 2)  # Assuming 2 cards max

              # 7. The last action taken (embedded):
              last_action = actions_game[-1]
              last_action_embedded = embedding_actions(torch.tensor(last_action))  # Keep as a tensor

              state = torch.cat(([cards_in_play_embedded.unsqueeze(-1),
                                  torch.tensor([bot0_coins_normalized]).unsqueeze(-1).unsqueeze(-1),
                                  torch.tensor([avg_other_cards_normalized]).unsqueeze(-1).unsqueeze(-1),
                                  bot0_cards_embedded.unsqueeze(-1),
                                  last_action_embedded.unsqueeze(-1)]),
                                1).type(torch.float32)

              states = torch.cat((states, state.unsqueeze(0)), 0)








            action_stack = []

            action_vector = [0,1,2,3,4,5,6]
            for j in action_vector:
                if actions[j].p1_net_coins * (-1) > bots[i].num_coins:
                    action_vector.remove(j)

            # state = None
            # state_tensor = None
            # if bots[0].name == 0:
            #   state = get_state(bots, discard_pile, action_history, reaction_history, challenge_history, card_history, bots[i], network_type="action")
            #   state_tensor = torch.FloatTensor(state).unsqueeze(0)
            #   print(len(state_tensor[0]))

            action_selection_output = action_selection(i, bots, action_vector, actions, epsilon, state)

            action = action_selection_output[0]

            actions_game.append(action)

            action_e = actions_emb[action]

            action = actions[action]
    #         print(action_selection_output[1])

            # print(f'bot {bots[i].name} is performing action {action.name}')
            # print(f'target is {action_selection_output[1]}')

            target = None
            reacting_player = 4
            challenge = 0
            reaction = 0
            try:
                target = next((bot for bot in bots if bot.name == action_selection_output[1]), None)
                reacting_player = int(target.name)
            except:
                target = None
#                 reacting_players.append(4)
                # print("no target")
            # if target is not None:
            #     print(f'target is {target.name}')

            action_stack.append(action)

            if action.response_action is not None and target is None:
              target = random.choice(bots)
              reacting_player = int(target.name)
            reacting_players.append(reacting_player)

            # state_reaction = copy.deepcopy(state_action)
            # state_reaction.append(action)

            # state_challenge = copy.deepcopy(state_action)
            # state_challenge.append(action)

            # state_card = copy.deepcopy(state_action)
            # state_card.append(action)

            if action.response_action is not None and action.response_action != 'challenge':  # is blockable?

                response, target = reaction_selection(i, bots, target, action.response_action, epsilon, state)

                # reacting_player = int(target.name)

#                 try:
#                     print(f'bot {target.name} is considering blocking')
#                 except:
#                     print("no target, check reaction selection")

                if response == 1:

                    reaction = 1

                    reactions_game.append(1)

#                     reacting_players.append(int(target.name))

                    action_stack.append(action.response_action)
                    # state_challenge.append(reaction)
                    # state_card.append(reaction)

                    # print(f'bot {target.name} is performing action {action.response_action.name} against bot {bots[i].name}')

                else:

                    reactions_game.append(0)

            else:

                reactions_game.append(0)

#                     print(f'target will not block')

            if action_stack[-1].response_action == 'challenge':  # is challengeable?

                response = challenge_selection(epsilon, state, target if len(action_stack) == 3 else bots[i])

                if response == 1:

                    challenge = 1

                    challenges_game.append(1)

                    action_stack.append('challenge')

                    if len(action_stack) == 3:
                        challenge_dir = 1

                    else:
                        challenge_dir = 0
                    challenges_direction.append(challenge_dir)

                else:

                    challenges_game.append(0)
                    challenges_direction.append(challenge_dir)

#                     print('no challenge')

            else:

                challenges_game.append(0)
                challenges_direction.append(2)

            # challenges_game.append(0)
            # challenges_direction.append(2)

            while len(action_stack) != 0:

                # state_card.append(challenge)
                # state_card.append(challenge_dir)

                a = action_stack.pop()
                # if a != 'challenge':
                #   print(a.name)

                if bots[0].name != 0:
                  rewards[-1] -= 1.0
                # else:
                #   if acting_player == 0:
                #     if action.name in ['take 1 coin', 'take 2 coins', 'steal 2 coins', 'take 3 coins']:
                #         if challenge == 0 and (action.name != 'steal 2 coins' or reaction == 0):  # Action successful
                #             rewards[-1] += 0.1 * action.num_coins # Small reward for increasing coins
                #     elif action.name == 'assassinate':
                #         if challenge == 0 and reaction == 0:  # Action successful
                #             rewards[-1] += 0.6  # Larger reward for eliminating an opponent
                #     if challenge == 1:
                #       if a == 'challenge':
                #           if len(action_stack) > 1:
                #               if influences_reverse[action_stack[-1]] not in target.cards:
                #                   rewards[-1] += 0.5  # Penalty for losing a challenge

                #               else:
                #                   rewards[-1] -= 0.5  # Penalty for losing a challenge

                #           else:
                #               if influences_reverse[action_stack[-1]] not in bots[i].cards:
                #                   rewards[-1] -= 0.5  # Penalty for losing a challenge

                #               else:
                #                   rewards[-1] += 0.25  # Penalty for losing a challenge
                #   elif reacting_player == 0:
                #     if action.name in ['take 1 coin', 'take 2 coins', 'steal 2 coins', 'take 3 coins']:
                #         if challenge == 0 and (action.name != 'steal 2 coins' or reaction == 0):  # Action successful
                #             rewards[-1] -= 0.1 * action.num_coins # Small reward for increasing coins
                #     elif action.name == 'assassinate':
                #         if challenge == 0 and reaction == 0:  # Action successful
                #             rewards[-1] -= 0.5  # Larger reward for eliminating an opponent
                #     if challenge == 1:
                #       if a == 'challenge':
                #           if len(action_stack) > 1:
                #               if influences_reverse[action_stack[-1]] not in target.cards:
                #                   rewards[-1] -= 0.5  # Penalty for losing a challenge

                #               else:
                #                   rewards[-1] += 0.5  # Penalty for losing a challenge

                #           else:
                #               if influences_reverse[action_stack[-1]] not in bots[i].cards:
                #                   rewards[-1] += 0.5  # Penalty for losing a challenge

                #               else:
                #                   rewards[-1] += 0.5  # Penalty for losing a challenge

                if a == 'challenge':

                    if len(action_stack) > 1:

                        if influences_reverse[action_stack[-1]] in target.cards:

                            # print(f'bot {bots[i].name} has lost the challenge')

                            card = 0
                            if len(bots[i].cards) > 1:

                              card = card_selection(bots[i], bots[i].cards, epsilon, state)

                            # print(card)
                            x = bots[i].cards[card]

                            discard_pile.append(inf_map[x])

                            card_chosen = card
                            # card = inf_map.get(card)

                            bots[i].cards.remove(x)

                            if len(bots[i].cards) == 0:

                                print (f'bot {bots[i].name} is out!')

                                bots.remove(bots[i])

                                i -= 1

                            target.cards.remove(influences_reverse[action_stack[-1]])
                            bag.insert(influences_reverse[action_stack[-1]])
                            random.shuffle(bag)
                            c = random.sample(bag, 1)
                            bag.remove(c)
                            target.cards.insert(c)

                            action_stack.clear()

                        else:

                            # print(f'bot {target.name} has lost the challenge')

                            card = 0
                            if len(target.cards) > 1:
                              card = card_selection(target, target.cards, epsilon, state)

                            # print(card)
                            x = target.cards[card]

                            discard_pile.append(inf_map[x])

                            card_chosen = card

                            # card = inf_map.get(card)

                            target.cards.remove(x)

                            if len(target.cards) == 0:

                                bots.remove(target)

                                # print (f'bot {target.name} is out!')

                            action_stack.pop()

                    else:

                        if influences_reverse[action_stack[-1]] in bots[i].cards:

                            # print(f'bot {target.name} has lost the challenge')

                            card = 0
                            if len(target.cards) > 1:
                              card = card_selection(target, target.cards, epsilon, state)

                            # print(card)
                            x = target.cards[card]

                            discard_pile.append(inf_map[x])

                            card_chosen = card
                            # card = inf_map.get(card)

                            target.cards.remove(x)

                            if len(target.cards) == 0:

                                # print (f'bot {target.name} is out!')

                                bots.remove(target)

                            bots[i].cards.remove(influences_reverse[action_stack[-1]])
                            bag.insert(influences_reverse[action_stack[-1]])
                            random.shuffle(bag)
                            c = random.sample(bag, 1)
                            bag.remove(c)
                            bots[i].cards.insert(c)

                        else:

                            # print(f'bot {bots[i].name} has lost the challenge')

                            card = 0
                            if len(bots[i].cards) > 1:
                              card = card_selection(bots[i], bots[i].cards, epsilon, state)
                            # print(card)
                            x = bots[i].cards[card]

                            discard_pile.append(inf_map[x])

                            card_chosen = card
                            # card = inf_map.get(card)

                            bots[i].cards.remove(x)

                            if len(bots[i].cards) == 0:

                                # print (f'bot {bots[i].name} is out!')

                                bots.remove(bots[i])

                                i -= 1

                            action_stack.pop()

                else:

                    # print(f'current action: {a.name}')

                    if len(action_stack) == 1:

                        target, curr, discard_pile, card_chosen, bag = perform_action(target, curr, a, discard_pile, state, card_chosen, epsilon, bag)
                        if curr is not None:
                            if len(curr.cards) == 0:
                                if curr in bots:
                                    bots.remove(curr)
                                    # print(f'{curr.name} is out!')
                                    i -= 1

                    else:

                        curr, target, discard_pile, card_chosen, bag = perform_action(curr, target, a, discard_pile, state, card_chosen, epsilon, bag)
                        if target is not None:
                            if len(target.cards) == 0:
                                if target in bots:
                                    bots.remove(target)
                                    # print(f'{target.name} is out!')




#             print(f'bot {curr.name} has {curr.num_coins} coins.')
#             if target is not None:
#                 print(f'bot {target.name} has {target.num_coins} coins.')

#             print(f'bot {curr.name} has {len(curr.cards)} cards.')
#             if target is not None:
#                 print(f'bot {target.name} has {len(target.cards)} cards.')


            curr_players = [0,0,0,0]
            for bot in bots:
#                 print(int(bot.name))
                curr_players[int(bot.name)] = 1
#             print(curr_players)
            current_players.append(curr_players)

            cards_turn = [[0,0],
                          [0,0],
                          [0,0],
                          [0,0]]
            coins_turn = [0,0,0,0]

            # if len(bots[0].cards) == 0:
            #     cards_turn[0] = [0,0]
            # if len(bots[0].cards) == 1:
            #     cards_turn[0].append(0)

            for bot in bots:

                bot_index = int(bot.name)

                cards_ind = []

                cards_turn[bot_index] = [inf_map[c] for c in bot.cards]

                if len(bot.cards) == 0:
                    cards_turn[bot_index] = [0, 0]
                if len(bot.cards) == 1:
                    cards_turn[bot_index].append(0)

                coins_turn[bot_index] = bot.num_coins

            cards_game.append(cards_turn)
            coins_game.append(coins_turn)

            discard_piles.append(discard_pile)

            # reacting_players.append(reacting_player)
            # reactions_game.append(reaction)
            # challenges_game.append(challenge)
            # challenges_direction.append(challenge_dir)
            cards_chosen.append(card_chosen)

            action_history.append(action)
            reaction_history.append(reaction)
            challenge_history.append(challenge)
            card_history.append(card_chosen)

            i += 1
#             print(cards_turn)
#             print(coins_turn)
# #             print(curr_players)
#             print(discard_pile)



    # print(f'bot {bots[0].name} wins!')
    acting_players.append(4)
    reacting_players.append(4)
    reactions_game.append(0)
    challenges_game.append(0)
    challenges_direction.append(2)
    done.append(1)
    # rewards = copy.deepcopy(done)
    if bots[0].name != '0':
      rewards[-1] = -1.0
    else:
      rewards[-1] = 1.0
    # print(rewards[-1])

    # rewards = [[0,0,0,0]]
    # i = 1
    # while i < len(acting_players):
    #   rewards_new = copy.deepcopy(rewards[-1])

    #   punishment_curr_player = (current_players[i][acting_players[i]] - current_players[i-1][acting_players[i]])

    #   punishment_curr_cards = len(cards_game[i][acting_players[i]]) - len(cards_game[i-1][acting_players[i]])

    #   reward_curr_player = 0
    #   punishment_reacting_player = 0
    #   reward_curr_cards = 0
    #   if reacting_players[i] != 4:
    #     reward_curr_cards = len(cards_game[i][reacting_players[i]]) - len(cards_game[i-1][reacting_players[i]])
    #     reward_curr_player = current_players[i-1][reacting_players[i]] - current_players[i][reacting_players[i]]
    #     punishment_reacting_player = -1 * (reward_curr_player + reward_curr_cards)


    #   coins_reward_curr = (coins_game[i][acting_players[i]] - coins_game[i-1][acting_players[i]])
    #   coins_reward_react = 0
    #   if reacting_players[i] != 4:
    #     coins_reward_react = -1 * coins_reward_curr

    #   victory_reward = 50 * done[i]

    #   new_reward_curr = punishment_curr_player + punishment_curr_cards + reward_curr_player + coins_reward_curr
    #   new_reward_react = punishment_reacting_player + coins_reward_react

    #   if victory_reward > 0:
    #     if punishment_curr_player > 0:
    #       new_reward_curr += victory_reward
    #     else:
    #       new_reward_react += victory_reward

    #   rewards_new[acting_players[i]] += new_reward_curr
    #   if reacting_players[i] != 4:
    #     rewards_new[reacting_players[i]] += new_reward_react

    #   rewards.append(rewards_new)

    #   i += 1


    # Reset Game

    bag = ['Duke', 'Captain', 'Assassin', 'Contessa', 'Ambassador'] * 3
    random.shuffle(bag)
    # new_bots = []  # Create a new list

    for bot in bots_copy:
        bot.cards = random.sample(bag, 2)
        for card in bot.cards:
            bag.remove(card)
        bot.num_coins = 2

    bots = bots_copy


    return discard_piles, acting_players, reacting_players, current_players, actions_game, reactions_game, challenges_game, cards_game, coins_game, challenges_direction, done, rewards, cards_chosen, bots_copy


In [30]:
from os import stat
from collections import defaultdict
num_episodes = 1000
max_steps_per_episode = 200
epsilon = 1.0
list_division = 4
gamma = 0.99

bot = copy.deepcopy(bots[0])

avg_losses_action = []
avg_losses_block = []
avg_losses_challenge = []
avg_losses_card = []

# bots.remove(bots[-1])
# bots.remove(bots[-1])

win_rates = []
avg_game_lengths = []

data_fraction = 1/5

batch_size = 32

for episode in range(num_episodes):

  replay_buffer_actions = []
  replay_buffer_blocks = []
  replay_buffer_challenges = []
  replay_buffer_cards = []

  print(f'episode {episode} of 1000')
  print(f'epsilon: {epsilon}')
  print(f'gamma: {gamma}')

  # discard_piles, acting_players, reacting_players, current_players, actions_game, reactions_game, challenges_game, cards_game, coins_game, challenges_direction, done, rewards, cards_chosen, bots_copy = game_loop_random(bots, actions, influences_reverse, epsilon)
  # bots = bots_copy
  state = torch.empty((0, 12), dtype=torch.float32)  # Assume state_size = 25 for action network
  # states_action = torch.empty((0, 24), dtype=torch.float32)  # Assume state_size = 25 for action network
  # next_states_action = torch.empty((0, 24), dtype=torch.float32)
  actions_main = torch.empty((0,), dtype=torch.int64)
  # states_block = torch.empty((0, 23), dtype=torch.float32)  # Assume state_size = 24 for block network
  # next_states_block = torch.empty((0, 23), dtype=torch.float32)
  actions_block = torch.empty((0,), dtype=torch.int64)
  # states_challenge = torch.empty((0, 24), dtype=torch.float32)  # Assume state_size = 25 for challenge network
  # next_states_challenge = torch.empty((0, 24), dtype=torch.float32)
  actions_challenge = torch.empty((0,), dtype=torch.int64)
  # states_card = torch.empty((0, 19), dtype=torch.float32)  # Assume state_size = 20 for card network
  # next_states_card = torch.empty((0, 19), dtype=torch.float32)
  actions_card = torch.empty((0,), dtype=torch.int64)
  rewards = torch.empty((0,), dtype=torch.float32)
  done = torch.empty((0,), dtype=torch.float32)
  game_length_sum = 0
  all_discard_piles = []
  acting_players = []
  reacting_players = []
  reactions_game = []

  num_games = 0


  while len(state) <= 50 * batch_size:

    discard_pile, acting_player, reacting_player, current_player, action_game, reaction_game, challenge_game, card_game, coin_game, challenge_direction, done_0, reward, card_chosen, bots_copy = game_loop_random(bots, actions, influences_reverse, epsilon)

    num_games += 1
    # print(f'Game Number {num_games}')

    # if random.random():
    #   print(reacting_player)

    # start_index = int(3 * len(acting_player) / list_division)

    bots = bots_copy

    game_length_sum += len(acting_player)

    # split_point = int((1 - data_fraction) * len(acting_player))
    acting_players += acting_player
    reacting_players += reacting_player
    current_players = current_player
    actions_game = action_game
    reactions_game += reaction_game
    challenges_game = challenge_game
    cards_game = card_game
    coins_game = coin_game
    challenges_direction = challenge_direction
    cards_chosen = card_chosen
    discard_piles = discard_pile
    all_discard_piles += discard_pile

    avg_game_lengths.append(game_length_sum / 100)

    # cards_game: N x 4 x 2 -> 8 tensors of size N
    cards_game_tensors = [
        torch.tensor([cards_game[i][j][k] for i in range(len(cards_game))] )
        for j in range(4) for k in range(2)
    ]

    # current_players: N x 4 -> 4 tensors of size N
    current_players_tensors = torch.tensor([sum(row) for row in current_players])

    coins_game_tensors = [
              torch.tensor([coins_game[i][j] for i in range(len(coins_game))] )
              for j in range(4)
    ]

    # discard_piles: N x y (max y = 7) -> 7 tensors of size N
    max_discard_len = 7  # Maximum possible length of discard_piles
    discard_piles_tensors = [
        torch.tensor([discard_piles[i][j] if j < len(discard_piles[i]) else 0
                      for i in range(len(discard_piles))] )
        for j in range(max_discard_len)
    ]


    # Concatenate tensors for states_action
    # new_states_action = torch.cat(([
    #     torch.tensor(acting_players[1:]).unsqueeze(1), # unsqueeze to add a dimension
    #     torch.tensor(reacting_players[:-1]).unsqueeze(1),
    #     torch.tensor(reactions_game[:-1]).unsqueeze(1),
    #     torch.tensor(challenges_game[:-1]).unsqueeze(1),
    #     torch.tensor(current_players_tensors[:-1]).unsqueeze(1),
    #     *[t[:-1].unsqueeze(1) for t in cards_game_tensors],
    #     *[t[:-1].unsqueeze(1) for t in discard_piles_tensors],
    #     *[t[:-1].unsqueeze(1) for t in coins_game_tensors]
    # ]), 1).type(torch.float32)  # changed dim to 1
    # states_action = torch.cat([states_action, new_states_action], 0)


    # 1. Cards in play (embedded):
    all_cards_in_play_embedded = []

    for current_discard_pile in discard_pile:
        cards_in_play_embedded = []
        for card_name in influences.keys():
            num_in_discard = current_discard_pile.count(inf_map[card_name])
            num_in_play = 3 - num_in_discard
            cards_in_play_embedded.append(torch.tensor(num_in_play)) # Remove .tolist() here

        all_cards_in_play_embedded.append(torch.stack(cards_in_play_embedded)) # Stack the embedded tensors here

    # Convert to a single tensor outside the loop
    all_cards_in_play_embedded = torch.stack(all_cards_in_play_embedded)

    # 4. Bot 0's normalized coins:
    bot0_coins_normalized = torch.tensor(coin_game)[:, 0] / 12  # Get Bot 0's coins and normalize

    # 5. Average cards of other players (normalized and embedded):
    avg_other_cards_normalized = []
    for step_cards in cards_game:
        other_bots_cards = [len([card for card in bot_cards if card != 0])
                          for bot_cards in step_cards[1:]]  # Exclude Bot 0
        avg_other_cards = sum(other_bots_cards) / len(other_bots_cards) if other_bots_cards else 0
        avg_other_cards_normalized.append(avg_other_cards / 2)

    avg_other_cards_normalized = torch.tensor(avg_other_cards_normalized)
    # avg_other_cards_embedded = embedding_cards(torch.tensor(int(avg_other_cards_normalized))).tolist()  # Assuming embedding_cards is your embedding layer

    # 6. Bot 0's current cards (embedded):
    all_bot0_cards_embedded = []  # Store embedded cards for all steps

    for step_cards in cards_game:
        bot0_cards_embedded = []
        for card in step_cards[0]:  # Get Bot 0's cards for this step
            if card != 0:  # Assuming 0 represents the absence of a card
                bot0_cards_embedded.extend(embedding_cards(torch.tensor(card)).tolist())

        # If Bot 0 has no cards, add zero embeddings for consistency
        while len(bot0_cards_embedded) < embedding_cards.embedding_dim * 2:  # Assuming 2 cards max
            bot0_cards_embedded.extend([0] * embedding_cards.embedding_dim)

        all_bot0_cards_embedded.append(torch.tensor(bot0_cards_embedded))  # Convert to tensor and store

    all_bot0_cards_embedded = torch.stack(all_bot0_cards_embedded) # Stack to create a 2D tensor

    # 7. The last action taken (embedded):
    all_last_action_embedded = []  # Store embedded last actions for all steps

    for i in range(len(actions_game)):
        last_action = actions_game[i]  # Get the action for the current step
        last_action_embedded = embedding_actions(torch.tensor(last_action)).tolist()
        all_last_action_embedded.append(last_action_embedded)

    all_last_action_embedded = torch.tensor(all_last_action_embedded)  # Convert to a tensor

    new_state = torch.cat(([all_cards_in_play_embedded.unsqueeze(-1),
                        bot0_coins_normalized.unsqueeze(-1).unsqueeze(-1),
                        avg_other_cards_normalized.unsqueeze(-1).unsqueeze(-1),
                        all_bot0_cards_embedded.unsqueeze(-1),
                        all_last_action_embedded.unsqueeze(-1)]),
                      1).squeeze(2)
    state = torch.cat([state, new_state], 0)

    # print(states_action)

    # states_action = torch.tensor([acting_players[:-1], reacting_players[:-1], reactions_game[:-1], challenges_game[:-1], current_players[:-1], cards_game[:-1], coins_game[:-1], discard_piles[:-1], done[:-1]])
    # new_next_states_action = torch.cat(([
    #     torch.tensor(acting_players[1:]).unsqueeze(1), # unsqueeze to add a dimension
    #     torch.tensor(reacting_players[1:]).unsqueeze(1),
    #     torch.tensor(reactions_game[1:]).unsqueeze(1),
    #     torch.tensor(challenges_game[1:]).unsqueeze(1),
    #     torch.tensor(current_players_tensors[1:]).unsqueeze(1),
    #     *[t[1:].unsqueeze(1) for t in cards_game_tensors],
    #     *[t[1:].unsqueeze(1) for t in discard_piles_tensors],
    #     *[t[1:].unsqueeze(1) for t in coins_game_tensors]
    # ]), 1).type(torch.float32)  # changed dim to 1
    # next_states_action = torch.cat([next_states_action, new_next_states_action], 0)

    new_actions_main = torch.tensor(actions_game[1:]).type(torch.int64)
    actions_main = torch.cat([actions_main, new_actions_main], 0)


    # new_states_block = torch.cat(([
    #     torch.tensor(acting_players[1:]).unsqueeze(1), # unsqueeze to add a dimension
    #     torch.tensor(reacting_players[1:]).unsqueeze(1),
    #     torch.tensor(actions_game[1:]).unsqueeze(1),
    #     torch.tensor(current_players_tensors[:-1]).unsqueeze(1),
    #     *[t[:-1].unsqueeze(1) for t in cards_game_tensors],
    #     *[t[:-1].unsqueeze(1) for t in discard_piles_tensors],
    #     *[t[:-1].unsqueeze(1) for t in coins_game_tensors]
    # ]), 1).type(torch.float32)  # changed dim to 1
    # states_block = torch.cat([states_block, new_states_block], 0)

    # new_next_states_block = torch.cat(([
    #     torch.tensor(acting_players[1:]).unsqueeze(1), # unsqueeze to add a dimension
    #     torch.tensor(reacting_players[1:]).unsqueeze(1),
    #     torch.tensor(actions_game[1:]).unsqueeze(1),
    #     torch.tensor(current_players_tensors[1:]).unsqueeze(1),
    #     *[t[1:].unsqueeze(1) for t in cards_game_tensors],
    #     *[t[1:].unsqueeze(1) for t in discard_piles_tensors],
    #     *[t[1:].unsqueeze(1) for t in coins_game_tensors]
    # ]), 1).type(torch.float32)  # changed dim to 1
    # next_states_block = torch.cat([next_states_block, new_next_states_block], 0)

    new_actions_block = torch.tensor(reactions_game[1:]).type(torch.int64)
    actions_block = torch.cat([actions_block, new_actions_block], 0)


    # new_states_challenge = torch.cat(([
    #     torch.tensor(acting_players[1:]).unsqueeze(1), # unsqueeze to add a dimension
    #     torch.tensor(reacting_players[1:]).unsqueeze(1),
    #     torch.tensor(actions_game[1:]).unsqueeze(1),
    #     torch.tensor(reactions_game[1:]).unsqueeze(1),
    #     torch.tensor(current_players_tensors[:-1]).unsqueeze(1),
    #     *[t[:-1].unsqueeze(1) for t in cards_game_tensors],
    #     *[t[:-1].unsqueeze(1) for t in discard_piles_tensors],
    #     *[t[:-1].unsqueeze(1) for t in coins_game_tensors]
    # ]), 1).type(torch.float32) # changed dim to 1
    # states_challenge = torch.cat([states_challenge, new_states_challenge], 0)

    # new_next_states_challenge = torch.cat(([
    #     torch.tensor(acting_players[1:]).unsqueeze(1), # unsqueeze to add a dimension
    #     torch.tensor(reacting_players[1:]).unsqueeze(1),
    #     torch.tensor(actions_game[1:]).unsqueeze(1),
    #     torch.tensor(reactions_game[1:]).unsqueeze(1),
    #     torch.tensor(current_players_tensors[1:]).unsqueeze(1),
    #     *[t[1:].unsqueeze(1) for t in cards_game_tensors],
    #     *[t[1:].unsqueeze(1) for t in discard_piles_tensors],
    #     *[t[1:].unsqueeze(1) for t in coins_game_tensors]
    # ]), 1).type(torch.float32)  # changed dim to 1
    # next_states_challenge = torch.cat([next_states_challenge, new_next_states_challenge], 0)

    new_actions_challenge = torch.tensor(challenges_game[1:]).type(torch.int64)
    actions_challenge = torch.cat([actions_challenge, new_actions_challenge], 0)


    # new_states_card = torch.cat(([
    #     *[t[:-1].unsqueeze(1) for t in cards_game_tensors],
    #     *[t[:-1].unsqueeze(1) for t in discard_piles_tensors],
    #     *[t[:-1].unsqueeze(1) for t in coins_game_tensors]
    # ]), 1).type(torch.float32)  # changed dim to 1
    # states_card = torch.cat([states_card, new_states_card], 0)

    # new_next_states_card = torch.cat(([
    #     *[t[1:].unsqueeze(1) for t in cards_game_tensors],
    #     *[t[1:].unsqueeze(1) for t in discard_piles_tensors],
    #     *[t[1:].unsqueeze(1) for t in coins_game_tensors]
    # ]), 1).type(torch.float32)  # changed dim to 1
    # next_states_card = torch.cat([next_states_card, new_next_states_card], 0)

    # actions_card = torch.tensor([acting_players[1:], reacting_players[1:], challenges_game[1:], challenges_direction[1:], cards_game[1:]])
    new_actions_card = torch.tensor(cards_chosen[1:]).type(torch.int64)
    actions_card = torch.cat([actions_card, new_actions_card], 0)

    new_rewards = torch.tensor(reward).type(torch.float32)
    rewards = torch.cat([rewards, new_rewards], 0)

    new_done = torch.tensor(done_0).type(torch.float32)
    done = torch.cat([done, new_done], 0)

  print(f'Number of games in episode {episode}: {num_games}')

  states_action = torch.empty((0, 12), dtype=torch.float32)
  next_states_action = torch.empty((0, 12), dtype=torch.float32)
  states_block = torch.empty((0, 12), dtype=torch.float32)
  next_states_block = torch.empty((0, 12), dtype=torch.float32)
  states_challenge = torch.empty((0, 12), dtype=torch.float32)
  next_states_challenge = torch.empty((0, 12), dtype=torch.float32)
  states_card = torch.empty((0, 12), dtype=torch.float32)
  next_states_card = torch.empty((0, 12), dtype=torch.float32)

  # Assuming you have a list called 'all_states' that contains all the states
  # generated using the 'new_state' calculation you provided
  # and 'acting_players' list that has acting players per state,
  # and 'reacting_players' list for reacting players

  # all_states = []  # Initialize with your existing state generation logic

  # print(state.shape)
  # print(all_bot0_cards_embedded.shape)

  # Create a dictionary to store current states and their corresponding next states
  state_transitions = defaultdict(list)

  for i in range(len(state) - 1):  # Iterate through all states (except the last one)
      # print(next_state[7:9])
      current_state = state[i]
      next_state = state[i + 1]

      # Add the next state to the list of next states for the current state
      state_transitions[tuple(current_state.tolist())].append(next_state)  # Convert to tuple for dictionary key

  state_indices = {}

  for i, state_tensor in enumerate(state):
      state_indices[tuple(state_tensor.tolist())] = i

  for i in range(len(state) - 1):  # Iterate through all states (except the last one)

      current_state = state[i]
      next_state = state[i + 1]

      # --- Action Network ---
      if acting_players[i] == 0:  # Check acting player for current state
          states_action = torch.cat([states_action, current_state.unsqueeze(0)], 0)
      else:
          next_states_action = torch.cat([next_states_action, next_state.unsqueeze(0)], 0)

      # --- Reaction Network & Challenge Network ---
      if reacting_players[i] == 0:  # Bot 0 is the reacting player
          states_block = torch.cat([states_block, current_state.unsqueeze(0)], 0)
      else:
          next_states_block = torch.cat([next_states_block, next_state.unsqueeze(0)], 0)

      # --- Challenge Network ---
      if reacting_players[i] == 0:
          states_challenge = torch.cat([states_challenge, current_state.unsqueeze(0)], 0)
      elif acting_players[i] == 0 and reactions_game[i] == 1: # Check acting player for current state
          states_challenge = torch.cat([states_challenge, current_state.unsqueeze(0)], 0)
      else:
          next_states_challenge = torch.cat([next_states_challenge, next_state.unsqueeze(0)], 0)

      # --- Card Network ---
      # Assuming 'all_discard_piles' contains discard piles for each state
      # and 'all_bot0_cards_embedded' contains Bot 0's cards for each state

      current_discard_pile_size = len(all_discard_piles[i])
      next_discard_pile_size = len(all_discard_piles[i + 1])

      # Check if Bot 0 lost a card in the transition
      bot0_lost_card = (next_discard_pile_size > current_discard_pile_size) and \
                        any(torch.equal(card, torch.tensor(0)) for card in next_state[7:9])

      if bot0_lost_card:
          states_card = torch.cat([states_card, current_state.unsqueeze(0)], 0)
      else:
          next_states_card = torch.cat([next_states_card, next_state.unsqueeze(0)], 0)


  # print(state.shape)

  # print(states_action.shape)
  # print(next_states_action.shape)
  # print(len(states_action) + len(next_states_action))

  # print(states_block.shape)
  # print(next_states_block.shape)
  # print(len(states_block) + len(next_states_block))

  # print(states_challenge.shape)
  # print(next_states_challenge.shape)
  # print(len(states_challenge) + len(next_states_challenge))

  # print(states_card.shape)
  # print(next_states_card.shape)
  # print(len(states_card) + len(next_states_card))

  # acting_players_      = actions_card[0]  # shape [N]
  # reacting_players_    = actions_card[1]  # shape [N]
  # challenges_direction = actions_card[3]  # shape [N]
  # cards_game_          = actions_card[4]  # shape [N, M]
  # chosen_player_idx = torch.where(challenges_direction == 0,
  #                                 reacting_players_.long(),
  #                                 cards_game_.long())
  # row_indices = torch.arange(cards_game.shape[0])
  # actions_card = cards_game_[row_indices, chosen_player_idx]
  # actions_card = actions_card.unsqueeze(0)

  # Convert states_action and next_states_action to float
  # states_action = states_action.type(torch.float32)
  # next_states_action = next_states_action.type(torch.float32)

  # # Convert states_block and next_states_block to float
  # states_block = states_block.type(torch.float32)
  # next_states_block = next_states_block.type(torch.float32)

  # # Convert states_challenge and next_states_challenge to float
  # states_challenge = states_challenge.type(torch.float32)
  # next_states_challenge = next_states_challenge.type(torch.float32)

  # # Convert states_card and next_states_card to float
  # states_card = states_card.type(torch.float32)
  # next_states_card = next_states_card.type(torch.float32)

  # # Convert actions_card to float (if necessary)
  # actions_card = actions_card.type(torch.int64)

  # print(len(next_states_action))

  # bot = copy.deepcopy(bots[0])

  # Recalculate the actual state size
  # state_size_card = states_card.shape[1]

  # Update the input layer of bot.card_q to match the actual state size
  # bot.card_q.fc1 = nn.Linear(state_size_card, 64)



  # print(len(states_action[0]))

  # batch_size = 128
  # i = 0


  losses_action = []
  losses_block = []
  losses_challenge = []
  losses_card = []


  num_batches_action = len(states_action) // batch_size  # Calculate number of batches
  for i in range(num_batches_action):  # Loop through desired number of batches
    # Randomly sample batch indices
    batch_indices_action = random.sample(range(len(states_action)), min(batch_size, len(states_action)))

    batch_indices_challenge = random.sample(range(len(states_challenge)), min(batch_size, len(states_challenge)))
    batch_indices_card = random.sample(range(len(states_card)), min(batch_size, len(states_card)))



    # Create batch for each replay buffer using sampled indices
    batch_states_action = torch.stack([states_action[j] for j in batch_indices_action])
    batch_actions_main = actions_main[batch_indices_action]  # Index to get correct shape

    batch_next_states_action = torch.empty((0, 12), dtype=torch.float32)

    batch_rewards_action = []
    batch_done_action = []

    for state_tensor in batch_states_action:

        indices = torch.where((state == state_tensor).all(dim=1))[0]

        # Handle cases where the state is not found (e.g., terminal state)
        if len(indices) == 0:
            batch_next_states_action = torch.cat([batch_next_states_action, state_tensor.unsqueeze(0)], 0)
            batch_rewards_action.append(torch.tensor(0.0))  # Default reward for terminal state
            batch_done_action.append(torch.tensor(1))
            continue

        state_index = indices[0].item()  # Get the index as an integer


        # Extract next states until Bot 0's next turn
        next_states = torch.empty((0, 12), dtype=torch.float32)
        current_index = state_index + 1
        rewards_for_state = []
        while current_index < len(state) and acting_players[current_index] != 0 and done[current_index] != 0:  # Assuming acting_player is a list of acting player IDs
            next_states = torch.cat([next_states, state[current_index].unsqueeze(0)], 0)
            rewards_for_state.append(rewards[current_index])
            current_index += 1

        if not next_states.numel():
            batch_next_states_action = torch.cat([batch_next_states_action, state_tensor.unsqueeze(0)], 0)
            batch_rewards_action.append(rewards[current_index])
            batch_done_action.append(torch.tensor(1))
            continue

        # print(next_states.shape)

        # Summarize the next states using the summarizer
        next_states = next_states.view(1, next_states.shape[0], 12)
        next_state_tensor = summarizer(next_states)
        # print(next_state_tensor.shape)
        batch_next_states_action = torch.cat([batch_next_states_action, next_state_tensor], 0)
        batch_rewards_action.append(torch.tensor(sum(rewards_for_state) / len(rewards_for_state) if rewards_for_state else rewards[current_index-1])) # Average reward, handle empty list
        batch_done_action.append(torch.tensor(done[current_index - 1] if current_index >= len(state) else torch.tensor(done[current_index])))


    # Stack the next states into a single tensor
    # batch_next_states_action = torch.stack(batch_next_states_action)

    # print(batch_states_action.shape)
    # print(batch_next_states_action.shape)
    # print(batch_actions_main.shape)

    # batch_rewards_action = rewards[batch_indices_action]
    # batch_done_action = done[batch_indices_action]

    action_q_next = bot.action_q(batch_next_states_action).max(1)[0]
    action_target_q = torch.tensor(batch_rewards_action) + (gamma * action_q_next * (1 - torch.tensor(batch_done_action)))
    q_values_action = bot.action_q(batch_states_action).gather(1, batch_actions_main.unsqueeze(1)).squeeze(1)

    # Compute the loss
    loss_action = criterion(q_values_action, action_target_q)

    # Optimize the Q-network
    bot.optimizer_action.zero_grad()
    loss_action.backward()
    bot.optimizer_action.step()

    states_action = np.delete(states_action.cpu().numpy(), batch_indices_action, axis=0)
    actions_main = np.delete(actions_main.cpu().numpy(), batch_indices_action, axis=0)  # Adjust axis for actions_main
    next_states_action = np.delete(next_states_action.cpu().numpy(), batch_indices_action, axis=0)

    states_action = torch.tensor(states_action, dtype=torch.float32)
    actions_main = torch.tensor(actions_main, dtype=torch.int64)
    next_states_action = torch.tensor(next_states_action, dtype=torch.float32)

    losses_action.append(loss_action.item())

    # print('action success')




  num_batches_block = len(states_block) // batch_size  # Calculate number of batches
  for i in range(num_batches_block):
    batch_indices_block = random.sample(range(len(states_block)), min(batch_size, len(states_block)))

    batch_states_block = torch.stack([states_block[j] for j in batch_indices_block])
    batch_actions_block = actions_block[batch_indices_block]

    batch_next_states_block = torch.empty((0, 12), dtype=torch.float32)

    batch_rewards_block = []
    batch_done_block = []

    for state_tensor in batch_states_block:

        indices = torch.where((state == state_tensor).all(dim=1))[0]

        # Handle cases where the state is not found (e.g., terminal state)
        if len(indices) == 0:
            batch_next_states_block = torch.cat([batch_next_states_block, state_tensor.unsqueeze(0)], 0)
            batch_rewards_block.append(torch.tensor(0.0))  # Default reward for terminal state
            batch_done_block.append(torch.tensor(1))
            continue

        state_index = indices[0].item()  # Get the index as an integer


        # Extract next states until Bot 0's next turn
        next_states = torch.empty((0, 12), dtype=torch.float32)
        current_index = state_index + 1
        rewards_for_state = []
        while current_index < len(state) and reacting_players[current_index] != 0 and done[current_index] != 1:  # Assuming acting_player is a list of acting player IDs
            next_states = torch.cat([next_states, state[current_index].unsqueeze(0)], 0)
            rewards_for_state.append(rewards[current_index])
            current_index += 1

        if not next_states.numel():
            batch_next_states_block = torch.cat([batch_next_states_block, state_tensor.unsqueeze(0)], 0)
            batch_rewards_block.append(rewards[current_index])
            batch_done_block.append(torch.tensor(1))
            continue

        # print(next_states.shape)

        # Summarize the next states using the summarizer
        next_states = next_states.view(1, next_states.shape[0], 12)
        next_state_tensor = summarizer(next_states)
        # print(next_state_tensor.shape)
        batch_next_states_block = torch.cat([batch_next_states_block, next_state_tensor], 0)
        batch_rewards_block.append(torch.tensor(sum(rewards_for_state) / len(rewards_for_state) if rewards_for_state else rewards[current_index-1])) # Average reward, handle empty list
        batch_done_block.append(torch.tensor(done[current_index-1]) if current_index >= len(state) else torch.tensor(done[current_index]))

    # batch_rewards_block = rewards[batch_indices_block]
    # batch_done_block = done[batch_indices_block]

    block_q_next = bot.block_q(batch_next_states_block).max(1)[0]
    block_target_q = torch.tensor(batch_rewards_block) + (gamma * block_q_next * (1 - torch.tensor(batch_done_block)))
    q_values_block = bot.block_q(batch_states_block).gather(1, batch_actions_block.unsqueeze(1)).squeeze(1)

    # Compute the loss
    loss_block = criterion(q_values_block, block_target_q)

    # Optimize the Q-network
    bot.optimizer_block.zero_grad()
    loss_block.backward()
    bot.optimizer_block.step()

    states_block = np.delete(states_block.cpu().numpy(), batch_indices_block, axis=0)
    actions_block = np.delete(actions_block.cpu().numpy(), batch_indices_block, axis=0)
    next_states_block = np.delete(next_states_block.cpu().numpy(), batch_indices_block, axis=0)

    states_block = torch.tensor(states_block, dtype=torch.float32)
    actions_block = torch.tensor(actions_block, dtype=torch.int64)
    next_states_block = torch.tensor(next_states_block, dtype=torch.float32)

    losses_block.append(loss_block.item())

    # print('block success')




  num_batches_challenge = len(states_challenge) // batch_size  # Calculate number of batches
  for i in range(num_batches_challenge):
    batch_indices_challenge = random.sample(range(len(states_challenge)), min(batch_size, len(states_challenge)))

    batch_states_challenge = torch.stack([states_challenge[j] for j in batch_indices_challenge])
    batch_actions_challenge = actions_challenge[batch_indices_challenge]
    batch_next_states_challenge = torch.empty((0, 12), dtype=torch.float32)

    batch_rewards_challenge = []
    batch_done_challenge = []

    for state_tensor in batch_states_challenge:

        indices = torch.where((state == state_tensor).all(dim=1))[0]

        # Handle cases where the state is not found (e.g., terminal state)
        if len(indices) == 0:
            batch_next_states_challenge = torch.cat([batch_next_states_challenge, state_tensor.unsqueeze(0)], 0)
            batch_rewards_challenge.append(torch.tensor(0.0))  # Default reward for terminal state
            batch_done_challenge.append(torch.tensor(1))  # Mark terminal state as done
            continue

        state_index = indices[0].item()  # Get the index as an integer


        # Extract next states until Bot 0's next turn
        next_states = torch.empty((0, 12), dtype=torch.float32)
        current_index = state_index + 1
        rewards_for_state = []
        while current_index < len(state) and ((reacting_players[current_index] != 0) and (acting_players[current_index] != 0 or reactions_game[current_index] != 1)) and done[current_index] != 1:  # Assuming acting_player is a list of acting player IDs
            next_states = torch.cat([next_states, state[current_index].unsqueeze(0)], 0)
            rewards_for_state.append(rewards[current_index])
            current_index += 1

        if not next_states.numel():
            batch_next_states_challenge = torch.cat([batch_next_states_challenge, state_tensor.unsqueeze(0)], 0)
            batch_rewards_challenge.append(rewards[current_index])
            batch_done_challenge.append(torch.tensor(1))
            continue

        # print(next_states.shape)

        # Summarize the next states using the summarizer
        next_states = next_states.view(1, next_states.shape[0], 12)
        next_state_tensor = summarizer(next_states)
        # print(next_state_tensor.shape)
        batch_next_states_challenge = torch.cat([batch_next_states_challenge, next_state_tensor], 0)

        batch_rewards_challenge.append(torch.tensor(sum(rewards_for_state) / len(rewards_for_state) if rewards_for_state else rewards[current_index-1])) # Average reward, handle empty list
        batch_done_challenge.append(torch.tensor(done[current_index-1]) if current_index >= len(state) else torch.tensor(done[current_index]))

    # batch_rewards_challenge = rewards[batch_indices_challenge]
    # batch_done_challenge = done[batch_indices_challenge]

    challenge_q_next = bot.challenge_q(batch_next_states_challenge).max(1)[0]
    challenge_target_q = torch.tensor(batch_rewards_challenge) + (gamma * challenge_q_next * (1 - torch.tensor(batch_done_challenge)))
    q_values_challenge = bot.challenge_q(batch_states_challenge).gather(1, batch_actions_challenge.unsqueeze(1)).squeeze(1)

    # Compute the loss
    loss_challenge = criterion(q_values_challenge, challenge_target_q)

    # Optimize the Q-network
    bot.optimizer_challenge.zero_grad()
    loss_challenge.backward()
    bot.optimizer_challenge.step()

    states_challenge = np.delete(states_challenge.cpu().numpy(), batch_indices_challenge, axis=0)
    actions_challenge = np.delete(actions_challenge.cpu().numpy(), batch_indices_challenge, axis=0)
    next_states_challenge = np.delete(next_states_challenge.cpu().numpy(), batch_indices_challenge, axis=0)

    states_challenge = torch.tensor(states_challenge, dtype=torch.float32)
    actions_challenge = torch.tensor(actions_challenge, dtype=torch.int64)
    next_states_challenge = torch.tensor(next_states_challenge, dtype=torch.float32)

    losses_challenge.append(loss_challenge.item())

    # print('challenge success')




  num_batches_card = len(states_card) // batch_size  # Calculate number of batches
  for i in range(num_batches_card):
    batch_indices_card = random.sample(range(len(states_card)), min(batch_size, len(states_card)))

    batch_states_card = torch.stack([states_card[j] for j in batch_indices_card])
    batch_actions_card = actions_card[batch_indices_card]

    batch_next_states_card = torch.empty((0, 12), dtype=torch.float32)
    batch_rewards_card = []
    batch_done_card = []

    for state_tensor in batch_states_card:

        indices = torch.where((state == state_tensor).all(dim=1))[0]

        # Handle cases where the state is not found (e.g., terminal state)
        if len(indices) == 0:
            batch_next_states_card = torch.cat([batch_next_states_card, state_tensor.unsqueeze(0)], 0)
            batch_rewards_card.append(torch.tensor(0.0))  # Default reward for terminal state
            batch_done_card.append(torch.tensor(1))  # Mark terminal state as done
            continue

        state_index = indices[0].item()  # Get the index as an integer


        # Extract next states until Bot 0's next turn
        next_states = torch.empty((0, 12), dtype=torch.float32)
        current_index = state_index + 1
        rewards_for_state = []
        if current_index == len(state):
            batch_next_states_card = torch.cat([batch_next_states_card, state_tensor.unsqueeze(0)], 0)
            rewards_for_state.append(rewards[current_index])  # Collect reward for each next state
            continue

        while current_index < len(state) - 1 and done[current_index] != 1:  # Assuming acting_player is a list of acting player IDs
            current_discard_pile_size = len(all_discard_piles[current_index])
            next_discard_pile_size = len(all_discard_piles[current_index + 1])

            current_bot0_card_length = torch.sum(state[current_index][7:9] == 0).item()
            next_bot0_card_length = torch.sum(state[current_index + 1][7:9] == 0).item()

            # Check if Bot 0 doesn't lose a card in the transition
            bot0_not_lost_card = (next_discard_pile_size == current_discard_pile_size) or \
             (current_bot0_card_length == next_bot0_card_length)

            if bot0_not_lost_card:
                next_states = torch.cat([next_states, state_tensor.unsqueeze(0)], 0)

            rewards_for_state.append(rewards[current_index])  # Collect reward for each next state
            current_index += 1

        if not next_states.numel():
            batch_next_states_card = torch.cat([batch_next_states_card, state_tensor.unsqueeze(0)], 0)
            batch_rewards_card.append(rewards[current_index])
            batch_done_card.append(torch.tensor(1))  # Mark terminal state as done
            continue

        # print(next_states.shape)

        # Summarize the next states using the summarizer
        next_states = next_states.view(1, next_states.shape[0], 12)
        next_state_tensor = summarizer(next_states)
        # print(next_state_tensor.shape)
        batch_next_states_card = torch.cat([batch_next_states_card, next_state_tensor], 0)

        batch_rewards_card.append(torch.tensor(sum(rewards_for_state) / len(rewards_for_state) if rewards_for_state else rewards[current_index-1])) # Average reward, handle empty list
        batch_done_card.append(torch.tensor(done[current_index-1]) if current_index >= len(state) else torch.tensor(done[current_index]))

    # print(batch_states_card.shape)
    # print(batch_next_states_card.shape)
    # print(batch_actions_card.shape)

    # batch_rewards_card = rewards[batch_indices_card]
    # batch_done_card = done[batch_indices_card]

    # print(batch_rewards_card.shape)
    # print(batch_done_card.shape)

    card_q_next = bot.card_q(batch_next_states_card).max(1)[0]
    card_target_q = torch.tensor(batch_rewards_card) + (gamma * card_q_next * (1 - torch.tensor(batch_done_card)))
    # print(f"batch_actions_card: {batch_actions_card}")
    q_values_card = bot.card_q(batch_states_card).gather(1, batch_actions_card.unsqueeze(1)).squeeze(1)

    # Compute the loss
    loss_card = criterion(q_values_card, card_target_q)

    # Optimize the Q-network
    bot.optimizer_card.zero_grad()
    loss_card.backward()
    bot.optimizer_card.step()

    states_card = np.delete(states_card.cpu().numpy(), batch_indices_card, axis=0)
    actions_card = np.delete(actions_card.cpu().numpy(), batch_indices_card, axis=0)
    next_states_card = np.delete(next_states_card.cpu().numpy(), batch_indices_card, axis=0)

    states_card = torch.tensor(states_card, dtype=torch.float32)
    actions_card = torch.tensor(actions_card, dtype=torch.int64)
    next_states_card = torch.tensor(next_states_card, dtype=torch.float32)

    losses_card.append(loss_card.item())

    # print('card success')

    # Delete the randomly selected rows from the tensors

    # rewards = np.delete(rewards.cpu().numpy(), batch_indices_action+batch_indices_block+batch_indices_challenge+batch_indices_card, axis=0)
    # done = np.delete(done.cpu().numpy(), batch_indices_action+batch_indices_block+batch_indices_challenge+batch_indices_card, axis=0)





    # Convert back to tensors for the next batch

    # rewards = torch.tensor(rewards, dtype=torch.float32)
    # done = torch.tensor(done, dtype=torch.float32)






    # print(f'batch {i}:')
    # print(f'loss for action: {loss_action}')
    # print(f'loss for block: {loss_block}')
    # print(f'loss for challenge: {loss_challenge}')
    # print(f'loss for card: {loss_card}')

    bot.cards = bots[0].cards
    bot.num_coins = bots[0].num_coins






    # i += 1
  epsilon *= 0.995

  # if (episode + 1) % 100 == 0:
  #   data_fraction = min(data_fraction + 1/5, 1)
  #   epsilon = 1.0

  # gamma = min(0.99, gamma + 0.001)

  avg_losses_action.append(sum(losses_action) / len(losses_action))
  avg_losses_block.append(sum(losses_block) / len(losses_block))
  avg_losses_challenge.append(sum(losses_challenge) / len(losses_challenge))
  avg_losses_card.append(sum(losses_card) / len(losses_card))

  print(f'Avg Action Loss, {num_batches_action} batches: {avg_losses_action[-1]}')
  print(f'Avg Block Loss, {num_batches_block} batches: {avg_losses_block[-1]}')
  print(f'Avg Challenge Loss, {num_batches_challenge} batches: {avg_losses_challenge[-1]}')
  print(f'Avg Card Loss, {num_batches_card} batches: {avg_losses_card[-1]}')

  # Copy parameters of action_q network
  bots[0].action_q.load_state_dict(bot.action_q.state_dict())

  # Copy parameters of block_q network
  bots[0].block_q.load_state_dict(bot.block_q.state_dict())

  # Copy parameters of challenge_q network
  bots[0].challenge_q.load_state_dict(bot.challenge_q.state_dict())

  # Copy parameters of card_q network
  bots[0].card_q.load_state_dict(bot.card_q.state_dict())

  print(bots[0].name)

  if episode % 10 == 0:

    win_rate = 0

    for i in range(100):

      # print(i)
      discard_pile, acting_player, reacting_player, current_player, action_game, reaction_game, challenge_game, card_game, coin_game, challenge_direction, done_0, reward, card_chosen, bots_copy = game_loop_random(bots, actions, influences_reverse, 0.0)
      bots = bots_copy
      if reward[-1] == 1:
        win_rate += 1

    # win_rate = win_rate / 50
    # print(f'Bot 0 Win Rate, Random Actions: {win_rate / 100}')

    win_rate = win_rate / 100
    win_rates.append(win_rate)

    print(f'win rate: {win_rate}')


  # df = pd.DataFrame(data = data)
  # print(df.head())

episode 0 of 1000
epsilon: 1.0
gamma: 0.99
Number of games in episode 0: 77


  batch_rewards_action.append(torch.tensor(sum(rewards_for_state) / len(rewards_for_state) if rewards_for_state else rewards[current_index-1])) # Average reward, handle empty list
  batch_done_action.append(torch.tensor(done[current_index - 1] if current_index >= len(state) else torch.tensor(done[current_index])))
  batch_rewards_block.append(torch.tensor(sum(rewards_for_state) / len(rewards_for_state) if rewards_for_state else rewards[current_index-1])) # Average reward, handle empty list
  batch_done_block.append(torch.tensor(done[current_index-1]) if current_index >= len(state) else torch.tensor(done[current_index]))
  batch_rewards_challenge.append(torch.tensor(sum(rewards_for_state) / len(rewards_for_state) if rewards_for_state else rewards[current_index-1])) # Average reward, handle empty list
  batch_done_challenge.append(torch.tensor(done[current_index-1]) if current_index >= len(state) else torch.tensor(done[current_index]))
  batch_rewards_card.append(torch.tensor(sum(rewards

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
0
episode 123 of 1000
epsilon: 0.5398075216808175
gamma: 0.99
Number of games in episode 123: 77
Avg Action Loss, 13 batches: 0.03919730206521658
Avg Block Loss, 8 batches: 0.04139342618873343
Avg Challenge Loss, 9 batches: 0.054010369504491486
Avg Card Loss, 12 batches: 0.06127004612547656
0
episode 124 of 1000
epsilon: 0.5371084840724134
gamma: 0.99
Number of games in episode 124: 72
Avg Action Loss, 13 batches: 0.03374566524647749
Avg Block Loss, 9 batches: 0.07734859445028835
Avg Challenge Loss, 9 batches: 0.07610877147979206
Avg Card Loss, 10 batches: 0.06598290763795375
0
episode 125 of 1000
epsilon: 0.5344229416520513
gamma: 0.99
Number of games in episode 125: 75
Avg Action Loss, 11 batches: 0.040466784753582695
Avg Block Loss, 7 batches: 0.07823382903422628
Avg Challenge Loss, 8 batches: 0.08731952053494751
Avg Card Loss, 11 batches: 0.06130341351540251
0
episode 126 of 1000
epsilon: 0.531750826943791
gamma: 0.99

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

plt.plot(avg_losses_action, label='Avg Action Loss')
plt.plot(avg_losses_block, label='Avg Block Loss')
plt.plot(avg_losses_challenge, label='Avg Challenge Loss')
plt.plot(avg_losses_card, label='Avg Card Loss')

plt.xlabel('Episode/Iteration')
plt.ylabel('Loss')
plt.title('Training Losses')
plt.legend()
plt.show()

In [None]:
plt.plot(win_rates, label='Win Rate')

plt.xlabel('Episode/Iteration')
plt.ylabel('Win Rate')
plt.title('Win Rate over Time')
plt.legend()
plt.show()

In [None]:
plt.plot(avg_game_lengths, label='Avg Game Lengths')

plt.xlabel('Episode/Iteration')
plt.ylabel('Avg Game Length (timesteps)')
plt.title('Avg Game Length over Time')
plt.legend()
plt.show()

In [None]:
torch.save({
        'action_q_state_dict': bot.action_q.state_dict(),
        'block_q_state_dict': bot.block_q.state_dict(),
        'challenge_q_state_dict': bot.challenge_q.state_dict(),
        'card_q_state_dict': bot.card_q.state_dict(),
        'optimizer_action_state_dict': bot.optimizer_action.state_dict(),
        'optimizer_block_state_dict': bot.optimizer_block.state_dict(),
        'optimizer_challenge_state_dict': bot.optimizer_challenge.state_dict(),
        'optimizer_card_state_dict': bot.optimizer_card.state_dict()
    }, 'bot_parameters.pth')

In [None]:
checkpoint = torch.load('bot_parameters.pth')
bot.action_q.load_state_dict(checkpoint['action_q_state_dict'])
bot.block_q.load_state_dict(checkpoint['block_q_state_dict'])
bot.challenge_q.load_state_dict(checkpoint['challenge_q_state_dict'])
bot.card_q.load_state_dict(checkpoint['card_q_state_dict'])
bot.optimizer_action.load_state_dict(checkpoint['optimizer_action_state_dict'])
bot.optimizer_block.load_state_dict(checkpoint['optimizer_block_state_dict'])
bot.optimizer_challenge.load_state_dict(checkpoint['optimizer_challenge_state_dict'])
bot.optimizer_card.load_state_dict(checkpoint['optimizer_card_state_dict'])

In [None]:
bots[0].action_q.load_state_dict(bot.action_q.state_dict())

# Copy parameters of block_q network
bots[0].block_q.load_state_dict(bot.block_q.state_dict())

# Copy parameters of challenge_q network
bots[0].challenge_q.load_state_dict(bot.challenge_q.state_dict())

# Copy parameters of card_q network
bots[0].card_q.load_state_dict(bot.card_q.state_dict())


win_rate = 0

for i in range(100):

  # print(i)
  discard_pile, acting_player, reacting_player, current_player, action_game, reaction_game, challenge_game, card_game, coin_game, challenge_direction, done_0, reward, card_chosen, bots_copy = game_loop_random(bots, actions, influences_reverse, 0.0)
  bots = bots_copy
  if reward[-1] == 1:
    win_rate += 1

# win_rate = win_rate / 50
print(f'Bot 0 Win Rate, Random Actions: {win_rate / 100}')
