In [104]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import random
import matplotlib.cm as cm
import matplotlib.colors as colors

import nashpy as nash
import itertools

In [220]:
class Card:
    def __init__(self, kind, show):
        self.kind = kind
        self.show = show

    def toString(self):
        return '[' + self.show + "," + self.kind + ']'


class MultiAgentGame:
    def __init__(self, N, C):
        self.N = N  # Number of players
        self.C = C  # Number of card sets
        self.deck = self.createDeck() * self.C  # Multiple sets of cards
        random.shuffle(self.deck)
        self.PH = [[] for _ in range(self.N)]  # Player hands (list of lists)
        self.DH = []  # Dealer hand
        self.usefulAce = [False] * self.N  # Track useful aces for each player

    def createDeck(self):
        kinds = ['Diamond', 'Club', 'Spade', 'Heart']
        shows = ['A', '2', '3', '4', '5', '6', '7', '8', '9', 'T', 'J', 'Q', 'K']
        return [Card(kind, show) for kind in kinds for show in shows]

    def newHand(self, target, player_id=None):
        card = self.deck.pop()
        if target == 'Player' and player_id is not None:
            self.PH[player_id].append(card)
        elif target == 'Dealer':
            self.DH.append(card)

    def sum(self, target, player_id=None):
        hand = self.PH[player_id] if target == "Player" else self.DH
        usefulAce = False
        total = 0

        for card in hand:
            if card.show in ('T', 'J', 'Q', 'K'):
                total += 10
            elif card.show == 'A':
                usefulAce = True
                total += 11
            else:
                total += int(card.show)

        while total > 21 and usefulAce:
            total -= 10
            usefulAce = False

        if target == "Player":
            self.usefulAce[player_id] = usefulAce

        return total

    def displayHand(self, target, player_id=None, showAll=False):
        if target == 'Player' and player_id is not None:
            hand = self.PH[player_id]
            return "Player {}: {}".format(player_id, "/".join(card.toString() for card in hand))
        elif target == 'Dealer':
            if showAll:
                return "Dealer: {}".format("/".join(card.toString() for card in self.DH))
            else:
                return "Dealer: {}/???".format(self.DH[0].toString())

    def Bust(self, target, player_id=None):
        return self.sum(target, player_id) > 21

    def Compare(self, PS, DS):
        if PS > DS:
            return "Player"
        elif PS == DS:
            return "Tie"
        else:
            return "Dealer"


class MultiAgentTrial:
    def __init__(self, N, C):
        self.game = MultiAgentGame(N, C)
        self.N = N
        self.players = range(N)

        # Deal initial cards to all players and the dealer
        # At least 12 for PS
        for player_id in self.players:
            self.game.newHand("Player", player_id)
            self.game.newHand("Player", player_id)
            while self.game.sum("Player", player_id) < 12:
                self.game.newHand("Player", player_id)
        self.game.newHand("Dealer")
        self.game.newHand("Dealer")

        # Initial state for each player
        self.s0 = tuple([
            (
                self.game.usefulAce[player_id],
                self.game.DH[0].show,
                self.game.sum("Player", player_id),
            )
            for player_id in self.players
        ])

        #display init cards
        # print("players cards:")
        # for player_id in self.players:
        #     print(f"{player_id}: {self.game.displayHand('Player', player_id)}")
        # print(f"dealer's face up card: {self.game.displayHand('Dealer')}")

    def next(self, joint_s0, a0, player_id):
        joint_s1 = joint_s0
        s0 = joint_s0[player_id]
        if a0 == 0: #hit
            self.game.newHand("Player", player_id)
            PS = self.game.sum("Player", player_id)
            DS = self.game.DH[0].show
            UA = self.game.usefulAce[player_id]
            playerBust = self.game.Bust("Player", player_id)
            if playerBust:
                s1 = 'illegal'
                r = -1
                done = True
            else:
                s1 = (UA, DS, PS)
                r = 0
                done = False
        elif a0 == 1: #stay
            s1 = s0
            r = -999
            done = True
        else:
            print("illegal action")
            raise KeyError

        temp = list(joint_s0)
        temp[player_id] = s1
        joint_s1 = tuple(temp)
        # print(f"in next() method, check for the s0 s1 change: {joint_s0} ---> {joint_s1}")
        return joint_s1, r, done

    def dealerTurn(self, joint_state, reward_list):
        # print(len(reward_list))
        while True:
            curSum = self.game.sum("Dealer")
            if curSum >= 17:
                action = 's'
                for player in self.players:
                    if reward_list[player][-1] == -999:
                        PS = joint_state[player][2]
                        winner = self.game.Compare(PS, curSum)
                        if winner == "Dealer":
                            r = -1
                        elif winner == "Player":
                            r = 1
                        elif winner == "Tie":
                            r = 0
                        reward_list[player][-1] = r
                break
            else:
                action = 'h'
                self.game.newHand("Dealer")
                if self.game.Bust("Dealer"):
                    # print("dealer busted")
                    for player in self.players: reward_list[player][-1] = 1 #every player wins, reward=1
                    break
        assert len(reward_list) == self.N
        return reward_list

    def action_space(self):
        return ['hit', 'stay']

In [221]:
trial = MultiAgentTrial(2,2)
joint_state_list = []
r_list = [[], []]
action_list = [[], []]

s0_joint = trial.s0
joint_state_list.append(s0_joint)
#1 step, player 0
s1_joint, r, done = trial.next(s0_joint, 1, 0)
joint_state_list.append(s1_joint)
r_list[0].append(r)
action_list[0].append('s')
s0_joint = s1_joint

#1 step, player 1
s1_joint, r, done = trial.next(s0_joint, 0, 1)
joint_state_list.append(s1_joint)
r_list[1].append(r)
action_list[1].append('h')
s0_joint = s1_joint

if done is False:
    s1_joint, r, done = trial.next(s0_joint, 1, 1)
    joint_state_list.append(s1_joint)
    r_list[1].append(r)
    action_list[1].append('s')
    s0_joint = s1_joint

print(f"game traj: {joint_state_list}")
trial.dealerTurn(s1_joint,r_list)

game traj: [((False, '4', 12), (False, '4', 20)), ((False, '4', 12), (False, '4', 20)), ((False, '4', 12), 'illegal')]


[[1], [1]]

In [260]:
class single_table:
    def __init__(self):
        self.n_usefulAce = 2
        self.usefulAce_index = [True, False]
        self.n_DF = 13
        self.DF_index = ['A', '2', '3', '4', '5', '6', '7', '8', '9', 'T', 'J', 'Q', 'K']
        self.n_PS = 10
        self.PS_index = [12,13, 14, 15, 16, 17, 18, 19, 20, 21]
        self.n_action = 2
        self.states_dict = {}

    def get_possible_states(self):
        possible_states = list(itertools.product(self.usefulAce_index, self.DF_index, self.PS_index))
        possible_states.append(('illegal'))
        print(possible_states)
        return possible_states

    def create_table(self):
        states = self.get_possible_states()
        for _, value in enumerate(states):
            self.states_dict[value] = _
        self.grid = np.zeros((len(states), self.n_action))

    def get_index(self, state):
        try:
            index = self.states_dict[state]
            return index
        except:
            return None
        
class joint_table:
    #a state is like: (usefulAce, DF, PS)
    def __init__(self):
        self.n_usefulAce = 2
        self.usefulAce_index = [True, False]
        self.n_DF = 13
        self.DF_index = ['A', '2', '3', '4', '5', '6', '7', '8', '9', 'T', 'J', 'Q', 'K']
        self.n_PS = 10
        self.PS_index = [12,13, 14, 15, 16, 17, 18, 19, 20, 21]
        self.n_action = 2
        self.action_index = ['h', 's']
        self.joints_dict = {}
        self.grid = None
    
    def get_possible_joints(self):
        possible_states = list(itertools.product(self.usefulAce_index, self.DF_index, self.PS_index))
        possible_states.append(('illegal'))
        possible_joints = [(s1, s2) for s1, s2 in itertools.product(possible_states, possible_states) if s1[1]==s2[1] or (s1=='illegal' or s2=='illegal')]
        return possible_joints
        
    def create_table(self):
        joints = self.get_possible_joints()
        for _, value in enumerate(joints):
            self.joints_dict[value] = _
        self.n_joints = len(joints)
        self.grid = np.zeros((self.n_joints, self.n_action, self.n_action))
        print(self.grid.shape)
        print(len(self.joints_dict.keys()))

    def get_index(self, state):
        try: 
            index = self.joints_dict[state]
            return index
        except:
            return None


In [410]:
#Q table
QT_single =  single_table()
QT_single.create_table()
Q0 = joint_table()
Q0.create_table()
Q1 = joint_table()
Q1.create_table()
# print(Q0.get_index(((True, 'T', 17), 'illegal')))
# print(Q1.get_index(((True, 'T', 17), 'illegal'))) #不可能出现不一样的face up的情况，因此，可以缩小我们的joint state空间，大概是60000/10 = 6000左右



#all agents are cooperating scenario, JAL with nash equilibrium
epoches = 1000000
gamma = 0.99
epsi = 0.5
num_agents = 2
card_decks = 2
lr = 1/(epoches/5721)

for ep in range(epoches):
    if ep % 10000 == 0: print(ep)
    trial = MultiAgentTrial(num_agents, card_decks)
    s0_joint = trial.s0 #s0 is a joint state like ([xxx, xxx, xxx], [xxx, xxx, xxx])

    action_list = [[], []] 
    reward_list = [[], []]
    joint_state_list = [[], []]
    joint_state_list[0].append(s0_joint)
    wait_for_dealer_result = [False, False]
    if np.random.rand() < 1:
        for player in trial.players:
            # print(f"palyer {player}-----------------------------------------------------------------------------------")
            TD_0 = True
            if player == trial.players[-1]: TD_0 = False #if first player, then uses TD(0) to learn at each step
            while True:
                ai = trial.action_space().index(np.random.choice(trial.action_space()))
                s1_joint, ri, done = trial.next(s0_joint, ai, player)
                # print(trial.action_space()[ai], s0_joint, s1_joint, ri)
                action_list[player].append(ai)
                reward_list[player].append(ri)
                joint_state_list[player].append(s1_joint)
                #learn with TD(0) if is player 0
                if TD_0:
                    if ri != -999:
                        s0_index = QT_single.get_index(s0_joint[player])
                        s1_index = QT_single.get_index(s1_joint[player])
                        Q_old = QT_single.grid[s0_index][ai]
                        Q_new = np.max(QT_single.grid[s1_index][:])
                        Q_old += lr*(ri + (gamma*Q_new - Q_old)*(1-done))
                        QT_single.grid[s0_index][ai] = Q_old

                        #copy this value to Q0, for every combo with s0_joint[player], use this updated Q value
                        for key in Q0.joints_dict.keys():
                            if key[0] ==  s0_joint[player]:
                                index = Q0.get_index(key)
                                for aj in range(2):
                                    Q0.grid[index][ai][aj] += lr*(ri + (gamma*Q_new - Q0.grid[index][ai][aj])*(1-done))
                    else:
                        wait_for_dealer_result[player] = True
                #learn with nash TD if is player 1
                else:
                    if ri != -999:
                        aj = action_list[0][-1]
                        #求解nash eq policies for player 1
                        s0_index = Q0.get_index(s0_joint)
                        s1_index = Q0.get_index(s1_joint)
                        game = nash.Game(Q0.grid[s1_index], Q1.grid[s1_index]) #现在只支持两个玩家的nash eq求解
                        equilibriums = list(game.support_enumeration())
                        best_payoff = -np.Inf
                        eq_value = None
                        for equilibrium in equilibriums:
                            payoff = game[equilibrium][0] + game[equilibrium][1]
                            if payoff > best_payoff:
                                eq_value = game[equilibrium]
                                best_payoff = payoff
                        # print(eq_value)
                        Q0.grid[s0_index][aj][ai] += lr*(ri + (gamma*eq_value[0] - Q0.grid[s0_index][aj][ai])*(1-done))
                        Q1.grid[s0_index][aj][ai] += lr*(ri + (gamma*eq_value[1] - Q1.grid[s0_index][aj][ai])*(1-done))
                    else:
                        wait_for_dealer_result[player] = True

                #update for next game step
                s0_joint = s1_joint
                if done:
                    if player == 0:
                        joint_state_list[1].append(s1_joint)
                    break
        #dealer's turn, no matter two palyers were busted or not
        reward_list = trial.dealerTurn(s1_joint, reward_list)
        #更新需要等待跟dealer比大小的情况
        #player 0
        # print(f"state history for both players: \n {joint_state_list[0]} \n {joint_state_list[1]}")
        if wait_for_dealer_result[0]:
            player = 0
            s0_joint = joint_state_list[0][-2]
            ai = action_list[0][-1]
            ri = reward_list[0][-1]
            s0_index = QT_single.get_index(s0_joint[player])
            Q_old = QT_single.grid[s0_index][ai]
            Q_old += lr*ri
            QT_single.grid[s0_index][ai] = Q_old
            for key in Q0.joints_dict.keys():
                if key[0] ==  s0_joint[player]:
                    index = Q0.get_index(key)
                    for aj in range(2):
                        Q0.grid[index][ai][aj] += lr*ri

        #player 1
        if wait_for_dealer_result[1]:
            s0_joint = joint_state_list[1][-2]
            ai = action_list[1][-1] #ai是player 1, aj才是palyer2
            ri = reward_list[1][-1]
            rj = reward_list[0][-1]
            aj = action_list[0][-1]
            s0_index = Q1.get_index(s0_joint)
            # game = nash.Game(Q0.grid[s1_index], Q1.grid[s1_index]) #现在只支持两个玩家的nash eq求解
            # equilibriums = list(game.support_enumeration())
            # best_payoff = -np.Inf
            # eq_value = None
            # for equilibrium in equilibriums:
            #     payoff = game[equilibrium][0] + game[equilibrium][1]
            #     if payoff > best_payoff:
            #         eq_value = game[equilibrium]
            #         best_payoff = payoff
            # equilibrium = equilibriums[0] #frist nash eq point, like ([0, 1], [1, 0])
            # print(eq_value)
            Q0.grid[s0_index][aj][ai] += lr*rj 
            # print(Q1.grid[s0_index][aj][ai])
            Q1.grid[s0_index][aj][ai] += lr*ri
            # print(Q1.grid[s0_index][aj][ai])

        # print(joint_state_list, action_list, reward_list)
    # else:
    #     for player in trial.players:
    #         #求解nash eq policies
    #         matrices = [QT[i][s0] for i in trial.players]
    #         game = nash.Game(matrices[0], matrices[1]) #现在只支持两个玩家的nash eq求解
    #         equilibriums = list(game.support_enumeration())
    #         equilibrium = equilibriums[0] #frist nash eq point, like ([0, 1], [1, 0])
    #         action = np.where(equilibrium[player]==1)
    #         action_list.append(action)
    
    #         #game step forward
    #         trial.next(s0_joint, action, player_id=player)
        
                
                

        





[(True, 'A', 12), (True, 'A', 13), (True, 'A', 14), (True, 'A', 15), (True, 'A', 16), (True, 'A', 17), (True, 'A', 18), (True, 'A', 19), (True, 'A', 20), (True, 'A', 21), (True, '2', 12), (True, '2', 13), (True, '2', 14), (True, '2', 15), (True, '2', 16), (True, '2', 17), (True, '2', 18), (True, '2', 19), (True, '2', 20), (True, '2', 21), (True, '3', 12), (True, '3', 13), (True, '3', 14), (True, '3', 15), (True, '3', 16), (True, '3', 17), (True, '3', 18), (True, '3', 19), (True, '3', 20), (True, '3', 21), (True, '4', 12), (True, '4', 13), (True, '4', 14), (True, '4', 15), (True, '4', 16), (True, '4', 17), (True, '4', 18), (True, '4', 19), (True, '4', 20), (True, '4', 21), (True, '5', 12), (True, '5', 13), (True, '5', 14), (True, '5', 15), (True, '5', 16), (True, '5', 17), (True, '5', 18), (True, '5', 19), (True, '5', 20), (True, '5', 21), (True, '6', 12), (True, '6', 13), (True, '6', 14), (True, '6', 15), (True, '6', 16), (True, '6', 17), (True, '6', 18), (True, '6', 19), (True, '6', 2

An even number of (4) equilibria was returned. This
indicates that the game is degenerate. Consider using another algorithm
to investigate.
                  
An even number of (2) equilibria was returned. This
indicates that the game is degenerate. Consider using another algorithm
to investigate.
                  


10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
680000
690000
700000
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000
810000
820000
830000
840000
850000
860000
870000
880000
890000
900000
910000
920000
930000
940000
950000
960000
970000
980000
990000


In [421]:
#check the learning
# target0 = QT_single.get_index((False, '7', 20))
# targetQ0 = Q0.get_index(((False, 'K', 18), (False, 'K', 20)))
nonbust_A = Q1.get_index(((False, '8', 20), (False, '8', 18)))
bust_A = Q1.get_index(('illegal', (False, '8', 18)))
# print(QT_single.grid[target0])  
# print(Q0.grid[targetQ0]) 
print(Q1.grid[nonbust_A]) 
print(Q1.grid[bust_A])

[[ 0.          0.        ]
 [-0.89858497  0.125862  ]]
[[-2.45880541  1.02978   ]
 [ 0.          0.        ]]


5720

In [86]:
states = {((0,1), (0, 1)): 10, ((1,0), (0, 1)): 100}
states[((0,1), (0, 1))]


10

In [124]:
dic = {'A': 0}
dic['B']

KeyError: 'B'

In [96]:
a = [0,1,2]
a[0] += 10
a

[10, 1, 2]

In [107]:
a =  ['h', 's']
b = [2,3,4,5,6]
list(itertools.product(a,b))

[('h', 2),
 ('h', 3),
 ('h', 4),
 ('h', 5),
 ('h', 6),
 ('s', 2),
 ('s', 3),
 ('s', 4),
 ('s', 5),
 ('s', 6)]

In [203]:
t1 = ('a', 'b', 'c')
t2 = ('a', 'c', 'd')
t1 = t2
t1

('a', 'c', 'd')