# Blackjack Environment


In [49]:
import gym
from gym import spaces
from gym.utils import seeding

import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

In [58]:
def usable_ace(hand):
    return np.apply_along_axis(lambda e: 1 in e and sum(e)+10<=21,1,hand)

In [59]:
def sum_hand(hand):  # Return current hand total in numpy array (nhands,)
    return np.apply_along_axis(lambda e: sum(e)+10 if usable_ace(e[np.newaxis,:])[0] else sum(e),1,hand)

In [79]:
def pairs(hand):
    return np.apply_along_axis(lambda e: len(e)==2 and (e[0]==e[1]), 1,hand)

In [6]:
def is_bust(hand):  # Is this hand a bust?
    if isinstance(hand,list):
        hand=np.array(hand)[np.newaxis,:]
        return sum_hand(hand)[0]>21
    return np.apply_along_axis(lambda e:  sum_hand(e[np.newaxis,:])[0] > 21,1,hand)

In [107]:
def score(hand):  # What is the score of this hand (0 if bust)
    #dissociate for dealer and for player
    if isinstance(hand,list): 
        if is_bust(hand):
            return 0
        else:
            hand=np.array(hand)[np.newaxis,:]
            return sum_hand(hand)
    return np.apply_along_axis(lambda e: 0 if is_bust(e[np.newaxis,:])[0] else sum_hand(e[np.newaxis,:])[0]+int(is_natural(e[np.newaxis,:])[0]),1,hand)

In [73]:
def is_natural(hand):  # Is this hand a natural blackjack?
    return np.apply_along_axis(np.sort(e[np.newaxis,:]) == np.array([1, 10]),1,hand)

In [113]:
class BlackjackEnv(gym.Env):
    """Simple blackjack environment
    Blackjack is a card game where the goal is to obtain cards that sum to as
    near as possible to 21 without going over.  They're playing against a fixed
    dealer.
    Face cards (Jack, Queen, King) have point value 10.
    Aces can either count as 11 or 1, and it's called 'usable' at 11.
    This game is placed with an infinite deck (or with replacement).
    The game starts with each (player and dealer) having one face up and one
    face down card.
    The player can request additional cards (hit=1) until they decide to stop
    (stick=0) or exceed 21 (bust).
    After the player sticks, the dealer reveals their facedown card, and draws
    until their sum is 17 or greater.  If the dealer goes bust the player wins.
    If neither player nor dealer busts, the outcome (win, lose, draw) is
    decided by whose sum is closer to 21.  The reward for winning is +1,
    drawing is 0, and losing is -1.
    The observation of a 3-tuple of: the players current sum,
    the dealer's one showing card (1-10 where 1 is ace),
    and whether or not the player holds a usable ace (0 or 1).
    This environment corresponds to the version of the blackjack problem
    described in Example 5.1 in Reinforcement Learning: An Introduction
    by Sutton and Barto.
    http://incompleteideas.net/book/the-book-2nd.html
    """
    def __init__(self, natural=False):
        self.action_space = spaces.Discrete(4)
        self.observation_space = spaces.Tuple((
            spaces.Discrete(32),
            spaces.Discrete(11),
            spaces.Discrete(2)))
        self.seed()
        # Flag to payout 1.5 on a "natural" blackjack win, like casino rules
        # Ref: http://www.bicyclecards.com/how-to-play/blackjack/
        self.natural = natural
        # Start the first game
        self.reset()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def step(self, action):
        # hit stand double split = 0 , 1 , 2 , 3
        assert self.action_space.contains(action)
        if action==2:
            self.doubled[self.poshand]=True
        if action==0 or action==2:  # hit: add a card to players hand and return
            
            bef,aft=self.poshand-1,self.nhand-self.poshand
            card=self.draw_card(self.np_random)
            new_col=np.pad(np.array([card])[np.newaxis,:],((bef,aft),(0,0)))
            self.player=np.c_[self.player,new_col]
                           
            if is_bust(self.player) :
                if self.poshand<self.nhand:
                    reward=0
                    self.poshand+=1
                    done=False
                else:
                    reward = float(-1*int(self.doubled[self.poshand]))
                    done=True
            else:
                if action==1:
                    reward=0
                    done=False
                else:
                    if self.poshand<self.nhand:
                        self.poshand+=1
                        done=False
                        reward=0
                    else:
                        done=True
                        self.play_dealer()
                        reward = self.cmp(score(hand), score(self.dealer))
        elif action==1:
            if self.poshand<self.nhand:
                    reward=0
                    self.poshand+=1
                    done=False
            else:
                done=True
                reward = float(-1*int(self.doubled[self.poshand]))
                self.play_dealer()
                reward = self.cmp(score(hand), score(self.dealer))
                
        elif action==3:
            self.nhands+=1
            card_split=self.player[self.poshand,0]
            self.player=np.r_[self.player
            
            
            
                
                
        else:  # stick: play out the dealers hand, and score
            done = True
            while sum_hand(self.dealer) < 17:
                self.dealer.append(draw_card(self.np_random))
            reward = cmp(score(self.player), score(self.dealer))
            if self.natural and is_natural(self.player) and reward == 1:
                reward = 1.5
        return self._get_obs(), reward, done, {}
                           
    def cmp(self,sp,sd,hand):
        score_dealer=sd[0]
        result=np.sum(np.apply_along_axis(lambda e: float(e > score_dealer) - float(e < score_dealer)))
        tweak=self.doubled.reshape(self.nhands,-1)+is_natural(hand).reshape(nhands,-1)
        reward=np.sum(np.multiply(result,tweak))
        return reward
                                           
    def play_dealer(self):
        while sum_hand(self.dealer) < 17:
            self.dealer+=self.draw_card(np.random)
    
    def draw_card(np_random,self):
        i=np.random.randint(0,len(self.decks))
        card= self.decks[i]
        self.decks.pop(i)
        return [card]
    
    def draw_hand(np_random,self):
        return self.draw_card(np_random)+ self.draw_card(np_random)
    
    def _get_obs(self):
        
        return sumhand(self.player)
        sumhand=np.sum(self.player,axis=1) 
        self.poshand, sumhand(self.player), self.doubled, pairs(self.player),

    def reset(self):
        self.reward=0
        self.poshand=1
        self.nhand=1 # numbers of hands in case of split
        self.decks=6*[1,2,3,4,5,6,7,8,9,10,10,10,10]
        self.doubled=np.array([False])
        self.dealer = self.draw_card(self.np_random)
        self.player = np.array([self.draw_hand(self.np_random)]).reshape(-1,2)
        return self._get_obs()
    
    

SyntaxError: invalid syntax (<ipython-input-113-7cc06f15ea18>, line 53)

In [99]:
np.c_[np.array([[1,2],[1,2]]), np.array([8,8])]

array([[1, 2, 8],
       [1, 2, 8]])

In [98]:
np.array([[1,2],[1,2,3]]).shape

(2,)

In [3]:
L_d=range(1,12)
L_p=range(21,4,-1)+[i+'-'+i for i in L_d]+['A-'+i for i in L_d if i!='A']
print(len(L_d),len(L_p), L_d,L_p)
L_d_reversed=L_d
L_d_reversed.reverse()

10 36 ['A', '10', '9', '8', '7', '6', '5', '4', '3', '2'] ['21', '20', '19', '18', '17', '16', '15', '14', '13', '12', '11', '10', '9', '8', '7', '6', '5', 'A-A', '10-10', '9-9', '8-8', '7-7', '6-6', '5-5', '4-4', '3-3', '2-2', 'A-10', 'A-9', 'A-8', 'A-7', 'A-6', 'A-5', 'A-4', 'A-3', 'A-2']


In [4]:
cmax=10
cmin=-10
ndecks_left=5
npos_P=36 # De 21 à 5 puis de A-A,10-10 à 2-2 puis de A-10 à A-2 = 36 possibilités de first cards pour player
npos_D=10 # De 2 à 10 + l'AS = 10 possibilités de first card pour le dealer
npos_TC=cmax-cmin+1 # Simulation pour Truecount  de -13 à +13
npos_AC=4 # HIT, DOUBLE, SPLIT, STAND
n_rows=npos_P*npos_D*npos_TC
n_cols=npos_AC
print('Taille des données: (Cartes Player * Cartes Dealer * Truecount [-13,13]  ; Actions ) =',(npos_P*npos_D*npos_TC,npos_AC))

Taille des données: (Cartes Player * Cartes Dealer * Truecount [-13,13]  ; Actions ) = (7560, 4)


In [5]:
n_rows,n_cols

(7560, 4)

In [6]:
Deck=np.concatenate([np.array([L_d for i in range(6*4)]).ravel(),np.array(['10' for i in range(3*4*6)])])
np.random.shuffle(Deck)
Deck.shape

(312,)

In [7]:
def get_Q_matrix():
    Q = np.full((len(L_p),n_cols), -np.inf)

    for i in range(36):
        if i<17 or i>26:
            Q[i,0] = 0.0
            Q[i,1] = 0.0
            Q[i,2] = 0.0
            Q[i,3] =  -np.inf
            
        elif i>=17 and i<=26:
            Q[i,0] = 0.0
            Q[i,1] = 0.0
            Q[i,2] = 0.0
            Q[i,3]=0.0
        
        
    return np.concatenate([Q for i in range(7*10)])

In [8]:
# Ordre : H S D P (A former)
def get_matrix_tr():
    rules = np.zeros((len(L_p),4))

    for i in range(36):
        if i<17 or i>26:
            rules[i,0] = 1/3
            rules[i,1] = 1/3
            rules[i,2] = 1/3
        elif i>=17 and i<=26:
            rules[i,0] = 1/4
            rules[i,1] = 1/4
            rules[i,2] = 1/4
            rules[i,3]=1/4
        
    return np.concatenate([rules for i in range(21*10)])

mat=get_matrix_tr()
mat.shape
        
        

(7560, 4)

## Deck Generator

In [9]:
def getdecks(ndecks,TC):
    
    lows=[str(i) for i in range(2,7)]*24
    nlows=len(lows)
    mids=[str(i) for i in range(7,10)]*24
    nmids=len(mids)
    highs=['10' for i in range(4*4*6)]+['A' for i in range(24)]
    nhighs=len(highs)
    
    C=np.floor(TC*ndecks)
    remove=(6-ndecks)*52
    

    inf=int(max(0,np.floor(0.5*(remove-72-C)),-C))
    sup=int(min( 120,120-C,np.floor(0.5*(remove-C))))
    sols=[[i,i+C,remove-(2*i +C)] for i in range(inf,sup)]
    low,high,mid=random.choice(sols)

    for i in range(int(low)):
        pos=np.random.randint(nlows)
        lows.pop(pos)
        nlows-=1
    for i in range(int(high)):
        pos=np.random.randint(nhighs)
        highs.pop(pos)
        nhighs-=1
    for i in range(int(mid)):
        pos=np.random.randint(nmids)
        mids.pop(pos)
        nmids-=1
        
    newdeck=np.r_[lows,mids,highs]
    np.random.shuffle(newdeck)
    return newdeck,int(high)-int(low)

In [10]:
def sig(p):
    s=np.sign(p)
    v=min(10,int(abs(p)))
    if v<3:
        return '0'
    elif v>=3 and v<7:
        return str(5*s)
    return str(8*s)

    

In [11]:
def color_negative_red(val):
    if val=='S':
        color = 'red' 
    elif val =='H':
        color='green'
    elif val=='D' :
        color='blue'
    else :
        color='purple'
        
    return 'color: %s' % color

## Create all_states matrix  and  transition generator

In [12]:
X=L_p 
Y=L_d 
Z=[ '-8', '-5', '-3', '0', '3' , '5', '8']


B,C,A=np.meshgrid(Y,Z,X)
all_states=np.c_[A.ravel(),B.ravel(),C.ravel()]

In [13]:
all_states=np.array([[i,k,l]  for l in Z for k in L_d for i in L_p ])
all_states.shape

(2520, 3)

In [14]:
all_states.shape,all_states

((2520, 3), array([['21', '2', '-8'],
        ['20', '2', '-8'],
        ['19', '2', '-8'],
        ...,
        ['A-4', 'A', '8'],
        ['A-3', 'A', '8'],
        ['A-2', 'A', '8']], dtype='<U5'))

In [15]:
dic={}
for i in range(len(all_states)):
    dic["/".join(list(all_states[i]))]=i
    dic["/".join(list(all_states[i]))+".0"]=i

In [16]:
def init_tour():
    
        
    step=npos_P*npos_D
    ix=np.random.randint(0,step*7-1) # partir d'un état initial [Cartes P, Cartes D, Count] 
    state=all_states[ix,:]
    pos=np.random.randint(0,7)
    p=['-8','-5','-3','0','3','5','8'][pos]
    deck,cnt=getdecks(5,TC=int(p))
    state[2]=p
    return ix,state,deck,cnt

In [97]:
def init_reinforce():
    ix=np.random.choice(xplore)
    state=all_states[ix,:]
    deck,cnt=getdecks(5,TC=int(state[2]))
    return ix,state,deck,cnt

## Maybe useful?

In [17]:
def bust(state,card,dec):
    if dec in ['P','S']:
        return False
    elif state[0]  in ['A-'+i for i in L_d if i!='A']:
        return False
    
    elif '-' in state[0]:
        if int(state[0][0])<=4:
            return False
        return 2*int(state[0][0])+card
        
        
    

## Creates tour generator from scratch

In [18]:
if bust==True:
    newIX=None

def transition(state,d,deck,count,aftersplit=False):
    
    if aftersplit:
        t1=None
    else:
        t1=dic["/".join(state)]
    ncards=len(deck)
    main=state[0]
    bust=False
    newIX=None
    if d=='H' or d=='D':
        p=np.random.randint(ncards)
        card=deck[p]
        ###print('Carte tirée Joueur (transitions):',card)
        
        deck=np.delete(deck,p)
        sign= int(card in [str(i) for i in range(2,7)]) - int(card in ['10','A']) #1 ou -1
        count+=sign
        if 'A' in main:
                if 'A' in main[1:]:
                    rest=1
                else:
                    if len(main)>2:
                        rest=int(main[2:])
        elif '-' in main:
            if '10' in main:
                val=20
            else:
                val=2*int(main[0])
        else: val=int(main)
            
            
        if 'A' in main and aftersplit==False: 
            if card=='A':
                if rest+1<=10:
                    snew= ['A-'+str(rest+1),state[1],sig(count/5)]
                    if bust==False:
                        newIX=dic["/".join(snew)]
                    else:
                        newIX=["B",state[1],count]
                    return snew,bust,deck,(t1,newIX),count
                snew= [str(rest+2),state[1],sig(count/5)]
                if bust==False:
                        newIX=dic["/".join(snew)]
                else:
                    newIX=["B",state[1],count]
                return snew,bust,deck,(t1,newIX),count
            else:
                if rest+int(card)<=10:
                    snew= ['A-'+str(rest+int(card)),state[1],sig(count/5)]
                    if bust==False:
                        newIX=dic["/".join(snew)]
                    else:
                        newIX=["B",state[1],count]
                    return snew,bust,deck,(t1,newIX),count
                snew= [str(rest+1+int(card)),state[1],sig(count/5)]
                if bust==False:
                        newIX=dic["/".join(snew)]
                else:
                    newIX=["B",state[1],count]
                return snew,bust,deck,(t1,newIX),count
        
        elif '-' in main and aftersplit==False:
            if card!='A':
                if val+int(card)>21:
                    bust=True
                    
                snew= [str(val+int(card)),state[1],sig(count/5)]
                if bust==False:
                        newIX=dic["/".join(snew)]
                else:
                    newIX=["B",state[1],count]
                return snew,bust,deck,(t1,newIX),count
            else:
                if 1+val>21:
                    bust=True
                if val<=10:
                    snew= ['A-'+str(val),state[1],sig(count/5)]
                    if bust==False:
                        newIX=dic["/".join(snew)]
                    else:
                        newIX=["B",state[1],count]
                    
                    return snew,bust,deck,(t1,newIX),count
                
                snew= [str(val+1),state[1],sig(count/5)]
                if bust==False:
                        newIX=dic["/".join(snew)]
                else:
                    newIX=["B",state[1],count]
                return snew,bust,deck,(t1,newIX),count
            
        elif aftersplit==False:   
            if card!='A':
                if val+int(card)>21:
                    bust=True
                snew=[str(val+int(card)),state[1],sig(count/5)]
                if bust==False:
                        newIX=dic["/".join(snew)]
                else:
                    newIX=["B",state[1],count]
                return snew,bust,deck,(t1,newIX),count
            else:
                if 1+val>21:
                    bust=True
                if val<=10:
                    snew=['A-'+str(val),state[1],sig(count/5)]
                    if bust==False:
                        newIX=dic["/".join(snew)]
                    else:
                        newIX=["B",state[1],count]
                    return snew,bust,deck,(t1,newIX),count
                snew=[str(val+1),state[1],sig(count/5)]
                if bust==False:
                        newIX=dic["/".join(snew)]
                else:
                    newIX=["B",state[1],count]
                return snew,bust,deck,(t1,newIX),count
        else:
            if card==main:
                snew=[card+'-'+card,state[1],sig(count/5)]
                if bust==False:
                        newIX=dic["/".join(snew)]
                else:
                    newIX=["B",state[1],count]
                return snew,bust,deck,(t1,newIX),count
            else:
                if main=='A':
                    snew=['A-'+card,state[1],sig(count/5)]
                    if bust==False:
                        newIX=dic["/".join(snew)]
                    else:
                        newIX=["B",state[1],count]
                    return snew,bust,deck,(t1,newIX),count
                elif card=='A':
                    snew=['A-'+main,state[1],sig(count/5)]
                    if bust==False:
                        newIX=dic["/".join(snew)]
                    else:
                        newIX=["B",state[1],count]
                    return snew,bust,deck,(t1,newIX),count
                else:
                    snew=[str(int(card)+int(main)),state[1],sig(count/5)]
                    if bust==False:
                        newIX=dic["/".join(snew)]
                    else:
                        newIX=["B",state[1],count]
                    
                    return snew,bust,deck,(t1,newIX),count
    

    if d=='P':
        main=state[0][0]
        if '10' in state[0]:
            main='10'
        state1,bust1,deck1,(tprev,tnext),count=transition([main,state[1],state[2]],'H',deck,count,aftersplit=True)
        state2,bust2,deck2,(tprev,tnext),count=transition([main,state[1],str(count)],'H',deck1,count,aftersplit=True)
        return [state1,state2],deck2,count
        


In [19]:
def play_dealer():
    global deck
    global card_dealer
    ##print("Dealer starts with=", card_dealer)
    one_as=False
    bust=False
    val1=0
    val2=0
    count=1
    if card_dealer== 'A' :
        one_as=True
        val1,val2=1,11
    else:
        val1+=int(card_dealer)
        val2+=int(card_dealer)
    while True:
        count+=1
        ncards=len(deck)
        p=np.random.randint(ncards)
        card=deck[p]
        ##print('Dealer hits:',card)
        deck=np.delete(deck,p)
        if card=='A': 
            if one_as:
                val1+=1
                val2+=1
            else:
                val1+=1
                val2+=11
        else:
            val1+=int(card)
            val2+=int(card)
        
        if val2>=17 and val2<=21:
            ##print('valD,bustD,blD=',[val2,bust,(val2==21 and count==2)])
            return [val2,bust,(val2==21 and count==2)]
        elif val1>=17 :
            if val1>21:
                bust=True
                ###print('valD,bustD,blD=',[val1,bust,(val1==21 and count==2)])
                return [val1,bust,(val1==21 and count==2)]
            ##print('valD,bustD,blD=',[val1,bust,(val1==21 and count==2)])
            return [val1,bust,(val1==21 and count==2)]
        
        
    

In [20]:
def get_rewards_transitions(resP,valD,bustD,blD):
    decs=['H','S','D','P']
    split=False
    if len(resP)>1:
        split=True
    rw=0
    transitions=[]
    splits=[]
    for i in range (len(resP)):
        liste=resP[i]
        
        trans=[]
        blackjack=False
        bust,doubled,stand=liste[0],int(liste[1]),liste[2]
        if bust:
            rw-=1*(1+doubled)
            ##print("main_joueur={}, Blackjack={}, Doubled={}".format('bust',blackjack,doubled))
        else: 
            fstate=all_states[liste[4][-1][2],0]
            if fstate=='A-10' and split==False and len(liste[4])==1:
                blackjack=True
                valP=21
                
            elif 'A' in fstate:
                if fstate=='A-A':
                    valP=12
                else:
                    if '10' in fstate:
                           valP=21
                    else:
                        valP=11+int(fstate[2:])
                        
            elif '-' in fstate: #doubled
                if '10' in fstate:
                    valP=20
                else:
                    valP=2*int(fstate[2:])
            else:
                valP=int(fstate)
            ##print("main_joueur={}, Blackjack={}, Doubled={}".format(valP,blackjack,doubled))
            if bustD:
                sign=1
            else :
                sign=(valP>=valD)
                
            if not blackjack:
                if blD:
                    ##print('reward for hand1:' , liste,' is',-(1+doubled))
                    rw-=(1+doubled)
                else:
                    ##print('reward for hand1:', liste,' is',sign*(1+doubled) - (1-sign)*(1+doubled))
                    rw+=sign*(1+doubled) - (1-sign)*(1+doubled)
            else:
                if not blD:
                    ##print('reward for hand1:', liste,' is',1.5)
                    rw+=1.5
                else:
                    ##print('reward for hand1:', liste,' is',0)
                    rw+=0
                
        if len(liste[3])>0:
            splits.append( (liste[3][0],3))
        
        if doubled==True and bust==False:
            cap=len(liste[4])-1 #ne pas prendre en compte le stand comme une transition possible
        else:
            cap=len(liste[4])
        for i in range (cap):
            tr=liste[4][i]
            trans.append([tr[0],decs.index(tr[1])])
        transitions.append(trans)
    transitions.append(splits)
    ##print('Get rewards : (split,rw,transitions):', split,rw,transitions)
    return split,rw,transitions
        
    
                
            
        

### 1) Montecarlo Cross entropy tries (Failure)

In [345]:
n_steppp=2
epochsss=100
ccc=0
discount_rateee=0.1
display_incrrr=20000

for i in range(n_stepppp):
    Mtemp=np.zeros_like(Minit)
    rewards=[]
    trans=[]
    for j in range(epochssss):
        #if c%display_incr==0:
            # print("{} % of the {} th epoch" .format(np.round((100*c/epochs),2),i+1))
        c+=1
        ix_,state,deck,count=init_tour()
        card_dealer=state[1]
        resP=generate_tours(state)
        ##print('tour=',resP)
        valD,bustD,blD=play_dealer()
        ##print("main_dealer={}, Blackjack={}".format(valD,blD))
        split,rw,transitions=get_rewards_transitions(resP,valD,bustD,blD)
        ##print("transitions=",transitions)
        if split:
            trans.append(transitions[-1])
        else:
            trans.append(transitions[0])
        rewards.append(rw)
   
    q = np.percentile(rewards,80)
    index=[i for i in range(len(rewards)) if rewards[i]>=q]
    
    for ix in index:
        tr=trans[ix]
        sz=len(tr)
        for v in range(sz):
            #print(i,j)
            tpl=tr[v]
            Mtemp[tpl[0],tpl[1]]+=1/((1+discount_rate)**(sz-1-v))
            #print('Mij afterwards:',Mtemp[i,j],Mtemp.sum()/(Mtemp.shape[0]+Mtemp.shape[1]))
    
    Minit=9*Mtemp+100*Minit
    row_sums = Minit.sum(axis=1)
    Minit= Minit / row_sums[:, np.newaxis]
    show=pd.DataFrame(Minit,index=["/".join(i) for i in all_states],columns=['HIT','STAND','DOUBLE','SPLIT'])

    ax = sns.heatmap(show.iloc[0:35,:], cmap="YlGnBu")
    plt.show()
    question=str(input("Continue training Y/N?"))
    if question=="N":
        break
        

Player cuckstarts with ['19' '2' '8']


ValueError: probabilities are not non-negative

 ### 2) Q Learning 

In [21]:
def prob(p):
    beta=1.85
    return np.array([np.exp(beta*k)/(np.sum(np.exp(beta*p))) for k in p])

In [22]:
prob(np.array([0.2,0.9,0.0,-np.inf]))


array([0.18720526, 0.68348564, 0.1293091 , 0.        ])

In [23]:
def epsilon_greedy_action(q_values):
    global c
    step=c
    epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps) 
    if random.random() < epsilon:
        possible=np.array([i for i in range(len(q_values)) if q_values[i]!=-np.inf])
        i_d=np.random.choice(possible,1, p=np.ones(len(possible))/len(possible))
        return ['H','S','D','P'][int(i_d)] # random action else:
    return ['H','S','D','P'][int(np.argmax(q_values))] # optimal action

In [24]:
def epsilon_greedy_action2(q_values,s):
    global c
    epsilon=eps[s]
    if random.random() < epsilon:
        possible=np.array([i for i in range(len(q_values)) if q_values[i]!=-np.inf])
        i_d=np.random.choice(possible,1, p=np.ones(len(possible))/len(possible))
        return ['H','S','D','P'][int(i_d)] # random action else:
    pos=int(np.argmax(q_values))
    if random.random() < 0.2:
        pos=int(np.argmax(q_values))
        q_values[pos]=-np.inf
        return ['H','S','D','P'][int(np.argmax(q_values))] # optimal action
    return ['H','S','D','P'][int(np.argmax(q_values))]
    

In [25]:
def generate_Q_tours(state,optsplit=True):
    
    dec=''
    bust=False
    split=False
    ab=False
    stand=False
    doubled=False
    rewards=0
    transitions=[]
    global deck
    global count
    global Q
    global iteration 
    Qtmp=Q.copy()
    while True:
        ##print('Player starts with',state)
        ix=dic["/".join(state)]
        if optsplit==False:
            Qtmp[:,3]=-np.inf
        

        #Epsilon greedy
        #d=np.random.choice(np.arange(n_cols),1, p=prob(Q[ix,:])) 
        #dec=['H','S','D','P'][int(d)]
        dec=epsilon_greedy_action2(Qtmp[ix,:],ix)
        ##print('Player choice:',dec)
        
        if dec in ['H','D']: #fait la transition la main
            ##print('stateprev=',state)
            state,bust,deck,(t1,t2),count=transition(state,dec,deck,count) 
            ##print('Player next state is',state)
            transitions.append((t1,dec,t2))
            if dec=='D': 
                doubled=True
                Qtmp[:,0]=-np.inf
            Qtmp[:,2]=-np.inf
            
        elif dec=='P':
            
            split=True
            (state1,state2),deck,count=transition(state,dec,deck,count) 
            ##print('statenext=',state1,state2)
            state1=[state1[0],state1[1],sig(count/5)] #mise a jour du count
            r1,count=generate_Q_tours(state1,False)
            ##print(r1)
            #if isinstance(r1[0][4][-1][2],list):
                #print(r1[0][4][-1][2][2])
                #count=int(r1[0][4][-1][2][2])
            #else:
                #print(r1[0][4][-1][2])
                #count=int(all_states[r1[0][4][-1][2]] [2]  )
                
            state2=[state2[0],state2[1],sig(count/5)] #maj du count
           
            r1[0][3]=[ix,'P']
            r2,count=generate_Q_tours(state2,False)
            ##print('Generate tours (bust,doubled,stand,splitfromstate,transitions): ', r1+r2)
            return r1+r2


        else:#end of tour + get rewards
            stand=True
            t1=dic["/".join(state)]
            transitions.append((t1,dec,t1))
            
            r1=[[bust,doubled,stand,[]]+[transitions]]
            ##print('Generate tours  (bust,doubled,stand,splitfromstate,transitions) : ' ,r1)
            if optsplit==False:
                return r1,count
            else:
                return r1

        if bust==True:
            r1=[[bust,doubled,stand,[]]+[transitions]]
            ##print('Generate tours (bust,doubled,stand,splitfromstate,transitions)  : ', r1)
            if optsplit==False:
                return r1,count
            return r1
        
        
        #transition doit utiliser les cartes du sabot, doit calculer le nouveau count
        #pour retourner nouvelle ligne et si bust ou non
         #tirer jusqu'a 16 avce transition définie
          # faire jouer le dealer

        

In [74]:
n_step=2
epochs=3000000
display_incr=5000
c=0
discount_rate=1
eps_min = 0.05
eps_max = 1.0
eps_decay_steps = 50000*2.5/(0.11*2.2)
learning_rate=0.985
beta=0.002

for iteration in range(n_step):
    
    for j in range(epochs):
        
        if c%display_incr==0:
            print("{} % of the {} th epoch" .format(np.round((100*c/epochs),2),iteration+1))
        c+=1
        
        ix_,state,deck,count=init_tour()
        card_dealer=state[1]
        resP=generate_Q_tours(state)
        ##print('tour=',resP)
        valD,bustD,blD=play_dealer()
        ##print("main_dealer={}, Blackjack={}".format(valD,blD))
        split,rw,transitions=get_rewards_transitions(resP,valD,bustD,blD)
        
        []
        
        ##print("transitions=",transitions)
        #[[[1255, 0], [1228, 0], [1224, 0]], [[1241, 1]], [(1241, 3)]]
        #[(1241, 3 'state + split')]+[(1255,1228,'newstates')]
        #[(1241, 3), (1255, 1228)]
        if split:
            snew1,snew2=transitions[0][0][0],transitions[1][0][0]
            trans.append(transitions[-1]+[(snew1,snew2)])
        else:
            trans.append(transitions[0])
    
   
    
    index=range(len(rewards))  ##modif
    
    
    for ix in index:
        split_=False   #reward_, tr
        tr=trans[ix] 
        reward_=rewards[ix]
        for v in range(len(tr)):
            #print(i,j)
            if isinstance(tr[0],tuple):
                #print('Q learning detect split')
                split_=True
                
            if split_:
                if v==0:
                    (s,a)=tr[0]
                    (sn1,sn2)=tr[-1]
                    #print('QLearning starts with state,action={} and newstates={}'.format((all_states[s],a),(all_states[sn1],all_states[sn2])))
                    Q[s,a]=Q[s, a] + (1 - learning_rate) * ( discount_rate * (np.max(Q[sn1]+np.max(Q[sn2])))-Q[s,a])

            else:
                
                tpl=tr[v]
                s,a=tpl[0],tpl[1]
                if v<=len(tr)-2:
                    tpp=tr[v+1]
                    #print('QLearning detects states {} with choice {}  as inside hand and bypassit, nextstate is {} '.format(all_states[tr[v][0]],['HIT','STAND','DOUBLE','SPLIT'][tr[v][1]],all_states[tr[v+1][0]]))
                    sp=tpp[0]
                    Q[s,a]=Q[s, a] + (1 - learning_rate) * (discount_rate * (np.max(Q[sp]))-Q[s,a])

                else:
                    #print('QLearning detects this as done hand and affect it reward {} starting from{} and choosing to {}'.format(reward_,all_states[s],['HIT','STAND','DOUBLE','SPLIT'][a]))
                    Q[s,a]=Q[s, a]+ (1 - learning_rate) * (reward_- Q[s,a])
            
            #print('Mij afterwards:',Mtemp[i,j],Mtemp.sum()/(Mtemp.shape[0]+Mtemp.shape[1]))
    
    
    
   
    show=pd.DataFrame(Q,index=["/".join(i) for i in all_states],columns=['HIT','STAND','DOUBLE','SPLIT'])

    ax = sns.heatmap(show.iloc[0:35,:], cmap="YlGnBu")
    plt.show()

        

0.0 % of the 1 th epoch


NameError: name 'trans' is not defined

In [92]:
xplore=explore_reinforce()

In [26]:
c=0
Q=get_Q_matrix()

In [76]:
c=21000000

In [98]:
n_step=1
epochs=3000000
display_incr=5000
discount_rate=0.99
eps=np.ones(len(all_states)) #exploratory and not greedy
learning_rate=0.985
beta=0.002
sigma=0.02
delt=0.25

for iteration in range(n_step):
    
    for j in range(epochs):
        
        if c%display_incr==0:
            mn,mx=np.mean(eps),np.max(eps)
            print("{} % of the {} th epoch and average,max eps={}" .format(np.round((100*j/epochs),2),iteration+1,(np.round(mn,2),np.round(mx,2))))
            print("1 with", all_states[np.argmax(eps)])
        c+=1
        
        ix_,state,deck,count=init_reinforce()
        card_dealer=state[1]
        resP=generate_Q_tours(state)
        ##print('tour=',resP)
        valD,bustD,blD=play_dealer()
        ##print("main_dealer={}, Blackjack={}".format(valD,blD))
        split,rw,transitions=get_rewards_transitions(resP,valD,bustD,blD)
        
        []
        
        ##print("transitions=",transitions)
        #[[[1255, 0], [1228, 0], [1224, 0]], [[1241, 1]], [(1241, 3)]]
        #[(1241, 3 'state + split')]+[(1255,1228,'newstates')]
        #[(1241, 3), (1255, 1228)]
        if split:
            snew1,snew2=transitions[0][0][0],transitions[1][0][0]
            tr=transitions[-1]+[(snew1,snew2)]
        else:
            tr=transitions[0]
        
   
        split_=False   #reward_, tr
        for v in range(len(tr)):
            #print(i,j)
            if isinstance(tr[0],tuple):
                #print('Q learning detect split')
                split_=True

            if split_:
                if v==0:
                    (s,a)=tr[0]
                    (sn1,sn2)=tr[-1]
                    #print('QLearning starts with state,action={} and newstates={}'.format((all_states[s],a),(all_states[sn1],all_states[sn2])))
                    oldQ=Q[s,a]
                    newQ=Q[s, a] + (1 - learning_rate) * ( discount_rate * (np.max(Q[sn1]+np.max(Q[sn2])))-Q[s,a])
                    Q[s,a]=newQ
                    diff=abs(newQ-oldQ)
                    f=(1-np.exp(-diff/sigma))/(1+np.exp(-diff/sigma))
                    eps[s]=delt*f + (1-delt)*eps[s]

            else:

                tpl=tr[v]
                s,a=tpl[0],tpl[1]
                if v<=len(tr)-2:
                    tpp=tr[v+1]
                    #print('QLearning detects states {} with choice {}  as inside hand and bypassit, nextstate is {} '.format(all_states[tr[v][0]],['HIT','STAND','DOUBLE','SPLIT'][tr[v][1]],all_states[tr[v+1][0]]))
                    sp=tpp[0]
                    
                    oldQ=Q[s, a]
                    newQ=Q[s, a] + (1 - learning_rate) * (discount_rate * (np.max(Q[sp]))-Q[s,a])
                    Q[s,a]=newQ
                    diff=abs(newQ-oldQ)
                    f=(1-np.exp(-diff/sigma))/(1+np.exp(-diff/sigma))
                    eps[s]=delt*f + (1-delt)*eps[s]
                   

                else:
                    #print('QLearning detects this as done hand and affect it reward {} starting from{} and choosing to {}'.format(reward_,all_states[s],['HIT','STAND','DOUBLE','SPLIT'][a]))
                    oldQ=Q[s, a]
                    newQ=Q[s,a]=Q[s, a]+ (1 - learning_rate) * (rw- Q[s,a])
                    Q[s,a]=newQ
                    diff=abs(newQ-oldQ)
                    f=(1-np.exp(-diff/sigma))/(1+np.exp(-diff/sigma))
                    eps[s]=delt*f + (1-delt)*eps[s]
            
                    
                    
                #print('Mij afterwards:',Mtemp[i,j],Mtemp.sum()/(Mtemp.shape[0]+Mtemp.shape[1]))
    
    
    
   
    show=pd.DataFrame(Q,index=["/".join(i) for i in all_states],columns=['HIT','STAND','DOUBLE','SPLIT'])

    ax = sns.heatmap(show.iloc[0:35,:], cmap="YlGnBu")
    plt.show()

        

0.0 % of the 1 th epoch and average,max eps=(1.0, 1.0)
1 with ['21' '2' '-5']
0.17 % of the 1 th epoch and average,max eps=(0.82, 1.0)
1 with ['20' '2' '3']
0.33 % of the 1 th epoch and average,max eps=(0.76, 1.0)
1 with ['17' '2' '-8']
0.5 % of the 1 th epoch and average,max eps=(0.73, 1.0)
1 with ['9' '2' '8']
0.67 % of the 1 th epoch and average,max eps=(0.71, 1.0)
1 with ['9' '2' '8']
0.83 % of the 1 th epoch and average,max eps=(0.7, 1.0)
1 with ['9' '2' '8']
1.0 % of the 1 th epoch and average,max eps=(0.69, 1.0)
1 with ['9' '2' '8']
1.17 % of the 1 th epoch and average,max eps=(0.68, 1.0)
1 with ['9' '2' '8']
1.33 % of the 1 th epoch and average,max eps=(0.67, 1.0)
1 with ['9' '2' '8']
1.5 % of the 1 th epoch and average,max eps=(0.67, 1.0)
1 with ['9' '2' '8']
1.67 % of the 1 th epoch and average,max eps=(0.67, 1.0)
1 with ['9' '2' '8']
1.83 % of the 1 th epoch and average,max eps=(0.66, 1.0)
1 with ['9' '2' '8']
2.0 % of the 1 th epoch and average,max eps=(0.66, 1.0)
1 with ['

ImportError: Cannot load backend 'Qt4Agg' which requires the 'qt4' interactive framework, as 'qt5' is currently running

 ### Strat de base (moyenne sur tous les truecounts)

In [78]:
c

24000000

In [79]:
mul=np.zeros((360,4))
Minit=Q
for j in range(360):
    for p in range(4):
 
        mul[j,p]=np.mean( [Minit[360*tc:360*(tc+1)][j,p] for tc in range(7)])
lines=[]
for j in range(36): #pour chaque main looper sur toutes les cartes de dealer possibles
    add=[ ['H','S','D','P'][np.argmax(mul[i,:])]  for i in range(j,360,36)]
    add
    lines+=[add]

basic_strat=np.r_[lines]
basic_strat.shape ,basic_strat                    
                   
strat_basic=pd.DataFrame(basic_strat,index=L_p,columns=L_d_reversed)

dd=strat_basic.style.applymap(color_negative_red)
dd

Unnamed: 0,2,3,4,5,6,7,8,9,10,A
21,S,S,S,S,S,S,S,S,S,S
20,S,S,S,S,S,S,S,S,S,S
19,S,S,S,S,S,S,S,S,S,S
18,S,S,S,S,S,S,S,S,S,S
17,S,S,S,S,S,S,S,S,S,S
16,S,S,S,S,S,H,H,H,H,H
15,S,S,S,S,S,H,H,H,H,H
14,H,S,S,S,S,H,H,H,H,H
13,S,S,S,S,S,H,H,H,H,H
12,H,H,H,H,H,H,H,H,H,H


In [91]:
def explore_reinforce():
    explore = []
    
    

    for i in range(1,37):
        for j in range(1,11):
            for tc in range(1,8):
            
                ix=(i*j*tc)-1
                if i>=8 and i<=13 and j<=6:
                    explore.append(ix)
                elif i>=19 and i<=20:
                    explore.append(ix)
                elif i==21 and j>6:
                    explore.append(ix)
                elif i in [22,23,26,27] and j>=5 and j>=8:
                    explore.append(ix)
                elif i in [24,25] and j<=8:
                    explore.append(ix)
                elif i==31 and j<=8:
                    explore.append(ix)
                elif i>=32 and j<=6:
                    explore.append(ix)
            
        
        
        
    return explore

### Matrice d'état Q moyenné sur les truecounts

In [73]:
mul=np.zeros((360,4))
Minit=Q
for j in range(360):
    for p in range(4):
 
        mul[j,p]=np.mean( [Minit[360*tc:360*(tc+1)][j,p] for tc in range(7)])
lines=[]
for j in range(36): #pour chaque main looper sur toutes les cartes de dealer possibles
    add=[ "/".join([str(np.round(v,2)) for v in mul[i,:]])  for i in range(j,360,36)]

    lines+=[add]

basic_strat=np.r_[lines]
basic_strat.shape ,basic_strat                    
                   
strat_basic=pd.DataFrame(basic_strat,index=L_p,columns=L_d_reversed)

dd=strat_basic.style.applymap(color_negative_red)
dd

Unnamed: 0,2,3,4,5,6,7,8,9,10,A
21,-1.14/1.0/-1.57/-inf,-1.14/1.0/-1.59/-inf,-1.16/1.0/-1.58/-inf,-1.15/1.0/-1.58/-inf,-1.13/1.0/-1.61/-inf,-1.12/1.0/-1.5/-inf,-1.09/1.0/-1.52/-inf,-1.09/1.0/-1.51/-inf,-1.07/0.83/-1.97/-inf,-1.03/0.41/-2.0/-inf
20,-0.96/0.76/-1.73/-inf,-0.97/0.78/-1.68/-inf,-0.97/0.79/-1.72/-inf,-1.0/0.73/-1.68/-inf,-0.98/0.8/-1.66/-inf,-0.95/0.86/-1.74/-inf,-0.9/0.88/-1.68/-inf,-0.92/0.89/-1.68/-inf,-0.91/0.78/-1.66/-inf,-0.94/0.23/-1.79/-inf
19,-0.79/0.55/-1.41/-inf,-0.79/0.51/-1.47/-inf,-0.78/0.53/-1.47/-inf,-0.79/0.56/-1.45/-inf,-0.78/0.61/-1.41/-inf,-0.76/0.67/-1.45/-inf,-0.75/0.73/-1.44/-inf,-0.76/0.6/-1.43/-inf,-0.75/0.01/-1.44/-inf,-0.82/0.02/-1.58/-inf
18,-0.62/0.24/-1.23/-inf,-0.56/0.29/-1.24/-inf,-0.68/0.33/-1.19/-inf,-0.65/0.32/-1.18/-inf,-0.6/0.32/-1.18/-inf,-0.62/0.56/-1.09/-inf,-0.65/0.47/-1.13/-inf,-0.59/-0.08/-1.27/-inf,-0.64/-0.14/-1.34/-inf,-0.69/-0.25/-1.38/-inf
17,-0.57/0.01/-1.04/-inf,-0.49/-0.05/-0.94/-inf,-0.49/0.11/-1.06/-inf,-0.5/0.11/-1.02/-inf,-0.56/0.18/-0.88/-inf,-0.43/0.28/-0.82/-inf,-0.46/-0.3/-0.98/-inf,-0.51/-0.35/-1.05/-inf,-0.62/-0.35/-1.28/-inf,-0.7/-0.52/-1.3/-inf
16,-0.43/-0.34/-0.8/-inf,-0.43/-0.24/-0.84/-inf,-0.47/-0.25/-0.85/-inf,-0.44/-0.18/-0.83/-inf,-0.39/-0.23/-0.84/-inf,-0.33/-0.51/-0.81/-inf,-0.43/-0.53/-0.89/-inf,-0.49/-0.52/-0.95/-inf,-0.5/-0.59/-1.08/-inf,-0.61/-0.73/-1.33/-inf
15,-0.4/-0.25/-0.82/-inf,-0.37/-0.29/-0.69/-inf,-0.38/-0.29/-0.69/-inf,-0.41/-0.21/-0.67/-inf,-0.32/-0.16/-0.59/-inf,-0.29/-0.53/-0.72/-inf,-0.38/-0.57/-0.76/-inf,-0.41/-0.53/-0.87/-inf,-0.47/-0.6/-0.89/-inf,-0.58/-0.76/-1.21/-inf
14,-0.35/-0.34/-0.71/-inf,-0.3/-0.31/-0.66/-inf,-0.3/-0.2/-0.68/-inf,-0.33/-0.2/-0.54/-inf,-0.32/-0.17/-0.57/-inf,-0.27/-0.52/-0.69/-inf,-0.32/-0.54/-0.72/-inf,-0.4/-0.62/-0.77/-inf,-0.48/-0.62/-1.09/-inf,-0.58/-0.75/-1.09/-inf
13,-0.33/-0.32/-0.61/-inf,-0.28/-0.26/-0.56/-inf,-0.25/-0.22/-0.55/-inf,-0.21/-0.19/-0.47/-inf,-0.23/-0.16/-0.46/-inf,-0.22/-0.5/-0.64/-inf,-0.3/-0.49/-0.66/-inf,-0.3/-0.55/-0.79/-inf,-0.4/-0.59/-0.88/-inf,-0.56/-0.77/-1.11/-inf
12,-0.27/-0.35/-0.51/-inf,-0.22/-0.29/-0.45/-inf,-0.16/-0.26/-0.39/-inf,-0.19/-0.17/-0.4/-inf,-0.19/-0.23/-0.39/-inf,-0.13/-0.49/-0.51/-inf,-0.24/-0.55/-0.6/-inf,-0.27/-0.57/-0.64/-inf,-0.35/-0.57/-0.87/-inf,-0.53/-0.76/-1.19/-inf


 ### Strategie par truecounts (voir l'ordre dans all_states)

In [43]:
lines=[]
for tc in range(7):
    for j in range(36): #pour chaque main looper sur toutes les cartes de dealer possibles
        add=[ ['H','S','D','P'][np.argmax(Minit[360*tc:360*(tc+1)][i])]  for i in range(j,360,36)]
        add.reverse()
        lines+=[add]
new_strat=np.r_[lines]
new_strat.shape ,new_strat

((252, 10), array([['S', 'S', 'S', ..., 'S', 'S', 'S'],
        ['S', 'S', 'S', ..., 'S', 'S', 'S'],
        ['S', 'S', 'S', ..., 'S', 'S', 'H'],
        ...,
        ['H', 'H', 'D', ..., 'D', 'D', 'D'],
        ['H', 'H', 'H', ..., 'D', 'H', 'D'],
        ['H', 'D', 'H', ..., 'D', 'H', 'H']], dtype='<U1'))

L_d_reversed=L_d
L_d_reversed.reverse()

In [45]:
strat_finale=pd.DataFrame(new_strat,index=L_p*7,columns=L_d_reversed)

In [133]:
strat_finale


Unnamed: 0,2,3,4,5,6,7,8,9,10,A
21,S,S,S,S,S,S,S,S,S,S
20,S,S,S,S,S,S,S,H,S,S
19,S,S,H,S,S,S,S,S,S,H
18,D,S,S,S,S,S,S,S,S,H
17,D,S,S,S,S,S,H,H,S,S
16,S,H,D,H,H,D,S,S,S,S
15,S,D,H,S,H,S,D,H,H,S
14,S,H,D,H,D,D,S,D,S,H
13,S,H,S,D,H,D,D,D,H,S
12,H,D,H,H,D,H,H,S,D,D
