<a href="https://colab.research.google.com/github/tarod13/CardGames/blob/main/BlackJack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
import numpy as np
import random

In [92]:
def number2card(number):
  invalid_flag = False
  
  rank = (number % 13) + 1
  if rank < 1 or rank > 13:
    invalid_flag = True
  elif rank == 1:
    rank = 'A'
  elif rank == 11:
    rank = 'J'
  elif rank == 12:
    rank = 'Q'
  elif rank == 13:
    rank = 'K'
  else:
    rank = str(int(rank))
  
  symbol = number // 13
  if symbol == 0:
    symbol = 'clubs'
  elif symbol == 1:
    symbol = 'diamonds'
  elif symbol == 2:
    symbol = 'hearts'
  elif symbol == 3:
    symbol = 'spades'
  else:
    invalid_flag = True

  assert not invalid_flag, 'Invalid number'

  return (rank, symbol)


def card2value(card):
  rank = card[0]
  
  if rank == 'A':
    value = 1    
  elif rank in ['J', 'Q', 'K']:
    value = 10
  elif int(rank) > 1 and int(rank) <= 10:
    value = int(rank)
  else:
    print(rank)
    raise NameError('Invalid card')
  
  return value

In [193]:
def state2code(state):
  n_aces = 0
  sum_ = 0
  
  player_hand, dealer_card = state
  
  for card in player_hand:  
    rank = card[0]
    if rank == 'A':
      n_aces += 1          
    sum_ += card2value(card)
  
  visible_score_dealer = card2value(dealer_card)
  aces = n_aces > 0
  return (sum_, visible_score_dealer, aces)


def sample_epsilon_policy(hand_sum, dealer_score, aces, q_function, eps):
  q_values = q_function[int(hand_sum-12), int(dealer_score-1), int(aces), :]
  if np.random.rand() > eps:
    possible_actions = q_values.argmax()
    if isinstance(possible_actions, np.int64):
      action_id = possible_actions
    else:
      action_id = random.choice(possible_actions)
  else:
    action_id = np.random.randint(0, high=N_acciones)
  if action_id == 0:
    return 'stand'
  else:
    return 'hit'

In [147]:
class BlackJackEnv():
  def __init__(self, seed=0):    
    self.seed(seed)
    
  def step(self, action):
    state = [list(self.player_hand).copy(), self.visible_dealer_card]

    if action == 0 or action == '0' or action == 'stand':
      sum_dealer, n_aces_dealer = self.calculate_sum(who='dealer')
      less_than_17 = (sum_dealer + int(n_aces_dealer>0) * 10) < 17
      more_than_21 = False
      while less_than_17 and not more_than_21:
        new_dealer_card = set(random.sample(self.deck, 1))        
        self.deck = self.deck - new_dealer_card
        self.dealer_hand = self.dealer_hand.union(new_dealer_card)

        new_dealer_card = new_dealer_card.pop()
        sum_dealer += card2value(new_dealer_card)
        rank = new_dealer_card[0]
        if rank == 'A':
          n_aces_dealer += 1
        less_than_17 = (sum_dealer + int(n_aces_dealer>0) * 10) < 17   
        more_than_21 = sum_dealer > 21
      done = True
      score_dealer = self.calculate_score(sum_dealer, n_aces_dealer)
      if score_dealer > 21:
        reward = 1.0
      else:
        sum_player, n_aces_player = self.calculate_sum()
        score_player = self.calculate_score(sum_player, n_aces_player)
        if score_player > score_dealer:
          reward = 1.0
        elif score_player == score_dealer:
          reward = 0.0
        else:
          reward = -1.0

    elif action == 1 or action == '1' or action == 'hit':
      new_player_card = set(random.sample(self.deck, 1))
      self.deck = self.deck - new_player_card
      self.player_hand = self.player_hand.union(new_player_card)
      sum_player, n_aces_player = self.calculate_sum()
      went_bust = sum_player > 21      
      if went_bust:
        reward = -1
        done = True        
      else:
        reward = 0
        done = False        

    else:
      raise NameError('Invalid action')
    
    if done:    
      next_state = [list(self.player_hand).copy(), list(self.dealer_hand).copy()]
    else:
      next_state = [list(self.player_hand).copy(), self.visible_dealer_card]
    return state, next_state, reward, done 

  def seed(self, seed=None):
    random.seed(seed)

  
  def reset(self):
    self.deck = set(range(0,52))
    self.deck = set([number2card(x) for x in self.deck])

    self.player_hand = set(random.sample(self.deck, 2))
    self.deck = self.deck - self.player_hand
    self.dealer_hand = set(random.sample(self.deck, 2))
    self.deck = self.deck - self.dealer_hand

    self.visible_dealer_card = list(self.dealer_hand)[0]
    state = [list(self.player_hand).copy(), self.visible_dealer_card]
    return state


  @staticmethod
  def calculate_score(sum_no_aces, n_aces):
    score = sum_no_aces
    soft_hand = (21 - sum_no_aces) >= 10
    at_least_one_ace = n_aces > 0
    if soft_hand and at_least_one_ace:
      score += 10
    return score

  
  def calculate_sum(self, who='player'):
    n_aces = 0
    sum_ = 0
    
    if who == 'player':
      hand = self.player_hand
    else:
      hand = self.dealer_hand
    
    for card in hand:  
      rank = card[0]
      if rank == 'A':
        n_aces += 1          
      sum_ += card2value(card)
    
    return sum_, n_aces

In [149]:
semilla = 0
juego = BlackJackEnv(seed=semilla)
mano_inicial = juego.reset()
print('Mano inicial: ', mano_inicial)
m, nm, r, d = juego.step('hit')
print('Nueva mano: ', nm)
print('Recompensa: ', r)
print('Juego terminado: ', d)
m, nm, r, d = juego.step('stand')
print('Nueva mano: ', nm)
print('Recompensa: ', r)
print('Juego terminado: ', d)

Mano inicial:  [[('A', 'spades'), ('5', 'hearts')], ('4', 'hearts')]
Nueva mano:  [[('A', 'spades'), ('5', 'hearts'), ('5', 'diamonds')], ('4', 'hearts')]
Recompensa:  0
Juego terminado:  False
Nueva mano:  [[('A', 'spades'), ('5', 'hearts'), ('5', 'diamonds')], [('4', 'hearts'), ('7', 'hearts'), ('5', 'spades'), ('10', 'diamonds')]]
Recompensa:  1.0
Juego terminado:  True


In [209]:
def entrenar_agente(funcion_q, n_visitas, juego, n_episodios, imprimir=False):
  for episodio in range(0, n_episodios):
    terminado = False
    retorno = 0
    trayectoria = []
    estado = juego.reset()

    while not terminado:
      suma_mano, puntaje_dealer, aces = state2code(estado)
      if suma_mano < 12:
        accion = 'hit'
      else:
        accion = sample_epsilon_policy(suma_mano, puntaje_dealer, aces, funcion_q, eps=epsilon)
      _, siguiente_estado, recompensa, terminado = juego.step(accion)

      estado = siguiente_estado.copy()  
      trayectoria.append([suma_mano, puntaje_dealer, aces, accion, recompensa, terminado])

    for i in range(0,len(trayectoria)):
      suma_mano, puntaje_dealer, aces, accion, recompensa, terminado = trayectoria.pop()
      retorno = recompensa + beta * retorno

      if suma_mano >= 12:
        # Indices del estado
        s_0 = int(suma_mano-12)
        s_1 = int(puntaje_dealer-1)
        s_2 = int(aces)

        # Indice de la accion
        if accion == 'hit':
          a = 1
        else:
          a = 0
        
        # Numero de veces que ha sido visitada la pareja estado-acccion ((s0,s1,s2), a) 
        n = n_visitas[s_0, s_1, s_2, a] + 1

        # Se actualiza el número de visitas y la funcion de valor
        n_visitas[s_0, s_1, s_2, a] = n
        funcion_q[s_0, s_1, s_2, a] += (retorno - funcion_q[s_0, s_1, s_2, a]) / n 
    
    if imprimir and ((episodio+1) % imprimir_cada) == 0:
      print('Episodio: ' + str(episodio+1))

In [210]:
def evaluar_agente(funcion_q, juego, n_episodios, imprimir=False, ciclo=0):
  retornos = []
  for episodio in range(0, n_episodios):
    terminado = False
    retorno = 0
    estado = juego.reset()

    paso = 0
    while not terminado:
      suma_mano, puntaje_dealer, aces = state2code(estado)
      if suma_mano < 12:
        accion = 'hit'
      else:
        accion = sample_epsilon_policy(suma_mano, puntaje_dealer, aces, funcion_q, eps=epsilon)
      _, siguiente_estado, recompensa, terminado = juego.step(accion)
      retorno = retorno + (beta**paso) * recompensa
      estado = siguiente_estado.copy()
      paso += 1  
      
    retornos.append(retorno)
  retornos = np.array(retornos)
  retorno_promedio = retornos.mean()
  desviacion_retorno = retornos.std()
  if imprimir:
    print(('Ciclo: ' + str(ciclo) + ', Retorno esperado: {:.3f}'.format(
        retorno_promedio) + u"\u00B1" + '{:.3f}'.format(desviacion_retorno)))
  
  return retorno_promedio, desviacion_retorno

In [212]:
N_episodios_train = 10000
N_episodios_eval = 10000
N_ciclos = 100
N_acciones = 2
epsilon = 0.1
beta = 1.0
imprimir_cada = 5001

juego = BlackJackEnv(seed=semilla)
funcion_q = np.zeros((10,10,2,2))
n_visitas = np.zeros((10,10,2,2))
np.random.seed(semilla)

In [213]:
Gs = []
sigmaGs = []
for ciclo in range(0, N_ciclos):
  entrenar_agente(funcion_q, n_visitas, juego, N_episodios_train)
  G, sigmaG = evaluar_agente(funcion_q, juego, N_episodios_eval, imprimir=True, ciclo=ciclo)
  Gs.append(G)
  sigmaGs.append(sigmaG)
  epsilon = epsilon * 0.98
Gs = np.array(Gs)
sigmaGs = np.array(sigmaGs)

Ciclo: 0, Retorno esperado: -0.115±0.952
Ciclo: 1, Retorno esperado: -0.103±0.953
Ciclo: 2, Retorno esperado: -0.105±0.954
Ciclo: 3, Retorno esperado: -0.114±0.951
Ciclo: 4, Retorno esperado: -0.104±0.953
Ciclo: 5, Retorno esperado: -0.097±0.950
Ciclo: 6, Retorno esperado: -0.091±0.952
Ciclo: 7, Retorno esperado: -0.096±0.953
Ciclo: 8, Retorno esperado: -0.099±0.951
Ciclo: 9, Retorno esperado: -0.099±0.951
Ciclo: 10, Retorno esperado: -0.085±0.953
Ciclo: 11, Retorno esperado: -0.082±0.954
Ciclo: 12, Retorno esperado: -0.075±0.953
Ciclo: 13, Retorno esperado: -0.100±0.952
Ciclo: 14, Retorno esperado: -0.090±0.951
Ciclo: 15, Retorno esperado: -0.080±0.956
Ciclo: 16, Retorno esperado: -0.098±0.950
Ciclo: 17, Retorno esperado: -0.068±0.953
Ciclo: 18, Retorno esperado: -0.092±0.951
Ciclo: 19, Retorno esperado: -0.070±0.951
Ciclo: 20, Retorno esperado: -0.087±0.950
Ciclo: 21, Retorno esperado: -0.068±0.952
Ciclo: 22, Retorno esperado: -0.084±0.953
Ciclo: 23, Retorno esperado: -0.091±0.952
Ci

In [186]:
funcion_q[:,:,0,0]

array([[-0.96491228, -0.34478022, -0.36556604, -0.27836257, -0.21448999,
        -0.1778442 , -0.49069767, -0.58333333, -0.68550369, -0.65193746],
       [-0.87242798, -0.30305954, -0.2769208 , -0.24266   , -0.15978813,
        -0.16681876, -0.55066922, -0.58144796, -0.62457338, -0.65343415],
       [-0.77659574, -0.25732676, -0.22244692, -0.2140271 , -0.13246811,
        -0.1412413 , -0.48604651, -0.55268022, -0.68      , -0.6885906 ],
       [-0.64477612, -0.22276811, -0.15272969, -0.14213519, -0.09532612,
        -0.12878394, -0.40042373, -0.59205021, -0.676     , -0.62181581],
       [-0.65486726, -0.21508756, -0.12999294, -0.08425633, -0.05747675,
        -0.10660458, -0.46601942, -0.48083067, -0.6614786 , -0.61770966],
       [-0.50465988, -0.06441718, -0.01004954,  0.03540622,  0.12428107,
         0.05151844, -0.05952381, -0.38269987, -0.52576647, -0.50757042],
       [-0.29900731,  0.18349692,  0.22487813,  0.23168103,  0.36578877,
         0.31170848,  0.46799717,  0.14005236

In [187]:
n_visitas[:,:,0,0]

array([[  456.,   728.,   424.,   855.,  1049., 10706.,   430.,   468.,
          407.,  1471.],
       [  486.,  7256.,  6742., 10797.,  4531., 10958.,   523.,   442.,
          293.,  1587.],
       [  470.,  7302.,  7912.,  9667.,  4469., 10344.,   430.,   541.,
          250.,  1490.],
       [  670.,  7124.,  7510., 11081.,  3787.,  9613.,   472.,   478.,
          250.,  1531.],
       [ 1808.,  6681.,  7085., 10112.,  4193.,  8161.,   412.,   626.,
          257.,  2349.],
       [ 9335.,  6520.,  7065.,  9970.,  3999.,  9220.,  7560.,  6867.,
         1533., 29787.],
       [ 9167.,  6011.,  6359.,  9280.,  3759.,  8370.,  8468.,  7640.,
         4809., 30077.],
       [ 9018.,  5672.,  6037.,  8730.,  3455.,  7851.,  7970.,  8469.,
         4326., 28992.],
       [12397.,  8643.,  9272., 13246.,  5375., 12385., 11091., 11525.,
         6584., 38123.],
       [ 4770.,  1957.,  2217.,  3003.,  1198.,  2393.,  4158.,  4484.,
         2667., 15780.]])