We first initiate the required libraries for the project

In [7]:
import random
import gym
from gym import spaces
import numpy as np
from stable_baselines3 import PPO
import optuna
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3 import DQN
from gym.wrappers import monitoring
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
import matplotlib.pyplot as plt
from stable_baselines3.common.monitor import Monitor
import pandas as pd

Now we create the enviroment in which the agent will be trained. Note that the agent will receive a positive reward only in case of win (+1), in case of draw the reward will be 0 and in case of lose the reward will be negative.

In [24]:
#Environment


def cmp(a, b):
    if a > b:
        return 1
    elif a < b:
        return -1
    else:
        return 1

class SimpleBlackjackEnv(gym.Env):
    metadata = {'render.modes': ['human']}
    
    def __init__(self):
        self.reward_range = (-np.inf, np.inf)
        super(SimpleBlackjackEnv, self).__init__()
        self.deck = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10] * 4  # a full deck
        random.shuffle(self.deck)  # shuffle the deck
        self.action_space = spaces.Discrete(2)
        self.observation_space = spaces.Box(low=0, high=11, shape=(23,), dtype=int)
        
    def draw_card(self):
        return self.deck.pop()
        
    def draw_hand(self):
        return [self.draw_card(), self.draw_card()]

    def usable_ace(self, hand):
        return 1 in hand and sum(hand) + 10 <= 21

    def sum_hand(self, hand):
        if self.usable_ace(hand):
            return sum(hand) + 10
        return sum(hand)

    def is_bust(self, hand):
        return self.sum_hand(hand) > 21

    def score(self, hand):
        return 0 if self.is_bust(hand) else self.sum_hand(hand)

    def reset(self):
        if len(self.deck) < 15:
            self.deck = [1, 2, 3, 4, 5, 6, 7, 8, 9,
                         10, 10, 10, 10] * 4
            random.shuffle(self.deck)
        self.dealer = self.draw_hand()
        self.player = self.draw_hand()
        return self._get_observation()

    def step(self, action):
        assert self.action_space.contains(action)
        if action == 1:  # hit
            self.player.append(self.draw_card())
            if self.is_bust(self.player):
                done = True
                reward = -1.0
            else:
                done = False
                reward = 0.0
        else:  # stick
            done = True
            while self.sum_hand(self.dealer) < 17:
                self.dealer.append(self.draw_card())
            reward = cmp(self.score(self.player), self.score(self.dealer))
        return self._get_observation(), reward, done, {}

    def _get_observation(self):
        player_obs = self.player + [0] * (11 - len(self.player))
        dealer_obs = self.dealer + [0] * (11 - len(self.dealer))
        usable_ace_obs = [1] if self.usable_ace(self.player) else [0]
        return np.array(player_obs + dealer_obs + usable_ace_obs)

    def render(self, mode='human'):
        if mode != 'human':
            raise NotImplementedError()
        return f"Player hand: {self.player}, Dealer hand: {self.dealer}"

    def close(self):
        pass
    def seed(self, seed=None):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

# Testing the environment to ensure it initializes and steps correctly
env = SimpleBlackjackEnv()
obs = env.reset()
print(env.render())
obs, reward, done, _ = env.step(1)
print(env.render())
obs, reward, done, _ = env.step(0)
print(env.render())

Player hand: [5, 10], Dealer hand: [10, 1]
Player hand: [5, 10, 3], Dealer hand: [10, 1]
Player hand: [5, 10, 3], Dealer hand: [10, 1]


For the first model we will be using a stable_baselines3 algorithm, which will only need to be called and trained. Since the performance with the standard hyperparameters are poor we also will use optuna to find the best hyperparameters in order to increase the win rate of the agent. Note that the trials have been set to 5 because of compoutational and running time purposes, with an higher number of trials optuna will surely find more accurate hyperparameters.

In [15]:
#model training



def evaluate_agent(model, env, num_games=1000):
    wins = 0
    win_rates = []
    
    for i in range(num_games):
        obs = env.reset()
        done = False
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, _ = env.step(action)
            if done and reward == 1:
                wins += 1
        if (i+1) % 10 == 0:  # Log win rate every 100 games
            win_rates.append(wins / (i+1))
    
    win_rate_df = pd.DataFrame(win_rates, columns=['WinRate', 'NumGames'])
    win_rate_df.to_csv('win_rate_over_time.csv', index=False)
    return wins / num_games


# Create the environment
env = DummyVecEnv([lambda: SimpleBlackjackEnv()])

# Set hyperparameters
params = {
    'learning_rate': 2.5e-4,
    'n_steps': 256,
    'batch_size': 64,
    'n_epochs': 10,
    'gamma': 0.99,
    'gae_lambda': 0.95,
    'clip_range': 0.2,
    'ent_coef': 1e-4
}

# Instantiate the model
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log="./blackjack_tensorboard/", **params)

# Train the model
model.learn(total_timesteps=100000)

# Evaluate the model
win_rate = evaluate_agent(model, env)
print(f"Win rate: {win_rate:.2f}")


Using cpu device
Logging to ./blackjack_tensorboard/PPO_7
-----------------------------
| time/              |      |
|    fps             | 4847 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 256  |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 3099        |
|    iterations           | 2           |
|    time_elapsed         | 0           |
|    total_timesteps      | 512         |
| train/                  |             |
|    approx_kl            | 0.018533219 |
|    clip_fraction        | 0.266       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.681      |
|    explained_variance   | -0.103      |
|    learning_rate        | 0.00025     |
|    loss                 | 0.451       |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0315     |
|    value_loss           | 0.924       |
------------------



-----------------------------------------
| time/                   |             |
|    fps                  | 3254        |
|    iterations           | 3           |
|    time_elapsed         | 0           |
|    total_timesteps      | 768         |
| train/                  |             |
|    approx_kl            | 0.030365176 |
|    clip_fraction        | 0.219       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.645      |
|    explained_variance   | -0.00614    |
|    learning_rate        | 0.00025     |
|    loss                 | 0.406       |
|    n_updates            | 20          |
|    policy_gradient_loss | -0.0289     |
|    value_loss           | 0.897       |
-----------------------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 3341       |
|    iterations           | 4          |
|    time_elapsed         | 0          |
|    total_timesteps      | 1024       

ValueError: Shape of passed values is (100, 1), indices imply (100, 2)

In [26]:
def simulate_blackjack_games(env, model, num_games=10000):
    action_frequencies = {}
    rewards = []
    results = []

    for game in range(num_games):
        obs = env.reset()
        done = False
        total_reward = 0
        player_actions = []
        player_hand_sums = []

        while not done:
            action, _ = model.predict(obs)
            player_actions.append('Hit' if action == 1 else 'Stick')

            # Define state key
            player_hand = obs[:11][obs[:11] != 0]
            dealer_visible_card = env.dealer[0]
            state_key = (tuple(player_hand), dealer_visible_card)

            # Record action frequencies
            if state_key not in action_frequencies:
                action_frequencies[state_key] = {'Hit': 0, 'Stick': 0}
            action_frequencies[state_key]['Hit' if action == 1 else 'Stick'] += 1

            obs, reward, done, _ = env.step(action)
            total_reward += reward
            player_hand_sums.append(env.sum_hand(player_hand))

        rewards.append(total_reward)

        player_final_hand = obs[:11][obs[:11] != 0]
        dealer_final_hand = obs[11:22][obs[11:22] != 0]

        game_results = {
            'Game': game + 1,
            'PlayerFinalHandSum': env.sum_hand(player_final_hand),
            'DealerFinalHandSum': env.sum_hand(dealer_final_hand),
            'PlayerNumCards': len(player_final_hand),
            'DealerNumCards': len(dealer_final_hand),
            'DealerVisibleCard': dealer_visible_card,
            'PlayerActions': ' '.join(player_actions),
            'PlayerHandProgression': ' '.join(map(str, player_hand_sums)),
            'Outcome': 'Win' if reward > 0 else 'Loss' if reward < 0 else 'Draw'
        }
        results.append(game_results)

    # Export action frequencies and rewards
    action_freq_data = []
    for state, actions in action_frequencies.items():
        player_hand, dealer_card = state
        action_freq_data.append({'PlayerHand': ' '.join(map(str, player_hand)), 
                                 'DealerVisibleCard': dealer_card,
                                 'Hit': actions['Hit'], 
                                 'Stick': actions['Stick']})
    
    action_freq_df = pd.DataFrame(action_freq_data)
    action_freq_df.to_csv('action_frequencies.csv', index=False)
    
    rewards_df = pd.DataFrame(rewards, columns=['Reward'])
    rewards_df.to_csv('rewards_distribution.csv', index=False)

    results_df = pd.DataFrame(results)
    results_df.to_csv('blackjack_results.csv', index=False)

    win_rate = results_df[results_df['Outcome'] == 'Win'].shape[0] / num_games
    print(f"\nAgent won {results_df[results_df['Outcome'] == 'Win'].shape[0]} out of {num_games} games. Win rate: {win_rate * 100:.2f}%")
    return win_rate


# Example usage
env = SimpleBlackjackEnv()
simulate_blackjack_games(env, model)



Agent won 5306 out of 10000 games. Win rate: 53.06%


0.5306

we can see that the agen won at most 45% of the games, which is way less then the expected 49% which is the world's average winning rate. But for this enviroment we must keep in mind that we do not have the possibility to split, even if it would not change things too much, because if we split we will be simply playing 2 hands at the same time, meaning that it will not change the winning rate in most cases.  

CAN'T RUN THE CODE BELOW DUE TO HIGH COMPUTATIONAL POWER REQUIRED


In [27]:
from stable_baselines3 import DQN
from stable_baselines3.dqn.policies import MlpPolicy
from stable_baselines3.common.vec_env import DummyVecEnv
# Create environment
env = DummyVecEnv([lambda: SimpleBlackjackEnv()])

dqn_params = {
    'policy': MlpPolicy,
    'env': env,
    'learning_rate': 1e-3,
    'buffer_size': 50000,
    'learning_starts': 1000,
    'batch_size': 32,
    'tau': 1.0,
    'gamma': 0.99,
    'train_freq': 1,
    'gradient_steps': 1,
    'optimize_memory_usage': False,
    'target_update_interval': 1000,
    'exploration_fraction': 0.1,
    'exploration_initial_eps': 1.0,
    'exploration_final_eps': 0.01,
    'max_grad_norm': 10,
    'verbose': 1,
}

# Create the DQN model
model = DQN(**dqn_params)

# Train the DQN model
model.learn(total_timesteps=50000)

Using cpu device
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.999    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 969      |
|    time_elapsed     | 0        |
|    total_timesteps  | 5        |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.998    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 1448     |
|    time_elapsed     | 0        |
|    total_timesteps  | 10       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.997    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 1880     |
|    time_elapsed     | 0        |
|    total_timesteps  | 15       |
----------------------------------
----------------------------------
| r



----------------------------------
| rollout/            |          |
|    exploration_rate | 0.793    |
| time/               |          |
|    episodes         | 744      |
|    fps              | 5333     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1043     |
| train/              |          |
|    learning_rate    | 0.001    |
|    loss             | 0.279    |
|    n_updates        | 42       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.793    |
| time/               |          |
|    episodes         | 748      |
|    fps              | 5272     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1047     |
| train/              |          |
|    learning_rate    | 0.001    |
|    loss             | 0.375    |
|    n_updates        | 46       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

<stable_baselines3.dqn.dqn.DQN at 0x179432c50>

In [28]:
def simulate_blackjack_games(env, model, num_games=10000):
    action_frequencies = {}
    rewards = []
    results = []

    for game in range(num_games):
        obs = env.reset()
        done = False
        total_reward = 0
        player_actions = []
        player_hand_sums = []

        while not done:
            action, _ = model.predict(obs)
            player_actions.append('Hit' if action == 1 else 'Stick')

            # Define state key
            player_hand = obs[:11][obs[:11] != 0]
            dealer_visible_card = env.dealer[0]
            state_key = (tuple(player_hand), dealer_visible_card)

            # Record action frequencies
            if state_key not in action_frequencies:
                action_frequencies[state_key] = {'Hit': 0, 'Stick': 0}
            action_frequencies[state_key]['Hit' if action == 1 else 'Stick'] += 1

            obs, reward, done, _ = env.step(action)
            total_reward += reward
            player_hand_sums.append(env.sum_hand(player_hand))

        rewards.append(total_reward)

        player_final_hand = obs[:11][obs[:11] != 0]
        dealer_final_hand = obs[11:22][obs[11:22] != 0]

        game_results = {
            'Game': game + 1,
            'PlayerFinalHandSum': env.sum_hand(player_final_hand),
            'DealerFinalHandSum': env.sum_hand(dealer_final_hand),
            'PlayerNumCards': len(player_final_hand),
            'DealerNumCards': len(dealer_final_hand),
            'DealerVisibleCard': dealer_visible_card,
            'PlayerActions': ' '.join(player_actions),
            'PlayerHandProgression': ' '.join(map(str, player_hand_sums)),
            'Outcome': 'Win' if reward > 0 else 'Loss' if reward < 0 else 'Draw'
        }
        results.append(game_results)

    # Export action frequencies and rewards
    action_freq_data = []
    for state, actions in action_frequencies.items():
        player_hand, dealer_card = state
        action_freq_data.append({'PlayerHand': ' '.join(map(str, player_hand)), 
                                 'DealerVisibleCard': dealer_card,
                                 'Hit': actions['Hit'], 
                                 'Stick': actions['Stick']})
    
    action_freq_df = pd.DataFrame(action_freq_data)
    action_freq_df.to_csv('action_frequencies.csv', index=False)
    
    rewards_df = pd.DataFrame(rewards, columns=['Reward'])
    rewards_df.to_csv('rewards_distribution.csv', index=False)

    results_df = pd.DataFrame(results)
    results_df.to_csv('blackjack_results.csv', index=False)

    win_rate = results_df[results_df['Outcome'] == 'Win'].shape[0] / num_games
    print(f"\nAgent won {results_df[results_df['Outcome'] == 'Win'].shape[0]} out of {num_games} games. Win rate: {win_rate * 100:.2f}%")
    return win_rate


# Example usage
env = SimpleBlackjackEnv()
simulate_blackjack_games(env, model)


Agent won 5458 out of 10000 games. Win rate: 54.58%


0.5458

In [None]:
from stable_baselines3 import DQN
from gym.wrappers import monitoring
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
from stable_baselines3.common.vec_env import DummyVecEnv
import matplotlib.pyplot as plt
env = DummyVecEnv([lambda: SimpleBlackjackEnv()])

model = DQN.load("./RisultatiAlessio/dqn_blackjack")
def test_agent_dqn(model, num_games=10000):
    action_mapping = {0: "Stick", 1: "Hit"}
    won_games = 0
    for _ in range(num_games):
        obs = env.reset()
        done = False
        print("\nStarting a new game...")
        
        while not done:
            player_hand_length = np.sum(obs[0][:11].astype(int) != 0)
            dealer_hand_length = np.sum(obs[0][11:22].astype(int) != 0)
            
            player_hand = obs[0][:11][:player_hand_length]
            dealer_hand = obs[0][11:22][:dealer_hand_length]
            
            print(f"Player's hand: {player_hand}")
            print(f"Dealer's visible card: {dealer_hand[0]}")
            
            action, _ = model.predict(obs, deterministic=True)
            print(f"Agent's action: {action_mapping[int(action)]}")  
            
            obs, reward, done, _ = env.step(action)
        
        print(f"Player's final hand: {player_hand}")
        print(f"Dealer's final hand: {dealer_hand}")
        
        if reward > 0:
            print("Result: Won!")
            won_games += 1
        elif reward < 0:
            print("Result: Lost!")
        else:
            print("Result: Draw!")
        print('-'*40)
    print(f"Agent won {won_games} out of {num_games} games.")
    return won_games  # Return the number of won games

# Test the DQN model
test_agent_dqn(model)

In [None]:
model = PPO.load("./RisultatiAlessio/ppo_blackjack")
def test_agent_verbose(model, num_games=10000):
    action_mapping = {0: "Stick", 1: "Hit"}
    won_games = 0
    for _ in range(num_games):
        obs = env.reset()
        done = False
        print("\nStarting a new game...")
        
        while not done:
            player_hand_length = np.sum(obs[0][:11].astype(int) != 0)
            dealer_hand_length = np.sum(obs[0][11:22].astype(int) != 0)
            
            player_hand = obs[0][:11][:player_hand_length]
            dealer_hand = obs[0][11:22][:dealer_hand_length]
            
            print(f"Player's hand: {player_hand}")
            print(f"Dealer's visible card: {dealer_hand[0]}")
            
            action, _ = model.predict(obs)
            print(f"Agent's action: {action_mapping[int(action)]}")  
            
            obs, reward, done, _ = env.step(action)
        
        print(f"Player's final hand: {player_hand}")
        print(f"Dealer's final hand: {dealer_hand}")
        
        if reward > 0:
            print("Result: Won!")
            won_games += 1
        elif reward < 0:
            print("Result: Lost!")
        else:
            print("Result: Draw!")
        print('-'*40)
    print(f"Agent won {won_games} out of {num_games} games.")
    return won_games  # Return the number of won games

def simulate_games(model, num_simulations=10, games_per_simulation=[100, 500, 1000, 5000, 10000]):
    win_rates = []
    for num_games in games_per_simulation:
        total_wins = 0
        for _ in range(num_simulations):
            won_games = test_agent_verbose(model, num_games)  # Assumes test_agent returns the number of won games
            if won_games is not None:  # Check if won_games is not None before adding to total_wins
                total_wins += won_games
        average_win_rate = total_wins / (num_games * num_simulations)
        win_rates.append(average_win_rate)
        print(f"Average winning rate for {num_games} games: {average_win_rate * 100}%")
    return win_rates
simulate_games(model)