We first initiate the required libraries for the project

In [1]:
import random
import gym
from gym import spaces
from gym.wrappers import monitoring
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3 import DQN
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

Now we create the enviroment in which the agent will be trained. Note that the agent will receive a positive reward only in case of win (+1), in case of draw the reward will be 0 and in case of lose the reward will be negative.

In [23]:
#Env_1
def cmp(a, b):
    if a > b:
        return 1
    elif a < b:
        return -1
    else:
        return 0

class SimpleBlackjackEnv(gym.Env):
    metadata = {'render.modes': ['human']}
    
    def __init__(self):
        self.reward_range = (-np.inf, np.inf)
        super(SimpleBlackjackEnv, self).__init__()
        self.deck = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10] * 4  # a full deck
        random.shuffle(self.deck)  # shuffle the deck
        self.action_space = spaces.Discrete(2)
        self.observation_space = spaces.Box(low=0, high=11, shape=(23,), dtype=int)
        
    def draw_card(self):
        return self.deck.pop()
        
    def draw_hand(self):
        return [self.draw_card(), self.draw_card()]

    def usable_ace(self, hand):
        return 1 in hand and sum(hand) + 10 <= 21

    def sum_hand(self, hand):
        if self.usable_ace(hand):
            return sum(hand) + 10
        return sum(hand)

    def is_bust(self, hand):
        return self.sum_hand(hand) > 21

    def score(self, hand):
        return 0 if self.is_bust(hand) else self.sum_hand(hand)

    def reset(self):
        if len(self.deck) < 15:
            self.deck = [1, 2, 3, 4, 5, 6, 7, 8, 9,
                         10, 10, 10, 10] * 4
            random.shuffle(self.deck)
        self.dealer = self.draw_hand()
        self.player = self.draw_hand()
        return self._get_observation()

    def step(self, action):
        assert self.action_space.contains(action)
        if action == 1:  # hit
            self.player.append(self.draw_card())
            if self.is_bust(self.player):
                done = True
                reward = -1.0
            else:
                done = False
                reward = 0.0
        else:  # stick
            done = True
            while self.sum_hand(self.dealer) < 17:
                self.dealer.append(self.draw_card())
            reward = cmp(self.score(self.player), self.score(self.dealer))
        return self._get_observation(), reward, done, {}

    def _get_observation(self):
        player_obs = self.player + [0] * (11 - len(self.player))
        dealer_obs = self.dealer + [0] * (11 - len(self.dealer))
        usable_ace_obs = [1] if self.usable_ace(self.player) else [0]
        return np.array(player_obs + dealer_obs + usable_ace_obs)

    def render(self, mode='human'):
        if mode != 'human':
            raise NotImplementedError()
        return f"Player hand: {self.player}, Dealer hand: {self.dealer}"

    def close(self):
        pass
    def seed(self, seed=None):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

# Testing the environment to ensure it initializes and steps correctly
env = SimpleBlackjackEnv()
obs = env.reset()
print(env.render())
obs, reward, done, _ = env.step(1)
print(env.render())
obs, reward, done, _ = env.step(0)
print(env.render())

Player hand: [10, 10], Dealer hand: [8, 1]
Player hand: [10, 10, 4], Dealer hand: [8, 1]
Player hand: [10, 10, 4], Dealer hand: [8, 1]


We now will train 3 different PPO models, with the same hyperparameters but different total_timesteps.

In [None]:
#500k model
def evaluate_agent(model, env, num_games=1000):
    wins = 0
    win_rates = []
    num_games_list = []  # List to store the number of games after each logging interval

    for i in range(num_games):
        obs = env.reset()
        done = False
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, _ = env.step(action)
            if done and reward == 1:
                wins += 1
        if (i + 1) % 100 == 0:  # Log win rate every 100 games
            win_rates.append(wins / (i + 1))
            num_games_list.append(i + 1)  # Append the number of games played so far

    # Create a DataFrame with both win rates and number of games
    win_rate_df = pd.DataFrame({'WinRate': win_rates, 'NumGames': num_games_list})
    win_rate_df.to_csv('PPO500k_win_rate_over_time.csv', index=False)
    
    return wins / num_games


# Create the environment
env = DummyVecEnv([lambda: SimpleBlackjackEnv()])

# Set hyperparameters
params = {
    'learning_rate': 2.5e-4,
    'n_steps': 256,
    'batch_size': 64,
    'n_epochs': 10,
    'gamma': 0.99,
    'gae_lambda': 0.95,
    'clip_range': 0.2,
    'ent_coef': 1e-4
}

# Instantiate the model
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log="./blackjack_tensorboard/", **params)

# Train the model
model.learn(total_timesteps=500000)

# Evaluate the model
win_rate = evaluate_agent(model, env)

def simulate_blackjack_games(env, model, num_games=10000):
    action_frequencies = {}
    rewards = []
    results = []

    for game in range(num_games):
        obs = env.reset()
        done = False
        total_reward = 0
        player_actions = []
        player_hand_sums = []

        while not done:
            action, _ = model.predict(obs)
            player_actions.append('Hit' if action == 1 else 'Stick')

            # Define state key
            player_hand = obs[:11][obs[:11] != 0]
            dealer_visible_card = env.dealer[0]
            state_key = (tuple(player_hand), dealer_visible_card)

            # Record action frequencies
            if state_key not in action_frequencies:
                action_frequencies[state_key] = {'Hit': 0, 'Stick': 0}
            action_frequencies[state_key]['Hit' if action == 1 else 'Stick'] += 1

            obs, reward, done, _ = env.step(action)
            total_reward += reward
            player_hand_sums.append(env.sum_hand(player_hand))

        rewards.append(total_reward)

        player_final_hand = obs[:11][obs[:11] != 0]
        dealer_final_hand = obs[11:22][obs[11:22] != 0]

        game_results = {
            'Game': game + 1,
            'PlayerFinalHandSum': env.sum_hand(player_final_hand),
            'DealerFinalHandSum': env.sum_hand(dealer_final_hand),
            'PlayerNumCards': len(player_final_hand),
            'DealerNumCards': len(dealer_final_hand),
            'DealerVisibleCard': dealer_visible_card,
            'PlayerActions': ' '.join(player_actions),
            'PlayerHandProgression': ' '.join(map(str, player_hand_sums)),
            'Outcome': 'Win' if reward > 0 else 'Loss' if reward < 0 else 'Draw'
        }
        results.append(game_results)

    # Export action frequencies and rewards
    action_freq_data = []
    for state, actions in action_frequencies.items():
        player_hand, dealer_card = state
        action_freq_data.append({'PlayerHand': ' '.join(map(str, player_hand)), 
                                 'DealerVisibleCard': dealer_card,
                                 'Hit': actions['Hit'], 
                                 'Stick': actions['Stick']})
    
    action_freq_df = pd.DataFrame(action_freq_data)
    action_freq_df.to_csv('PPO500k_action_frequencies.csv', index=False)
    
    rewards_df = pd.DataFrame(rewards, columns=['Reward'])
    rewards_df.to_csv('PPO500k_rewards_distribution.csv', index=False)

    results_df = pd.DataFrame(results)
    results_df.to_csv('PPO500k_blackjack_results.csv', index=False)

    win_rate = results_df[results_df['Outcome'] == 'Win'].shape[0] / num_games
    print(f"\nAgent won {results_df[results_df['Outcome'] == 'Win'].shape[0]} out of {num_games} games. Win rate: {win_rate * 100:.2f}%")
    return win_rate

model.save("PPO500k")
env = SimpleBlackjackEnv()
simulate_blackjack_games(env, model)
print(f"Win rate: {win_rate:.2f}")

Training the agent for 1M timesteps


In [None]:
#1M model
def evaluate_agent(model, env, num_games=1000):
    wins = 0
    win_rates = []
    num_games_list = []  # List to store the number of games after each logging interval

    for i in range(num_games):
        obs = env.reset()
        done = False
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, _ = env.step(action)
            if done and reward == 1:
                wins += 1
        if (i + 1) % 100 == 0:  # Log win rate every 100 games
            win_rates.append(wins / (i + 1))
            num_games_list.append(i + 1)  # Append the number of games played so far

    # Create a DataFrame with both win rates and number of games
    win_rate_df = pd.DataFrame({'WinRate': win_rates, 'NumGames': num_games_list})
    win_rate_df.to_csv('PPO1M_win_rate_over_time.csv', index=False)
    
    return wins / num_games


# Create the environment
env = DummyVecEnv([lambda: SimpleBlackjackEnv()])

# Set hyperparameters
params = {
    'learning_rate': 2.5e-4,
    'n_steps': 256,
    'batch_size': 64,
    'n_epochs': 10,
    'gamma': 0.99,
    'gae_lambda': 0.95,
    'clip_range': 0.2,
    'ent_coef': 1e-4
}

# Instantiate the model
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log="./blackjack_tensorboard/", **params)

# Train the model
model.learn(total_timesteps=100)

# Evaluate the model
win_rate = evaluate_agent(model, env)

def simulate_blackjack_games(env, model, num_games=10000):
    action_frequencies = {}
    rewards = []
    results = []

    for game in range(num_games):
        obs = env.reset()
        done = False
        total_reward = 0
        player_actions = []
        player_hand_sums = []

        while not done:
            action, _ = model.predict(obs)
            player_actions.append('Hit' if action == 1 else 'Stick')

            # Define state key
            player_hand = obs[:11][obs[:11] != 0]
            dealer_visible_card = env.dealer[0]
            state_key = (tuple(player_hand), dealer_visible_card)

            # Record action frequencies
            if state_key not in action_frequencies:
                action_frequencies[state_key] = {'Hit': 0, 'Stick': 0}
            action_frequencies[state_key]['Hit' if action == 1 else 'Stick'] += 1

            obs, reward, done, _ = env.step(action)
            total_reward += reward
            player_hand_sums.append(env.sum_hand(player_hand))

        rewards.append(total_reward)

        player_final_hand = obs[:11][obs[:11] != 0]
        dealer_final_hand = obs[11:22][obs[11:22] != 0]

        game_results = {
            'Game': game + 1,
            'PlayerFinalHandSum': env.sum_hand(player_final_hand),
            'DealerFinalHandSum': env.sum_hand(dealer_final_hand),
            'PlayerNumCards': len(player_final_hand),
            'DealerNumCards': len(dealer_final_hand),
            'DealerVisibleCard': dealer_visible_card,
            'PlayerActions': ' '.join(player_actions),
            'PlayerHandProgression': ' '.join(map(str, player_hand_sums)),
            'Outcome': 'Win' if reward > 0 else 'Loss' if reward < 0 else 'Draw',
            'TotalReward': total_reward  # Add total reward for each game
        }
        results.append(game_results)

    # Export action frequencies and rewards
    action_freq_data = []
    for state, actions in action_frequencies.items():
        player_hand, dealer_card = state
        action_freq_data.append({'PlayerHand': ' '.join(map(str, player_hand)), 
                                 'DealerVisibleCard': dealer_card,
                                 'Hit': actions['Hit'], 
                                 'Stick': actions['Stick'],
                                 'TotalActions': actions['Hit'] + actions['Stick']})  # Add total actions for each state

    action_freq_df = pd.DataFrame(action_freq_data)
    action_freq_df.to_csv('PPO1M_action_frequencies.csv', index=False)
    
    rewards_df = pd.DataFrame(rewards, columns=['Reward'])
    rewards_df.to_csv('PPO1M_rewards_distribution.csv', index=False)

    results_df = pd.DataFrame(results)
    results_df.to_csv('PPO1M_blackjack_results.csv', index=False)

    win_rate = results_df[results_df['Outcome'] == 'Win'].shape[0] / num_games
    print(f"\nAgent won {results_df[results_df['Outcome'] == 'Win'].shape[0]} out of {num_games} games. Win rate: {win_rate * 100:.2f}%")
    return win_rate

model.save("PPO1M")
env = SimpleBlackjackEnv()
simulate_blackjack_games(env, model)
print(f"Win rate: {win_rate:.2f}")

Now we want to create a new environment that will reward the agent a bit for asking a new card and not busting, but will penalize him a lot for asking and busting, correspondigly: +0.5, -1.5. Then we will want to plot the new results to see how the agen treact to such a change in the playstyle.

In [19]:
#Env_2
def cmp(a, b):
    if a > b:
        return 1
    elif a < b:
        return -1
    else:
        return 0

class SimpleBlackjackEnv(gym.Env):
    metadata = {'render.modes': ['human']}
    
    def __init__(self):
        self.reward_range = (-np.inf, np.inf)
        super(SimpleBlackjackEnv, self).__init__()
        self.deck = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10] * 4  # a full deck
        random.shuffle(self.deck)  # shuffle the deck
        self.action_space = spaces.Discrete(2)
        self.observation_space = spaces.Box(low=0, high=11, shape=(23,), dtype=int)
        
    def draw_card(self):
        return self.deck.pop()
        
    def draw_hand(self):
        return [self.draw_card(), self.draw_card()]

    def usable_ace(self, hand):
        return 1 in hand and sum(hand) + 10 <= 21

    def sum_hand(self, hand):
        if self.usable_ace(hand):
            return sum(hand) + 10
        return sum(hand)

    def is_bust(self, hand):
        return self.sum_hand(hand) > 21

    def score(self, hand):
        return 0 if self.is_bust(hand) else self.sum_hand(hand)

    def reset(self):
        if len(self.deck) < 15:
            self.deck = [1, 2, 3, 4, 5, 6, 7, 8, 9,
                         10, 10, 10, 10] * 4
            random.shuffle(self.deck)
        self.dealer = self.draw_hand()
        self.player = self.draw_hand()
        return self._get_observation()

    def step(self, action):
        assert self.action_space.contains(action)
        if action == 1:  # hit
            self.player.append(self.draw_card())
            if self.is_bust(self.player):
                done = True
                reward = -2.5
            else:
                done = False
                reward = 1
        else:  # stick
            done = True
            while self.sum_hand(self.dealer) < 17:
                self.dealer.append(self.draw_card())
            reward = cmp(self.score(self.player), self.score(self.dealer))
        return self._get_observation(), reward, done, {}

    def _get_observation(self):
        player_obs = self.player + [0] * (11 - len(self.player))
        dealer_obs = self.dealer + [0] * (11 - len(self.dealer))
        usable_ace_obs = [1] if self.usable_ace(self.player) else [0]
        return np.array(player_obs + dealer_obs + usable_ace_obs)

    def render(self, mode='human'):
        if mode != 'human':
            raise NotImplementedError()
        return f"Player hand: {self.player}, Dealer hand: {self.dealer}"

    def close(self):
        pass
    def seed(self, seed=None):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

# Testing the environment to ensure it initializes and steps correctly
env = SimpleBlackjackEnv()
obs = env.reset()
print(env.render())
obs, reward, done, _ = env.step(1)
print(env.render())
obs, reward, done, _ = env.step(0)
print(env.render())

Player hand: [7, 10], Dealer hand: [4, 10]
Player hand: [7, 10, 10], Dealer hand: [4, 10]
Player hand: [7, 10, 10], Dealer hand: [4, 10, 2, 8]


In [20]:
#500k(draw) model
def evaluate_agent(model, env, num_games=1000):
    wins = 0
    win_rates = []
    num_games_list = []  # List to store the number of games after each logging interval

    for i in range(num_games):
        obs = env.reset()
        done = False
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, _ = env.step(action)
            if done and reward == 1:
                wins += 1
        if (i + 1) % 100 == 0:  # Log win rate every 100 games
            win_rates.append(wins / (i + 1))
            num_games_list.append(i + 1)  # Append the number of games played so far

    # Create a DataFrame with both win rates and number of games
    win_rate_df = pd.DataFrame({'WinRate': win_rates, 'NumGames': num_games_list})
    win_rate_df.to_csv('PPO500k(draw)_win_rate_over_time.csv', index=False)
    
    return wins / num_games


# Create the environment
env = DummyVecEnv([lambda: SimpleBlackjackEnv()])

# Set hyperparameters
params = {
    'learning_rate': 2.5e-4,
    'n_steps': 256,
    'batch_size': 64,
    'n_epochs': 10,
    'gamma': 0.99,
    'gae_lambda': 0.95,
    'clip_range': 0.2,
    'ent_coef': 1e-4
}

# Instantiate the model
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log="./blackjack_tensorboard/", **params)

# Train the model
model.learn(total_timesteps=500000)

# Evaluate the model
win_rate = evaluate_agent(model, env)

def simulate_blackjack_games(env, model, num_games=10000):
    action_frequencies = {}
    rewards = []
    results = []

    for game in range(num_games):
        obs = env.reset()
        done = False
        total_reward = 0
        player_actions = []
        player_hand_sums = []

        while not done:
            action, _ = model.predict(obs)
            player_actions.append('Hit' if action == 1 else 'Stick')

            # Define state key
            player_hand = obs[:11][obs[:11] != 0]
            dealer_visible_card = env.dealer[0]
            state_key = (tuple(player_hand), dealer_visible_card)

            # Record action frequencies
            if state_key not in action_frequencies:
                action_frequencies[state_key] = {'Hit': 0, 'Stick': 0}
            action_frequencies[state_key]['Hit' if action == 1 else 'Stick'] += 1

            obs, reward, done, _ = env.step(action)
            total_reward += reward
            player_hand_sums.append(env.sum_hand(player_hand))

        rewards.append(total_reward)

        player_final_hand = obs[:11][obs[:11] != 0]
        dealer_final_hand = obs[11:22][obs[11:22] != 0]

        game_results = {
            'Game': game + 1,
            'PlayerFinalHandSum': env.sum_hand(player_final_hand),
            'DealerFinalHandSum': env.sum_hand(dealer_final_hand),
            'PlayerNumCards': len(player_final_hand),
            'DealerNumCards': len(dealer_final_hand),
            'DealerVisibleCard': dealer_visible_card,
            'PlayerActions': ' '.join(player_actions),
            'PlayerHandProgression': ' '.join(map(str, player_hand_sums)),
            'Outcome': 'Win' if reward > 0 else 'Loss' if reward < 0 else 'Draw'
        }
        results.append(game_results)

    # Export action frequencies and rewards
    action_freq_data = []
    for state, actions in action_frequencies.items():
        player_hand, dealer_card = state
        action_freq_data.append({'PlayerHand': ' '.join(map(str, player_hand)), 
                                 'DealerVisibleCard': dealer_card,
                                 'Hit': actions['Hit'], 
                                 'Stick': actions['Stick']})
    
    action_freq_df = pd.DataFrame(action_freq_data)
    action_freq_df.to_csv('PPO500k(draw)_action_frequencies.csv', index=False)
    
    rewards_df = pd.DataFrame(rewards, columns=['Reward'])
    rewards_df.to_csv('PPO500k(draw)_rewards_distribution.csv', index=False)

    results_df = pd.DataFrame(results)
    results_df.to_csv('PPO500k(draw)_blackjack_results.csv', index=False)

    win_rate = results_df[results_df['Outcome'] == 'Win'].shape[0] / num_games
    print(f"\nAgent won {results_df[results_df['Outcome'] == 'Win'].shape[0]} out of {num_games} games. Win rate: {win_rate * 100:.2f}%")
    return win_rate

model.save("PPO500k(draw)")
env = SimpleBlackjackEnv()
simulate_blackjack_games(env, model)
print(f"Win rate: {win_rate:.2f}")

Using cpu device
Logging to ./blackjack_tensorboard/PPO_5
-----------------------------
| time/              |      |
|    fps             | 2512 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 256  |
-----------------------------



You provided an OpenAI Gym environment. We strongly recommend transitioning to Gymnasium environments. Stable-Baselines3 is automatically wrapping your environments in a compatibility layer, which could potentially cause issues.



-----------------------------------------
| time/                   |             |
|    fps                  | 1789        |
|    iterations           | 2           |
|    time_elapsed         | 0           |
|    total_timesteps      | 512         |
| train/                  |             |
|    approx_kl            | 0.015161392 |
|    clip_fraction        | 0.196       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.682      |
|    explained_variance   | -0.0253     |
|    learning_rate        | 0.00025     |
|    loss                 | 0.754       |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.031      |
|    value_loss           | 2.03        |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1918        |
|    iterations           | 3           |
|    time_elapsed         | 0           |
|    total_timesteps      | 768   

In [None]:
#1M model(draw)
def evaluate_agent(model, env, num_games=1000):
    wins = 0
    win_rates = []
    num_games_list = []  # List to store the number of games after each logging interval

    for i in range(num_games):
        obs = env.reset()
        done = False
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, _ = env.step(action)
            if done and reward == 1:
                wins += 1
        if (i + 1) % 100 == 0:  # Log win rate every 100 games
            win_rates.append(wins / (i + 1))
            num_games_list.append(i + 1)  # Append the number of games played so far

    # Create a DataFrame with both win rates and number of games
    win_rate_df = pd.DataFrame({'WinRate': win_rates, 'NumGames': num_games_list})
    win_rate_df.to_csv('PPO1M(draw)_win_rate_over_time.csv', index=False)
    
    return wins / num_games


# Create the environment
env = DummyVecEnv([lambda: SimpleBlackjackEnv()])

# Set hyperparameters
params = {
    'learning_rate': 2.5e-4,
    'n_steps': 256,
    'batch_size': 64,
    'n_epochs': 10,
    'gamma': 0.99,
    'gae_lambda': 0.95,
    'clip_range': 0.2,
    'ent_coef': 1e-4
}

# Instantiate the model
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log="./blackjack_tensorboard/", **params)

# Train the model
model.learn(total_timesteps=500000)

# Evaluate the model
win_rate = evaluate_agent(model, env)

def simulate_blackjack_games(env, model, num_games=10000):
    action_frequencies = {}
    rewards = []
    results = []

    for game in range(num_games):
        obs = env.reset()
        done = False
        total_reward = 0
        player_actions = []
        player_hand_sums = []

        while not done:
            action, _ = model.predict(obs)
            player_actions.append('Hit' if action == 1 else 'Stick')

            # Define state key
            player_hand = obs[:11][obs[:11] != 0]
            dealer_visible_card = env.dealer[0]
            state_key = (tuple(player_hand), dealer_visible_card)

            # Record action frequencies
            if state_key not in action_frequencies:
                action_frequencies[state_key] = {'Hit': 0, 'Stick': 0}
            action_frequencies[state_key]['Hit' if action == 1 else 'Stick'] += 1

            obs, reward, done, _ = env.step(action)
            total_reward += reward
            player_hand_sums.append(env.sum_hand(player_hand))

        rewards.append(total_reward)

        player_final_hand = obs[:11][obs[:11] != 0]
        dealer_final_hand = obs[11:22][obs[11:22] != 0]

        game_results = {
            'Game': game + 1,
            'PlayerFinalHandSum': env.sum_hand(player_final_hand),
            'DealerFinalHandSum': env.sum_hand(dealer_final_hand),
            'PlayerNumCards': len(player_final_hand),
            'DealerNumCards': len(dealer_final_hand),
            'DealerVisibleCard': dealer_visible_card,
            'PlayerActions': ' '.join(player_actions),
            'PlayerHandProgression': ' '.join(map(str, player_hand_sums)),
            'Outcome': 'Win' if reward > 0 else 'Loss' if reward < 0 else 'Draw'
        }
        results.append(game_results)

    # Export action frequencies and rewards
    action_freq_data = []
    for state, actions in action_frequencies.items():
        player_hand, dealer_card = state
        action_freq_data.append({'PlayerHand': ' '.join(map(str, player_hand)), 
                                 'DealerVisibleCard': dealer_card,
                                 'Hit': actions['Hit'], 
                                 'Stick': actions['Stick']})
    
    action_freq_df = pd.DataFrame(action_freq_data)
    action_freq_df.to_csv('PPO1M(draw)_action_frequencies.csv', index=False)
    
    rewards_df = pd.DataFrame(rewards, columns=['Reward'])
    rewards_df.to_csv('PPO1M(draw)_rewards_distribution.csv', index=False)

    results_df = pd.DataFrame(results)
    results_df.to_csv('PPO1M(draw)_blackjack_results.csv', index=False)

    win_rate = results_df[results_df['Outcome'] == 'Win'].shape[0] / num_games
    print(f"\nAgent won {results_df[results_df['Outcome'] == 'Win'].shape[0]} out of {num_games} games. Win rate: {win_rate * 100:.2f}%")
    return win_rate

model.save("PPO500k(draw)")
env = SimpleBlackjackEnv()
simulate_blackjack_games(env, model)
print(f"Win rate: {win_rate:.2f}")

In [8]:
#500k DQN model
def evaluate_agent(model, env, num_games=1000):
    wins = 0
    win_rates = []
    num_games_list = []  # List to store the number of games after each logging interval

    for i in range(num_games):
        obs = env.reset()
        done = False
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, _ = env.step(action)
            if done and reward == 1:
                wins += 1
        if (i + 1) % 100 == 0:  # Log win rate every 100 games
            win_rates.append(wins / (i + 1))
            num_games_list.append(i + 1)  # Append the number of games played so far

    # Create a DataFrame with both win rates and number of games
    win_rate_df = pd.DataFrame({'WinRate': win_rates, 'NumGames': num_games_list})
    win_rate_df.to_csv('DQN500k_win_rate_over_time.csv', index=False)
    
    return wins / num_games


# Create the environment
env = DummyVecEnv([lambda: SimpleBlackjackEnv()])



model = DQN(
    "MlpPolicy",
    env,
    learning_rate=1e-3,
    buffer_size=10000,
    learning_starts=1000,
    batch_size=32,
    tau=1.0,
    gamma=0.99,
    train_freq=4,
    gradient_steps=1,
    target_update_interval=1000,
    exploration_fraction=0.1,
    exploration_final_eps=0.02,
    max_grad_norm=10,
    tensorboard_log="./blackjack_dqn_tensorboard/",
    verbose=1
)

model.learn(total_timesteps = 50000)

# Evaluate the model
win_rate = evaluate_agent(model, env)

def simulate_blackjack_games(env, model, num_games=10000):
    action_frequencies = {}
    rewards = []
    results = []

    for game in range(num_games):
        obs = env.reset()
        done = False
        total_reward = 0
        player_actions = []
        player_hand_sums = []

        while not done:
            action, _ = model.predict(obs)
            player_actions.append('Hit' if action == 1 else 'Stick')

            # Define state key
            player_hand = obs[:11][obs[:11] != 0]
            dealer_visible_card = env.dealer[0]
            state_key = (tuple(player_hand), dealer_visible_card)

            # Record action frequencies
            if state_key not in action_frequencies:
                action_frequencies[state_key] = {'Hit': 0, 'Stick': 0}
            action_frequencies[state_key]['Hit' if action == 1 else 'Stick'] += 1

            obs, reward, done, _ = env.step(action)
            total_reward += reward
            player_hand_sums.append(env.sum_hand(player_hand))

        rewards.append(total_reward)

        player_final_hand = obs[:11][obs[:11] != 0]
        dealer_final_hand = obs[11:22][obs[11:22] != 0]

        game_results = {
            'Game': game + 1,
            'PlayerFinalHandSum': env.sum_hand(player_final_hand),
            'DealerFinalHandSum': env.sum_hand(dealer_final_hand),
            'PlayerNumCards': len(player_final_hand),
            'DealerNumCards': len(dealer_final_hand),
            'DealerVisibleCard': dealer_visible_card,
            'PlayerActions': ' '.join(player_actions),
            'PlayerHandProgression': ' '.join(map(str, player_hand_sums)),
            'Outcome': 'Win' if reward > 0 else 'Loss' if reward < 0 else 'Draw'
        }
        results.append(game_results)

    # Export action frequencies and rewards
    action_freq_data = []
    for state, actions in action_frequencies.items():
        player_hand, dealer_card = state
        action_freq_data.append({'PlayerHand': ' '.join(map(str, player_hand)), 
                                 'DealerVisibleCard': dealer_card,
                                 'Hit': actions['Hit'], 
                                 'Stick': actions['Stick']})
    
    action_freq_df = pd.DataFrame(action_freq_data)
    action_freq_df.to_csv('DQN500k_action_frequencies.csv', index=False)
    
    rewards_df = pd.DataFrame(rewards, columns=['Reward'])
    rewards_df.to_csv('DQN500k_rewards_distribution.csv', index=False)

    results_df = pd.DataFrame(results)
    results_df.to_csv('DQN500k_blackjack_results.csv', index=False)

    win_rate = results_df[results_df['Outcome'] == 'Win'].shape[0] / num_games
    print(f"\nAgent won {results_df[results_df['Outcome'] == 'Win'].shape[0]} out of {num_games} games. Win rate: {win_rate * 100:.2f}%")
    return win_rate

model.save("DQN500k")
env = SimpleBlackjackEnv()
simulate_blackjack_games(env, model)
print(f"Win rate: {win_rate:.2f}")



Using cpu device
Logging to ./blackjack_dqn_tensorboard/DQN_2
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.999    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 427      |
|    time_elapsed     | 0        |
|    total_timesteps  | 5        |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.998    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 701      |
|    time_elapsed     | 0        |
|    total_timesteps  | 9        |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.997    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 1038     |
|    time_elapsed     | 0        |
|    total_timesteps  | 14       |
----------------------------

In [3]:

df1 = pd.read_csv("/Users/tommasoagudio/Documents/GitHub/GamblingRL/BlackJack_noBudget/PPO500k/PPO500k_blackjack_results.csv")
df2 = pd.read_csv("/Users/tommasoagudio/Documents/GitHub/GamblingRL/BlackJack_noBudget/PPO1M/PPO1M_blackjack_results.csv")
df3 = pd.read_csv("/Users/tommasoagudio/Documents/GitHub/GamblingRL/BlackJack_noBudget/PPO500k(draw)/PPO500k(draw)_blackjack_results.csv")

outcome_counts_df1 = df1['Outcome'].value_counts()
outcome_counts_df2 = df2['Outcome'].value_counts()
outcome_counts_df3 = df3['Outcome'].value_counts()

comparison_df = pd.DataFrame({
    'Outcome': outcome_counts_df1.index.tolist() + outcome_counts_df2.index.tolist() + outcome_counts_df3.index.tolist(),
    'Count': outcome_counts_df1.tolist() + outcome_counts_df2.tolist() + outcome_counts_df3.tolist(),
    'Dataset': ['Dataset 1'] * len(outcome_counts_df1) + ['Dataset 2'] * len(outcome_counts_df2) + ['Dataset 3'] * len(outcome_counts_df3)
})

comparison_df['Dataset'] = comparison_df['Dataset'].replace({'Dataset 1': 'PPO500k', 'Dataset 2': 'PPO1M', 'Dataset 3': 'PPO500k(draw)'})



fig = px.bar(comparison_df, x='Outcome', y='Count', color='Dataset', 
             barmode='group',  # This will group the bars similar to Seaborn's hue
             title='Comparison of Outcomes between Three Datasets')

# Customize the layout
fig.update_layout(
    xaxis_title='Outcome',
    yaxis_title='Count',
    legend_title='Dataset',
    plot_bgcolor='white'
)

# Show the plot
fig.show()


In [4]:


# Add a 'Dataset' column to each DataFrame
df1['Dataset'] = 'Dataset 1'
df2['Dataset'] = 'Dataset 2'
df3['Dataset'] = 'Dataset 3'

# Combine the DataFrames
combined_df = pd.concat([df1, df2, df3])

# Creating subplots
fig = make_subplots(rows=2, cols=1, subplot_titles=('Player Final Hand Sums', 'Dealer Final Hand Sums'))

# Histogram for Player Final Hand Sum
fig.add_trace(go.Histogram(x=combined_df[combined_df['Dataset'] == 'Dataset 1']['PlayerFinalHandSum'], nbinsx=30, name='Dataset 1 - Player'),
              row=1, col=1)
fig.add_trace(go.Histogram(x=combined_df[combined_df['Dataset'] == 'Dataset 2']['PlayerFinalHandSum'], nbinsx=30, name='Dataset 2 - Player'),
              row=1, col=1)
fig.add_trace(go.Histogram(x=combined_df[combined_df['Dataset'] == 'Dataset 3']['PlayerFinalHandSum'], nbinsx=30, name='Dataset 3 - Player'),
              row=1, col=1)

# Histogram for Dealer Final Hand Sum
fig.add_trace(go.Histogram(x=combined_df[combined_df['Dataset'] == 'Dataset 1']['DealerFinalHandSum'], nbinsx=30, name='Dataset 1 - Dealer'),
              row=2, col=1)
fig.add_trace(go.Histogram(x=combined_df[combined_df['Dataset'] == 'Dataset 2']['DealerFinalHandSum'], nbinsx=30, name='Dataset 2 - Dealer'),
              row=2, col=1)
fig.add_trace(go.Histogram(x=combined_df[combined_df['Dataset'] == 'Dataset 3']['DealerFinalHandSum'], nbinsx=30, name='Dataset 3 - Dealer'),
              row=2, col=1)

# Update layout
fig.update_layout(height=700, showlegend=True, title_text="Histograms of Final Hand Sums for Multiple Datasets")
fig.update_xaxes(title_text="Final Hand Sum", row=1, col=1)
fig.update_xaxes(title_text="Final Hand Sum", row=2, col=1)
fig.update_yaxes(title_text="Count", row=1, col=1)
fig.update_yaxes(title_text="Count", row=2, col=1)

fig.show()


In [18]:
import pandas as pd
import plotly.graph_objects as go

# Load your datasets
ppo500k_df= pd.read_csv("/Users/tommasoagudio/Documents/GitHub/GamblingRL/BlackJack_noBudget/PPO500k/PPO500k_rewards_distribution.csv")
ppo1m_df = pd.read_csv("/Users/tommasoagudio/Documents/GitHub/GamblingRL/BlackJack_noBudget/PPO1M/PPO1M_rewards_distribution.csv")
ppo500k_draw_df = pd.read_csv("/Users/tommasoagudio/Documents/GitHub/GamblingRL/BlackJack_noBudget/PPO500k(draw)/PPO500k(draw)_rewards_distribution.csv")

fig = go.Figure()
fig.add_trace(go.Violin(y=ppo1m_df['Reward'], name='PPO1M', box_visible=True, line_color='blue', fillcolor='blue', opacity=0.6))
fig.add_trace(go.Violin(y=ppo500k_draw_df['Reward'], name='PPO500k (draw)', box_visible=True, line_color='red', fillcolor='red', opacity=0.6))
fig.add_trace(go.Violin(y=ppo500k_df['Reward'], name='PPO500k', box_visible=True, line_color='green', fillcolor='green', opacity=0.6))

# Update the layout
fig.update_layout(
    title="Reward Distributions Comparison - Violin Plots",
    yaxis_title="Reward",
    violingap=0.05,  # Gap between the violins
    violingroupgap=0.2  # Gap between groups of violins
)

# Show the figure
fig.show()


In [20]:

import pandas as pd
import plotly.graph_objects as go

# Load your datasets
ppo500k_df = pd.read_csv("/Users/tommasoagudio/Documents/GitHub/GamblingRL/BlackJack_noBudget/PPO500k/PPO500k_win_rate_over_time.csv")
ppo1m_df = pd.read_csv("/Users/tommasoagudio/Documents/GitHub/GamblingRL/BlackJack_noBudget/PPO1M/PPO1M_win_rate_over_time.csv")
ppo500k_draw_df = pd.read_csv("/Users/tommasoagudio/Documents/GitHub/GamblingRL/BlackJack_noBudget/PPO500k(draw)/PPO500k(draw)_win_rate_over_time.csv")

# Creating a line plot for each dataset
fig = go.Figure()
fig.add_trace(go.Scatter(x=ppo500k_df['NumGames'], y=ppo500k_df['WinRate'],
                         mode='lines', name='PPO500k'))
fig.add_trace(go.Scatter(x=ppo1m_df['NumGames'], y=ppo1m_df['WinRate'],
                         mode='lines', name='PPO1M'))
fig.add_trace(go.Scatter(x=ppo500k_draw_df['NumGames'], y=ppo500k_draw_df['WinRate'],
                         mode='lines', name='PPO500k (draw)'))

# Update the layout
fig.update_layout(
    title="Win Rate Over Number of Games - Comparison",
    xaxis_title="Number of Games",
    yaxis_title="Win Rate",
    hovermode="x"
)

# Show the figure
fig.show()


In [27]:
env = DummyVecEnv([lambda: SimpleBlackjackEnv()])


def simulate_blackjack_games(env, model, num_games=5000):
    win_count = 0

    for _ in range(num_games):
        obs = env.reset()
        done = False
        total_reward = 0

        while not done:
            action, _ = model.predict(obs)
            obs, reward, done, _ = env.step(action)
            total_reward += reward

        if total_reward > 0:  # Increment win count if the reward for the game is positive
            win_count += 1

    win_rate = win_count / num_games
    print(f"\nAgent won {win_count} out of {num_games} games. Win rate: {win_rate * 100:.2f}%")
    return win_rate

model = PPO.load("/Users/tommasoagudio/Documents/GitHub/GamblingRL/BlackJack_noBudget/PPO1M/PPO1M.zip")
rates = []
for i in range (1000):
    rates.append(simulate_blackjack_games(env, model))
    print(i)
print(rates)


You provided an OpenAI Gym environment. We strongly recommend transitioning to Gymnasium environments. Stable-Baselines3 is automatically wrapping your environments in a compatibility layer, which could potentially cause issues.


Could not deserialize object clip_range. Consider using `custom_objects` argument to replace this object.
Exception: code() argument 13 must be str, not int


Could not deserialize object lr_schedule. Consider using `custom_objects` argument to replace this object.
Exception: code() argument 13 must be str, not int




Agent won 2291 out of 5000 games. Win rate: 45.82%
0

Agent won 2228 out of 5000 games. Win rate: 44.56%
1

Agent won 2269 out of 5000 games. Win rate: 45.38%
2

Agent won 2315 out of 5000 games. Win rate: 46.30%
3

Agent won 2202 out of 5000 games. Win rate: 44.04%
4

Agent won 2224 out of 5000 games. Win rate: 44.48%
5

Agent won 2276 out of 5000 games. Win rate: 45.52%
6

Agent won 2216 out of 5000 games. Win rate: 44.32%
7

Agent won 2243 out of 5000 games. Win rate: 44.86%
8

Agent won 2180 out of 5000 games. Win rate: 43.60%
9

Agent won 2234 out of 5000 games. Win rate: 44.68%
10

Agent won 2285 out of 5000 games. Win rate: 45.70%
11

Agent won 2215 out of 5000 games. Win rate: 44.30%
12

Agent won 2193 out of 5000 games. Win rate: 43.86%
13

Agent won 2282 out of 5000 games. Win rate: 45.64%
14

Agent won 2228 out of 5000 games. Win rate: 44.56%
15

Agent won 2265 out of 5000 games. Win rate: 45.30%
16

Agent won 2274 out of 5000 games. Win rate: 45.48%
17

Agent won 2299 out 

KeyboardInterrupt: 