In [39]:
import random
import gymnasium as gym
from gymnasium.envs import *
from gymnasium import spaces
import numpy as np
from stable_baselines3 import PPO
import optuna
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3 import DQN
from gym.wrappers import monitoring
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
import matplotlib.pyplot as plt
from stable_baselines3.common.monitor import Monitor

In [59]:
import numpy as np
import random

# Define the comparison function for blackjack results
def cmp(a, b):
    if a > b:
        return 1
    elif a < b:
        return -1
    else:
        return 0

# Define the simple blackjack environment
class SimpleBlackjackEnv(gym.Env):
    def __init__(self, initial_budget=100, bet_sizes=[1, 5, 10]):
        self.deck = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10] * 4
        random.shuffle(self.deck)
        self.dealer = None
        self.player = None
        self.budget = initial_budget
        self.current_bet = 1
        self.bet_sizes = bet_sizes
        self.action_space = spaces.Discrete(len(bet_sizes) * 2)  # hit or stick for each bet size

        # Define observation space
        # The observation will be an array containing:
        # - The player's current hand total (could be between 0 to 31 - allowing for usable ace)
        # - The dealer's showing card (could be between 1 to 10)
        # - Whether the player has a usable ace (0 or 1)
        self.observation_space = spaces.Box(low=np.array([0, 1, 0]), high=np.array([31, 10, 1]), dtype=np.int32)

    def set_bet(self, bet):
        if bet in self.bet_sizes:
            self.current_bet = bet
        else:
            raise ValueError("Bet size not in allowed bet sizes")

    def draw_card(self):
        return self.deck.pop()

    def draw_hand(self):
        return [self.draw_card(), self.draw_card()]

    def usable_ace(self, hand):
        return 1 in hand and sum(hand) + 10 <= 21

    def sum_hand(self, hand):
        return sum(hand) + 10 if self.usable_ace(hand) else sum(hand)

    def is_bust(self, hand):
        return self.sum_hand(hand) > 21

    def score(self, hand):
        return 0 if self.is_bust(hand) else self.sum_hand(hand)

    def reset(self, seed=None):
            # Simply ignore the seed and reset the environment as usual
            if len(self.deck) < 15:
                self.deck = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10] * 4
                random.shuffle(self.deck)
            self.dealer = self.draw_hand()
            self.player = self.draw_hand()
            self.current_bet = 1
            return self._get_observation()

    def step(self, action):
        # Determine bet size and action from the single 'action' argument
        bet_action = action // 2
        hit_or_stick = action % 2
        self.current_bet = self.bet_sizes[bet_action]

        if self.budget <= 0:
            return self._get_observation(), 0.0, True, {'message': 'Budget depleted'}

        # Player decision: hit
        if hit_or_stick == 1:
            self.player.append(self.draw_card())
            if self.is_bust(self.player):
                done = True
                reward = -self.current_bet
                self.budget += reward
            else:
                done = False
                reward = 0.0
        # Player decision: stick
        else:
            done = True
            while self.sum_hand(self.dealer) < 17:
                self.dealer.append(self.draw_card())
            game_outcome = cmp(self.score(self.player), self.score(self.dealer))
            reward = self.current_bet if game_outcome == 1 else -self.current_bet if game_outcome == -1 else 0
            self.budget += reward

        if self.budget <= 0:
            done = True

        return self._get_observation(), reward, done, {'budget': self.budget}

    def _get_observation(self):
        player_obs = self.player + [0] * (11 - len(self.player))
        dealer_obs = self.dealer + [0] * (11 - len(self.dealer))
        usable_ace_obs = [1] if self.usable_ace(self.player) else [0]
        return np.array(player_obs + dealer_obs + usable_ace_obs, dtype=np.int32)

    def render(self, mode='human'):
        if mode != 'human':
            raise NotImplementedError()
        print(f"Player hand: {self.player} (score: {self.score(self.player)})")
        print(f"Dealer hand: {self.dealer} (score: {self.score(self.dealer)})")
        print(f"Current bet: {self.current_bet}")
        print(f"Budget: {self.budget}")

    def close(self):
        pass


# To create an instance of the environment and test it:
env = SimpleBlackjackEnv(initial_budget=100, bet_sizes=[1, 5, 10])
env.reset()
env.render()

# Example of a single step with the action 'hit with a bet size of 5'
# (assuming '1' corresponds to 'hit' and the second bet size is '5')
action = 3  # This corresponds to the second bet size with a 'hit' action
observation, reward, done, info = env.step(action)
env.render()



# Testing the environment to ensure it initializes and steps correctly
env = SimpleBlackjackEnv()
obs = env.reset()
print(env.render())
obs, reward, done, _ = env.step(1)
print(env.render())
obs, reward, done, _ = env.step(0)
print(env.render())

Player hand: [10, 4] (score: 14)
Dealer hand: [10, 7] (score: 17)
Current bet: 1
Budget: 100
Player hand: [10, 4, 2] (score: 16)
Dealer hand: [10, 7] (score: 17)
Current bet: 5
Budget: 100
Player hand: [4, 7] (score: 11)
Dealer hand: [10, 1] (score: 21)
Current bet: 1
Budget: 100
None
Player hand: [4, 7, 6] (score: 17)
Dealer hand: [10, 1] (score: 21)
Current bet: 1
Budget: 100
None
Player hand: [4, 7, 6] (score: 17)
Dealer hand: [10, 1] (score: 21)
Current bet: 1
Budget: 99
None


In [60]:
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

# Create and wrap the environment
env = DummyVecEnv([lambda: SimpleBlackjackEnv(initial_budget=100)])

# Instantiate the model
model = PPO("MlpPolicy", env, verbose=1)

# Train the model
model.learn(total_timesteps=10000)






Using cpu device


ValueError: too many values to unpack (expected 2)

In [25]:
obs = env.reset()
for _ in range(1000):  # Run for some steps to see the agent's behavior
    action, _states = model.predict(obs, deterministic=True)
    obs, rewards, dones, info = env.step(action)
    env.render()  # Visualize the game

    if dones:
        obs = env.reset()

Player hand: [2, 10, 3] (score: 15)
Dealer hand: [10, 3] (score: 13)
Current bet: 1
Budget: 100
Player hand: [2, 10, 3, 10] (score: 0)
Dealer hand: [10, 3] (score: 13)
Current bet: 1
Budget: 99
Player hand: [10, 10, 5] (score: 0)
Dealer hand: [8, 8] (score: 16)
Current bet: 1
Budget: 98
Player hand: [6, 7] (score: 13)
Dealer hand: [3, 6, 1] (score: 20)
Current bet: 1
Budget: 97
Player hand: [10, 5] (score: 15)
Dealer hand: [7, 9, 7] (score: 0)
Current bet: 1
Budget: 98
Player hand: [10, 3, 7] (score: 20)
Dealer hand: [4, 1] (score: 15)
Current bet: 1
Budget: 98
Player hand: [10, 3, 7, 10] (score: 0)
Dealer hand: [4, 1] (score: 15)
Current bet: 1
Budget: 97
Player hand: [10, 4, 4] (score: 18)
Dealer hand: [1, 1] (score: 12)
Current bet: 1
Budget: 97
Player hand: [10, 4, 4, 8] (score: 0)
Dealer hand: [1, 1] (score: 12)
Current bet: 1
Budget: 96
Player hand: [8, 10, 2] (score: 20)
Dealer hand: [9, 4] (score: 13)
Current bet: 1
Budget: 96
Player hand: [8, 10, 2, 10] (score: 0)
Dealer hand: