In [12]:
# import kagglehub

# # Download latest version
# path = kagglehub.dataset_download("dennisho/blackjack-hands")

# print("Path to dataset files:", path)

import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
import ast

In [4]:
# Load the dataset (replace with your actual file path)
data = pd.read_csv("blackjack_simulator.csv")  # Replace 'your_data.csv' with the actual filename
data

Unnamed: 0,shoe_id,cards_remaining,dealer_up,initial_hand,dealer_final,dealer_final_value,player_final,player_final_value,actions_taken,run_count,true_count,win
0,0,416,10,"[10, 11]","[10, 4, 10]",24,"[[10, 11]]",['BJ'],[['S']],1,0,1.5
1,0,411,10,"[5, 5]","[10, 8]",18,"[[5, 5, 11]]",[21],"[['H', 'S']]",-2,0,1.0
2,0,406,6,"[3, 10]","[6, 6, 10]",22,"[[3, 10]]",[13],[['S']],-2,0,1.0
3,0,401,10,"[5, 9]","[10, 8]",18,"[[5, 9, 11, 3]]",[18],"[['H', 'H', 'S']]",-1,0,0.0
4,0,395,8,"[6, 10]","[8, 2, 10]",20,"[[6, 10, 10]]",[26],[['H']],-1,0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
49999995,822844,108,10,"[7, 9]","[10, 3, 10]",23,"[[7, 9]]",[16],[['R']],14,6,-0.5
49999996,822844,103,5,"[9, 9]","[5, 2, 5, 9]",21,"[[9, 10], [9, 10]]","[19, 19]","[['P', 'S'], ['S']]",13,6,-2.0
49999997,822844,95,10,"[10, 10]","[10, 10]",20,"[[10, 10]]",[20],[['S']],14,7,0.0
49999998,822844,91,9,"[4, 10]","[9, 10]",19,"[[4, 10, 9]]",[23],[['H']],10,5,-1.0


In [10]:
data.dtypes

shoe_id                 int64
cards_remaining         int64
dealer_up               int64
initial_hand           object
dealer_final           object
dealer_final_value     object
player_final           object
player_final_value     object
actions_taken          object
run_count               int64
true_count              int64
win                   float64
dtype: object

In [16]:
# --- Data Preprocessing (List Handling & Minimal) ---

# Keep all columns
df = data.copy()

# Columns to treat as lists
list_cols = ['initial_hand', 'dealer_final', 'player_final', 'actions_taken']

# Convert string representations of lists to actual lists
for col in list_cols:
    try:
        df[col] = df[col].apply(ast.literal_eval)
    except (ValueError, SyntaxError) as e:
        print(f"Warning: Could not convert column {col} to list. Imputing with empty list. Error {e}")
        df[col] = [[] for _ in range(len(df))] # impute empty lists if parse fails

# Convert 'win' to categorical: win, lose, push
df['win'] = df['win'].apply(lambda x: 'win' if x > 0 else ('lose' if x < 0 else 'push'))

# For simplicity, convert numerical columns into categorical columns with bins
for col in df.select_dtypes(include=np.number):
    df[col] = pd.cut(df[col], bins=5, labels=False, include_lowest=True) # Convert to categorical

# Also treat list-like columns as categorical by converting the lists to strings
for col in list_cols:
    df[col] = df[col].astype(str)

# Convert all object columns to strings
for col in df.select_dtypes(include='object'):
    df[col] = df[col].astype(str)

# Drop any remaining NaN - can cause issues with CPT calculation
df.dropna(inplace=True)

KeyboardInterrupt: 

In [None]:
df

Unnamed: 0,shoe_id,cards_remaining,dealer_up,initial_hand,dealer_final,dealer_final_value,player_final,player_final_value,actions_taken,run_count,true_count,win
0,0,4,4,"[10, 11]","[10, 4, 10]",24,"[[10, 11]]",['BJ'],[['S']],2,2,win
1,0,4,4,"[5, 5]","[10, 8]",18,"[[5, 5, 11]]",[21],"[['H', 'S']]",2,2,win
2,0,4,2,"[3, 10]","[6, 6, 10]",22,"[[3, 10]]",[13],[['S']],2,2,win
3,0,4,4,"[5, 9]","[10, 8]",18,"[[5, 9, 11, 3]]",[18],"[['H', 'H', 'S']]",2,2,push
4,0,4,3,"[6, 10]","[8, 2, 10]",20,"[[6, 10, 10]]",[26],[['H']],2,2,lose
...,...,...,...,...,...,...,...,...,...,...,...,...
49999995,4,0,4,"[7, 9]","[10, 3, 10]",23,"[[7, 9]]",[16],[['R']],3,3,lose
49999996,4,0,1,"[9, 9]","[5, 2, 5, 9]",21,"[[9, 10], [9, 10]]","[19, 19]","[['P', 'S'], ['S']]",3,3,lose
49999997,4,0,4,"[10, 10]","[10, 10]",20,"[[10, 10]]",[20],[['S']],3,3,push
49999998,4,0,3,"[4, 10]","[9, 10]",19,"[[4, 10, 9]]",[23],[['H']],3,3,lose


In [None]:
# --- Split into Training and Testing Sets ---
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)  # 80% train, 20% test

ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
# --- Build Bayesian Network (From Scratch) ---

class BayesianNode:
    def __init__(self, name, parents=None):
        self.name = name
        self.parents = parents if parents else []
        self.cpt = {}  # Conditional Probability Table

    def compute_cpt(self, data):
        """Computes the CPT based on the data."""
        if not self.parents:
            # If no parents, calculate marginal probabilities
            counts = data[self.name].value_counts(normalize=True)
            self.cpt = counts.to_dict()
        else:
            # Calculate conditional probabilities
            grouped = data.groupby([parent.name for parent in self.parents] + [self.name])
            total_counts = data.groupby([parent.name for parent in self.parents]).size().to_dict()
            counts = grouped.size().to_dict()

            for cond, count in counts.items():
                condition = cond[:-1]
                outcome = cond[-1]

                if isinstance(condition, str):
                    condition = (condition,)

                condition_key = condition
                if len(condition) == 1:
                    condition_key = condition[0]
                elif len(condition) > 1:
                    condition_key = tuple(condition)

                if condition_key not in self.cpt:
                    self.cpt[condition_key] = {}

                # Safe check
                if condition_key not in total_counts:
                    print(f"Warning: Condition {condition_key} not found in total counts for node{self.name} with condition {condition_key}")
                    self.cpt[condition_key] = {}  # empty dictionary
                else:
                    self.cpt[condition_key][outcome] = count / total_counts[condition_key]

    def predict_proba(self, evidence):
        """Predicts probabilities given evidence."""
        if not self.parents:
            return self.cpt  # Return marginal probabilities

        # Find relevant condition based on evidence
        condition = tuple(evidence[parent.name] for parent in self.parents)

        if condition in self.cpt:
            return self.cpt[condition]
        else:
            # Handle unseen conditions (e.g., return a uniform distribution)
            print(f"Warning: Condition {condition} not found in CPT for node {self.name}. Returning uniform distribution.")
            outcomes = train_df[self.name].unique()  # Important - uses train_df
            return {outcome: 1.0 / len(outcomes) for outcome in outcomes}

# --- Build the Network Structure (Define Dependencies) ---

# Create all nodes
nodes = {col: BayesianNode(col) for col in df.columns}

# Define a simple structure: Make each column a parent of 'win'
for col in df.columns:
    if col != 'win':
        nodes['win'].parents.append(nodes[col])

# --- Compute CPTs (Train on Training Data) ---
for node in nodes.values():
    node.compute_cpt(train_df)  # Compute CPTs using ONLY training data

# --- Inference Function ---
def predict_win_probability(evidence):
    """Predicts the probability of winning given evidence."""
    win_probs = nodes['win'].predict_proba(evidence)
    return win_probs

KeyboardInterrupt: 

In [None]:
# --- Evaluation ---
def evaluate_model(test_df):
    """Evaluates the model on the test data."""
    correct_predictions = 0
    total_predictions = len(test_df)

    for index, row in test_df.iterrows():
        # Prepare evidence from the test data row
        evidence = row.drop('win').to_dict()  # All columns except 'win'

        # Convert types to string
        evidence = {str(k): str(v) for k, v in evidence.items()}

        # Make prediction
        probabilities = predict_win_probability(evidence)

        # Determine predicted win category (most probable)
        predicted_win = max(probabilities, key=probabilities.get)

        # Get actual win category from the test data
        actual_win = str(row['win'])  # Ensure it's a string

        # Compare prediction with actual
        if predicted_win == actual_win:
            correct_predictions += 1

    accuracy = correct_predictions / total_predictions
    return accuracy

In [None]:
# --- Example Usage and Evaluation ---

# Evaluate the model
accuracy = evaluate_model(test_df)
print(f"Accuracy on the test set: {accuracy:.4f}")