# NBA Betting Simulator

Maanas Manoj, 
mm3054, 
Section 04

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss, brier_score_loss, roc_auc_score
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import warnings
from tqdm import tqdm
import os

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Create visualizations directory if it doesn't exist
os.makedirs('visualizations', exist_ok=True)

## NBA Betting Simulator Class

We'll define the main `NBABettingSimulator` class that handles all aspects of the betting simulation.

In [2]:
class NBABettingSimulator:
    def __init__(self, data_path='oddsData.csv', value_threshold=0.05, initial_bankroll=1000, bet_size=0.02):
        self.data_path = data_path
        self.value_threshold = value_threshold
        self.initial_bankroll = initial_bankroll
        self.bet_size = bet_size
        self.model = None
        self.label_encoders = {}
        self.imputer = SimpleImputer(strategy='mean')

## Data Loading and Preprocessing

The following method loads and preprocesses the betting data, handling missing values, encoding categorical variables, and creating derived features.

In [3]:
def load_and_preprocess_data(self):
    """Load and preprocess the data with robust NaN handling"""
    print("Loading and preprocessing data...")
    df = pd.read_csv(self.data_path)
    
    # Convert date and sort
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date')
    
    # Encode categorical variables
    for col in ['team', 'opponent', 'home/visitor']:
        self.label_encoders[col] = LabelEncoder()
        df[col] = self.label_encoders[col].fit_transform(df[col].astype(str))
    
    # Calculate target variable (win/loss)
    df['win'] = (df['score'] > df['opponentScore']).astype(int)
    
    # Convert moneyline to implied probability with NaN handling
    def moneyline_to_prob(moneyline):
        try:
            if moneyline > 0:
                return 100 / (moneyline + 100)
            else:
                return -moneyline / (-moneyline + 100)
        except:
            return np.nan
    
    df['implied_prob'] = df['moneyLine'].apply(moneyline_to_prob)
    df['opp_implied_prob'] = df['opponentMoneyLine'].apply(moneyline_to_prob)
    
    # Market's win probability (adjust for home/away)
    df['market_win_prob'] = np.where(
        df['home/visitor'] == 1, 
        df['implied_prob'], 
        1 - df['implied_prob']
    )
    
    # Calculate additional features with NaN handling
    df['point_diff'] = df['score'] - df['opponentScore']
    df['total_points'] = df['score'] + df['opponentScore']
    df['pace'] = df['total_points'] / (df['total'].replace(0, np.nan) / 2)
    
    # Calculate rest days with proper NaN handling
    df['days_rest'] = df.groupby('team')['date'].diff().dt.days
    df['days_rest'] = df.groupby('team')['days_rest'].fillna(method='bfill').fillna(5)
    
    # Calculate rolling statistics with minimum periods requirement
    rolling_stats = []
    for team in df['team'].unique():
        team_df = df[df['team'] == team].sort_values('date')
        for window in [5, 10, 20]:
            team_df[f'win_pct_{window}'] = team_df['win'].rolling(window, min_periods=3).mean()
            team_df[f'point_diff_{window}'] = team_df['point_diff'].rolling(window, min_periods=3).mean()
            team_df[f'points_for_{window}'] = team_df['score'].rolling(window, min_periods=3).mean()
            team_df[f'points_against_{window}'] = team_df['opponentScore'].rolling(window, min_periods=3).mean()
        rolling_stats.append(team_df)
    
    df = pd.concat(rolling_stats).sort_values('date')
    
    return df

# Add the method to the class
NBABettingSimulator.load_and_preprocess_data = load_and_preprocess_data

## Feature Preparation

This method prepares the feature matrix for model training and evaluation, with proper handling of missing values.

In [4]:
def prepare_features(self, df):
    """Prepare features for modeling with proper NaN handling"""
    print("Preparing features...")
    features = [
        'market_win_prob',
        'total',
        'spread',
        'secondHalfTotal',
        'days_rest',
        'pace',
        'win_pct_5',
        'win_pct_10',
        'point_diff_5',
        'point_diff_10',
        'points_for_5',
        'points_against_5',
        'home/visitor'
    ]
    
    # Create feature matrix and target
    X = df[features].copy()
    y = df['win'].copy()
    
    # Drop rows where target is missing
    valid_idx = y.notna()
    X = X[valid_idx]
    y = y[valid_idx]
    
    # Fit imputer and transform features
    if not hasattr(self.imputer, 'statistics_'):
        self.imputer.fit(X)
    X_imputed = self.imputer.transform(X)
    
    return pd.DataFrame(X_imputed, columns=features, index=X.index), y

NBABettingSimulator.prepare_features = prepare_features

## Model Training

This method trains a calibrated Gradient Boosting Classifier model to predict NBA game outcomes.

In [5]:
def train_model(self, X_train, y_train):
    """Train the predictive model"""
    print("Training model...")
    base_model = GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=3,
        random_state=42
    )
    
    self.model = CalibratedClassifierCV(base_model, method='isotonic', cv=5)
    self.model.fit(X_train, y_train)

NBABettingSimulator.train_model = train_model

## Model Evaluation

This method evaluates the trained model's performance using various metrics.

In [6]:
def evaluate_model(self, X_test, y_test):
    """Evaluate model performance"""
    print("\nModel Evaluation Metrics:")
    proba = self.model.predict_proba(X_test)[:, 1]
    
    print(f"Log Loss: {log_loss(y_test, proba):.4f}")
    print(f"Brier Score: {brier_score_loss(y_test, proba):.4f}")
    print(f"ROC AUC: {roc_auc_score(y_test, proba):.4f}")
    
    # Calibration check
    calibration_df = pd.DataFrame({'prob': proba, 'actual': y_test})
    calibration_df['prob_bucket'] = pd.cut(calibration_df['prob'], bins=np.arange(0, 1.1, 0.1))
    calibration = calibration_df.groupby('prob_bucket')['actual'].mean()
    print("\nCalibration:")
    print(calibration)

NBABettingSimulator.evaluate_model = evaluate_model

## Betting Simulation

This method simulates a betting strategy based on finding value bets where the model's predicted probability exceeds the market's implied probability by a certain threshold.

In [7]:
def simulate_betting(self, X_test, df_test):
    """Simulate betting on the test data with fixed bet sizes"""
    print("\nSimulating betting strategy...")
    
    # Get model probabilities
    proba = self.model.predict_proba(X_test)[:, 1]
    
    # Prepare test data with predictions
    test_df = df_test.loc[X_test.index].copy()
    test_df['model_prob'] = proba
    test_df['value'] = test_df['model_prob'] - test_df['market_win_prob']
    
    # Calculate expected value
    def calculate_ev(row):
        try:
            if row['moneyLine'] > 0:
                potential_win = row['moneyLine'] / 100
            else:
                potential_win = 100 / (-row['moneyLine'])
            return (row['model_prob'] * potential_win) - ((1 - row['model_prob']) * 1)
        except:
            return np.nan
    
    test_df['expected_value'] = test_df.apply(calculate_ev, axis=1)
    
    # Filter for value bets
    value_bets = test_df[test_df['value'] > self.value_threshold].copy()
    
    # Decode team names
    for col in ['team', 'opponent']:
        if col in self.label_encoders:
            value_bets[col] = self.label_encoders[col].inverse_transform(value_bets[col])
    
    # Add meaningful columns
    value_bets['bet_on'] = np.where(value_bets['home/visitor'] == 1, value_bets['team'], value_bets['opponent'])
    value_bets['vs'] = np.where(value_bets['home/visitor'] == 1, value_bets['opponent'], value_bets['team'])
    value_bets['location'] = np.where(value_bets['home/visitor'] == 1, 'home', 'away')
    
    # Sort by date for proper bankroll tracking
    value_bets = value_bets.sort_values('date')
    
    # Initialize bankroll tracking
    bankroll = self.initial_bankroll
    bankroll_history = [bankroll]
    bets = []
    fixed_bet_amount = self.initial_bankroll * self.bet_size
    
    # Simulate each bet
    for _, row in tqdm(value_bets.iterrows(), total=len(value_bets), desc="Processing bets"):
        # Use fixed bet amount (2% of initial bankroll)
        bet_amount = fixed_bet_amount
        
        # Determine odds and potential payout
        if row['home/visitor'] == 1:  # Betting on home team
            odds = row['moneyLine']
            won = row['win'] == 1
        else:  # Betting on away team
            odds = row['opponentMoneyLine']
            won = row['win'] == 0
        
        # Calculate payout
        if won:
            if odds > 0:
                payout = bet_amount * (odds / 100)
            else:
                payout = bet_amount * (100 / -odds)
            net = payout
        else:
            net = -bet_amount
        
        # Update bankroll
        bankroll += net
        bankroll_history.append(bankroll)
        
        # Record bet details
        bets.append({
            'date': row['date'],
            'season': row['season'],
            'team': row['bet_on'],
            'vs': row['vs'],
            'location': row['location'],
            'odds': odds,
            'bet_amount': bet_amount,
            'result': 'WIN' if won else 'LOSS',
            'payout': net + bet_amount if won else 0,
            'net': net,
            'bankroll': bankroll,
            'model_prob': row['model_prob'],
            'market_prob': row['market_win_prob'],
            'value': row['value']
        })
    
    # Convert bets to DataFrame
    bets_df = pd.DataFrame(bets)
    self._generate_visualizations(bets_df, bankroll_history, X_test, df_test)

    return bets_df, bankroll_history

NBABettingSimulator.simulate_betting = simulate_betting

## Visualization Generation

This method generates visualizations of the betting performance and model evaluation.

In [8]:
def _generate_visualizations(self, bets_df, bankroll_history, X_test, df_test):
    """Generate key visualizations for model interpretation."""
    # Bankroll Growth Over Time
    plt.figure(figsize=(12, 6))
    plt.plot(bankroll_history, label='Bankroll', color='#2ecc71', linewidth=2)
    plt.axhline(y=self.initial_bankroll, color='red', linestyle='--', label='Initial Bankroll')
    plt.title('Bankroll Growth Over Time', fontsize=16)
    plt.xlabel('Bet Number', fontsize=12)
    plt.ylabel('Bankroll ($)', fontsize=12)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.savefig('visualizations/bankroll_growth.png', bbox_inches='tight', dpi=300)
    plt.close()

    # Win Probability Calibration
    prob_true, prob_pred = calibration_curve(
        bets_df['result'].map({'WIN': 1, 'LOSS': 0}),
        bets_df['model_prob'],
        n_bins=10
    )
    plt.figure(figsize=(8, 6))
    plt.plot([0, 1], [0, 1], 'k:', label='Perfectly Calibrated')
    plt.plot(prob_pred, prob_true, 's-', label='Model')
    plt.title('Probability Calibration', fontsize=16)
    plt.xlabel('Predicted Probability', fontsize=12)
    plt.ylabel('Actual Probability', fontsize=12)
    plt.legend()
    plt.savefig('visualizations/calibration_curve.png', bbox_inches='tight', dpi=300)
    plt.close()

    # Bet Outcomes by Odds Range
    bets_df['odds_group'] = pd.cut(bets_df['odds'], bins=[-500, -200, -150, -100, 0, 100, 200, 500])
    outcome_by_odds = bets_df.groupby('odds_group')['result'].value_counts(normalize=True).unstack()
    
    plt.figure(figsize=(12, 6))
    outcome_by_odds.plot(kind='bar', stacked=True, color=['#e74c3c', '#2ecc71'])
    plt.title('Win Rate by Odds Range', fontsize=16)
    plt.xlabel('Moneyline Odds Range', fontsize=12)
    plt.ylabel('Percentage', fontsize=12)
    plt.xticks(rotation=45)
    plt.legend(title='Result')
    plt.savefig('visualizations/winrate_by_odds.png', bbox_inches='tight', dpi=300)
    plt.close()

    # Value Distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(bets_df['value'], bins=20, kde=True, color='#3498db')
    plt.axvline(x=0, color='red', linestyle='--')
    plt.title('Distribution of Model Value Over Market', fontsize=16)
    plt.xlabel('Model Probability - Market Probability', fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.savefig('visualizations/value_distribution.png', bbox_inches='tight', dpi=300)
    plt.close()

NBABettingSimulator._generate_visualizations = _generate_visualizations

## Performance Analysis

This method analyzes the betting performance and outputs key metrics.

In [9]:
def analyze_performance(self, bets_df, bankroll_history):
    """Analyze betting performance"""
    print("\nBetting Performance Analysis:")
    
    # Basic metrics
    total_bets = len(bets_df)
    wins = bets_df['result'].value_counts().get('WIN', 0)
    losses = total_bets - wins
    win_pct = wins / total_bets if total_bets > 0 else 0
    
    total_wagered = bets_df['bet_amount'].sum()
    total_return = bets_df['net'].sum()
    roi = (total_return / total_wagered) * 100 if total_wagered > 0 else 0
    
    final_bankroll = bankroll_history[-1]
    profit = final_bankroll - self.initial_bankroll
    growth = (final_bankroll / self.initial_bankroll - 1) * 100
    
    # Average odds calculation
    avg_odds = bets_df['odds'].mean()
    
    print(f"Total Bets: {total_bets}")
    print(f"Wins: {wins} ({win_pct:.1%})")
    print(f"Losses: {losses}")
    print(f"Average Odds: {avg_odds:.1f}")
    print(f"Total Wagered: ${total_wagered:,.2f}")
    print(f"Total Return: ${total_return:,.2f}")
    print(f"ROI: {roi:.2f}%")
    print(f"\nStarting Bankroll: ${self.initial_bankroll:,.2f}")
    print(f"Final Bankroll: ${final_bankroll:,.2f}")
    print(f"Profit: ${profit:,.2f}")
    print(f"Growth: {growth:.2f}%")
    
    # Save detailed results
    bets_df.to_csv('betting_simulation_results.csv', index=False)
    pd.DataFrame({'bankroll': bankroll_history}).to_csv('bankroll_history.csv', index=False)
    
    print("\nDetailed results saved to 'betting_simulation_results.csv'")
    print("Bankroll history saved to 'bankroll_history.csv'")
    print("\nVisualizations Generated:")
    print("- Bankroll growth: visualizations/bankroll_growth.png")
    print("- Probability calibration: visualizations/calibration_curve.png")
    print("- Win rates by odds: visualizations/winrate_by_odds.png")
    print("- Value distribution: visualizations/value_distribution.png")

NBABettingSimulator.analyze_performance = analyze_performance

## Complete Pipeline

This method runs the complete betting simulation pipeline from data loading to performance analysis.

In [10]:
def run(self):
    """Run the complete pipeline"""
    # Load and preprocess data
    df = self.load_and_preprocess_data()
    
    # Chronological split (first 80% by date for training, last 20% for testing)
    split_date = df['date'].quantile(0.8)
    train_df = df[df['date'] <= split_date]
    test_df = df[df['date'] > split_date]
    
    # Prepare features
    X_train, y_train = self.prepare_features(train_df)
    X_test, y_test = self.prepare_features(test_df)
    
    # Train model
    self.train_model(X_train, y_train)
    
    # Evaluate model
    self.evaluate_model(X_test, y_test)
    
    # Simulate betting
    bets_df, bankroll_history = self.simulate_betting(X_test, test_df)
    
    # Analyze performance
    self.analyze_performance(bets_df, bankroll_history)
    
    return bets_df, bankroll_history

NBABettingSimulator.run = run

## Run the Simulator

Now let's run the complete NBA betting simulation pipeline and examine the results.

In [None]:
# Create and run the simulator
simulator = NBABettingSimulator()
bets_df, bankroll_history = simulator.run()

## Visualize Sample Bets

Let's take a look at the first 10 bets to understand the betting strategy in action.

In [None]:
print("\nSample Bets:")
bets_df[['date', 'team', 'vs', 'odds', 'bet_amount', 'result', 'net']].head(10)

## Examine Visualizations

Let's display some of the generated visualizations to better understand the betting performance.

In [None]:
# Display bankroll growth visualization
from IPython.display import Image
Image('visualizations/bankroll_growth.png')

In [None]:
# Display probability calibration curve
Image('visualizations/calibration_curve.png')

In [None]:
# Display win rate by odds range
Image('visualizations/winrate_by_odds.png')

In [None]:
# Display value distribution
Image('visualizations/value_distribution.png')

## Additional Analysis

Let's perform some additional analysis on the betting results to gain more insights.

In [None]:
# Analyze betting performance by month
if len(bets_df) > 0:
    bets_df['month'] = bets_df['date'].dt.to_period('M')
    monthly_performance = bets_df.groupby('month').agg({
        'bet_amount': 'sum',
        'net': 'sum',
        'result': lambda x: x.value_counts().get('WIN', 0) / len(x) if len(x) > 0 else 0
    })
    monthly_performance.columns = ['Amount Wagered', 'Net Profit', 'Win Rate']
    monthly_performance['ROI'] = monthly_performance['Net Profit'] / monthly_performance['Amount Wagered'] * 100
    
    # Plot monthly ROI
    plt.figure(figsize=(12, 6))
    monthly_performance['ROI'].plot(kind='bar', color='#3498db')
    plt.title('Monthly ROI', fontsize=16)
    plt.xlabel('Month', fontsize=12)
    plt.ylabel('ROI (%)', fontsize=12)
    plt.axhline(y=0, color='red', linestyle='--')
    plt.grid(axis='y', alpha=0.3)
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    # Display the monthly performance table
    monthly_performance