In [None]:
# NBA Betting Model Analysis & Visualization

This notebook provides comprehensive analysis and visualization of the NBA betting model data.

**Current Section**: Data Loading & Overview  
**Last Updated**: August 2025


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
from pathlib import Path
import sqlite3
from datetime import datetime, date
import yaml

# Set up plotting styles
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Configure pandas display
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# Set up paths
PROJECT_ROOT = Path('.').absolute().parent
DATA_PATH = PROJECT_ROOT / 'data'
PROCESSED_PATH = DATA_PATH / 'processed'
RAW_PATH = DATA_PATH / 'raw'

print("🏀 NBA Betting Analysis Setup Complete")
print(f"Project Root: {PROJECT_ROOT}")
print(f"Data Path: {DATA_PATH}")
print("="*50)


In [None]:
# Load processed features data
features_file = PROCESSED_PATH / 'nba_features.parquet'
if features_file.exists():
    features_df = pd.read_parquet(features_file)
    print(f"✅ Loaded processed features: {features_df.shape}")
else:
    print("❌ No processed features found")
    features_df = None

# Load recent games data (2024-2025 season)
games_file = PROCESSED_PATH / 'games_2024_2025.csv'
if games_file.exists():
    games_df = pd.read_csv(games_file)
    games_df['game_date'] = pd.to_datetime(games_df['game_date'])
    print(f"✅ Loaded 2024-2025 games: {games_df.shape}")
else:
    print("❌ No recent games data found")
    games_df = None

# Load historical games data (2020-2023)
historical_file = PROCESSED_PATH / 'games_2020_2023.csv'
if historical_file.exists():
    historical_df = pd.read_csv(historical_file)
    historical_df['game_date'] = pd.to_datetime(historical_df['game_date'])
    print(f"✅ Loaded historical games: {historical_df.shape}")
else:
    print("❌ No historical games data found")
    historical_df = None

# Check for performance database
performance_db = DATA_PATH / 'performance.db'
if performance_db.exists():
    print(f"✅ Found performance database: {performance_db}")
    performance_available = True
else:
    print("❌ No performance database found")
    performance_available = False

print("\n" + "="*50)
print("📊 Data Summary:")
if features_df is not None:
    print(f"• Features: {features_df.shape[0]:,} records, {features_df.shape[1]} features")
if games_df is not None:
    print(f"• Recent Games: {games_df.shape[0]:,} games (2024-2025)")
if historical_df is not None:
    print(f"• Historical Games: {historical_df.shape[0]:,} games (2020-2023)")
print(f"• Performance Tracking: {'Available' if performance_available else 'Not Available'}")
print("="*50)


In [None]:
# Explore features data structure
if features_df is not None:
    print("FEATURES DATA EXPLORATION")
    print("="*50)
    
    print("Dataset Info:")
    print(f"• Shape: {features_df.shape}")
    print(f"• Date Range: {features_df['game_date'].min()} to {features_df['game_date'].max()}")
    print(f"• Unique Teams: {features_df['team_name'].nunique()}")
    print(f"• Total Games: {features_df['game_id'].nunique()}")
    
    print("\nFeature Categories:")
    feature_cols = features_df.columns.tolist()
    
    # Categorize features
    rolling_features = [col for col in feature_cols if 'last_' in col]
    season_features = [col for col in feature_cols if 'season_' in col]
    basic_features = [col for col in feature_cols if col in ['game_id', 'game_date', 'team_name', 'opponent', 'venue', 'is_home', 'target_win']]
    other_features = [col for col in feature_cols if col not in rolling_features + season_features + basic_features]
    
    print(f"• Basic Info: {len(basic_features)} features")
    print(f"• Rolling Stats: {len(rolling_features)} features")
    print(f"• Season Stats: {len(season_features)} features") 
    print(f"• Other Features: {len(other_features)} features")
    
    print("\nSample Data:")
    display(features_df.head())
    
    print("\nTarget Variable Distribution:")
    win_rate = features_df['target_win'].mean()
    print(f"• Overall Win Rate: {win_rate:.1%}")
    if 'is_home' in features_df.columns:
        print(f"• Home Win Rate: {features_df[features_df['is_home']==1]['target_win'].mean():.1%}")
        print(f"• Away Win Rate: {features_df[features_df['is_home']==0]['target_win'].mean():.1%}")
else:
    print("❌ No features data available for exploration")


In [None]:
# Feature Correlation Analysis
if features_df is not None:
    print("FEATURE CORRELATION ANALYSIS")
    print("="*50)
    
    # Select numeric features for correlation analysis
    numeric_cols = features_df.select_dtypes(include=[np.number]).columns.tolist()
    # Remove non-feature columns
    exclude_cols = ['game_id', 'target_win']
    numeric_features = [col for col in numeric_cols if col not in exclude_cols]
    
    print(f"Analyzing correlations for {len(numeric_features)} numeric features")
    
    # Calculate correlation matrix
    corr_matrix = features_df[numeric_features + ['target_win']].corr()
    
    # Get correlations with target variable
    target_correlations = corr_matrix['target_win'].drop('target_win').sort_values(key=abs, ascending=False)
    
    print("\nTOP 15 FEATURES CORRELATED WITH WINS:")
    print("-" * 50)
    for i, (feature, corr) in enumerate(target_correlations.head(15).items(), 1):
        print(f"{i:2d}. {feature:<30} {corr:+.4f}")
    
    print(f"\nBOTTOM 10 FEATURES CORRELATED WITH WINS:")
    print("-" * 50)
    for i, (feature, corr) in enumerate(target_correlations.tail(10).items(), 1):
        print(f"{i:2d}. {feature:<30} {corr:+.4f}")
    
    # Create correlation heatmap for top features
    top_features = target_correlations.head(10).index.tolist() + ['target_win']
    top_corr_matrix = features_df[top_features].corr()
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(top_corr_matrix, annot=True, cmap='RdBu_r', center=0, 
                square=True, fmt='.3f', cbar_kws={'label': 'Correlation Coefficient'})
    plt.title('Correlation Matrix: Top 10 Features vs Target Win', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    # Feature importance insights
    print(f"\nKEY INSIGHTS:")
    print("-" * 50)
    strongest_positive = target_correlations.idxmax()
    strongest_negative = target_correlations.idxmin()
    print(f"• Strongest positive predictor: {strongest_positive} ({target_correlations[strongest_positive]:+.4f})")
    print(f"• Strongest negative predictor: {strongest_negative} ({target_correlations[strongest_negative]:+.4f})")
    print(f"• Features with correlation > 0.1: {len(target_correlations[target_correlations > 0.1])}")
    print(f"• Features with correlation < -0.1: {len(target_correlations[target_correlations < -0.1])}")
    
else:
    print("❌ No features data available for correlation analysis")

In [None]:
# Team Performance Visualization
if features_df is not None:
    print("TEAM PERFORMANCE VISUALIZATIONS")
    print("="*50)
    
    # Calculate team-level aggregations for visualization
    team_performance = features_df.groupby('team_name').agg({
        'target_win': ['count', 'mean'],
        'season_win_pct': 'mean',
        'season_avg_pts': 'mean',
        'season_avg_pts_allowed': 'mean',
        'win_pct_last_10': 'mean',
        'is_home': 'mean'
    }).round(3)
    
    # Flatten column names
    team_performance.columns = ['games_played', 'win_rate', 'season_win_pct', 
                               'avg_pts_scored', 'avg_pts_allowed', 'recent_win_pct', 'home_pct']
    team_performance = team_performance.reset_index()
    
    # Add point differential
    team_performance['point_differential'] = team_performance['avg_pts_scored'] - team_performance['avg_pts_allowed']
    
    # Create subplots for multiple visualizations
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('NBA Team Performance Analysis', fontsize=16, fontweight='bold')
    
    # 1. Win Rate vs Point Differential
    scatter = axes[0,0].scatter(team_performance['point_differential'], 
                               team_performance['win_rate'],
                               s=team_performance['games_played']*2,
                               alpha=0.7, c=team_performance['avg_pts_scored'], 
                               cmap='viridis')
    axes[0,0].set_xlabel('Point Differential (PPG)')
    axes[0,0].set_ylabel('Win Rate')
    axes[0,0].set_title('Win Rate vs Point Differential\n(Size = Games Played, Color = Avg Points)')
    
    # Add team labels for extreme values
    for idx, row in team_performance.iterrows():
        if row['point_differential'] > 8 or row['point_differential'] < -8 or row['win_rate'] > 0.7 or row['win_rate'] < 0.3:
            axes[0,0].annotate(row['team_name'], (row['point_differential'], row['win_rate']), 
                              xytext=(5, 5), textcoords='offset points', fontsize=8)
    
    # 2. Offensive vs Defensive Performance
    axes[0,1].scatter(team_performance['avg_pts_scored'], 
                     team_performance['avg_pts_allowed'],
                     s=100, alpha=0.7, c=team_performance['win_rate'], cmap='RdYlGn')
    axes[0,1].set_xlabel('Average Points Scored')
    axes[0,1].set_ylabel('Average Points Allowed')
    axes[0,1].set_title('Offensive vs Defensive Performance\n(Color = Win Rate)')
    
    # Add diagonal line for reference
    min_pts = min(team_performance['avg_pts_scored'].min(), team_performance['avg_pts_allowed'].min())
    max_pts = max(team_performance['avg_pts_scored'].max(), team_performance['avg_pts_allowed'].max())
    axes[0,1].plot([min_pts, max_pts], [min_pts, max_pts], 'k--', alpha=0.5, label='Equal Offense/Defense')
    axes[0,1].legend()
    
    # 3. Season vs Recent Performance
    axes[1,0].scatter(team_performance['season_win_pct'], 
                     team_performance['recent_win_pct'],
                     s=100, alpha=0.7)
    axes[1,0].set_xlabel('Season Win Percentage')
    axes[1,0].set_ylabel('Recent Win Percentage (Last 10)')
    axes[1,0].set_title('Season vs Recent Performance')
    
    # Add diagonal line
    axes[1,0].plot([0, 1], [0, 1], 'k--', alpha=0.5, label='Equal Performance')
    axes[1,0].legend()
    
    # Identify teams performing above/below season average recently
    improving = team_performance[team_performance['recent_win_pct'] > team_performance['season_win_pct'] + 0.1]
    declining = team_performance[team_performance['recent_win_pct'] < team_performance['season_win_pct'] - 0.1]
    
    for idx, row in improving.iterrows():
        axes[1,0].annotate(f"{row['team_name']} ↑", 
                          (row['season_win_pct'], row['recent_win_pct']), 
                          xytext=(5, 5), textcoords='offset points', 
                          fontsize=8, color='green')
    
    for idx, row in declining.iterrows():
        axes[1,0].annotate(f"{row['team_name']} ↓", 
                          (row['season_win_pct'], row['recent_win_pct']), 
                          xytext=(5, -15), textcoords='offset points', 
                          fontsize=8, color='red')
    
    # 4. Home Game Advantage Distribution
    team_performance_sorted = team_performance.sort_values('win_rate', ascending=True)
    colors = ['red' if x < 0.4 else 'orange' if x < 0.6 else 'green' for x in team_performance_sorted['win_rate']]
    
    bars = axes[1,1].barh(range(len(team_performance_sorted)), 
                         team_performance_sorted['win_rate'],
                         color=colors, alpha=0.7)
    axes[1,1].set_yticks(range(len(team_performance_sorted)))
    axes[1,1].set_yticklabels(team_performance_sorted['team_name'], fontsize=8)
    axes[1,1].set_xlabel('Win Rate')
    axes[1,1].set_title('Team Win Rates (Ranked)')
    axes[1,1].axvline(x=0.5, color='black', linestyle='--', alpha=0.5, label='0.500')
    axes[1,1].legend()
    
    plt.tight_layout()
    plt.show()
    
    # Summary insights
    print(f"\nKEY PERFORMANCE INSIGHTS:")
    print("-" * 50)
    best_team = team_performance.loc[team_performance['win_rate'].idxmax()]
    worst_team = team_performance.loc[team_performance['win_rate'].idxmin()]
    
    print(f"• Best Team: {best_team['team_name']} ({best_team['win_rate']:.1%} win rate)")
    print(f"• Worst Team: {worst_team['team_name']} ({worst_team['win_rate']:.1%} win rate)")
    print(f"• Teams improving recently: {len(improving)}")
    print(f"• Teams declining recently: {len(declining)}")
    print(f"• Average point differential: {team_performance['point_differential'].mean():+.1f}")
    
else:
    print("❌ No features data available for team performance visualization")

In [None]:
## 2. Team Performance Analysis

## Let's analyze team performance across different metrics and create visualizations to understand team strengths and weaknesses.


In [None]:
# Team Performance Analysis
if features_df is not None:
    # Calculate team statistics
    team_stats = features_df.groupby('team_name').agg({
        'target_win': ['count', 'mean'],
        'season_win_pct': 'mean',
        'season_avg_pts': 'mean',
        'season_avg_pts_allowed': 'mean',
        'avg_pts_last_10': 'mean',
        'avg_pts_allowed_last_10': 'mean',
        'win_pct_last_10': 'mean',
        'is_home': 'mean'  # Home game percentage
    }).round(3)
    
    # Flatten column names
    team_stats.columns = ['_'.join(col).strip() for col in team_stats.columns]
    team_stats = team_stats.reset_index()
    
    # Rename columns for clarity
    team_stats.columns = [
        'team_name', 'games_played', 'win_rate', 'season_win_pct', 'season_avg_pts', 
        'season_avg_pts_allowed', 'recent_avg_pts', 'recent_avg_pts_allowed', 
        'recent_win_pct', 'home_game_pct'
    ]
    
    # Calculate point differential
    team_stats['season_point_diff'] = team_stats['season_avg_pts'] - team_stats['season_avg_pts_allowed']
    team_stats['recent_point_diff'] = team_stats['recent_avg_pts'] - team_stats['recent_avg_pts_allowed']
    
    print("TEAM STATISTICS CALCULATED")
    print("="*50)
    print(f"Analyzed {len(team_stats)} teams")
    print(f"Calculated {len(team_stats.columns)-1} performance metrics per team")
    
    # Show basic stats
    print(f"\nLeague Averages:")
    print(f"• Average Win Rate: {team_stats['win_rate'].mean():.1%}")
    print(f"• Average Points Per Game: {team_stats['season_avg_pts'].mean():.1f}")
    print(f"• Average Points Allowed: {team_stats['season_avg_pts_allowed'].mean():.1f}")
    print(f"• Average Point Differential: {team_stats['season_point_diff'].mean():+.1f}")
    
else:
    print("No features data available for team analysis")


In [None]:
# Display top teams rankings and performance tables
if features_df is not None and 'team_stats' in locals():
    
    print("TOP TEAMS BY WIN RATE")
    print("="*50)
    top_win_rate = team_stats.nlargest(10, 'win_rate')[['team_name', 'games_played', 'win_rate', 'season_win_pct', 'season_point_diff']]
    display(top_win_rate)
    
    print("\nHIGHEST SCORING TEAMS")
    print("="*50)
    top_scoring = team_stats.nlargest(10, 'season_avg_pts')[['team_name', 'season_avg_pts', 'season_avg_pts_allowed', 'season_point_diff']]
    display(top_scoring)
    
    print("\nBEST DEFENSIVE TEAMS (Lowest Points Allowed)")
    print("="*50)
    best_defense = team_stats.nsmallest(10, 'season_avg_pts_allowed')[['team_name', 'season_avg_pts_allowed', 'season_avg_pts', 'season_point_diff']]
    display(best_defense)
    
    print("\nLARGEST POINT DIFFERENTIALS")
    print("="*50)
    best_diff = team_stats.nlargest(10, 'season_point_diff')[['team_name', 'season_point_diff', 'season_avg_pts', 'season_avg_pts_allowed', 'win_rate']]
    display(best_diff)
    
    # Summary statistics
    print("\nLEAGUE SUMMARY STATISTICS")
    print("="*50)
    print(f"Teams analyzed: {len(team_stats)}")
    print(f"Win rate range: {team_stats['win_rate'].min():.1%} - {team_stats['win_rate'].max():.1%}")
    print(f"Scoring range: {team_stats['season_avg_pts'].min():.1f} - {team_stats['season_avg_pts'].max():.1f} PPG")
    print(f"Point diff range: {team_stats['season_point_diff'].min():+.1f} to {team_stats['season_point_diff'].max():+.1f}")
    
else:
    print("Team statistics not available for display")
