In [None]:
# NBA Betting Model Analysis & Visualization

This notebook provides comprehensive analysis and visualization of the NBA betting model data.

**Current Section**: Data Loading & Overview  
**Last Updated**: July 2025


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
from pathlib import Path
import sqlite3
from datetime import datetime, date
import yaml

# Set up plotting styles
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Configure pandas display
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# Set up paths
PROJECT_ROOT = Path('.').absolute().parent
DATA_PATH = PROJECT_ROOT / 'data'
PROCESSED_PATH = DATA_PATH / 'processed'
RAW_PATH = DATA_PATH / 'raw'

print("🏀 NBA Betting Analysis Setup Complete")
print(f"Project Root: {PROJECT_ROOT}")
print(f"Data Path: {DATA_PATH}")
print("="*50)


In [None]:
# Load processed features data
features_file = PROCESSED_PATH / 'nba_features.parquet'
if features_file.exists():
    features_df = pd.read_parquet(features_file)
    print(f"✅ Loaded processed features: {features_df.shape}")
else:
    print("❌ No processed features found")
    features_df = None

# Load recent games data (2024-2025 season)
games_file = PROCESSED_PATH / 'games_2024_2025.csv'
if games_file.exists():
    games_df = pd.read_csv(games_file)
    games_df['game_date'] = pd.to_datetime(games_df['game_date'])
    print(f"✅ Loaded 2024-2025 games: {games_df.shape}")
else:
    print("❌ No recent games data found")
    games_df = None

# Load historical games data (2020-2023)
historical_file = PROCESSED_PATH / 'games_2020_2023.csv'
if historical_file.exists():
    historical_df = pd.read_csv(historical_file)
    historical_df['game_date'] = pd.to_datetime(historical_df['game_date'])
    print(f"✅ Loaded historical games: {historical_df.shape}")
else:
    print("❌ No historical games data found")
    historical_df = None

# Check for performance database
performance_db = DATA_PATH / 'performance.db'
if performance_db.exists():
    print(f"✅ Found performance database: {performance_db}")
    performance_available = True
else:
    print("❌ No performance database found")
    performance_available = False

print("\n" + "="*50)
print("📊 Data Summary:")
if features_df is not None:
    print(f"• Features: {features_df.shape[0]:,} records, {features_df.shape[1]} features")
if games_df is not None:
    print(f"• Recent Games: {games_df.shape[0]:,} games (2024-2025)")
if historical_df is not None:
    print(f"• Historical Games: {historical_df.shape[0]:,} games (2020-2023)")
print(f"• Performance Tracking: {'Available' if performance_available else 'Not Available'}")
print("="*50)


In [None]:
# Explore features data structure
if features_df is not None:
    print("FEATURES DATA EXPLORATION")
    print("="*50)
    
    print("Dataset Info:")
    print(f"• Shape: {features_df.shape}")
    print(f"• Date Range: {features_df['game_date'].min()} to {features_df['game_date'].max()}")
    print(f"• Unique Teams: {features_df['team_name'].nunique()}")
    print(f"• Total Games: {features_df['game_id'].nunique()}")
    
    print("\nFeature Categories:")
    feature_cols = features_df.columns.tolist()
    
    # Categorize features
    rolling_features = [col for col in feature_cols if 'last_' in col]
    season_features = [col for col in feature_cols if 'season_' in col]
    basic_features = [col for col in feature_cols if col in ['game_id', 'game_date', 'team_name', 'opponent', 'venue', 'is_home', 'target_win']]
    other_features = [col for col in feature_cols if col not in rolling_features + season_features + basic_features]
    
    print(f"• Basic Info: {len(basic_features)} features")
    print(f"• Rolling Stats: {len(rolling_features)} features")
    print(f"• Season Stats: {len(season_features)} features") 
    print(f"• Other Features: {len(other_features)} features")
    
    print("\nSample Data:")
    display(features_df.head())
    
    print("\nTarget Variable Distribution:")
    win_rate = features_df['target_win'].mean()
    print(f"• Overall Win Rate: {win_rate:.1%}")
    if 'is_home' in features_df.columns:
        print(f"• Home Win Rate: {features_df[features_df['is_home']==1]['target_win'].mean():.1%}")
        print(f"• Away Win Rate: {features_df[features_df['is_home']==0]['target_win'].mean():.1%}")
else:
    print("❌ No features data available for exploration")
