In [None]:
# NBA Betting Model Analysis & Visualization

This notebook provides comprehensive analysis and visualization of the NBA betting model data.

**Current Section**: Data Loading & Overview  
**Last Updated**: July 2025


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
from pathlib import Path
import sqlite3
from datetime import datetime, date
import yaml

# Set up plotting styles
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Configure pandas display
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# Set up paths
PROJECT_ROOT = Path('.').absolute().parent
DATA_PATH = PROJECT_ROOT / 'data'
PROCESSED_PATH = DATA_PATH / 'processed'
RAW_PATH = DATA_PATH / 'raw'

print("üèÄ NBA Betting Analysis Setup Complete")
print(f"Project Root: {PROJECT_ROOT}")
print(f"Data Path: {DATA_PATH}")
print("="*50)


In [None]:
# Load processed features data
features_file = PROCESSED_PATH / 'nba_features.parquet'
if features_file.exists():
    features_df = pd.read_parquet(features_file)
    print(f"‚úÖ Loaded processed features: {features_df.shape}")
else:
    print("‚ùå No processed features found")
    features_df = None

# Load recent games data (2024-2025 season)
games_file = PROCESSED_PATH / 'games_2024_2025.csv'
if games_file.exists():
    games_df = pd.read_csv(games_file)
    games_df['game_date'] = pd.to_datetime(games_df['game_date'])
    print(f"‚úÖ Loaded 2024-2025 games: {games_df.shape}")
else:
    print("‚ùå No recent games data found")
    games_df = None

# Load historical games data (2020-2023)
historical_file = PROCESSED_PATH / 'games_2020_2023.csv'
if historical_file.exists():
    historical_df = pd.read_csv(historical_file)
    historical_df['game_date'] = pd.to_datetime(historical_df['game_date'])
    print(f"‚úÖ Loaded historical games: {historical_df.shape}")
else:
    print("‚ùå No historical games data found")
    historical_df = None

# Check for performance database
performance_db = DATA_PATH / 'performance.db'
if performance_db.exists():
    print(f"‚úÖ Found performance database: {performance_db}")
    performance_available = True
else:
    print("‚ùå No performance database found")
    performance_available = False

print("\n" + "="*50)
print("üìä Data Summary:")
if features_df is not None:
    print(f"‚Ä¢ Features: {features_df.shape[0]:,} records, {features_df.shape[1]} features")
if games_df is not None:
    print(f"‚Ä¢ Recent Games: {games_df.shape[0]:,} games (2024-2025)")
if historical_df is not None:
    print(f"‚Ä¢ Historical Games: {historical_df.shape[0]:,} games (2020-2023)")
print(f"‚Ä¢ Performance Tracking: {'Available' if performance_available else 'Not Available'}")
print("="*50)


In [None]:
# Explore features data structure
if features_df is not None:
    print("FEATURES DATA EXPLORATION")
    print("="*50)
    
    print("Dataset Info:")
    print(f"‚Ä¢ Shape: {features_df.shape}")
    print(f"‚Ä¢ Date Range: {features_df['game_date'].min()} to {features_df['game_date'].max()}")
    print(f"‚Ä¢ Unique Teams: {features_df['team_name'].nunique()}")
    print(f"‚Ä¢ Total Games: {features_df['game_id'].nunique()}")
    
    print("\nFeature Categories:")
    feature_cols = features_df.columns.tolist()
    
    # Categorize features
    rolling_features = [col for col in feature_cols if 'last_' in col]
    season_features = [col for col in feature_cols if 'season_' in col]
    basic_features = [col for col in feature_cols if col in ['game_id', 'game_date', 'team_name', 'opponent', 'venue', 'is_home', 'target_win']]
    other_features = [col for col in feature_cols if col not in rolling_features + season_features + basic_features]
    
    print(f"‚Ä¢ Basic Info: {len(basic_features)} features")
    print(f"‚Ä¢ Rolling Stats: {len(rolling_features)} features")
    print(f"‚Ä¢ Season Stats: {len(season_features)} features") 
    print(f"‚Ä¢ Other Features: {len(other_features)} features")
    
    print("\nSample Data:")
    display(features_df.head())
    
    print("\nTarget Variable Distribution:")
    win_rate = features_df['target_win'].mean()
    print(f"‚Ä¢ Overall Win Rate: {win_rate:.1%}")
    if 'is_home' in features_df.columns:
        print(f"‚Ä¢ Home Win Rate: {features_df[features_df['is_home']==1]['target_win'].mean():.1%}")
        print(f"‚Ä¢ Away Win Rate: {features_df[features_df['is_home']==0]['target_win'].mean():.1%}")
else:
    print("‚ùå No features data available for exploration")


In [None]:
## 2. Team Performance Analysis

## Let's analyze team performance across different metrics and create visualizations to understand team strengths and weaknesses.


In [None]:
# Team Performance Analysis
if features_df is not None:
    # Calculate team statistics
    team_stats = features_df.groupby('team_name').agg({
        'target_win': ['count', 'mean'],
        'season_win_pct': 'mean',
        'season_avg_pts': 'mean',
        'season_avg_pts_allowed': 'mean',
        'avg_pts_last_10': 'mean',
        'avg_pts_allowed_last_10': 'mean',
        'win_pct_last_10': 'mean',
        'is_home': 'mean'  # Home game percentage
    }).round(3)
    
    # Flatten column names
    team_stats.columns = ['_'.join(col).strip() for col in team_stats.columns]
    team_stats = team_stats.reset_index()
    
    # Rename columns for clarity
    team_stats.columns = [
        'team_name', 'games_played', 'win_rate', 'season_win_pct', 'season_avg_pts', 
        'season_avg_pts_allowed', 'recent_avg_pts', 'recent_avg_pts_allowed', 
        'recent_win_pct', 'home_game_pct'
    ]
    
    # Calculate point differential
    team_stats['season_point_diff'] = team_stats['season_avg_pts'] - team_stats['season_avg_pts_allowed']
    team_stats['recent_point_diff'] = team_stats['recent_avg_pts'] - team_stats['recent_avg_pts_allowed']
    
    print("TEAM STATISTICS CALCULATED")
    print("="*50)
    print(f"Analyzed {len(team_stats)} teams")
    print(f"Calculated {len(team_stats.columns)-1} performance metrics per team")
    
    # Show basic stats
    print(f"\nLeague Averages:")
    print(f"‚Ä¢ Average Win Rate: {team_stats['win_rate'].mean():.1%}")
    print(f"‚Ä¢ Average Points Per Game: {team_stats['season_avg_pts'].mean():.1f}")
    print(f"‚Ä¢ Average Points Allowed: {team_stats['season_avg_pts_allowed'].mean():.1f}")
    print(f"‚Ä¢ Average Point Differential: {team_stats['season_point_diff'].mean():+.1f}")
    
else:
    print("No features data available for team analysis")


In [None]:
# Display top teams rankings and performance tables
if features_df is not None and 'team_stats' in locals():
    
    print("TOP TEAMS BY WIN RATE")
    print("="*50)
    top_win_rate = team_stats.nlargest(10, 'win_rate')[['team_name', 'games_played', 'win_rate', 'season_win_pct', 'season_point_diff']]
    display(top_win_rate)
    
    print("\nHIGHEST SCORING TEAMS")
    print("="*50)
    top_scoring = team_stats.nlargest(10, 'season_avg_pts')[['team_name', 'season_avg_pts', 'season_avg_pts_allowed', 'season_point_diff']]
    display(top_scoring)
    
    print("\nBEST DEFENSIVE TEAMS (Lowest Points Allowed)")
    print("="*50)
    best_defense = team_stats.nsmallest(10, 'season_avg_pts_allowed')[['team_name', 'season_avg_pts_allowed', 'season_avg_pts', 'season_point_diff']]
    display(best_defense)
    
    print("\nLARGEST POINT DIFFERENTIALS")
    print("="*50)
    best_diff = team_stats.nlargest(10, 'season_point_diff')[['team_name', 'season_point_diff', 'season_avg_pts', 'season_avg_pts_allowed', 'win_rate']]
    display(best_diff)
    
    # Summary statistics
    print("\nLEAGUE SUMMARY STATISTICS")
    print("="*50)
    print(f"Teams analyzed: {len(team_stats)}")
    print(f"Win rate range: {team_stats['win_rate'].min():.1%} - {team_stats['win_rate'].max():.1%}")
    print(f"Scoring range: {team_stats['season_avg_pts'].min():.1f} - {team_stats['season_avg_pts'].max():.1f} PPG")
    print(f"Point diff range: {team_stats['season_point_diff'].min():+.1f} to {team_stats['season_point_diff'].max():+.1f}")
    
else:
    print("Team statistics not available for display")
