# Phase 1A: Exploratory Data Analysis

Analyze 3 seasons of historical NBA data to identify fantasy value drivers.

**Prerequisites:** Run `python scripts/fetch_historical_data.py` first to populate `data/`.

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from src.data.data_loader import load_dataset

sns.set_theme(style='darkgrid', palette='muted')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['figure.dpi'] = 100

%matplotlib inline

In [None]:
# Load combined dataset
df = load_dataset()
if df is None:
    raise FileNotFoundError('Run scripts/fetch_historical_data.py first')

print(f'Shape: {df.shape}')
print(f'Seasons: {sorted(df["SEASON_YEAR"].unique())}')
print(f'Players: {df["PLAYER_ID"].nunique():,}')
print(f'Date range: {df["game_date_parsed"].min().date()} to {df["game_date_parsed"].max().date()}')
print()
df.head()

## 1. Fantasy Points Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Overall FPTS distribution
axes[0].hist(df['fpts'].dropna(), bins=80, edgecolor='white', alpha=0.8)
axes[0].axvline(df['fpts'].mean(), color='red', linestyle='--', label=f'Mean: {df["fpts"].mean():.1f}')
axes[0].axvline(df['fpts'].median(), color='orange', linestyle='--', label=f'Median: {df["fpts"].median():.1f}')
axes[0].set_xlabel('Fantasy Points')
axes[0].set_ylabel('Frequency')
axes[0].set_title('FPTS Distribution (All Games)')
axes[0].legend()

# FPTS by season
for season in sorted(df['SEASON_YEAR'].unique()):
    season_data = df[df['SEASON_YEAR'] == season]['fpts']
    axes[1].hist(season_data, bins=60, alpha=0.5, label=season)
axes[1].set_xlabel('Fantasy Points')
axes[1].set_ylabel('Frequency')
axes[1].set_title('FPTS Distribution by Season')
axes[1].legend()

plt.tight_layout()
plt.savefig('../data/fpts_distribution.png', bbox_inches='tight')
plt.show()

print(df['fpts'].describe())

## 2. Top Players by Mean FPTS

In [None]:
# Top 20 players by mean FPTS (min 100 games across all seasons)
player_stats = df.groupby(['PLAYER_ID', 'PLAYER_NAME']).agg(
    mean_fpts=('fpts', 'mean'),
    std_fpts=('fpts', 'std'),
    games=('fpts', 'count'),
    seasons=('SEASON_YEAR', 'nunique'),
).reset_index()

top_20 = player_stats[player_stats['games'] >= 100].nlargest(20, 'mean_fpts')

fig, ax = plt.subplots(figsize=(12, 8))
bars = ax.barh(range(len(top_20)), top_20['mean_fpts'], xerr=top_20['std_fpts'],
               capsize=3, color=sns.color_palette('muted')[0])
ax.set_yticks(range(len(top_20)))
ax.set_yticklabels(top_20['PLAYER_NAME'])
ax.set_xlabel('Mean Fantasy Points per Game')
ax.set_title('Top 20 Players by Mean FPTS (min 100 games, 3 seasons)')
ax.invert_yaxis()
plt.tight_layout()
plt.show()

top_20[['PLAYER_NAME', 'mean_fpts', 'std_fpts', 'games', 'seasons']].to_string(index=False)

## 3. Rolling Average Behavior

In [None]:
# Pick 3 representative players: top scorer, mid-tier, high-variance
qualified = player_stats[player_stats['games'] >= 100]
star = qualified.nlargest(1, 'mean_fpts').iloc[0]
mid = qualified.iloc[(qualified['mean_fpts'] - qualified['mean_fpts'].median()).abs().argsort()[:1]].iloc[0]
volatile = qualified.nlargest(1, 'std_fpts').iloc[0]

sample_players = [star, mid, volatile]
labels = ['Star', 'Mid-tier', 'High-variance']

fig, axes = plt.subplots(3, 1, figsize=(14, 12), sharex=False)

for ax, player, label in zip(axes, sample_players, labels):
    pdata = df[(df['PLAYER_ID'] == player['PLAYER_ID']) &
              (df['SEASON_YEAR'] == df['SEASON_YEAR'].max())].copy()
    pdata = pdata.sort_values('game_date_parsed')

    ax.scatter(range(len(pdata)), pdata['fpts'], s=15, alpha=0.4, label='Actual FPTS')
    for window, color in [(3, 'orange'), (5, 'red'), (10, 'green')]:
        col = f'fpts_roll_{window}'
        if col in pdata.columns:
            ax.plot(range(len(pdata)), pdata[col], label=f'L{window} avg', color=color, linewidth=1.5)
    ax.set_title(f'{label}: {player["PLAYER_NAME"]} (std={player["std_fpts"]:.1f})')
    ax.set_ylabel('FPTS')
    ax.legend(loc='upper right', fontsize=8)

axes[-1].set_xlabel('Game Number')
plt.tight_layout()
plt.show()

## 4. Trend Predictiveness

In [None]:
# Does the trend predict next-game performance?
trend_cols = ['fpts_trend_3v10', 'fpts_trend_5v15', 'fpts_roll_3', 'fpts_roll_5', 'fpts_roll_10']
existing_cols = [c for c in trend_cols if c in df.columns]

corr_with_fpts = df[existing_cols + ['fpts']].corr()['fpts'].drop('fpts').sort_values(ascending=False)
print('Correlation of rolling/trend features with actual FPTS:')
print(corr_with_fpts.to_string())
print()

fig, ax = plt.subplots(figsize=(8, 4))
corr_with_fpts.plot(kind='barh', ax=ax)
ax.set_xlabel('Correlation with FPTS')
ax.set_title('Rolling/Trend Feature Correlation with Actual Game FPTS')
plt.tight_layout()
plt.show()

## 5. Home vs Away Effect

In [None]:
home = df[df['is_home'] == True]['fpts']
away = df[df['is_home'] == False]['fpts']

print(f'Home mean FPTS: {home.mean():.2f} (n={len(home):,})')
print(f'Away mean FPTS: {away.mean():.2f} (n={len(away):,})')
print(f'Difference: {home.mean() - away.mean():.2f}')

t_stat, p_val = stats.ttest_ind(home.dropna(), away.dropna())
print(f'T-test: t={t_stat:.3f}, p={p_val:.4f}')
print(f'Significant at 0.05: {p_val < 0.05}')

fig, ax = plt.subplots(figsize=(8, 5))
df.boxplot(column='fpts', by='is_home', ax=ax)
ax.set_xticklabels(['Away', 'Home'])
ax.set_title('FPTS: Home vs Away')
ax.set_ylabel('Fantasy Points')
plt.suptitle('')
plt.tight_layout()
plt.show()

## 6. Back-to-Back Impact

In [None]:
b2b = df[df['is_back_to_back'] == True]['fpts']
non_b2b = df[df['is_back_to_back'] == False]['fpts']

print(f'B2B mean FPTS: {b2b.mean():.2f} (n={len(b2b):,})')
print(f'Non-B2B mean FPTS: {non_b2b.mean():.2f} (n={len(non_b2b):,})')
print(f'B2B penalty: {b2b.mean() - non_b2b.mean():.2f}')

t_stat, p_val = stats.ttest_ind(b2b.dropna(), non_b2b.dropna())
print(f'T-test: t={t_stat:.3f}, p={p_val:.4f}')
print(f'Significant at 0.05: {p_val < 0.05}')

## 7. Rest Day Patterns

In [None]:
# FPTS by days of rest (cap at 5+)
rest_df = df[df['days_rest'].notna()].copy()
rest_df['rest_bucket'] = rest_df['days_rest'].clip(upper=5).astype(int)
rest_df.loc[rest_df['days_rest'] >= 5, 'rest_bucket'] = 5

rest_means = rest_df.groupby('rest_bucket')['fpts'].agg(['mean', 'count', 'std'])
print(rest_means)
print()

fig, ax = plt.subplots(figsize=(8, 5))
ax.bar(rest_means.index, rest_means['mean'], yerr=rest_means['std'] / np.sqrt(rest_means['count']),
       capsize=5, color=sns.color_palette('muted'))
ax.set_xticks(rest_means.index)
ax.set_xticklabels(['1 (B2B)', '2', '3', '4', '5+'])
ax.set_xlabel('Days of Rest')
ax.set_ylabel('Mean FPTS')
ax.set_title('Fantasy Points by Days of Rest')
plt.tight_layout()
plt.show()

## 8. Opponent Defense Impact

In [None]:
if 'opp_def_pts' in df.columns:
    opp_cols = [c for c in df.columns if c.startswith('opp_def_')]
    corr_opp = df[opp_cols + ['fpts']].corr()['fpts'].drop('fpts').sort_values(ascending=False)
    print('Correlation of opponent defense features with FPTS:')
    print(corr_opp.to_string())
    print()

    # FPTS vs top-5 defense vs bottom-5 defense
    opp_median = df['opp_def_pts'].median()
    strong_def = df[df['opp_def_pts'] <= df['opp_def_pts'].quantile(0.2)]['fpts']
    weak_def = df[df['opp_def_pts'] >= df['opp_def_pts'].quantile(0.8)]['fpts']
    print(f'FPTS vs strong defense (bottom 20% pts allowed): {strong_def.mean():.2f}')
    print(f'FPTS vs weak defense (top 20% pts allowed): {weak_def.mean():.2f}')
    print(f'Difference: {weak_def.mean() - strong_def.mean():.2f}')

    t_stat, p_val = stats.ttest_ind(strong_def.dropna(), weak_def.dropna())
    print(f'T-test: t={t_stat:.3f}, p={p_val:.4f}')
else:
    print('Opponent defense columns not found in dataset')

## 9. Advanced Stats as Predictors

In [None]:
adv_cols = [c for c in df.columns if c.startswith('adv_')]
if adv_cols:
    corr_adv = df[adv_cols + ['fpts']].corr()['fpts'].drop('fpts').sort_values(ascending=False)
    print('Correlation of advanced stats with FPTS:')
    print(corr_adv.to_string())

    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    if 'adv_usg_pct' in df.columns:
        player_avg = df.groupby('PLAYER_ID').agg(
            mean_fpts=('fpts', 'mean'), usg=('adv_usg_pct', 'first')
        ).dropna()
        axes[0].scatter(player_avg['usg'], player_avg['mean_fpts'], s=5, alpha=0.3)
        axes[0].set_xlabel('Usage Rate')
        axes[0].set_ylabel('Mean FPTS')
        axes[0].set_title('Usage Rate vs Fantasy Points')

    if 'adv_ts_pct' in df.columns:
        player_avg = df.groupby('PLAYER_ID').agg(
            mean_fpts=('fpts', 'mean'), ts=('adv_ts_pct', 'first')
        ).dropna()
        axes[1].scatter(player_avg['ts'], player_avg['mean_fpts'], s=5, alpha=0.3)
        axes[1].set_xlabel('True Shooting %')
        axes[1].set_ylabel('Mean FPTS')
        axes[1].set_title('True Shooting vs Fantasy Points')

    plt.tight_layout()
    plt.show()
else:
    print('No advanced stat columns found')

## 10. Feature Correlation Heatmap

In [None]:
# Select top numeric features for heatmap
feature_cols = [
    'fpts', 'PTS', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'MIN', 'FG_PCT', 'FG3M',
    'fpts_roll_3', 'fpts_roll_5', 'fpts_roll_10',
    'fpts_std_10', 'fpts_trend_3v10',
    'days_rest',
]
# Add advanced/opponent cols if they exist
for c in ['adv_usg_pct', 'adv_ts_pct', 'adv_pace', 'opp_def_pts', 'opp_def_fg_pct']:
    if c in df.columns:
        feature_cols.append(c)

existing_features = [c for c in feature_cols if c in df.columns]
corr_matrix = df[existing_features].corr()

fig, ax = plt.subplots(figsize=(14, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='RdBu_r', center=0,
            square=True, ax=ax, cbar_kws={'shrink': 0.8})
ax.set_title('Feature Correlation Heatmap')
plt.tight_layout()
plt.savefig('../data/correlation_heatmap.png', bbox_inches='tight')
plt.show()

## 11. Missing Data Audit

In [None]:
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)
missing_report = pd.DataFrame({'count': missing, 'pct': missing_pct})
missing_nonzero = missing_report[missing_report['count'] > 0].sort_values('pct', ascending=False)

print(f'Columns with missing data: {len(missing_nonzero)} / {len(df.columns)}')
print()
if len(missing_nonzero) > 0:
    print(missing_nonzero.to_string())
else:
    print('No missing data!')

## 12. Season Phase Analysis

In [None]:
if 'season_phase' in df.columns:
    phase_order = ['early', 'mid', 'late', 'final_stretch']
    phase_stats = df.groupby('season_phase')['fpts'].agg(['mean', 'std', 'count'])
    phase_stats = phase_stats.reindex(phase_order)
    print(phase_stats)
    print()

    fig, ax = plt.subplots(figsize=(8, 5))
    ax.bar(range(len(phase_stats)), phase_stats['mean'],
           yerr=phase_stats['std'] / np.sqrt(phase_stats['count']),
           capsize=5, color=sns.color_palette('muted'))
    ax.set_xticks(range(len(phase_stats)))
    ax.set_xticklabels(phase_order)
    ax.set_xlabel('Season Phase')
    ax.set_ylabel('Mean FPTS')
    ax.set_title('Fantasy Points by Season Phase')
    plt.tight_layout()
    plt.show()

## 13. Summary & Key Insights

**Fill in after running all cells:**

- Most predictive rolling window: _____
- Home advantage effect: _____ FPTS (significant: yes/no)
- Back-to-back penalty: _____ FPTS (significant: yes/no)
- Opponent defense impact: _____ FPTS difference (strong vs weak)
- Most correlated advanced stat: _____
- Key non-obvious finding: _____