In [121]:
import pandas as pd

data_path = 'data/processed'

games = pd.read_csv(f'{data_path}/games_cleaned.csv')

games['GAME_DATE_EST'] = pd.to_datetime(games['GAME_DATE_EST'])



In [122]:
# Home games
home_games = games[[
    'GAME_DATE_EST', 'SEASON', 'GAME_ID', 'HOME_TEAM_ID',
    'PTS_home', 'FG_PCT_home', 'FT_PCT_home', 'FG3_PCT_home',
    'AST_home', 'REB_home'
]].copy()

home_games.columns = [
    'date', 'season', 'game_id', 'team_id',
    'pts', 'fg_pct', 'ft_pct', 'fg3_pct', 'ast', 'reb'
]
home_games['is_home'] = 1

# Away games
away_games = games[[
    'GAME_DATE_EST', 'SEASON', 'GAME_ID', 'VISITOR_TEAM_ID',
    'PTS_away', 'FG_PCT_away', 'FT_PCT_away', 'FG3_PCT_away',
    'AST_away', 'REB_away'
]].copy()

away_games.columns = [
    'date', 'season', 'game_id', 'team_id',
    'pts', 'fg_pct', 'ft_pct', 'fg3_pct', 'ast', 'reb'
]
away_games['is_home'] = 0

team_games = pd.concat([home_games, away_games], ignore_index=True)
team_games = team_games.sort_values(['team_id', 'date']).reset_index(drop=True)

print(f"\nTeam-game records: {len(team_games):,}")



Team-game records: 53,046


In [None]:
WINDOW = 10  # Last N games

stat_cols = ['pts', 'fg_pct', 'ft_pct', 'fg3_pct', 'ast', 'reb']
print(f"\nCalculating {WINDOW}-game rolling averages...")
for col in stat_cols:
    # CRITICAL: shift(1) ensures we only use PAST games
    team_games[f'{col}_avg_{WINDOW}'] = team_games.groupby('team_id')[col].transform(
        lambda x: x.rolling(window=WINDOW, min_periods=1).mean().shift(1)
    )

print("✓ Rolling averages calculated")



Calculating 20-game rolling averages...
✓ Rolling averages calculated


In [124]:
# ============================================
# Calculate Rest Days (WITHIN SEASON ONLY)
# ============================================

print("\nCalculating rest days (within season)...")

# Sort by season, team, and date
team_games = team_games.sort_values(['season', 'team_id', 'date'])

# Calculate days since previous game (WITHIN EACH SEASON)
team_games['rest_days'] = team_games.groupby(['season', 'team_id'])['date'].diff().dt.days

# Cap at 7 days (anything longer is an aberration/offseason)
team_games['rest_days'] = team_games['rest_days'].clip(upper=7)

# Fill NaN (first game of season) with 3 days
team_games['rest_days'] = team_games['rest_days'].fillna(3)

# Create back-to-back indicator
team_games['back_to_back'] = (team_games['rest_days'] == 1).astype(int)

print(f"✓ Rest days calculated")
print(f"  Mean: {team_games['rest_days'].mean():.2f}")
print(f"  Std:  {team_games['rest_days'].std():.2f}")
print(f"  Max:  {team_games['rest_days'].max():.0f}")
print(f"  Back-to-backs: {team_games['back_to_back'].sum()} ({team_games['back_to_back'].mean()*100:.1f}%)")

# Split into home and away
rest_cols = ['rest_days', 'back_to_back']

home_rest = team_games[team_games['is_home'] == 1][
    ['game_id'] + rest_cols
].copy()
home_rest.columns = ['GAME_ID'] + [f'{col}_home' for col in rest_cols]

away_rest = team_games[team_games['is_home'] == 0][
    ['game_id'] + rest_cols
].copy()  
away_rest.columns = ['GAME_ID'] + [f'{col}_away' for col in rest_cols]


Calculating rest days (within season)...
✓ Rest days calculated
  Mean: 2.19
  Std:  1.04
  Max:  7
  Back-to-backs: 10733 (20.2%)


In [125]:
avg_cols = [f'{col}_avg_{WINDOW}' for col in stat_cols]

home_avgs = team_games[team_games['is_home'] == 1][
    ['date', 'game_id', 'team_id'] + avg_cols
].copy()
home_avgs.columns = ['GAME_DATE_EST', 'GAME_ID', 'HOME_TEAM_ID'] + \
                    [f'{col}_home' for col in avg_cols]

# Away team averages
away_avgs = team_games[team_games['is_home'] == 0][
    ['date', 'game_id', 'team_id'] + avg_cols
].copy()
away_avgs.columns = ['GAME_DATE_EST', 'GAME_ID', 'VISITOR_TEAM_ID'] + \
                    [f'{col}_away' for col in avg_cols]
                    
                    
games_with_features = games.merge(
    home_avgs,
    on=['GAME_DATE_EST', 'GAME_ID', 'HOME_TEAM_ID'],
    how='left'
)

games_with_features = games_with_features.merge(
    away_avgs,
    on=['GAME_DATE_EST', 'GAME_ID', 'VISITOR_TEAM_ID'],
    how='left'
)

games_with_features = games_with_features.merge(
    home_rest,
    on=['GAME_ID'],
    how='left'
)

games_with_features = games_with_features.merge(
    away_rest,
    on=['GAME_ID'],
    how='left'
)

                   
print(f"\n✓ Features merged back to games")



✓ Features merged back to games


In [126]:
league_home_win_rate = games['target'].mean()
games_with_features['league_home_advantage'] = league_home_win_rate

print(f"✓ Added league home advantage: {league_home_win_rate:.3f}")

✓ Added league home advantage: 0.589


In [127]:
feature_cols = [f'{col}_home' for col in avg_cols] + \
               [f'{col}_away' for col in avg_cols] + \
               ['rest_days_home', 'rest_days_away', 'back_to_back_home', 'back_to_back_away', 'league_home_advantage']


# Metadata + features + target
final_cols = ['GAME_DATE_EST', 'GAME_ID', 'SEASON', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID'] + \
             feature_cols + ['target']

final_data = games_with_features[final_cols].copy()

print(f"\nFeatures for prediction ({len(feature_cols)}):")
for f in feature_cols:
    print(f"  • {f}")


Features for prediction (17):
  • pts_avg_20_home
  • fg_pct_avg_20_home
  • ft_pct_avg_20_home
  • fg3_pct_avg_20_home
  • ast_avg_20_home
  • reb_avg_20_home
  • pts_avg_20_away
  • fg_pct_avg_20_away
  • ft_pct_avg_20_away
  • fg3_pct_avg_20_away
  • ast_avg_20_away
  • reb_avg_20_away
  • rest_days_home
  • rest_days_away
  • back_to_back_home
  • back_to_back_away
  • league_home_advantage


In [128]:
print(f"\nBefore dropping NaN: {len(final_data):,} games")
final_data = final_data.dropna(subset=feature_cols)
print(f"After dropping NaN: {len(final_data):,} games")
print(f"Lost: {len(games_with_features) - len(final_data):,} games (early season)")



Before dropping NaN: 26,523 games
After dropping NaN: 26,502 games
Lost: 21 games (early season)


In [129]:
print(f"\n{'='*60}")
print(f"FEATURE VALIDATION")
print(f"{'='*60}")

# Check that features look reasonable
print(f"\nSample feature values:")
print(final_data[feature_cols].describe().round(3))

# Check no data leakage - features should be different from actual game stats
sample_game = final_data.iloc[100]
print(f"\nExample: Game on {sample_game['GAME_DATE_EST'].date()}")
print(f"  Home team avg FG%: {sample_game['fg_pct_avg_10_home']:.3f} (from past games)")
print(f"  Away team avg FG%: {sample_game['fg_pct_avg_10_away']:.3f} (from past games)")



FEATURE VALIDATION

Sample feature values:
       pts_avg_20_home  fg_pct_avg_20_home  ft_pct_avg_20_home  \
count        26502.000           26502.000           26502.000   
mean           101.869               0.455               0.759   
std              7.454               0.020               0.037   
min             58.000               0.324               0.538   
25%             96.400               0.442               0.736   
50%            101.150               0.455               0.761   
75%            107.150               0.468               0.785   
max            125.250               0.532               0.897   

       fg3_pct_avg_20_home  ast_avg_20_home  reb_avg_20_home  pts_avg_20_away  \
count            26502.000        26502.000        26502.000        26502.000   
mean                 0.353           22.119           42.713          101.949   
std                  0.030            2.516            2.551            7.485   
min                  0.127           

KeyError: 'fg_pct_avg_10_home'

In [None]:
output_file = 'data/processed/games_with_features.csv'
final_data.to_csv(output_file, index=False)

print(f"\n{'='*60}")
print(f"✓ FEATURE ENGINEERING COMPLETE")
print(f"{'='*60}")
print(f"Saved to: {output_file}")
print(f"Final shape: {final_data.shape}")
print(f"Features: {len(feature_cols)}")
print(f"Samples: {len(final_data):,}")
print(f"{'='*60}")
