In [2]:
import pandas as pd

data_path = 'data/raw'

teams = pd.read_csv(f'{data_path}/teams.csv')
games = pd.read_csv(f'{data_path}/games.csv')
games['target'] = (games['PTS_home'] > games['PTS_away']).astype(int)
print(f"\nTarget created. Home win rate: {games['target'].mean():.3f}")



Target created. Home win rate: 0.587


This creates our baseline. We can predict that the home team will win 58.7% of the time so anything we do must be better than that.

In [8]:
feature_columns = [
    # Home team
    'FG_PCT_home',
    'FT_PCT_home', 
    'FG3_PCT_home',
    'AST_home',
    'REB_home',
    
    # Away team
    'FG_PCT_away',
    'FT_PCT_away',
    'FG3_PCT_away', 
    'AST_away',
    'REB_away',
]

columns_to_keep = ['GAME_DATE_EST', 'SEASON', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID'] + feature_columns + ['target']
clean_data = games[columns_to_keep].copy()
print(f"\nBefore cleaning: {len(clean_data)} games")
print(f"\nMissing Data:")
print(clean_data.isnull().sum())


Before cleaning: 26651 games

Missing Data:
GAME_DATE_EST       0
SEASON              0
HOME_TEAM_ID        0
VISITOR_TEAM_ID     0
FG_PCT_home        99
FT_PCT_home        99
FG3_PCT_home       99
AST_home           99
REB_home           99
FG_PCT_away        99
FT_PCT_away        99
FG3_PCT_away       99
AST_away           99
REB_away           99
target              0
dtype: int64


This shows the number of missing data per feature.

In [None]:
clean_data = clean_data.dropna(subset=columns_to_keep)
print(f"After cleaning: {len(clean_data)} games")
print(f"Dropped: {len(games) - len(clean_data)} games")


After cleaning: 26552 games
Dropped: 99 games


In [11]:
if 'GAME_DATE_EST' in clean_data.columns:
    clean_data = clean_data.sort_values('GAME_DATE_EST').reset_index(drop=True)
    print("\n✓ Data sorted chronologically")



✓ Data sorted chronologically


In [12]:
import os
os.makedirs('data/processed', exist_ok=True)

output_file = 'data/processed/games_basic.csv'
clean_data.to_csv(output_file, index=False)

print(f"\n{'='*60}")
print(f"✓ PREPROCESSING COMPLETE")
print(f"{'='*60}")
print(f"Saved to: {output_file}")
print(f"Final shape: {clean_data.shape}")
print(f"Features: {len(columns_to_keep)}")
print(f"Samples: {len(clean_data)}")
print(f"{'='*60}")


✓ PREPROCESSING COMPLETE
Saved to: data/processed/games_basic.csv
Final shape: (26552, 15)
Features: 15
Samples: 26552
