In [1]:
# Import pandas library for data manipulation
import pandas as pd
import numpy as np
import os

In [2]:
# Load NBA games dataset from CSV file
df = pd.read_csv("nba_games.csv", index_col=0)

In [3]:
# Sort games by date to ensure chronological order for time series analysis
df = df.sort_values("date")

In [4]:
# Reset index after sorting to have clean sequential indices
df = df.reset_index(drop=True)

In [5]:
# Remove duplicate columns that aren't needed for analysis
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]

In [6]:
# Create target variable: whether the team won their NEXT game
# This shifts the "won" column by -1 for each team, so we're predicting future outcomes
def add_target(group):
    group["target"] = group["won"].shift(-1)
    return group

df = df.groupby("team", group_keys=False).apply(add_target)

  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["ta

In [7]:
# Handle missing target values (last game of each team's season)
# Replace NaN values with 2 and convert target to integer type
df.loc[pd.isnull(df["target"]), "target"] = 2
df["target"] = df["target"].astype(int, errors="ignore")

In [8]:
# Find columns with null/missing values
nulls = pd.isnull(df).sum()
nulls = nulls[nulls > 0]

In [9]:
# Get list of valid columns (those without any null values)
valid_columns = df.columns[~df.columns.isin(nulls.index)]

In [10]:
# Keep only valid columns (remove columns with missing values)
df = df[valid_columns].copy()

In [11]:
#FEATURE ENGINEERING

In [12]:
# Define which columns to exclude from model features
# Remove metadata and target-related columns
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [13]:
#MOMENTUM & QUALITY FEATURES

df =  df.sort_values(["team", "date"])

# rest days and back to backs
df['date_dt'] = pd.to_datetime(df['date'], format='mixed')
df['days_rest'] = df.groupby('team')['date_dt'].diff().dt.days.fillna(3)
df['back_to_back'] = (df["days_rest"] ==1).astype(int)
df["b2b_away"] = ((df['back_to_back'] == 1) & (df['home'] == 0)).astype(int)

# season record (culmulative)
df['season_wins'] = df.groupby(['team', 'season'])['won'].cumsum()
df['season_games'] = df.groupby(['team', 'season']).cumcount() + 1
df['season_win_pct'] = df['season_wins'] / df['season_games']

#opponent win percentage (use map)
opp_lookup = df.set_index(['team', df.index])['season_win_pct']
df['opp_win_pct'] = df.apply(
    lambda row: df.loc[
        (df['team'] == row['team_opp']) & (df['season'] == row['season']) & (df.index <= row.name),
        'season_win_pct'
    ].iloc[-1] if len(df.loc[
        (df['team'] == row['team_opp']) & (df['season'] == row['season']) & (df.index <= row.name)
    ]) > 0 else 0.5,
    axis=1
)
df['win_pct_diff'] = df['season_win_pct'] - df['opp_win_pct']
#head to head record
df = df.sort_values(['team', 'team_opp', 'date'])
df['h2h_wins'] = df.groupby(['team', 'team_opp'])['won'].cumsum()
df['h2h_games'] = df.groupby(['team', 'team_opp']).cumcount() + 1
df['h2h_win_pct'] = df['h2h_wins'] / df['h2h_games']

# win/loss streaks

df = df.sort_values(["team", "date"])
def calc_streak(won_series):
    streak = []
    current = 0
    for w in won_series:
        if w == 1 or w == True:
            current = max(0, current) + 1
        elif w == 0 or w == False:
            current = min(0, current) - 1
        streak.append(current)
    return streak


df = df.sort_values(['team', 'date']).reset_index(drop=True)
streaks = []
for team in df['team'].unique():
    team_mask = df['team'] == team
    team_won = df.loc[team_mask, 'won'].tolist()
    streaks.extend(calc_streak(team_won))

df['streak'] = streaks
df['win_streak'] = df['streak'].clip(lower=0)
df['losing_streak'] = (-df['streak']).clip(lower=0)
df['hot_streak'] = (df['win_streak'] > 3).astype(int)

In [14]:
# Create new dataframe with only the features needed for rolling averages
df_rolling = df[list(selected_columns) + ["won", "team", "season"]].copy()

In [15]:
# Calculate rolling averages for each team over their last 10 games
# This captures recent team performance trends
def find_team_averages(team):
    # Only calculate rolling for numeric columns
    numeric_cols = team[selected_columns].select_dtypes(include=['number']).columns
    rolling = team[numeric_cols].rolling(10).mean()
    return rolling

df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)

  df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)


In [16]:
# Create EWM features dataframe
df_ewm = df[list(selected_columns) + ["won", "team", "season"]].copy()

In [17]:
# Calculate exponentially weighted moving averages
# Recent games are weighted MORE heavily than older games
def find_team_ewm(team):
    # Only calculate EWM for numeric columns
    numeric_cols = team[selected_columns].select_dtypes(include=['number']).columns
    ewm = team[numeric_cols].ewm(span=10, adjust=False).mean()
    return ewm

df_ewm = df_ewm.groupby(["team", "season"], group_keys=False).apply(find_team_ewm)

  df_ewm = df_ewm.groupby(["team", "season"], group_keys=False).apply(find_team_ewm)


In [18]:
# Rename EWM columns with _ewm suffix
ewm_cols = [f"{col}_ewm" for col in df_ewm.columns]
df_ewm.columns = ewm_cols

In [19]:
# Concatenate rolling and EWM features to main dataframe
df = pd.concat([df, df_rolling, df_ewm], axis=1)

# Remove duplicate columns created by concat
df = df.loc[:, ~df.columns.duplicated()]

In [20]:
df = df.dropna()

In [21]:
# Sort by team and date
df = df.sort_values(["team", "date"])

# Add next game columns using simple groupby shift
df["home_next"] = df.groupby("team")["home"].shift(-1)
df["team_opp_next"] = df.groupby("team")["team_opp"].shift(-1)
df["date_next"] = df.groupby("team")["date"].shift(-1)

In [22]:
# Get actual column names for rolling and EWM features
rolling_cols = [col for col in df.columns if col.endswith('_10')]
ewm_cols = [col for col in df.columns if col.endswith('_ewm')]

In [23]:
df = df.drop(columns=['date_dt'])

In [24]:
# Merge to create full dataset with both team's and opponent's features
full = df.merge(
  df[rolling_cols + ewm_cols + ["team_opp_next", "date_next", "team"]], 
  left_on=["team", "date_next"], 
  right_on=["team_opp_next", "date_next"]
)

In [25]:
#FEATURE SELECTION

In [26]:
# Define columns to remove (metadata and text columns)
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]
removed_columns = list(full.columns[full.dtypes == "object"]) + removed_columns

In [27]:
# Get numeric feature columns only
selected_columns = full.columns[~full.columns.isin(removed_columns)]

In [28]:
# Use SelectKBest for fast feature selection (takes seconds instead of hours)
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(f_classif, k=50)
selector.fit(full[selected_columns], full["target"])

  f = msb / msw


In [29]:
# Get the selected features
predictors = list(selected_columns[selector.get_support()])

In [30]:
# MODEL TRAINING & EVALUATION

In [31]:

# Trains on past seasons and predicts future seasons
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []
    seasons = sorted(data["season"].unique())
    
    # loop through seasons, train on past data, test on current season
    for i in range(start, len(seasons), step):
        season = seasons[i]
        train = data[data["season"] < season]
        test = data[data["season"] == season]
        
        selector = SelectKBest(f_classif, k=50)
        selector.fit(train[predictors], train["target"])
        selected = list(np.array(predictors)[selector.get_support()])

        model.fit(train[selected], train["target"])
        preds = model.predict(test[selected])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [32]:
# Initialize Ridge Classifier
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score

rr = RidgeClassifier(alpha=1)

In [33]:
# Run backtest with Ridge Classifier
# Pass ALL candidate features (selected_columns), not pre-filtered predictors
all_features = list(selected_columns)
predictions = backtest(full, rr, all_features)

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


In [34]:
# Calculate Ridge Classifier accuracy
ridge_accuracy = accuracy_score(predictions["actual"], predictions["prediction"])
print(f"Ridge Classifier Accuracy: {ridge_accuracy:.4f} ({ridge_accuracy * 100:.2f}%)")


Ridge Classifier Accuracy: 0.6602 (66.02%)


In [35]:
# Initialize XGBoost Classifier with tuned hyperparameters
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=3,
    random_state=42,
    eval_metric="logloss"
)

In [36]:
# Run backtest with XGBoost
xgb_predictions = backtest(full, xgb, all_features)

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


In [37]:
# Calculate XGBoost accuracy and compare
xgb_accuracy = accuracy_score(xgb_predictions["actual"], xgb_predictions["prediction"])
print(f"Ridge Classifier Accuracy: {ridge_accuracy:.4f} ({ridge_accuracy * 100:.2f}%)")
print(f"XGBoost Accuracy:          {xgb_accuracy:.4f} ({xgb_accuracy * 100:.2f}%)")
print(f"Difference:                {(xgb_accuracy - ridge_accuracy) * 100:+.2f}%")

Ridge Classifier Accuracy: 0.6602 (66.02%)
XGBoost Accuracy:          0.6528 (65.28%)
Difference:                -0.73%


In [38]:
TEAM_MAP = {
    'Atlanta Hawks': 'ATL', 'Boston Celtics': 'BOS', 'Brooklyn Nets': 'BRK',
    'Charlotte Hornets': 'CHO', 'Chicago Bulls': 'CHI', 'Cleveland Cavaliers': 'CLE',
    'Dallas Mavericks': 'DAL', 'Denver Nuggets': 'DEN', 'Detroit Pistons': 'DET',
    'Golden State Warriors': 'GSW', 'Houston Rockets': 'HOU', 'Indiana Pacers': 'IND',
    'Los Angeles Clippers': 'LAC', 'Los Angeles Lakers': 'LAL', 'Memphis Grizzlies': 'MEM',
    'Miami Heat': 'MIA', 'Milwaukee Bucks': 'MIL', 'Minnesota Timberwolves': 'MIN',
    'New Orleans Pelicans': 'NOP', 'New York Knicks': 'NYK', 'Oklahoma City Thunder': 'OKC',
    'Orlando Magic': 'ORL', 'Philadelphia 76ers': 'PHI', 'Phoenix Suns': 'PHO',
    'Portland Trail Blazers': 'POR', 'Sacramento Kings': 'SAC', 'San Antonio Spurs': 'SAS',
    'Toronto Raptors': 'TOR', 'Utah Jazz': 'UTA', 'Washington Wizards': 'WAS'
  }
  

In [39]:
def archive_completed_predictions():
    if not os.path.exists('data/predictions.csv'):
        return
    
    predictions = pd.read_csv('data/predictions.csv')
    
    # Separate completed and upcoming
    completed = predictions[predictions['result'] != 'not_played']
    upcoming = predictions[predictions['result'] == 'not_played']
    
    if len(completed) > 0:
        # Append completed to history
        if os.path.exists('data/prediction_history.csv'):
            history = pd.read_csv('data/prediction_history.csv')
            history = pd.concat([history, completed], ignore_index=True)
            # Remove duplicates based on date, home, visitor
            history = history.drop_duplicates(subset=['date', 'home', 'visitor'], keep='last')
        else:
            history = completed
        
        history.to_csv('data/prediction_history.csv', index=False)
        print(f"Archived {len(completed)} completed predictions to history")
    
    # Keep only upcoming in predictions.csv
    upcoming.to_csv('data/predictions.csv', index=False)
    return upcoming


def generate_predictions(model, df, predictors, upcoming_file, team=None, n_games=3):

    # first, archive any completed predictions
    archive_completed_predictions()
    
    # load upcoming games
    upcoming = pd.read_csv(upcoming_file)
    upcoming['date'] = pd.to_datetime(upcoming['date'], format='mixed')
    
    # map team names to abbreviations
    upcoming['home_abbrev'] = upcoming['home'].map(TEAM_MAP)
    upcoming['visitor_abbrev'] = upcoming['visitor'].map(TEAM_MAP)
    
    # filter for specific team if requested
    if team:
        upcoming = upcoming[
            (upcoming['home_abbrev'] == team) | 
            (upcoming['visitor_abbrev'] == team)
        ]
    
    # sort by date and get next n_games per team
    upcoming = upcoming.sort_values('date')
    
    if team:
        upcoming = upcoming.head(n_games)
    else:
        # get next n_games for each team
        games_list = []
        for t in df['team'].unique():
            team_games = upcoming[
                (upcoming['home_abbrev'] == t) | 
                (upcoming['visitor_abbrev'] == t)
            ].head(n_games)
            games_list.append(team_games)
        upcoming = pd.concat(games_list).drop_duplicates(subset=['date', 'home', 'visitor'])
    
    # load existing predictions to check what's already predicted
    existing_keys = set()
    if os.path.exists('data/predictions.csv'):
        existing_predictions = pd.read_csv('data/predictions.csv')
        if len(existing_predictions) > 0:
            existing_predictions['date'] = pd.to_datetime(existing_predictions['date']).dt.strftime('%Y-%m-%d')
            existing_keys = set(zip(existing_predictions['date'], existing_predictions['home'], existing_predictions['visitor']))
    
    # get rolling/EWM columns
    rolling_cols = [col for col in df.columns if col.endswith('_10')]
    ewm_cols = [col for col in df.columns if col.endswith('_ewm')]
    
    # get most recent stats for each team
    df_sorted = df.sort_values(['team', 'date'])
    latest_stats = df_sorted.groupby('team').last().reset_index()
    
    # separate predictors into home (_x or no suffix) and visitor (_y) features
    home_predictors = [p for p in predictors if not p.endswith('_y')]
    visitor_predictors = [p for p in predictors if p.endswith('_y')]
    
    predictions_list = []
    skipped = 0
    
    for _, game in upcoming.iterrows():
        home = game['home_abbrev']
        visitor = game['visitor_abbrev']
        game_date = game['date'].strftime('%Y-%m-%d')
        
        # skip if already predicted
        if (game_date, game['home'], game['visitor']) in existing_keys:
            skipped += 1
            continue
        
        # get team stats
        home_stats = latest_stats[latest_stats['team'] == home]
        visitor_stats = latest_stats[latest_stats['team'] == visitor]
        
        if len(home_stats) == 0 or len(visitor_stats) == 0:
            print(f"Skipping {visitor} @ {home} - missing team data")
            continue
        
        # build feature row
        feature_row = {}
        
        # match predictor columns to home team's stats
        for col in home_predictors:
            # try exact match first (e.g. 'home', 'season_win_pct')
            if col in home_stats.columns:
                feature_row[col] = home_stats[col].values[0]
            # try without _x suffix (from merge)
            elif col.endswith('_x') and col[:-2] in home_stats.columns:
                feature_row[col] = home_stats[col[:-2]].values[0]
        
        # _y columns map to visitor's base stats
        for col in visitor_predictors:
            # strip _y suffix to find the base column name in visitor stats
            base_col = col[:-2]  # remove '_y'
            if base_col in visitor_stats.columns:
                feature_row[col] = visitor_stats[base_col].values[0]
        
        pre_df = pd.DataFrame([feature_row])
        
        # fill any remaining missing predictors with 0
        for col in predictors:
            if col not in pre_df.columns:
                pre_df[col] = 0
        
        pre_df = pre_df[predictors]
        
        pred = model.predict(pre_df)[0]
        
        # calculate confidence 
        if hasattr(model, 'decision_function'):
            raw_score = abs(model.decision_function(pre_df)[0])
            confidence = 1 / (1 + np.exp(-raw_score))  # maps to 0.5-1.0
        else:
            confidence = None
        
        predictions_list.append({
            'date': game_date,
            'home': game['home'],
            'home_abbrev': home,
            'visitor': game['visitor'],
            'visitor_abbrev': visitor,
            'predicted_winner': game['home'] if pred == 1 else game['visitor'],
            'predicted_winner_abbrev': home if pred == 1 else visitor,
            'confidence': confidence,
            'result': 'not_played',
            'actual_winner': ''
        })
    
    # Combine with existing upcoming predictions
    new_predictions = pd.DataFrame(predictions_list)
    
    if os.path.exists('data/predictions.csv'):
        existing = pd.read_csv('data/predictions.csv')
        if len(existing) > 0:
            predictions_df = pd.concat([existing, new_predictions], ignore_index=True)
        else:
            predictions_df = new_predictions
    else:
        predictions_df = new_predictions
    
    # Remove duplicates and sort
    predictions_df = predictions_df.drop_duplicates(subset=['date', 'home', 'visitor'], keep='first')
    predictions_df = predictions_df.sort_values('date')
    
    # Save only upcoming predictions
    predictions_df.to_csv('data/predictions.csv', index=False)
    print(f"Added {len(predictions_list)} new predictions (skipped {skipped} existing)")
    print(f"Total upcoming predictions: {len(predictions_df)}")
    
    return predictions_df

In [40]:

#train the model on all data first
rr.fit(full[predictors], full["target"])

#generate predictions for all teams (next 3 games each)
all_predictions = generate_predictions(rr, df, predictors, 'data/upcoming_games_2026.csv', n_games=3)

# Display predictions nicely
print("=" * 70)
print("UPCOMING GAME PREDICTIONS")
print("=" * 70)
for _, row in all_predictions.iterrows():
    print(f"{row['date']}  {row['visitor_abbrev']} @ {row['home_abbrev']}  -->  {row['predicted_winner_abbrev']} wins")
print("=" * 70)
print(f"Total predictions: {len(all_predictions)}")

Archived 44 completed predictions to history
Added 43 new predictions (skipped 4 existing)
Total upcoming predictions: 47
UPCOMING GAME PREDICTIONS
2026-02-19  BOS @ GSW  -->  BOS wins
2026-02-19  TOR @ CHI  -->  TOR wins
2026-02-19  DET @ NYK  -->  DET wins
2026-02-19  ATL @ PHI  -->  ATL wins
2026-02-19  IND @ WAS  -->  IND wins
2026-02-19  ORL @ SAC  -->  SAC wins
2026-02-19  BRK @ CLE  -->  CLE wins
2026-02-19  DEN @ LAC  -->  LAC wins
2026-02-19  PHO @ SAS  -->  SAS wins
2026-02-19  HOU @ CHO  -->  HOU wins
2026-02-20  UTA @ MEM  -->  MEM wins
2026-02-20  LAC @ LAL  -->  LAL wins
2026-02-20  IND @ WAS  -->  IND wins
2026-02-20  DEN @ POR  -->  POR wins
2026-02-20  DAL @ MIN  -->  DAL wins
2026-02-20  MIL @ NOP  -->  MIL wins
2026-02-20  BRK @ OKC  -->  OKC wins
2026-02-20  MIA @ ATL  -->  ATL wins
2026-02-20  CLE @ CHO  -->  CLE wins
2026-02-21  HOU @ NYK  -->  HOU wins
2026-02-21  MEM @ MIA  -->  MEM wins
2026-02-21  PHI @ NOP  -->  PHI wins
2026-02-21  DET @ CHI  -->  DET wins
2