In [8]:
# Import pandas library for data manipulation
import pandas as pd

In [9]:
# Load NBA games dataset from CSV file
df = pd.read_csv("nba_games.csv", index_col=0)

In [10]:
# Sort games by date to ensure chronological order for time series analysis
df = df.sort_values("date")

In [11]:
# Reset index after sorting to have clean sequential indices
df = df.reset_index(drop=True)

In [12]:
# Remove duplicate columns that aren't needed for analysis
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]

In [None]:
# Create target variable: whether the team won their NEXT game
# This shifts the "won" column by -1 for each team, so we're predicting future outcomes
def add_target(group):
    group["target"] = group["won"].shift(-1)
    return group

df = df.groupby("team", group_keys=False).apply(add_target)

In [14]:
# Handle missing target values (last game of each team's season)
# Replace NaN values with 2 and convert target to integer type
df.loc[pd.isnull(df["target"]), "target"] = 2
df["target"] = df["target"].astype(int, errors="ignore")

In [15]:
# Find columns with null/missing values
nulls = pd.isnull(df).sum()
nulls = nulls[nulls > 0]

In [16]:
# Get list of valid columns (those without any null values)
valid_columns = df.columns[~df.columns.isin(nulls.index)]

In [17]:
# Keep only valid columns (remove columns with missing values)
df = df[valid_columns].copy()

In [None]:
#FEATURE ENGINEERING

In [18]:
# Define which columns to exclude from model features
# Remove metadata and target-related columns
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [20]:
# Create new dataframe with only the features needed for rolling averages
df_rolling = df[list(selected_columns) + ["won", "team", "season"]].copy()

In [21]:
# Calculate rolling averages for each team over their last 10 games
# This captures recent team performance trends
def find_team_averages(team):
    # Only calculate rolling for numeric columns
    numeric_cols = team[selected_columns].select_dtypes(include=['number']).columns
    rolling = team[numeric_cols].rolling(10).mean()
    return rolling

df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)

  df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)


In [25]:
# Create EWM features dataframe
df_ewm = df[list(selected_columns) + ["won", "team", "season"]].copy()

In [26]:
# Calculate exponentially weighted moving averages
# Recent games are weighted MORE heavily than older games
def find_team_ewm(team):
    # Only calculate EWM for numeric columns
    numeric_cols = team[selected_columns].select_dtypes(include=['number']).columns
    ewm = team[numeric_cols].ewm(span=10, adjust=False).mean()
    return ewm

df_ewm = df_ewm.groupby(["team", "season"], group_keys=False).apply(find_team_ewm)

  df_ewm = df_ewm.groupby(["team", "season"], group_keys=False).apply(find_team_ewm)


In [27]:
# Rename EWM columns with _ewm suffix
ewm_cols = [f"{col}_ewm" for col in df_ewm.columns]
df_ewm.columns = ewm_cols

In [28]:
# Concatenate rolling and EWM features to main dataframe
df = pd.concat([df, df_rolling, df_ewm], axis=1)

In [29]:
df = df.dropna()

In [31]:
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))

df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")

  return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))
  return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))
  return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))


In [32]:
# Get actual column names for rolling and EWM features
rolling_cols = [col for col in df.columns if col.endswith('_10')]
ewm_cols = [col for col in df.columns if col.endswith('_ewm')]

In [33]:
# Merge to create full dataset with both team's and opponent's features
full = df.merge(
  df[rolling_cols + ewm_cols + ["team_opp_next", "date_next", "team"]], 
  left_on=["team", "date_next"], 
  right_on=["team_opp_next", "date_next"]
)

In [None]:
#FEATURE SELECTION

In [34]:
# Define columns to remove (metadata and text columns)
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]
removed_columns = list(full.columns[full.dtypes == "object"]) + removed_columns

In [35]:
# Get numeric feature columns only
selected_columns = full.columns[~full.columns.isin(removed_columns)]

In [36]:
# Use SelectKBest for fast feature selection (takes seconds instead of hours)
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(f_classif, k=50)
selector.fit(full[selected_columns], full["target"])

  f = msb / msw


In [37]:
# Get the selected features
predictors = list(selected_columns[selector.get_support()])
print(f"Selected {len(predictors)} features")

Selected 50 features


In [None]:
# MODEL TRAINING & EVALUATION

In [38]:
# Backtesting function: simulates real-world predictions over time
# Trains on past seasons and predicts future seasons
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []
    
    seasons = sorted(data["season"].unique())
    
    # Loop through seasons, train on past data, test on current season
    for i in range(start, len(seasons), step):
        season = seasons[i]
        train = data[data["season"] < season]
        test = data[data["season"] == season]
        
        model.fit(train[predictors], train["target"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [39]:
# Initialize Ridge Classifier
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score

rr = RidgeClassifier(alpha=1)

In [40]:
# Run backtest with Ridge Classifier
predictions = backtest(full, rr, predictors)

In [41]:
# Calculate Ridge Classifier accuracy
ridge_accuracy = accuracy_score(predictions["actual"], predictions["prediction"])
print(f"Ridge Classifier Accuracy: {ridge_accuracy:.4f} ({ridge_accuracy * 100:.2f}%)")


Ridge Classifier Accuracy: 0.6325 (63.25%)


In [42]:
# Initialize Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=42
)

In [43]:
# Run backtest with Gradient Boosting
gb_predictions = backtest(full, gb, predictors)

In [44]:
gb_accuracy = accuracy_score(gb_predictions["actual"], gb_predictions["prediction"])
print(f"Gradient Boosting Accuracy: {gb_accuracy:.4f} ({gb_accuracy * 100:.2f}%)")

Gradient Boosting Accuracy: 0.6112 (61.12%)


In [45]:
# Compare both models
print("=" * 50)
print("MODEL COMPARISON")
print("=" * 50)
print(f"Ridge Classifier:     {ridge_accuracy:.4f} ({ridge_accuracy * 100:.2f}%)")
print(f"Gradient Boosting:    {gb_accuracy:.4f} ({gb_accuracy * 100:.2f}%)")
print(f"Baseline (home team): ~56%")
print("=" * 50)

MODEL COMPARISON
Ridge Classifier:     0.6325 (63.25%)
Gradient Boosting:    0.6112 (61.12%)
Baseline (home team): ~56%
