In [2]:
# Import pandas library for data manipulation
import pandas as pd
import os

In [3]:
# Load NBA games dataset from CSV file
df = pd.read_csv("nba_games.csv", index_col=0)

In [4]:
# Sort games by date to ensure chronological order for time series analysis
df = df.sort_values("date")

In [5]:
# Reset index after sorting to have clean sequential indices
df = df.reset_index(drop=True)

In [6]:
# Remove duplicate columns that aren't needed for analysis
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]

In [7]:
# Create target variable: whether the team won their NEXT game
# This shifts the "won" column by -1 for each team, so we're predicting future outcomes
def add_target(group):
    group["target"] = group["won"].shift(-1)
    return group

df = df.groupby("team", group_keys=False).apply(add_target)

  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["target"] = group["won"].shift(-1)
  group["ta

In [8]:
# Handle missing target values (last game of each team's season)
# Replace NaN values with 2 and convert target to integer type
df.loc[pd.isnull(df["target"]), "target"] = 2
df["target"] = df["target"].astype(int, errors="ignore")

In [9]:
# Find columns with null/missing values
nulls = pd.isnull(df).sum()
nulls = nulls[nulls > 0]

In [10]:
# Get list of valid columns (those without any null values)
valid_columns = df.columns[~df.columns.isin(nulls.index)]

In [11]:
# Keep only valid columns (remove columns with missing values)
df = df[valid_columns].copy()

In [12]:
#FEATURE ENGINEERING

In [13]:
# Define which columns to exclude from model features
# Remove metadata and target-related columns
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [14]:
# Create new dataframe with only the features needed for rolling averages
df_rolling = df[list(selected_columns) + ["won", "team", "season"]].copy()

In [15]:
# Calculate rolling averages for each team over their last 10 games
# This captures recent team performance trends
def find_team_averages(team):
    # Only calculate rolling for numeric columns
    numeric_cols = team[selected_columns].select_dtypes(include=['number']).columns
    rolling = team[numeric_cols].rolling(10).mean()
    return rolling

df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)

  df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)


In [16]:
# Create EWM features dataframe
df_ewm = df[list(selected_columns) + ["won", "team", "season"]].copy()

In [17]:
# Calculate exponentially weighted moving averages
# Recent games are weighted MORE heavily than older games
def find_team_ewm(team):
    # Only calculate EWM for numeric columns
    numeric_cols = team[selected_columns].select_dtypes(include=['number']).columns
    ewm = team[numeric_cols].ewm(span=10, adjust=False).mean()
    return ewm

df_ewm = df_ewm.groupby(["team", "season"], group_keys=False).apply(find_team_ewm)

  df_ewm = df_ewm.groupby(["team", "season"], group_keys=False).apply(find_team_ewm)


In [18]:
# Rename EWM columns with _ewm suffix
ewm_cols = [f"{col}_ewm" for col in df_ewm.columns]
df_ewm.columns = ewm_cols

In [19]:
# Concatenate rolling and EWM features to main dataframe
df = pd.concat([df, df_rolling, df_ewm], axis=1)

# Remove duplicate columns created by concat
df = df.loc[:, ~df.columns.duplicated()]

In [20]:
df = df.dropna()

In [21]:
# Sort by team and date
df = df.sort_values(["team", "date"])

# Add next game columns using simple groupby shift
df["home_next"] = df.groupby("team")["home"].shift(-1)
df["team_opp_next"] = df.groupby("team")["team_opp"].shift(-1)
df["date_next"] = df.groupby("team")["date"].shift(-1)

In [22]:
# Get actual column names for rolling and EWM features
rolling_cols = [col for col in df.columns if col.endswith('_10')]
ewm_cols = [col for col in df.columns if col.endswith('_ewm')]

In [23]:
# Merge to create full dataset with both team's and opponent's features
full = df.merge(
  df[rolling_cols + ewm_cols + ["team_opp_next", "date_next", "team"]], 
  left_on=["team", "date_next"], 
  right_on=["team_opp_next", "date_next"]
)

In [24]:
#FEATURE SELECTION

In [25]:
# Define columns to remove (metadata and text columns)
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]
removed_columns = list(full.columns[full.dtypes == "object"]) + removed_columns

In [26]:
# Get numeric feature columns only
selected_columns = full.columns[~full.columns.isin(removed_columns)]

In [27]:
# Use SelectKBest for fast feature selection (takes seconds instead of hours)
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(f_classif, k=50)
selector.fit(full[selected_columns], full["target"])

  f = msb / msw


In [28]:
# Get the selected features
predictors = list(selected_columns[selector.get_support()])
print(f"Selected {len(predictors)} features")

Selected 50 features


In [29]:
# MODEL TRAINING & EVALUATION

In [30]:
# Backtesting function: simulates real-world predictions over time
# Trains on past seasons and predicts future seasons
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []
    
    seasons = sorted(data["season"].unique())
    
    # Loop through seasons, train on past data, test on current season
    for i in range(start, len(seasons), step):
        season = seasons[i]
        train = data[data["season"] < season]
        test = data[data["season"] == season]
        
        model.fit(train[predictors], train["target"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [31]:
# Initialize Ridge Classifier
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score

rr = RidgeClassifier(alpha=1)

In [32]:
# Run backtest with Ridge Classifier
predictions = backtest(full, rr, predictors)

In [33]:
# Calculate Ridge Classifier accuracy
ridge_accuracy = accuracy_score(predictions["actual"], predictions["prediction"])
print(f"Ridge Classifier Accuracy: {ridge_accuracy:.4f} ({ridge_accuracy * 100:.2f}%)")


Ridge Classifier Accuracy: 0.6560 (65.60%)


In [35]:
TEAM_MAP = {
    'Atlanta Hawks': 'ATL', 'Boston Celtics': 'BOS', 'Brooklyn Nets': 'BRK',
    'Charlotte Hornets': 'CHO', 'Chicago Bulls': 'CHI', 'Cleveland Cavaliers': 'CLE',
    'Dallas Mavericks': 'DAL', 'Denver Nuggets': 'DEN', 'Detroit Pistons': 'DET',
    'Golden State Warriors': 'GSW', 'Houston Rockets': 'HOU', 'Indiana Pacers': 'IND',
    'Los Angeles Clippers': 'LAC', 'Los Angeles Lakers': 'LAL', 'Memphis Grizzlies': 'MEM',
    'Miami Heat': 'MIA', 'Milwaukee Bucks': 'MIL', 'Minnesota Timberwolves': 'MIN',
    'New Orleans Pelicans': 'NOP', 'New York Knicks': 'NYK', 'Oklahoma City Thunder': 'OKC',
    'Orlando Magic': 'ORL', 'Philadelphia 76ers': 'PHI', 'Phoenix Suns': 'PHO',
    'Portland Trail Blazers': 'POR', 'Sacramento Kings': 'SAC', 'San Antonio Spurs': 'SAS',
    'Toronto Raptors': 'TOR', 'Utah Jazz': 'UTA', 'Washington Wizards': 'WAS'
  }
  

In [36]:
def generate_predictions(model, df, predictors, upcoming_file, team=None, n_games=3, append=True):
    """
    Generate predictions for upcoming games.
    Set append=True to add to existing predictions (default)
    Set append=False to overwrite predictions file
    """
    
    # Load upcoming games
    upcoming = pd.read_csv(upcoming_file)
    upcoming['date'] = pd.to_datetime(upcoming['date'], format='mixed')
    
    # Map team names to abbreviations
    upcoming['home_abbrev'] = upcoming['home'].map(TEAM_MAP)
    upcoming['visitor_abbrev'] = upcoming['visitor'].map(TEAM_MAP)
    
    # Filter for specific team if requested
    if team:
        upcoming = upcoming[
            (upcoming['home_abbrev'] == team) | 
            (upcoming['visitor_abbrev'] == team)
        ]
    
    # Sort by date and get next n_games per team
    upcoming = upcoming.sort_values('date')
    
    if team:
        upcoming = upcoming.head(n_games)
    else:
        # Get next n_games for each team
        games_list = []
        for t in df['team'].unique():
            team_games = upcoming[
                (upcoming['home_abbrev'] == t) | 
                (upcoming['visitor_abbrev'] == t)
            ].head(n_games)
            games_list.append(team_games)
        upcoming = pd.concat(games_list).drop_duplicates(subset=['date', 'home', 'visitor'])
    
    # Load existing predictions to avoid duplicates
    existing_predictions = None
    if append and os.path.exists('data/predictions.csv'):
        existing_predictions = pd.read_csv('data/predictions.csv')
        existing_predictions['date'] = pd.to_datetime(existing_predictions['date']).dt.strftime('%Y-%m-%d')
        existing_keys = set(zip(existing_predictions['date'], existing_predictions['home'], existing_predictions['visitor']))
    else:
        existing_keys = set()
    
    # Get rolling/EWM columns
    rolling_cols = [col for col in df.columns if col.endswith('_10')]
    ewm_cols = [col for col in df.columns if col.endswith('_ewm')]
    
    # Get most recent stats for each team
    df_sorted = df.sort_values(['team', 'date'])
    latest_stats = df_sorted.groupby('team').last().reset_index()
    
    predictions_list = []
    skipped = 0
    
    for _, game in upcoming.iterrows():
        home = game['home_abbrev']
        visitor = game['visitor_abbrev']
        game_date = game['date'].strftime('%Y-%m-%d')
        
        # Skip if already predicted
        if (game_date, game['home'], game['visitor']) in existing_keys:
            skipped += 1
            continue
        
        # Get team stats
        home_stats = latest_stats[latest_stats['team'] == home]
        visitor_stats = latest_stats[latest_stats['team'] == visitor]
        
        if len(home_stats) == 0 or len(visitor_stats) == 0:
            print(f"Skipping {visitor} @ {home} - missing team data")
            continue
        
        # Build feature row
        feature_row = {}
        
        for col in rolling_cols + ewm_cols:
            if col in home_stats.columns:
                feature_row[col] = home_stats[col].values[0]
        
        for col in rolling_cols + ewm_cols:
            col_y = f"{col}_y"
            if col in visitor_stats.columns and col_y in predictors:
                feature_row[col_y] = visitor_stats[col].values[0]
        
        feature_row['home'] = 1
        
        pre_df = pd.DataFrame([feature_row])
        
        for col in predictors:
            if col not in pre_df.columns:
                pre_df[col] = 0
        
        pre_df = pre_df[predictors]
        
        pred = model.predict(pre_df)[0]
        
        if hasattr(model, 'decision_function'):
            confidence = abs(model.decision_function(pre_df)[0])
        else:
            confidence = None
        
        predictions_list.append({
            'date': game_date,
            'home': game['home'],
            'home_abbrev': home,
            'visitor': game['visitor'],
            'visitor_abbrev': visitor,
            'predicted_winner': game['home'] if pred == 1 else game['visitor'],
            'predicted_winner_abbrev': home if pred == 1 else visitor,
            'confidence': confidence,
            'result': 'not_played'
        })
    
    # Combine with existing predictions
    new_predictions = pd.DataFrame(predictions_list)
    
    if append and existing_predictions is not None:
        predictions_df = pd.concat([existing_predictions, new_predictions], ignore_index=True)
    else:
        predictions_df = new_predictions
    
    # Save
    predictions_df.to_csv('data/predictions.csv', index=False)
    print(f"Added {len(predictions_list)} new predictions (skipped {skipped} existing)")
    print(f"Total predictions: {len(predictions_df)}")
    
    return predictions_df

In [37]:
#train the model on all data first
rr.fit(full[predictors], full["target"])

#generate predictions for all teams (next 3 games each)
all_predictions = generate_predictions(rr, df, predictors, 'data/upcoming_games_2026.csv', n_games=3)

# Display predictions nicely
print("=" * 70)
print("UPCOMING GAME PREDICTIONS")
print("=" * 70)
for _, row in all_predictions.iterrows():
    print(f"{row['date']}  {row['visitor_abbrev']} @ {row['home_abbrev']}  -->  {row['predicted_winner_abbrev']} wins")
print("=" * 70)
print(f"Total predictions: {len(all_predictions)}")





Added 47 new predictions (skipped 2 existing)
Total predictions: 50
UPCOMING GAME PREDICTIONS
2026-01-17  OKC @ MIA  -->  MIA wins
2026-01-19  OKC @ CLE  -->  CLE wins
2026-01-21  OKC @ MIL  -->  MIL wins
2026-01-19  MIL @ ATL  -->  ATL wins
2026-01-21  ATL @ MEM  -->  MEM wins
2026-01-23  PHO @ ATL  -->  ATL wins
2026-01-19  BOS @ DET  -->  DET wins
2026-01-21  IND @ BOS  -->  BOS wins
2026-01-23  BOS @ BRK  -->  BRK wins
2026-01-19  PHO @ BRK  -->  BRK wins
2026-01-21  BRK @ NYK  -->  NYK wins
2026-01-20  LAC @ CHI  -->  CHI wins
2026-01-22  CHI @ MIN  -->  MIN wins
2026-01-24  BOS @ CHI  -->  CHI wins
2026-01-21  CLE @ CHO  -->  CHO wins
2026-01-22  CHO @ ORL  -->  ORL wins
2026-01-24  WAS @ CHO  -->  CHO wins
2026-01-23  SAC @ CLE  -->  CLE wins
2026-01-19  DAL @ NYK  -->  NYK wins
2026-01-22  GSW @ DAL  -->  DAL wins
2026-01-24  LAL @ DAL  -->  DAL wins
2026-01-20  LAL @ DEN  -->  DEN wins
2026-01-22  DEN @ WAS  -->  WAS wins
2026-01-23  DEN @ MIL  -->  MIL wins
2026-01-21  DET @ 

In [38]:
# get predictions for a specific team
team_predictions = generate_predictions(rr, df, predictors, 'data/upcoming_games_2026.csv', team='TOR', n_games=3)
print(team_predictions)

Added 0 new predictions (skipped 3 existing)
Total predictions: 50
          date                    home home_abbrev                 visitor  \
0   2026-01-17              Miami Heat         MIA   Oklahoma City Thunder   
1   2026-01-19     Cleveland Cavaliers         CLE   Oklahoma City Thunder   
2   2026-01-21         Milwaukee Bucks         MIL   Oklahoma City Thunder   
3   2026-01-19           Atlanta Hawks         ATL         Milwaukee Bucks   
4   2026-01-21       Memphis Grizzlies         MEM           Atlanta Hawks   
5   2026-01-23           Atlanta Hawks         ATL            Phoenix Suns   
6   2026-01-19         Detroit Pistons         DET          Boston Celtics   
7   2026-01-21          Boston Celtics         BOS          Indiana Pacers   
8   2026-01-23           Brooklyn Nets         BRK          Boston Celtics   
9   2026-01-19           Brooklyn Nets         BRK            Phoenix Suns   
10  2026-01-21         New York Knicks         NYK           Brooklyn Nets 