In [36]:
# ------------------------
# Data Handling and Preparation with Days Since Last Game
# ------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
from collections import deque

# Read and sort data
full25season = pd.read_csv("/Users/tristanpoul/Desktop/Personal code/NBAMar25Season.csv")
full25season['Date_clean'] = pd.to_datetime(full25season['Date'], format='%a %b %d %Y', errors='coerce')
full25season = full25season.sort_values('Date_clean')
full25season = full25season[full25season['HPTS'].notna()]

# Create Season column (if month >= 10, season = year, else season = year - 1)
full25season['Season'] = full25season['Date_clean'].apply(lambda d: d.year if d.month >= 10 else d.year - 1)

# Initialize starting Elo ratings for first season (everyone at 1000)
teams = pd.unique(pd.concat([full25season['Home'], full25season['Visitor']]))
elo_vec = {team: 1000 for team in teams}

# Prepare Elo columns
for col in ['visitor_elo_pre', 'home_elo_pre', 'visitor_elo_post', 'home_elo_post']:
    full25season[col] = np.nan

# Prepare additional columns for season metrics
full25season['home_game_num'] = np.nan
full25season['visitor_game_num'] = np.nan
full25season['home_win_pct'] = np.nan
full25season['visitor_win_pct'] = np.nan
# NEW: Add columns for days since last game
full25season['home_days_since_last'] = np.nan
full25season['visitor_days_since_last'] = np.nan

# Track overall games played (for dynamic K), season-specific game counts, and season wins
games_played_vec = {team: 0 for team in teams}
season_game_count = {}  # key = (season, team)
season_wins = {}        # key = (season, team)

# Initialize dictionary to store the last game date for each team.
last_game_date = {team: None for team in teams}

# Initialize the current season based on the first game
current_season = full25season.iloc[0]['Season']

def get_dynamic_K(gp):
    if gp < 5:
        return 20
    elif gp < 10:
        return 15
    else:
        return 12

# Process games to update Elo ratings, track season game numbers, win percentages, and days since last game.
for i, game in full25season.iterrows():
    game_season = game['Season']
    current_date = game['Date_clean']
    
    # If season changes, reset Elo ratings and season-specific counts (but leave last_game_date to capture off-season gap if desired)
    if game_season != current_season:
        sorted_teams = sorted(elo_vec.keys(), key=lambda team: elo_vec[team], reverse=True)
        total_teams = len(sorted_teams)
        for rank, team in enumerate(sorted_teams, start=1):
            elo_vec[team] = 1000 + ((total_teams / 2) - rank) * 3
        season_game_count = {}
        season_wins = {}
        current_season = game_season

    # Define keys for season-specific tracking
    visitor_team = game['Visitor']
    home_team = game['Home']
    visitor_key = (game_season, visitor_team)
    home_key = (game_season, home_team)
    
    # Compute pre-game win percentage for each team (using previous games only)
    pre_visitor_games = season_game_count.get(visitor_key, 0)
    pre_home_games = season_game_count.get(home_key, 0)
    visitor_win_pct = season_wins.get(visitor_key, 0) / pre_visitor_games if pre_visitor_games > 0 else 0.5
    home_win_pct = season_wins.get(home_key, 0) / pre_home_games if pre_home_games > 0 else 0.5
    full25season.at[i, 'visitor_win_pct'] = visitor_win_pct
    full25season.at[i, 'home_win_pct'] = home_win_pct
    
    # Update season-specific game numbers
    visitor_game_num = pre_visitor_games + 1
    home_game_num = pre_home_games + 1
    season_game_count[visitor_key] = visitor_game_num
    season_game_count[home_key] = home_game_num
    full25season.at[i, 'visitor_game_num'] = visitor_game_num
    full25season.at[i, 'home_game_num'] = home_game_num
    
    # Compute days since last game for each team
    if last_game_date[home_team] is None:
        home_days_since = 7  # default value if no previous game (e.g., assume 7 days)
    else:
        home_days_since = (current_date - last_game_date[home_team]).days
        
    if last_game_date[visitor_team] is None:
        visitor_days_since = 7
    else:
        visitor_days_since = (current_date - last_game_date[visitor_team]).days
    
    full25season.at[i, 'home_days_since_last'] = home_days_since
    full25season.at[i, 'visitor_days_since_last'] = visitor_days_since
    
    # Update last game dates for both teams
    last_game_date[home_team] = current_date
    last_game_date[visitor_team] = current_date
    
    # Retrieve current Elo ratings
    visitor_elo_pre = elo_vec[visitor_team]
    home_elo_pre = elo_vec[home_team]
    full25season.at[i, 'visitor_elo_pre'] = visitor_elo_pre
    full25season.at[i, 'home_elo_pre'] = home_elo_pre
    
    # Determine dynamic K values (using overall games played)
    K_visitor = get_dynamic_K(games_played_vec[visitor_team])
    K_home = get_dynamic_K(games_played_vec[home_team])
    
    # Calculate expected win probabilities
    exp_visitor = 1 / (1 + 10 ** ((home_elo_pre - visitor_elo_pre) / 400))
    exp_home = 1 - exp_visitor
    
    # Determine actual outcomes (1 if win, 0 if loss)
    if game['VPTS'] > game['HPTS']:
        actual_visitor, actual_home = 1, 0
    else:
        actual_visitor, actual_home = 0, 1
    
    # Update Elo ratings
    visitor_elo_post = visitor_elo_pre + K_visitor * (actual_visitor - exp_visitor)
    home_elo_post = home_elo_pre + K_home * (actual_home - exp_home)
    elo_vec[visitor_team] = visitor_elo_post
    elo_vec[home_team] = home_elo_post
    full25season.at[i, 'visitor_elo_post'] = visitor_elo_post
    full25season.at[i, 'home_elo_post'] = home_elo_post
    
    # Update overall game counters
    games_played_vec[visitor_team] += 1
    games_played_vec[home_team] += 1
    
    # Update season wins based on the outcome of this game
    season_wins[visitor_key] = season_wins.get(visitor_key, 0) + actual_visitor
    season_wins[home_key] = season_wins.get(home_key, 0) + actual_home

# Compute recent win margins (rolling last 5 games)
recent_margins = {team: deque(maxlen=5) for team in teams}
full25season['home_recent_margin'] = 0.0
full25season['visitor_recent_margin'] = 0.0

for i, game in full25season.iterrows():
    home_team = game['Home']
    visitor_team = game['Visitor']
    home_recent = np.mean(recent_margins[home_team]) if len(recent_margins[home_team]) > 0 else 0.0
    visitor_recent = np.mean(recent_margins[visitor_team]) if len(recent_margins[visitor_team]) > 0 else 0.0
    full25season.at[i, 'home_recent_margin'] = home_recent
    full25season.at[i, 'visitor_recent_margin'] = visitor_recent
    
    # Compute margin for current game
    home_margin = game['HPTS'] - game['VPTS']
    visitor_margin = game['VPTS'] - game['HPTS']
    
    recent_margins[home_team].append(home_margin)
    recent_margins[visitor_team].append(visitor_margin)

# Create target variable: home win (1 if HPTS > VPTS, else 0)
full25season['home_win'] = (full25season['HPTS'] > full25season['VPTS']).astype(int)


In [None]:
# ------------------------
# Graphing Elo Ratings Over Time (All Seasons)
# ------------------------
# Prepare Elo data by combining visitor and home records
visitor_elo_time = full25season[['Date_clean', 'Visitor', 'visitor_elo_post']].rename(
    columns={'Visitor': 'Team', 'visitor_elo_post': 'Elo'})
home_elo_time = full25season[['Date_clean', 'Home', 'home_elo_post']].rename(
    columns={'Home': 'Team', 'home_elo_post': 'Elo'})
elo_time = pd.concat([visitor_elo_time, home_elo_time], ignore_index=True)
elo_time = elo_time.sort_values(by=['Team', 'Date_clean'])

# Plot Elo ratings over time
plt.figure(figsize=(10, 6))
for team, group in elo_time.groupby('Team'):
    plt.plot(group['Date_clean'], group['Elo'], label=team, alpha=0.8)
plt.title("Elo Ratings Throughout All Seasons")
plt.xlabel("Date")
plt.ylabel("Elo Rating")
plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.3), ncol=5)
plt.tight_layout()
plt.show()


In [37]:
# ------------------------
# Machine Learning Training and Testing with Days Since Last Game
# ------------------------
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, brier_score_loss

# Exclude each team's first 5 games per season (if needed)
ml_data = full25season[(full25season['home_game_num'] > 5) & (full25season['visitor_game_num'] > 5)].copy()

# Create new features using pre-game Elo ratings
ml_data['diff_elo'] = ml_data['home_elo_pre'] - ml_data['visitor_elo_pre']

# Additional features:
#   - home_recent_margin and visitor_recent_margin (from recent performance)
#   - win_pct_diff: Difference in season win percentages
#   - days_since_last: Use the new columns for days since last game
ml_data['win_pct_diff'] = ml_data['home_win_pct'] - ml_data['visitor_win_pct']

# Define the features to use (remove game_num_diff, add days since last game for both teams)
features = ['diff_elo', 'home_recent_margin', 'visitor_recent_margin', 'win_pct_diff', 
            'home_days_since_last', 'visitor_days_since_last']

X = ml_data[features]
y = ml_data['home_win']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and test sets (80/20 split; adjust test_size as needed)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.01, random_state=42)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Generate predictions and evaluate the model
y_prob = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
brier = brier_score_loss(y_test, y_prob)
cm = confusion_matrix(y_test, y_pred)
coefficients = model.coef_[0]
intercept = model.intercept_[0]

# Output evaluation metrics and model parameters in formatted columns.
col_width = 20
num_width = 10
print(f"{'Metric':<{col_width}}{'Value':>{num_width}}")
print("-" * (col_width + num_width))
print(f"{'Accuracy':<{col_width}}{accuracy:>{num_width}.3f}")
print(f"{'Brier Score':<{col_width}}{brier:>{num_width}.3f}")
print(f"{'Intercept':<{col_width}}{intercept:>{num_width}.3f}")
print("\nFeature Coefficients:")
print(f"{'Feature':<{col_width}}{'Coefficient':>{num_width}}")
print("-" * (col_width + num_width))
for feature, coef in zip(features, coefficients):
    print(f"{feature:<{col_width}}{coef:>{num_width}.3f}")

print("\nConfusion Matrix:")
for row in np.round(cm, 3):
    print(" ".join(f"{val:>{num_width}.3f}" for val in row))


Metric                   Value
------------------------------
Accuracy                 0.629
Brier Score              0.219
Intercept                0.280

Feature Coefficients:
Feature             Coefficient
------------------------------
diff_elo                 0.503
home_recent_margin       0.134
visitor_recent_margin    -0.070
win_pct_diff             0.171
home_days_since_last     0.173
visitor_days_since_last    -0.161

Confusion Matrix:
     9.000      7.000
     6.000     13.000


In [38]:
# ------------------------
# Create Matchups DataFrame Using Logistic Regression Probabilities (with Days Since Last Game)
# ------------------------

# Define the feature names used in the logistic regression model for matchups (excluding sum_elo).
features = ['diff_elo', 'home_recent_margin', 'visitor_recent_margin', 'win_pct_diff', 
            'home_days_since_last', 'visitor_days_since_last']

# For each team, extract the most recent metrics from the full dataset.
latest_home = full25season.sort_values('Date_clean').groupby('Home').last().reset_index()
latest_away = full25season.sort_values('Date_clean').groupby('Visitor').last().reset_index()

# Create dictionaries mapping team to their latest metrics for home and away.
home_metrics = {row['Home']: row for _, row in latest_home.iterrows()}
away_metrics = {row['Visitor']: row for _, row in latest_away.iterrows()}

matchup_list = []
teams_list = list(elo_vec.keys())

# Loop over every possible ordered pair (i.e. both home/away roles)
for home_team in teams_list:
    for away_team in teams_list:
        if home_team != away_team:
            # Retrieve current Elo ratings from elo_vec
            home_elo = elo_vec[home_team]
            away_elo = elo_vec[away_team]
            diff_elo = home_elo - away_elo
            
            # Get home metrics (or use defaults if not available)
            if home_team in home_metrics:
                home_recent_margin = home_metrics[home_team].get('home_recent_margin', 0.0)
                home_win_pct = home_metrics[home_team].get('home_win_pct', 0.5)
                home_days_since_last = home_metrics[home_team].get('home_days_since_last', 7)
            else:
                home_recent_margin, home_win_pct, home_days_since_last = 0.0, 0.5, 7

            # Get away metrics (or use defaults if not available)
            if away_team in away_metrics:
                visitor_recent_margin = away_metrics[away_team].get('visitor_recent_margin', 0.0)
                visitor_win_pct = away_metrics[away_team].get('visitor_win_pct', 0.5)
                visitor_days_since_last = away_metrics[away_team].get('visitor_days_since_last', 7)
            else:
                visitor_recent_margin, visitor_win_pct, visitor_days_since_last = 0.0, 0.5, 7

            win_pct_diff = home_win_pct - visitor_win_pct

            # Form the feature vector according to the model.
            # Order: [diff_elo, home_recent_margin, visitor_recent_margin, win_pct_diff, home_days_since_last, visitor_days_since_last]
            feature_vector = [diff_elo, home_recent_margin, visitor_recent_margin, win_pct_diff, 
                              home_days_since_last, visitor_days_since_last]
            
            # Create a DataFrame with the feature vector to preserve feature names.
            feature_df = pd.DataFrame([feature_vector], columns=features)
            # Standardize using the previously fitted scaler.
            feature_vector_scaled = scaler.transform(feature_df)
            
            # Predict the home win probability using the logistic regression model.
            home_win_prob = model.predict_proba(feature_vector_scaled)[0, 1]
            away_win_prob = 1 - home_win_prob
            
            matchup_list.append({
                "Home": home_team,
                "Away": away_team,
                "home_win_prob": home_win_prob,
                "away_win_prob": away_win_prob
            })

# Create the matchups DataFrame.
matchups_df = pd.DataFrame(matchup_list)
print(matchups_df.head(20))


              Home                    Away  home_win_prob  away_win_prob
0   Boston Celtics   Golden State Warriors       0.733719       0.266281
1   Boston Celtics       Memphis Grizzlies       0.762658       0.237342
2   Boston Celtics              Miami Heat       0.877624       0.122376
3   Boston Celtics         Toronto Raptors       0.909032       0.090968
4   Boston Celtics  Minnesota Timberwolves       0.700479       0.299521
5   Boston Celtics       San Antonio Spurs       0.852951       0.147049
6   Boston Celtics               Utah Jazz       0.929754       0.070246
7   Boston Celtics            Phoenix Suns       0.815722       0.184278
8   Boston Celtics        Sacramento Kings       0.846788       0.153212
9   Boston Celtics           Atlanta Hawks       0.821003       0.178997
10  Boston Celtics           Brooklyn Nets       0.902003       0.097997
11  Boston Celtics         Detroit Pistons       0.767547       0.232453
12  Boston Celtics          Indiana Pacers       0.