This notebook trains classification models to predict Super Bowl winners.

**Goals:**
- Train logistic regression and random forest classifiers.
- Evaluate performance using accuracy, precision, recall, and F1-score.
- Save the best-performing model for future use.

**Tools:**
- scikit-learn for classification, metrics, joblib

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay
import matplotlib.pyplot as plt


In [17]:
df = pd.read_csv('../data/processed/nfl_team_stats_2003_2023_cleaned.csv')


df['points_per_game'] = df['points'] / df['g']
df['yards_per_play'] = df['total_yards'] / df['plays_offense']
df['completion_rate'] = df['pass_cmp'] / df['pass_att']
df['rush_avg'] = df['rush_yds'] / df['rush_att']


df.fillna(0, inplace=True)


features = [
    'points_diff', 'score_pct', 'turnover_pct',
    'pass_td', 'rush_td', 'penalties',
    'points_per_game', 'yards_per_play', 'completion_rate', 'rush_avg'
]

df['superbowl_winner'] = 0

winners = {
    2003: 'New England Patriots',
    2004: 'New England Patriots',
    2005: 'Pittsburgh Steelers',
    2006: 'Indianapolis Colts',
    2007: 'New York Giants',
    2008: 'Pittsburgh Steelers',
    2009: 'New Orleans Saints',
    2010: 'Green Bay Packers',
    2011: 'New York Giants',
    2012: 'Baltimore Ravens',
    2013: 'Seattle Seahawks',
    2014: 'New England Patriots',
    2015: 'Denver Broncos',
    2016: 'New England Patriots',
    2017: 'Philadelphia Eagles',
    2018: 'New England Patriots',
    2019: 'Kansas City Chiefs',
    2020: 'Tampa Bay Buccaneers',
    2021: 'Los Angeles Rams',
    2022: 'Kansas City Chiefs',
    2023: 'Kansas City Chiefs',
}

for year, team in winners.items():
    df.loc[(df['year'] == year) & (df['team'] == team), 'superbowl_winner'] = 1

    
X = df[features]
y = df['superbowl_winner']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [19]:

log_model = LogisticRegression(max_iter=1000, class_weight='balanced')
log_model.fit(X_train, y_train)


rf_model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf_model.fit(X_train, y_train)


In [20]:
import joblib
import os

# Ensure the models folder exists
os.makedirs('../models', exist_ok=True)

# Save the trained model
joblib.dump(rf_model, '../models/rf_model.pkl')


['../models/rf_model.pkl']

Print results from training model of logisitic regression and random forest to decide which model to use.

In [21]:

print("Logistic Regression Results:")
y_pred_log = log_model.predict(X_test)
print(classification_report(y_test, y_pred_log))
print(confusion_matrix(y_test, y_pred_log))

# Random Forest
print("Random Forest Results:")
y_pred_rf = rf_model.predict(X_test)
print(classification_report(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))


Logistic Regression Results:
              precision    recall  f1-score   support

           0       1.00      0.78      0.88       131
           1       0.12      1.00      0.22         4

    accuracy                           0.79       135
   macro avg       0.56      0.89      0.55       135
weighted avg       0.97      0.79      0.86       135

[[102  29]
 [  0   4]]
Random Forest Results:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       131
           1       0.00      0.00      0.00         4

    accuracy                           0.97       135
   macro avg       0.49      0.50      0.49       135
weighted avg       0.94      0.97      0.96       135

[[131   0]
 [  4   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [22]:
df['win_probability'] = rf_model.predict_proba(X)[:, 1]
df_2023 = df[df['year'] == 2023].sort_values('win_probability', ascending=False)
df_2023[['team', 'win_probability']].head(10)


Unnamed: 0,team,win_probability
652,Kansas City Chiefs,0.56
645,Cleveland Browns,0.06
644,Baltimore Ravens,0.03
656,Dallas Cowboys,0.03
669,Los Angeles Rams,0.02
668,San Francisco 49ers,0.02
640,Buffalo Bills,0.02
641,Miami Dolphins,0.01
660,Detroit Lions,0.01
661,Green Bay Packers,0.01


In [23]:
df.to_csv('../data/processed/nfl_team_stats_2003_2023_with_probs.csv', index=False)

