In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import xgboost as xgb

In [2]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Path to the shared folder
dataset_path = '/content/drive/My Drive/cfb_data'
os.chdir(dataset_path)

Mounted at /content/drive


Importing Dataset

In [3]:
cfp_data = pd.read_csv("all_data.csv")
display(cfp_data.head())
display(cfp_data.shape)
display(cfp_data.info())


Unnamed: 0,home_offense_ppa,home_offense_success_rate,home_offense_explosiveness,home_offense_power_success,home_offense_stuff_rate,home_offense_line_yards,home_offense_second_level_yards,home_offense_open_field_yards,home_offense_points_per_opportunity,home_offense_field_position_average_start,...,away_defense_rushing_plays_success_rate,away_defense_rushing_plays_explosiveness,away_defense_passing_plays_rate,away_defense_passing_plays_ppa,away_defense_passing_plays_success_rate,away_defense_passing_plays_explosiveness,home_team,home_points,away_team,away_points
0,0.307093,0.520231,0.990004,1.0,0.073826,3.81745,1.194631,1.161074,5.0,67.4,...,0.552239,0.901251,0.221591,-0.181437,0.333333,1.371141,Air Force,48.0,Georgia State,14.0
1,0.159582,0.385159,1.441978,0.818182,0.231884,2.67971,1.202899,2.42029,3.4,71.3,...,0.279412,1.056908,0.623656,0.261043,0.336207,2.11421,Utah State,20.0,Air Force,27.0
2,0.233796,0.490323,0.997993,0.823529,0.106719,3.398024,1.051383,0.86166,4.2,66.5,...,0.418182,0.912447,0.378182,0.373564,0.432692,1.734858,Air Force,28.0,Navy,14.0
3,0.149251,0.42217,1.226632,0.777778,0.224335,2.827376,1.087452,1.403042,3.871795,67.0,...,0.296552,1.160356,0.554545,0.20149,0.355191,1.915481,Wyoming,35.0,Air Force,26.0
4,0.238808,0.45618,1.167261,0.756098,0.140299,3.194925,1.044776,1.155224,4.297297,68.7,...,0.36,1.198131,0.466837,0.443473,0.448087,1.863659,Air Force,40.0,New Mexico,45.0


(5959, 124)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5959 entries, 0 to 5958
Columns: 124 entries, home_offense_ppa to away_points
dtypes: float64(122), object(2)
memory usage: 5.6+ MB


None

Adding a Boolean Win/Loss Column

In [4]:
cfp_data['winner'] = (cfp_data['home_points'] > cfp_data['away_points']).astype(int)


Variable Selection

In [5]:
import pandas as pd

# Select only numerical columns
numerical_data = cfp_data.select_dtypes(include=['number'])

# Calculate correlation matrix
correlation_matrix = numerical_data.corr()

# Identify highly correlated features (e.g., above 0.8)
correlated_features = set()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            correlated_features.add(correlation_matrix.columns[i])

# Exclude specific columns from removal
protected_columns = {'home_points', 'away_points', 'winner'}
correlated_features -= protected_columns  # Remove protected columns from the set

# Drop correlated features
data_reduced = cfp_data.drop(columns=correlated_features)

# Print the reduced dataset
display(data_reduced)

# Print remaining columns
display(data_reduced.columns)
display(data_reduced.shape)


#


Unnamed: 0,home_offense_ppa,home_offense_explosiveness,home_offense_power_success,home_offense_stuff_rate,home_offense_second_level_yards,home_offense_open_field_yards,home_offense_points_per_opportunity,home_offense_field_position_average_start,home_offense_havoc_total,home_offense_havoc_db,...,away_defense_passing_downs_explosiveness,away_defense_rushing_plays_rate,away_defense_rushing_plays_ppa,away_defense_rushing_plays_explosiveness,away_defense_passing_plays_explosiveness,home_team,home_points,away_team,away_points,winner
0,0.307093,0.990004,1.000000,0.073826,1.194631,1.161074,5.000000,67.4,0.069364,0.011561,...,1.522826,0.761364,0.327729,0.901251,1.371141,Air Force,48.0,Georgia State,14.0,1
1,0.159582,1.441978,0.818182,0.231884,1.202899,2.420290,3.400000,71.3,0.226148,0.095406,...,2.362168,0.365591,-0.109887,1.056908,2.114210,Utah State,20.0,Air Force,27.0,0
2,0.233796,0.997993,0.823529,0.106719,1.051383,0.861660,4.200000,66.5,0.074194,0.016129,...,1.604963,0.600000,0.087281,0.912447,1.734858,Air Force,28.0,Navy,14.0,1
3,0.149251,1.226632,0.777778,0.224335,1.087452,1.403042,3.871795,67.0,0.160377,0.044811,...,2.451553,0.439394,-0.009056,1.160356,1.915481,Wyoming,35.0,Air Force,26.0,1
4,0.238808,1.167261,0.756098,0.140299,1.044776,1.155224,4.297297,68.7,0.107865,0.026966,...,2.206819,0.510204,0.082097,1.198131,1.863659,Air Force,40.0,New Mexico,45.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5954,0.206092,1.293547,0.666667,0.200000,0.929730,0.762162,4.162162,70.1,0.163366,0.071782,...,2.092247,0.605701,0.210852,1.050796,1.664180,Western Kentucky,44.0,UTEP,17.0,1
5955,0.303245,1.135907,0.857143,0.076923,1.323077,1.015385,4.066667,73.8,0.100000,0.040000,...,2.110354,0.507042,0.136678,0.855070,1.435484,Wake Forest,30.0,Virginia,31.0,0
5956,0.176130,1.277665,0.771930,0.185446,1.150235,1.676056,4.223881,69.8,0.184073,0.053525,...,2.065585,0.496871,0.147410,0.982519,1.627343,Virginia Tech,37.0,Virginia,17.0,1
5957,0.426361,1.132529,0.833333,0.089888,1.662921,1.921348,4.333333,75.6,0.117021,0.021277,...,2.398876,0.443966,0.096702,1.031799,1.667135,Washington,19.0,Washington State,24.0,0


Index(['home_offense_ppa', 'home_offense_explosiveness',
       'home_offense_power_success', 'home_offense_stuff_rate',
       'home_offense_second_level_yards', 'home_offense_open_field_yards',
       'home_offense_points_per_opportunity',
       'home_offense_field_position_average_start', 'home_offense_havoc_total',
       'home_offense_havoc_db', 'home_offense_standard_downs_explosiveness',
       'home_offense_passing_downs_ppa',
       'home_offense_passing_downs_explosiveness',
       'home_offense_rushing_plays_rate', 'home_offense_rushing_plays_ppa',
       'home_offense_rushing_plays_explosiveness',
       'home_offense_passing_plays_explosiveness', 'home_defense_ppa',
       'home_defense_explosiveness', 'home_defense_power_success',
       'home_defense_stuff_rate', 'home_defense_second_level_yards',
       'home_defense_open_field_yards', 'home_defense_points_per_opportunity',
       'home_defense_field_position_average_start', 'home_defense_havoc_total',
       'home_def

(5959, 73)

Using a simple XGBoost model to further reduce the number of columns

In [6]:
from xgboost import XGBClassifier

# Partition the data into features (X) and target (y)
X = data_reduced .drop(columns=['winner', 'home_points', 'away_points'])
y = data_reduced['winner']

# Store original columns to re-add them later
home_away_columns = ['home_team', 'away_team']

# Temporarily remove 'home_team' and 'away_team' columns for training
X_reduced = X.drop(columns=home_away_columns)

# Apply One-Hot Encoding or Label Encoding to any categorical columns in the reduced dataset
X_encoded = pd.get_dummies(X_reduced, drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Train an XGBoost model on the training data
model = XGBClassifier(random_state=42)
model.fit(X_train, y_train)

# Feature importance
importance = pd.DataFrame({'Feature': X_train.columns, 'Importance': model.feature_importances_})
importance = importance.sort_values(by='Importance', ascending=False)

# Select top features (e.g., top 20 features)
top_features = importance['Feature'][:20].tolist()

# Reduce the data to only the top features
X_train_reduced = X_train[top_features]
X_test_reduced = X_test[top_features]

print(f"Reduced training data to {X_train_reduced.shape[1]} features.")

Reduced training data to 20 features.


Create XGBoost Model

In [8]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Convert datasets to DMatrix, required for xgboost.cv
dtrain = xgb.DMatrix(data=X_train_reduced, label=y_train)
dtest = xgb.DMatrix(data=X_test_reduced, label=y_test)

# Define a parameter grid to test
param_grid = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 5,
    'learning_rate': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'n_estimators': 100,
    'gamma': 0,
    'reg_alpha': 0.01,
    'reg_lambda': 1
}

# Cross-Validation Parameters
cv_results = xgb.cv(
    params=param_grid,
    dtrain=dtrain,
    num_boost_round=200,
    nfold=3,
    metrics="logloss",
    early_stopping_rounds=10,
    verbose_eval=True
)

# Train the final model using the best number of rounds from CV
best_num_boost_round = len(cv_results)
print(f"Best number of boosting rounds: {best_num_boost_round}")

final_model = xgb.train(
    params=param_grid,
    dtrain=dtrain,
    num_boost_round=best_num_boost_round
)

# Make predictions on the test set
y_pred_probs = final_model.predict(dtest)
y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_probs]

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy: {accuracy}")
print("Classification Report:\n", classification_report(y_test, y_pred))


[0]	train-logloss:0.64883+0.00123	test-logloss:0.65624+0.00191
[1]	train-logloss:0.62176+0.00114	test-logloss:0.63611+0.00177
[2]	train-logloss:0.59481+0.00115	test-logloss:0.61439+0.00106
[3]	train-logloss:0.57158+0.00095	test-logloss:0.59655+0.00138


Parameters: { "n_estimators" } are not used.



[4]	train-logloss:0.55080+0.00057	test-logloss:0.58047+0.00073
[5]	train-logloss:0.53252+0.00136	test-logloss:0.56695+0.00079
[6]	train-logloss:0.51604+0.00149	test-logloss:0.55517+0.00167
[7]	train-logloss:0.50080+0.00135	test-logloss:0.54444+0.00198
[8]	train-logloss:0.48729+0.00187	test-logloss:0.53505+0.00230
[9]	train-logloss:0.47506+0.00142	test-logloss:0.52724+0.00289
[10]	train-logloss:0.46362+0.00190	test-logloss:0.51963+0.00351
[11]	train-logloss:0.45302+0.00196	test-logloss:0.51298+0.00379
[12]	train-logloss:0.44312+0.00189	test-logloss:0.50699+0.00444
[13]	train-logloss:0.43441+0.00190	test-logloss:0.50141+0.00448
[14]	train-logloss:0.42536+0.00264	test-logloss:0.49714+0.00379
[15]	train-logloss:0.41790+0.00244	test-logloss:0.49237+0.00433
[16]	train-logloss:0.41009+0.00203	test-logloss:0.48869+0.00550
[17]	train-logloss:0.40321+0.00247	test-logloss:0.48541+0.00579
[18]	train-logloss:0.39659+0.00250	test-logloss:0.48157+0.00572
[19]	train-logloss:0.39066+0.00259	test-loglos

Parameters: { "n_estimators" } are not used.



Test Set Accuracy: 0.7869127516778524
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.73      0.74       509
           1       0.80      0.83      0.82       683

    accuracy                           0.79      1192
   macro avg       0.78      0.78      0.78      1192
weighted avg       0.79      0.79      0.79      1192



Allow user to select matchups and use XGBoost model to make predictions

In [9]:
import pickle
import xgboost as xgb
import numpy as np

# Load the data and the model
with open("/content/drive/My Drive/cfb_data/current_stats.pkl", "rb") as f:
    data = pickle.load(f)


# Function to get statistics for the given teams
def get_stats(home_team, away_team):
    home_team, away_team = home_team.strip(), away_team.strip()

    away_data = {}
    for key, val in data[away_team].items():
        new_key = key.replace("home", "away")
        away_data[new_key] = val

    return data[home_team] | away_data

def predict(home_team, away_team):
    stats = get_stats(home_team.strip(), away_team.strip())

    stats = {key: stats[key] for key in top_features}

    # Filter the stats based on the top features
    stats_list = list(stats.values())

    # Create the DMatrix from the 2D array
    dsample = xgb.DMatrix(data=[stats_list], feature_names=top_features)

    prediction = final_model.predict(dsample)

    # Output the prediction
    return prediction[0]


In [11]:
print(f'Teams: {data.keys()}')

while True:
    home_team = input("Enter Home Team: ")

    if home_team == 'End':
        break

    away_team = input("Enter Away Team: ")

    result = predict(home_team, away_team)

    if result > 0.5:
        print(f'The Model Predicts {home_team} to win with {result:.2f} probability')
    else:
        print(f'The Model Predicts {away_team} to win with {1-result:.2f} probability')



Teams: dict_keys(['Air Force', 'Akron', 'Alabama', 'App State', 'Arizona', 'Arizona State', 'Arkansas', 'Arkansas State', 'Army', 'Auburn', 'Ball State', 'Baylor', 'Boise State', 'Boston College', 'Bowling Green', 'Buffalo', 'BYU', 'California', 'Central Michigan', 'Charlotte', 'Cincinnati', 'Clemson', 'Coastal Carolina', 'Colorado', 'Colorado State', 'Duke', 'East Carolina', 'Eastern Michigan', 'Florida', 'Florida Atlantic', 'Florida International', 'Florida State', 'Fresno State', 'Georgia', 'Georgia Southern', 'Georgia State', 'Georgia Tech', "Hawai'i", 'Houston', 'Illinois', 'Indiana', 'Iowa', 'Iowa State', 'Jacksonville State', 'James Madison', 'Kansas', 'Kansas State', 'Kennesaw State', 'Kent State', 'Kentucky', 'Liberty', 'Louisiana', 'Louisiana Tech', 'Louisville', 'LSU', 'Marshall', 'Maryland', 'Massachusetts', 'Memphis', 'Miami', 'Miami (OH)', 'Michigan', 'Michigan State', 'Middle Tennessee', 'Minnesota', 'Mississippi State', 'Missouri', 'Navy', 'NC State', 'Nebraska', 'Nevad

Create Power Rankings

In [12]:
from collections import defaultdict

teams = list(data.keys())

diffs = defaultdict(float)

for team in teams:
  total_diff = 0
  for op in teams:
    if team == op:
      continue

    prob = predict(team, op)
    diff = prob

    prob = predict(op, team)
    diff += -1 * prob

    total_diff += diff

  diffs[team] += total_diff


for i, (team, _) in enumerate(sorted(diffs.items(), key=lambda x: -x[1])):
  print(f"{i+1}: {team}")

1: Notre Dame
2: Ohio State
3: Indiana
4: Ole Miss
5: Oregon
6: Penn State
7: Tennessee
8: Miami
9: Tulane
10: UNLV
11: Memphis
12: Boise State
13: SMU
14: Army
15: Clemson
16: Texas
17: Alabama
18: South Alabama
19: James Madison
20: Louisville
21: Arizona State
22: Georgia
23: South Carolina
24: TCU
25: Navy
26: Texas A&M
27: Iowa
28: Washington State
29: Georgia Tech
30: Texas State
31: Baylor
32: Colorado
33: Kansas State
34: Ohio
35: Marshall
36: USC
37: Illinois
38: Minnesota
39: BYU
40: Jacksonville State
41: Liberty
42: Missouri
43: Miami (OH)
44: Northern Illinois
45: UConn
46: Kansas
47: Boston College
48: Syracuse
49: Virginia Tech
50: Texas Tech
51: UCF
52: Utah
53: Florida
54: Michigan
55: Duke
56: Old Dominion
57: Buffalo
58: South Florida
59: Louisiana
60: LSU
61: California
62: Nebraska
63: UTSA
64: Fresno State
65: Arkansas
66: NC State
67: Vanderbilt
68: East Carolina
69: Auburn
70: Troy
71: Bowling Green
72: North Carolina
73: Iowa State
74: Louisiana Tech
75: Toledo