In [77]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import numpy as np

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [78]:
threshold = 6
team_data = pd.read_csv("./test_data.csv")
display(team_data.head())

Unnamed: 0,game_id,date,Team,Opponent,Team_Score,Opponent_Score,Location,ADJO,ADJD,EFF,EFG_pct,TO_Pct,OR_Pct,FTR_Pct,ThreePointer_Pct,FG_Pct,opp_ADJO,opp_ADJD,Opp_EFF,Opp_EFG_Pct,Opp_TO_Pct,Opp_OR_Pct,Opp_FTR_Pct,Opp_ThreePointer_Pct,Opp_FG_Pct
0,Cal St. BakersfieldNorth Dakota St.12-23,2024-12-23,Cal St. Bakersfield,North Dakota St.,60,94,A,85.2,125.3,87.8,46.3,16.1,21.2,38.9,28.57,42.59,133.4,89.4,137.6,62.7,8.8,20.0,68.6,37.04,52.94
1,CharlestonLoyola Chicago12-23,2024-12-23,Charleston,Loyola Chicago,77,68,N,101.7,91.9,100.6,56.0,20.9,9.7,27.6,42.31,46.55,95.7,97.7,88.8,46.6,24.8,31.6,34.5,25.0,41.38
2,CharlotteMurray St.12-23,2024-12-23,Charlotte,Murray St.,94,90,N,118.7,110.4,116.6,50.0,17.4,36.4,60.0,22.22,46.67,104.9,115.0,111.7,46.4,8.7,28.2,52.2,23.08,42.03
3,Middle TennesseeTennessee12-23,2024-12-23,Middle Tennessee,Tennessee,64,82,A,116.4,98.5,91.1,48.3,15.7,15.8,23.7,30.0,40.68,112.6,95.3,116.7,57.1,15.7,25.8,41.1,45.45,48.21
4,NebraskaHawaii12-23,2024-12-23,Nebraska,Hawaii,69,55,A,107.8,93.4,111.0,53.6,19.3,33.3,20.0,41.18,47.27,103.0,102.5,88.5,47.9,25.7,32.1,31.9,25.0,42.55


In [79]:
team_data["Location"] = np.where(
    team_data["Location"] == "N", 0, np.where(team_data["Location"] == "H", 1, -1)
)

In [80]:
columns_to_convert = [
    "Location",
    "ADJO",
    "ADJD",
    "EFG_pct",
    "TO_Pct",
    "OR_Pct",
    "FTR_Pct",
    
    "opp_ADJO",
    "opp_ADJD",
    "Opp_EFG_Pct",
    "Opp_TO_Pct",
    "Opp_OR_Pct",
    "Opp_FTR_Pct",
    
]
for col in columns_to_convert:
    team_data[col] = pd.to_numeric(team_data[col], errors="coerce")

In [81]:
team_data_cleaned = team_data.dropna()

X = team_data_cleaned[columns_to_convert]
y_team = team_data_cleaned["Team_Score"]
y_opp = team_data_cleaned["Opponent_Score"]

X_train, X_test, y_team_train, y_team_test = train_test_split(
    X, y_team, test_size=0.2, random_state=42
)
y_opp_train, y_opp_test = train_test_split(y_opp, test_size=0.2, random_state=42)

model_team = LinearRegression()
model_team.fit(X_train, y_team_train)
team_pred = model_team.predict(X_test)

model_opp = LinearRegression()
model_opp.fit(X_train, y_opp_train)
opp_pred = model_opp.predict(X_test)

team_rmse = mean_squared_error(y_team_test, team_pred, squared=False)
opp_rmse = mean_squared_error(y_opp_test, opp_pred, squared=False)
print(f"Team Score RMSE: {team_rmse}, Opponent Score RMSE: {opp_rmse}")

Team Score RMSE: 22.193593904211234, Opponent Score RMSE: 7.667345461015503




In [82]:
team_accuracy = (abs(team_pred - y_team_test) <= threshold).mean() * 100
opp_accuracy = (abs(opp_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy: {team_accuracy:.2f}%")
print(f"Opponent Score Accuracy: {opp_accuracy:.2f}%")

Team Score Accuracy: 0.00%
Opponent Score Accuracy: 50.00%


In [83]:
predict_games = pd.read_csv("./test_data.csv")

In [84]:
predict_games["Location"] = np.where(
    predict_games["Location"] == "Neutral",
    0,
    np.where(predict_games["Location"] == "Home", 1, -1),
)

In [85]:
predict_games

Unnamed: 0,game_id,date,Team,Opponent,Team_Score,Opponent_Score,Location,ADJO,ADJD,EFF,EFG_pct,TO_Pct,OR_Pct,FTR_Pct,ThreePointer_Pct,FG_Pct,opp_ADJO,opp_ADJD,Opp_EFF,Opp_EFG_Pct,Opp_TO_Pct,Opp_OR_Pct,Opp_FTR_Pct,Opp_ThreePointer_Pct,Opp_FG_Pct
0,Cal St. BakersfieldNorth Dakota St.12-23,2024-12-23,Cal St. Bakersfield,North Dakota St.,60,94,-1,85.2,125.3,87.8,46.3,16.1,21.2,38.9,28.57,42.59,133.4,89.4,137.6,62.7,8.8,20.0,68.6,37.04,52.94
1,CharlestonLoyola Chicago12-23,2024-12-23,Charleston,Loyola Chicago,77,68,-1,101.7,91.9,100.6,56.0,20.9,9.7,27.6,42.31,46.55,95.7,97.7,88.8,46.6,24.8,31.6,34.5,25.0,41.38
2,CharlotteMurray St.12-23,2024-12-23,Charlotte,Murray St.,94,90,-1,118.7,110.4,116.6,50.0,17.4,36.4,60.0,22.22,46.67,104.9,115.0,111.7,46.4,8.7,28.2,52.2,23.08,42.03
3,Middle TennesseeTennessee12-23,2024-12-23,Middle Tennessee,Tennessee,64,82,-1,116.4,98.5,91.1,48.3,15.7,15.8,23.7,30.0,40.68,112.6,95.3,116.7,57.1,15.7,25.8,41.1,45.45,48.21
4,NebraskaHawaii12-23,2024-12-23,Nebraska,Hawaii,69,55,-1,107.8,93.4,111.0,53.6,19.3,33.3,20.0,41.18,47.27,103.0,102.5,88.5,47.9,25.7,32.1,31.9,25.0,42.55
5,OaklandOregon St.12-23,2024-12-23,Oakland,Oregon St.,74,80,-1,120.1,108.1,108.5,50.9,20.5,38.2,42.9,19.23,46.43,110.7,112.4,117.3,50.8,16.1,46.2,28.8,45.83,42.42
6,SeattleWashington12-23,2024-12-23,Seattle,Washington,79,70,-1,120.1,92.5,109.6,44.6,11.1,21.9,57.1,22.22,41.07,91.8,112.1,97.1,47.4,16.7,21.6,40.4,25.0,42.11


In [86]:
X = predict_games[columns_to_convert]
display(X)

Unnamed: 0,Location,ADJO,ADJD,EFG_pct,TO_Pct,OR_Pct,FTR_Pct,opp_ADJO,opp_ADJD,Opp_EFG_Pct,Opp_TO_Pct,Opp_OR_Pct,Opp_FTR_Pct
0,-1,85.2,125.3,46.3,16.1,21.2,38.9,133.4,89.4,62.7,8.8,20.0,68.6
1,-1,101.7,91.9,56.0,20.9,9.7,27.6,95.7,97.7,46.6,24.8,31.6,34.5
2,-1,118.7,110.4,50.0,17.4,36.4,60.0,104.9,115.0,46.4,8.7,28.2,52.2
3,-1,116.4,98.5,48.3,15.7,15.8,23.7,112.6,95.3,57.1,15.7,25.8,41.1
4,-1,107.8,93.4,53.6,19.3,33.3,20.0,103.0,102.5,47.9,25.7,32.1,31.9
5,-1,120.1,108.1,50.9,20.5,38.2,42.9,110.7,112.4,50.8,16.1,46.2,28.8
6,-1,120.1,92.5,44.6,11.1,21.9,57.1,91.8,112.1,47.4,16.7,21.6,40.4


In [87]:
from pygam import LinearGAM, s, f
from sklearn.metrics import mean_squared_error

# Fit PyGAM for Team Score
gam_team = LinearGAM(
    s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9) + s(10)
).fit(X_train, y_team_train)
team_gam_pred = gam_team.predict(X_test)

# Fit PyGAM for Opponent Score
gam_opp = LinearGAM(
    s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9) + s(10)
).fit(X_train, y_opp_train)
opp_gam_pred = gam_opp.predict(X_test)

# Calculate RMSE for PyGAM
team_gam_rmse = mean_squared_error(y_team_test, team_gam_pred, squared=False)
opp_gam_rmse = mean_squared_error(y_opp_test, opp_gam_pred, squared=False)
print(f"Team Score RMSE (PyGAM): {team_gam_rmse}")
print(f"Opponent Score RMSE (PyGAM): {opp_gam_rmse}")

# Calculate accuracy for PyGAM
team_gam_accuracy = (abs(team_gam_pred - y_team_test) <= threshold).mean() * 100
opp_gam_accuracy = (abs(opp_gam_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy (PyGAM): {team_gam_accuracy:.2f}%")
print(f"Opponent Score Accuracy (PyGAM): {opp_gam_accuracy:.2f}%")

# Predict scores using PyGAM
team_gam_pred_new = gam_team.predict(X)
opp_gam_pred_new = gam_opp.predict(X)

# Add PyGAM predictions to the table
predictions_gam = pd.DataFrame(
    {
        "Predicted Team Score GAM": team_gam_pred_new,
        "Predicted Opponent Score GAM": opp_gam_pred_new,
    }
)

predict_games = pd.concat([predict_games, predictions_gam], axis=1)

Team Score RMSE (PyGAM): 20.303894843655566
Opponent Score RMSE (PyGAM): 35.185444916123714
Team Score Accuracy (PyGAM): 0.00%
Opponent Score Accuracy (PyGAM): 0.00%




In [88]:
team_pred_new = model_team.predict(X)
opp_pred_new = model_opp.predict(X)

predictions = pd.DataFrame(
    {
        "Predicted Team Score LR": team_pred_new,
        "Predicted Opponent Score LR": opp_pred_new,
    }
)

In [89]:
scores = team_data_cleaned[["Team", "Opponent", "Team_Score", "Opponent_Score"]]
display(scores)

Unnamed: 0,Team,Opponent,Team_Score,Opponent_Score
0,Cal St. Bakersfield,North Dakota St.,60,94
1,Charleston,Loyola Chicago,77,68
2,Charlotte,Murray St.,94,90
3,Middle Tennessee,Tennessee,64,82
4,Nebraska,Hawaii,69,55
5,Oakland,Oregon St.,74,80
6,Seattle,Washington,79,70


In [90]:
predict_games = pd.concat([predict_games, predictions], axis=1)

In [91]:
predict_games[
    [
        "Team",
        "Opponent",
        "Predicted Team Score LR",
        "Predicted Opponent Score LR",
    ]
]

Unnamed: 0,Team,Opponent,Predicted Team Score LR,Predicted Opponent Score LR
0,Cal St. Bakersfield,North Dakota St.,88.908696,95.410622
1,Charleston,Loyola Chicago,64.775363,57.410993
2,Charlotte,Murray St.,93.998231,90.16211
3,Middle Tennessee,Tennessee,64.0,82.0
4,Nebraska,Hawaii,69.0,55.0
5,Oakland,Oregon St.,73.998231,80.16211
6,Seattle,Washington,79.0,70.0


In [92]:
# Add columns for averages
predict_games["Average Predicted Team Score"] = predict_games[
    ["Predicted Team Score LR", "Predicted Team Score GAM"]
].mean(axis=1)

predict_games["Average Predicted Opponent Score"] = predict_games[
    ["Predicted Opponent Score LR", "Predicted Opponent Score GAM"]
].mean(axis=1)

# Select columns to display
columns_to_display = [
    "Team",
    "Opponent",
    "Team_Score",
    "Opponent_Score",
    "Predicted Team Score LR",
    "Predicted Opponent Score LR",
    "Predicted Team Score GAM",
    "Predicted Opponent Score GAM",
    "Average Predicted Team Score",
    "Average Predicted Opponent Score",
]

display(predict_games[columns_to_display])

Unnamed: 0,Team,Opponent,Team_Score,Opponent_Score,Predicted Team Score LR,Predicted Opponent Score LR,Predicted Team Score GAM,Predicted Opponent Score GAM,Average Predicted Team Score,Average Predicted Opponent Score
0,Cal St. Bakersfield,North Dakota St.,60,94,88.908696,95.410622,47.645649,47.361145,68.277172,71.385883
1,Charleston,Loyola Chicago,77,68,64.775363,57.410993,54.16461,53.732079,59.469986,55.571536
2,Charlotte,Murray St.,94,90,93.998231,90.16211,59.235701,58.376422,76.616966,74.269266
3,Middle Tennessee,Tennessee,64,82,64.0,82.0,62.631917,63.005566,63.315958,72.502783
4,Nebraska,Hawaii,69,55,69.0,55.0,62.393468,61.413751,65.696734,58.206875
5,Oakland,Oregon St.,74,80,73.998231,80.16211,73.857716,73.431952,73.927974,76.797031
6,Seattle,Washington,79,70,79.0,70.0,76.037235,75.077694,77.518617,72.538847
