In [93]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import numpy as np

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [94]:
threshold = 6
team_data = pd.read_csv("./train_data.csv")
display(team_data.head())

Unnamed: 0,game_id,date,Team,Opponent,Team_Score,Opponent_Score,Location,ADJO,ADJD,EFF,EFG_pct,TO_Pct,OR_Pct,FTR_Pct,ThreePointer_Pct,opp_ADJO,opp_ADJD,Opp_EFF,Opp_EFG_Pct,Opp_TO_Pct,Opp_OR_Pct,Opp_FTR_Pct,Opp_ThreePointer_Pct
0,Abilene ChristianBaylor12-9,2024-12-09,Abilene Christian,Baylor,57,88,A,94.8,112.7,86.7,36.4,15.2,27.0,41.8,14.29,130.2,93.9,133.8,66.1,13.7,35.7,27.1,44.44
1,Abilene ChristianKennesaw St.11-20,2024-11-20,Abilene Christian,Kennesaw St.,78,84,A,108.4,115.4,107.1,50.0,9.6,20.0,50.8,33.33,112.7,114.4,115.3,62.2,26.1,31.8,100.0,43.75
2,Abilene ChristianMontana St.11-26,2024-11-26,Abilene Christian,Montana St.,59,85,A,90.6,129.1,91.6,44.0,12.4,20.6,20.7,16.67,131.3,95.8,131.9,75.0,20.2,25.0,26.0,65.38
3,Abilene ChristianNebraska Omaha11-30,2024-11-30,Abilene Christian,Nebraska Omaha,71,55,A,105.4,84.1,107.2,51.9,15.1,24.1,35.2,30.77,81.2,114.1,83.1,42.0,24.2,17.2,54.5,31.25
4,Abilene ChristianNew Mexico St.12-4,2024-12-04,Abilene Christian,New Mexico St.,78,70,A,117.5,106.9,117.8,58.9,16.6,32.0,77.8,43.75,105.8,123.0,105.7,51.0,18.1,27.6,58.0,33.33


In [95]:
team_data["Location"] = np.where(
    team_data["Location"] == "N", 0, np.where(team_data["Location"] == "H", 1, -1)
)

In [96]:
columns_to_convert = [
    "Location",
    "ADJO",
    "ADJD",
    "EFG_pct",
    "TO_Pct",
    "OR_Pct",
    "FTR_Pct",
    
    "opp_ADJO",
    "opp_ADJD",
    "Opp_EFG_Pct",
    "Opp_TO_Pct",
    "Opp_OR_Pct",
    "Opp_FTR_Pct",
    
]
for col in columns_to_convert:
    team_data[col] = pd.to_numeric(team_data[col], errors="coerce")

In [97]:
team_data_cleaned = team_data.dropna()

X = team_data_cleaned[columns_to_convert]
y_team = team_data_cleaned["Team_Score"]
y_opp = team_data_cleaned["Opponent_Score"]

X_train, X_test, y_team_train, y_team_test = train_test_split(
    X, y_team, test_size=0.2, random_state=42
)
y_opp_train, y_opp_test = train_test_split(y_opp, test_size=0.2, random_state=42)

model_team = LinearRegression()
model_team.fit(X_train, y_team_train)
team_pred = model_team.predict(X_test)

model_opp = LinearRegression()
model_opp.fit(X_train, y_opp_train)
opp_pred = model_opp.predict(X_test)

team_rmse = mean_squared_error(y_team_test, team_pred, squared=False)
opp_rmse = mean_squared_error(y_opp_test, opp_pred, squared=False)
print(f"Team Score RMSE: {team_rmse}, Opponent Score RMSE: {opp_rmse}")

Team Score RMSE: 6.204308023366582, Opponent Score RMSE: 6.6469850727915585




In [98]:
team_accuracy = (abs(team_pred - y_team_test) <= threshold).mean() * 100
opp_accuracy = (abs(opp_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy: {team_accuracy:.2f}%")
print(f"Opponent Score Accuracy: {opp_accuracy:.2f}%")

Team Score Accuracy: 70.12%
Opponent Score Accuracy: 66.42%


In [99]:
predict_games = pd.read_csv("./test_file_29th.csv")

In [100]:
predict_games

Unnamed: 0,date,Team,Opponent,Location,ADJO,ADJD,EFG_pct,TO_Pct,OR_Pct,FTR_Pct,ThreePointer_Pct,Opp_ADJO,Opp_ADJD,Opp_EFG_pct,Opp_TO_Pct,Opp_OR_Pct,Opp_FTR_Pct,Opp_ThreePointer_Pct
0,2024-12-29,Texas St.,UT Arlington,H,109.504793,104.248766,51.8,20.1,35.0,39.5,39.0,108.781738,110.471294,51.4,18.0,33.4,34.3,37.8
1,2024-12-29,Washington,NJIT,H,103.623921,100.980171,49.2,17.1,31.0,41.7,32.4,90.805077,110.393403,44.2,19.5,27.1,27.5,29.7
2,2024-12-29,North Carolina,Campbell,H,116.097482,99.425272,52.8,15.0,28.7,39.8,31.6,96.043679,108.132847,47.5,17.7,22.6,35.9,30.7
3,2024-12-29,Iona,Harvard,H,96.975713,106.396593,44.8,23.8,41.3,34.3,30.3,98.117396,109.223416,48.7,18.9,26.2,24.7,30.1
4,2024-12-29,Army,UTSA,H,104.290586,114.008871,49.4,14.0,28.6,26.3,31.9,104.279709,109.569165,48.3,18.8,30.4,29.8,34.0
5,2024-12-29,Northwestern,Northeastern,H,108.543184,92.738343,49.4,13.5,29.9,33.5,33.2,101.571496,101.445543,51.9,18.5,29.3,31.3,32.4
6,2024-12-29,Alabama,South Dakota St.,H,122.500118,95.996245,54.7,17.6,36.7,43.4,31.2,109.213988,102.139328,52.4,18.3,31.1,29.4,32.7
7,2024-12-29,Ohio St.,Indiana St.,H,114.330437,97.647266,57.4,15.3,28.1,35.2,39.9,109.379733,113.091958,59.1,21.2,26.5,29.6,38.0
8,2024-12-29,Illinois,Chicago St.,H,116.083343,90.659572,51.5,16.0,36.4,39.9,32.3,88.940352,110.366604,40.9,20.5,24.3,27.8,29.1
9,2024-12-29,Oregon,Weber St.,H,116.967225,95.369118,52.2,16.4,34.4,40.3,33.6,104.761323,111.655595,49.4,16.6,29.0,33.3,30.8


In [101]:
predict_games["Location"] = np.where(
    predict_games["Location"] == "N",
    0,
    np.where(predict_games["Location"] == "H", 1, -1),
)

In [102]:
predict_games

Unnamed: 0,date,Team,Opponent,Location,ADJO,ADJD,EFG_pct,TO_Pct,OR_Pct,FTR_Pct,ThreePointer_Pct,Opp_ADJO,Opp_ADJD,Opp_EFG_pct,Opp_TO_Pct,Opp_OR_Pct,Opp_FTR_Pct,Opp_ThreePointer_Pct
0,2024-12-29,Texas St.,UT Arlington,1,109.504793,104.248766,51.8,20.1,35.0,39.5,39.0,108.781738,110.471294,51.4,18.0,33.4,34.3,37.8
1,2024-12-29,Washington,NJIT,1,103.623921,100.980171,49.2,17.1,31.0,41.7,32.4,90.805077,110.393403,44.2,19.5,27.1,27.5,29.7
2,2024-12-29,North Carolina,Campbell,1,116.097482,99.425272,52.8,15.0,28.7,39.8,31.6,96.043679,108.132847,47.5,17.7,22.6,35.9,30.7
3,2024-12-29,Iona,Harvard,1,96.975713,106.396593,44.8,23.8,41.3,34.3,30.3,98.117396,109.223416,48.7,18.9,26.2,24.7,30.1
4,2024-12-29,Army,UTSA,1,104.290586,114.008871,49.4,14.0,28.6,26.3,31.9,104.279709,109.569165,48.3,18.8,30.4,29.8,34.0
5,2024-12-29,Northwestern,Northeastern,1,108.543184,92.738343,49.4,13.5,29.9,33.5,33.2,101.571496,101.445543,51.9,18.5,29.3,31.3,32.4
6,2024-12-29,Alabama,South Dakota St.,1,122.500118,95.996245,54.7,17.6,36.7,43.4,31.2,109.213988,102.139328,52.4,18.3,31.1,29.4,32.7
7,2024-12-29,Ohio St.,Indiana St.,1,114.330437,97.647266,57.4,15.3,28.1,35.2,39.9,109.379733,113.091958,59.1,21.2,26.5,29.6,38.0
8,2024-12-29,Illinois,Chicago St.,1,116.083343,90.659572,51.5,16.0,36.4,39.9,32.3,88.940352,110.366604,40.9,20.5,24.3,27.8,29.1
9,2024-12-29,Oregon,Weber St.,1,116.967225,95.369118,52.2,16.4,34.4,40.3,33.6,104.761323,111.655595,49.4,16.6,29.0,33.3,30.8


In [103]:
predict_games.rename(columns={
    'Opp_ADJO': 'opp_ADJO',
    'Opp_ADJD': 'opp_ADJD',
    'Opp_EFG_pct': 'Opp_EFG_Pct'
}, inplace=True)

In [104]:
X = predict_games[columns_to_convert]
display(X)

Unnamed: 0,Location,ADJO,ADJD,EFG_pct,TO_Pct,OR_Pct,FTR_Pct,opp_ADJO,opp_ADJD,Opp_EFG_Pct,Opp_TO_Pct,Opp_OR_Pct,Opp_FTR_Pct
0,1,109.504793,104.248766,51.8,20.1,35.0,39.5,108.781738,110.471294,51.4,18.0,33.4,34.3
1,1,103.623921,100.980171,49.2,17.1,31.0,41.7,90.805077,110.393403,44.2,19.5,27.1,27.5
2,1,116.097482,99.425272,52.8,15.0,28.7,39.8,96.043679,108.132847,47.5,17.7,22.6,35.9
3,1,96.975713,106.396593,44.8,23.8,41.3,34.3,98.117396,109.223416,48.7,18.9,26.2,24.7
4,1,104.290586,114.008871,49.4,14.0,28.6,26.3,104.279709,109.569165,48.3,18.8,30.4,29.8
5,1,108.543184,92.738343,49.4,13.5,29.9,33.5,101.571496,101.445543,51.9,18.5,29.3,31.3
6,1,122.500118,95.996245,54.7,17.6,36.7,43.4,109.213988,102.139328,52.4,18.3,31.1,29.4
7,1,114.330437,97.647266,57.4,15.3,28.1,35.2,109.379733,113.091958,59.1,21.2,26.5,29.6
8,1,116.083343,90.659572,51.5,16.0,36.4,39.9,88.940352,110.366604,40.9,20.5,24.3,27.8
9,1,116.967225,95.369118,52.2,16.4,34.4,40.3,104.761323,111.655595,49.4,16.6,29.0,33.3


In [105]:
from pygam import LinearGAM, s, f
from sklearn.metrics import mean_squared_error

# Fit PyGAM for Team Score
gam_team = LinearGAM(
    s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9) + s(10)
).fit(X_train, y_team_train)
team_gam_pred = gam_team.predict(X_test)

# Fit PyGAM for Opponent Score
gam_opp = LinearGAM(
    s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9) + s(10)
).fit(X_train, y_opp_train)
opp_gam_pred = gam_opp.predict(X_test)

# Calculate RMSE for PyGAM
team_gam_rmse = mean_squared_error(y_team_test, team_gam_pred, squared=False)
opp_gam_rmse = mean_squared_error(y_opp_test, opp_gam_pred, squared=False)
print(f"Team Score RMSE (PyGAM): {team_gam_rmse}")
print(f"Opponent Score RMSE (PyGAM): {opp_gam_rmse}")

# Calculate accuracy for PyGAM
team_gam_accuracy = (abs(team_gam_pred - y_team_test) <= threshold).mean() * 100
opp_gam_accuracy = (abs(opp_gam_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy (PyGAM): {team_gam_accuracy:.2f}%")
print(f"Opponent Score Accuracy (PyGAM): {opp_gam_accuracy:.2f}%")

# Predict scores using PyGAM
team_gam_pred_new = gam_team.predict(X)
opp_gam_pred_new = gam_opp.predict(X)

# Add PyGAM predictions to the table
predictions_gam = pd.DataFrame(
    {
        "Predicted Team Score GAM": team_gam_pred_new,
        "Predicted Opponent Score GAM": opp_gam_pred_new,
    }
)

predict_games = pd.concat([predict_games, predictions_gam], axis=1)

Team Score RMSE (PyGAM): 6.316753294080223
Opponent Score RMSE (PyGAM): 7.49093610551907
Team Score Accuracy (PyGAM): 69.63%
Opponent Score Accuracy (PyGAM): 60.49%




In [106]:
team_pred_new = model_team.predict(X)
opp_pred_new = model_opp.predict(X)

predictions = pd.DataFrame(
    {
        "Predicted Team Score LR": team_pred_new,
        "Predicted Opponent Score LR": opp_pred_new,
    }
)

In [107]:
scores = team_data_cleaned[["Team", "Opponent", "Team_Score", "Opponent_Score"]]

In [108]:
predict_games = pd.concat([predict_games, predictions], axis=1)

In [109]:
predict_games[
    [
        "Team",
        "Opponent",
        "Predicted Team Score LR",
        "Predicted Opponent Score LR",
    ]
]

Unnamed: 0,Team,Opponent,Predicted Team Score LR,Predicted Opponent Score LR
0,Texas St.,UT Arlington,74.618049,73.674042
1,Washington,NJIT,72.862243,61.473652
2,North Carolina,Campbell,78.440134,65.566165
3,Iona,Harvard,66.134812,65.476205
4,Army,UTSA,71.652346,67.805974
5,Northwestern,Northeastern,74.356999,70.669792
6,Alabama,South Dakota St.,81.053669,72.837062
7,Ohio St.,Indiana St.,81.037897,75.057919
8,Illinois,Chicago St.,78.554663,55.619463
9,Oregon,Weber St.,78.389449,70.188665


In [110]:
# Add columns for averages
predict_games["Average Predicted Team Score"] = predict_games[
    ["Predicted Team Score LR", "Predicted Team Score GAM"]
].mean(axis=1)

predict_games["Average Predicted Opponent Score"] = predict_games[
    ["Predicted Opponent Score LR", "Predicted Opponent Score GAM"]
].mean(axis=1)

# Select columns to display
columns_to_display = [
    "Team",
    "Opponent",
    #"Team_Score",
    #"Opponent_Score",
    "Predicted Team Score LR",
    "Predicted Opponent Score LR",
    "Predicted Team Score GAM",
    "Predicted Opponent Score GAM",
    "Average Predicted Team Score",
    "Average Predicted Opponent Score",
]

display(predict_games[columns_to_display])

Unnamed: 0,Team,Opponent,Predicted Team Score LR,Predicted Opponent Score LR,Predicted Team Score GAM,Predicted Opponent Score GAM,Average Predicted Team Score,Average Predicted Opponent Score
0,Texas St.,UT Arlington,74.618049,73.674042,76.148729,73.056603,75.383389,73.365322
1,Washington,NJIT,72.862243,61.473652,76.321223,65.670266,74.591733,63.571959
2,North Carolina,Campbell,78.440134,65.566165,80.331642,66.557623,79.385888,66.061894
3,Iona,Harvard,66.134812,65.476205,68.214366,68.406531,67.174589,66.941368
4,Army,UTSA,71.652346,67.805974,72.955482,71.645544,72.303914,69.725759
5,Northwestern,Northeastern,74.356999,70.669792,75.265997,66.273845,74.811498,68.471819
6,Alabama,South Dakota St.,81.053669,72.837062,82.407994,69.354297,81.730832,71.095679
7,Ohio St.,Indiana St.,81.037897,75.057919,82.204232,72.96066,81.621064,74.009289
8,Illinois,Chicago St.,78.554663,55.619463,78.68321,56.454626,78.618936,56.037045
9,Oregon,Weber St.,78.389449,70.188665,80.558706,68.912061,79.474077,69.550363
