In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import numpy as np

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [22]:
threshold = 6
team_data = pd.read_csv("./train_data.csv")
display(team_data.head())

Unnamed: 0,game_id,date,Team,Opponent,Team_Score,Opponent_Score,Location,ADJO,ADJD,EFF,EFG_pct,TO_Pct,OR_Pct,FTR_Pct,ThreePointer_Pct,opp_ADJO,opp_ADJD,Opp_EFF,Opp_EFG_Pct,Opp_TO_Pct,Opp_OR_Pct,Opp_FTR_Pct,Opp_ThreePointer_Pct
0,Abilene ChristianBaylor12-9,2024-12-09,Abilene Christian,Baylor,57,88,A,94.8,112.8,86.7,36.4,15.2,27.0,41.8,14.29,129.2,94.3,133.8,66.1,13.7,35.7,27.1,44.44
1,Abilene ChristianKennesaw St.11-20,2024-11-20,Abilene Christian,Kennesaw St.,78,84,A,108.5,115.6,107.1,50.0,9.6,20.0,50.8,33.33,111.8,114.9,115.3,62.2,26.1,31.8,100.0,43.75
2,Abilene ChristianMontana St.11-26,2024-11-26,Abilene Christian,Montana St.,59,85,A,90.5,129.0,91.6,44.0,12.4,20.6,20.7,16.67,130.3,96.3,131.9,75.0,20.2,25.0,26.0,65.38
3,Abilene ChristianNebraska Omaha11-30,2024-11-30,Abilene Christian,Nebraska Omaha,71,55,A,105.5,84.2,107.2,51.9,15.1,24.1,35.2,30.77,80.6,114.6,83.1,42.0,24.2,17.2,54.5,31.25
4,Abilene ChristianNew Mexico St.12-4,2024-12-04,Abilene Christian,New Mexico St.,78,70,A,117.8,107.1,117.8,58.9,16.6,32.0,77.8,43.75,105.0,123.6,105.7,51.0,18.1,27.6,58.0,33.33


In [23]:
team_data["Location"] = np.where(
    team_data["Location"] == "N", 0, np.where(team_data["Location"] == "H", 1, -1)
)

In [24]:
columns_to_convert = [
    "Location",
    "ADJO",
    "ADJD",
    "EFG_pct",
    "TO_Pct",
    "OR_Pct",
    "FTR_Pct",
    
    "opp_ADJO",
    "opp_ADJD",
    "Opp_EFG_Pct",
    "Opp_TO_Pct",
    "Opp_OR_Pct",
    "Opp_FTR_Pct",
    
]
for col in columns_to_convert:
    team_data[col] = pd.to_numeric(team_data[col], errors="coerce")

In [25]:
team_data_cleaned = team_data.dropna()

X = team_data_cleaned[columns_to_convert]
y_team = team_data_cleaned["Team_Score"]
y_opp = team_data_cleaned["Opponent_Score"]

X_train, X_test, y_team_train, y_team_test = train_test_split(
    X, y_team, test_size=0.2, random_state=42
)
y_opp_train, y_opp_test = train_test_split(y_opp, test_size=0.2, random_state=42)

model_team = LinearRegression()
model_team.fit(X_train, y_team_train)
team_pred = model_team.predict(X_test)

model_opp = LinearRegression()
model_opp.fit(X_train, y_opp_train)
opp_pred = model_opp.predict(X_test)

team_rmse = mean_squared_error(y_team_test, team_pred, squared=False)
opp_rmse = mean_squared_error(y_opp_test, opp_pred, squared=False)
print(f"Team Score RMSE: {team_rmse}, Opponent Score RMSE: {opp_rmse}")

Team Score RMSE: 6.231575506991563, Opponent Score RMSE: 6.6764070173370635




In [26]:
team_accuracy = (abs(team_pred - y_team_test) <= threshold).mean() * 100
opp_accuracy = (abs(opp_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy: {team_accuracy:.2f}%")
print(f"Opponent Score Accuracy: {opp_accuracy:.2f}%")

Team Score Accuracy: 72.66%
Opponent Score Accuracy: 67.73%


In [27]:
predict_games = pd.read_csv("./test_file_28th.csv")

In [28]:
predict_games

Unnamed: 0,date,Team,Opponent,Location,ADJO,ADJD,EFG_pct,TO_Pct,OR_Pct,FTR_Pct,ThreePointer_Pct,Opp_ADJO,Opp_ADJD,Opp_EFG_pct,Opp_TO_Pct,Opp_OR_Pct,Opp_FTR_Pct,Opp_ThreePointer_Pct
0,2024-12-28,Richmond,Florida Gulf Coast,H,101.758087,109.741049,48.3,15.0,20.7,40.6,29.2,105.710407,106.895812,51.6,17.2,25.6,18.1,33.1
1,2024-12-28,San Diego St.,Utah St.,H,110.402016,93.446262,51.3,16.0,30.4,26.3,36.1,116.010122,97.818219,53.5,17.3,39.5,38.1,31.6
2,2024-12-28,Georgia Tech,Alabama A&M,H,105.714147,101.88589,48.7,16.0,29.8,31.8,32.7,94.540931,118.331617,43.9,20.0,32.8,36.4,31.9
3,2024-12-28,San Jose St.,Boise St.,H,108.957965,108.64251,51.3,15.4,26.7,38.0,35.2,113.108156,97.76246,51.5,16.0,33.9,42.7,30.0
4,2024-12-28,Colorado St.,New Mexico,H,105.539411,100.325033,51.8,16.8,23.1,27.9,30.6,112.930069,100.27481,50.0,15.6,34.2,41.0,34.6
5,2024-12-28,UCLA,Gonzaga,N,109.930056,87.869252,54.3,16.4,34.2,34.9,35.8,124.1186,93.745664,55.7,13.5,32.9,32.9,34.8
6,2024-12-28,Wyoming,Nevada,H,104.934785,103.629449,52.9,21.1,34.5,36.9,34.8,115.399439,101.214327,57.4,15.9,29.7,38.8,41.6
7,2024-12-28,Texas A&M,Abilene Christian,H,112.02133,92.040856,47.8,18.4,44.5,39.8,31.6,100.344692,105.008776,48.3,19.1,28.6,43.6,27.2
8,2024-12-28,Louisville,Eastern Kentucky,H,114.541145,98.320567,51.2,17.1,33.9,37.1,29.8,110.376078,107.352351,48.5,15.4,33.5,22.1,32.0
9,2024-12-28,Maryland,Maryland Eastern Shore,H,117.891812,89.667452,57.3,13.4,33.9,29.2,37.1,99.924968,120.725914,43.8,19.2,22.0,35.3,30.0


In [29]:
predict_games["Location"] = np.where(
    predict_games["Location"] == "N",
    0,
    np.where(predict_games["Location"] == "H", 1, -1),
)

In [30]:
predict_games

Unnamed: 0,date,Team,Opponent,Location,ADJO,ADJD,EFG_pct,TO_Pct,OR_Pct,FTR_Pct,ThreePointer_Pct,Opp_ADJO,Opp_ADJD,Opp_EFG_pct,Opp_TO_Pct,Opp_OR_Pct,Opp_FTR_Pct,Opp_ThreePointer_Pct
0,2024-12-28,Richmond,Florida Gulf Coast,1,101.758087,109.741049,48.3,15.0,20.7,40.6,29.2,105.710407,106.895812,51.6,17.2,25.6,18.1,33.1
1,2024-12-28,San Diego St.,Utah St.,1,110.402016,93.446262,51.3,16.0,30.4,26.3,36.1,116.010122,97.818219,53.5,17.3,39.5,38.1,31.6
2,2024-12-28,Georgia Tech,Alabama A&M,1,105.714147,101.88589,48.7,16.0,29.8,31.8,32.7,94.540931,118.331617,43.9,20.0,32.8,36.4,31.9
3,2024-12-28,San Jose St.,Boise St.,1,108.957965,108.64251,51.3,15.4,26.7,38.0,35.2,113.108156,97.76246,51.5,16.0,33.9,42.7,30.0
4,2024-12-28,Colorado St.,New Mexico,1,105.539411,100.325033,51.8,16.8,23.1,27.9,30.6,112.930069,100.27481,50.0,15.6,34.2,41.0,34.6
5,2024-12-28,UCLA,Gonzaga,0,109.930056,87.869252,54.3,16.4,34.2,34.9,35.8,124.1186,93.745664,55.7,13.5,32.9,32.9,34.8
6,2024-12-28,Wyoming,Nevada,1,104.934785,103.629449,52.9,21.1,34.5,36.9,34.8,115.399439,101.214327,57.4,15.9,29.7,38.8,41.6
7,2024-12-28,Texas A&M,Abilene Christian,1,112.02133,92.040856,47.8,18.4,44.5,39.8,31.6,100.344692,105.008776,48.3,19.1,28.6,43.6,27.2
8,2024-12-28,Louisville,Eastern Kentucky,1,114.541145,98.320567,51.2,17.1,33.9,37.1,29.8,110.376078,107.352351,48.5,15.4,33.5,22.1,32.0
9,2024-12-28,Maryland,Maryland Eastern Shore,1,117.891812,89.667452,57.3,13.4,33.9,29.2,37.1,99.924968,120.725914,43.8,19.2,22.0,35.3,30.0


In [31]:
predict_games.rename(columns={
    'Opp_ADJO': 'opp_ADJO',
    'Opp_ADJD': 'opp_ADJD',
    'Opp_EFG_pct': 'Opp_EFG_Pct'
}, inplace=True)

In [32]:
X = predict_games[columns_to_convert]
display(X)

Unnamed: 0,Location,ADJO,ADJD,EFG_pct,TO_Pct,OR_Pct,FTR_Pct,opp_ADJO,opp_ADJD,Opp_EFG_Pct,Opp_TO_Pct,Opp_OR_Pct,Opp_FTR_Pct
0,1,101.758087,109.741049,48.3,15.0,20.7,40.6,105.710407,106.895812,51.6,17.2,25.6,18.1
1,1,110.402016,93.446262,51.3,16.0,30.4,26.3,116.010122,97.818219,53.5,17.3,39.5,38.1
2,1,105.714147,101.88589,48.7,16.0,29.8,31.8,94.540931,118.331617,43.9,20.0,32.8,36.4
3,1,108.957965,108.64251,51.3,15.4,26.7,38.0,113.108156,97.76246,51.5,16.0,33.9,42.7
4,1,105.539411,100.325033,51.8,16.8,23.1,27.9,112.930069,100.27481,50.0,15.6,34.2,41.0
5,0,109.930056,87.869252,54.3,16.4,34.2,34.9,124.1186,93.745664,55.7,13.5,32.9,32.9
6,1,104.934785,103.629449,52.9,21.1,34.5,36.9,115.399439,101.214327,57.4,15.9,29.7,38.8
7,1,112.02133,92.040856,47.8,18.4,44.5,39.8,100.344692,105.008776,48.3,19.1,28.6,43.6
8,1,114.541145,98.320567,51.2,17.1,33.9,37.1,110.376078,107.352351,48.5,15.4,33.5,22.1
9,1,117.891812,89.667452,57.3,13.4,33.9,29.2,99.924968,120.725914,43.8,19.2,22.0,35.3


In [33]:
from pygam import LinearGAM, s, f
from sklearn.metrics import mean_squared_error

# Fit PyGAM for Team Score
gam_team = LinearGAM(
    s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9) + s(10)
).fit(X_train, y_team_train)
team_gam_pred = gam_team.predict(X_test)

# Fit PyGAM for Opponent Score
gam_opp = LinearGAM(
    s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9) + s(10)
).fit(X_train, y_opp_train)
opp_gam_pred = gam_opp.predict(X_test)

# Calculate RMSE for PyGAM
team_gam_rmse = mean_squared_error(y_team_test, team_gam_pred, squared=False)
opp_gam_rmse = mean_squared_error(y_opp_test, opp_gam_pred, squared=False)
print(f"Team Score RMSE (PyGAM): {team_gam_rmse}")
print(f"Opponent Score RMSE (PyGAM): {opp_gam_rmse}")

# Calculate accuracy for PyGAM
team_gam_accuracy = (abs(team_gam_pred - y_team_test) <= threshold).mean() * 100
opp_gam_accuracy = (abs(opp_gam_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy (PyGAM): {team_gam_accuracy:.2f}%")
print(f"Opponent Score Accuracy (PyGAM): {opp_gam_accuracy:.2f}%")

# Predict scores using PyGAM
team_gam_pred_new = gam_team.predict(X)
opp_gam_pred_new = gam_opp.predict(X)

# Add PyGAM predictions to the table
predictions_gam = pd.DataFrame(
    {
        "Predicted Team Score GAM": team_gam_pred_new,
        "Predicted Opponent Score GAM": opp_gam_pred_new,
    }
)

predict_games = pd.concat([predict_games, predictions_gam], axis=1)

Team Score RMSE (PyGAM): 6.295343605206589
Opponent Score RMSE (PyGAM): 7.050226750362496
Team Score Accuracy (PyGAM): 72.66%
Opponent Score Accuracy (PyGAM): 63.55%




In [34]:
team_pred_new = model_team.predict(X)
opp_pred_new = model_opp.predict(X)

predictions = pd.DataFrame(
    {
        "Predicted Team Score LR": team_pred_new,
        "Predicted Opponent Score LR": opp_pred_new,
    }
)

In [35]:
scores = team_data_cleaned[["Team", "Opponent", "Team_Score", "Opponent_Score"]]

In [36]:
predict_games = pd.concat([predict_games, predictions], axis=1)

In [37]:
predict_games[
    [
        "Team",
        "Opponent",
        "Predicted Team Score LR",
        "Predicted Opponent Score LR",
    ]
]

Unnamed: 0,Team,Opponent,Predicted Team Score LR,Predicted Opponent Score LR
0,Richmond,Florida Gulf Coast,69.353458,69.820268
1,San Diego St.,Utah St.,73.373351,79.003429
2,Georgia Tech,Alabama A&M,71.861372,62.553392
3,San Jose St.,Boise St.,73.755773,77.264403
4,Colorado St.,New Mexico,70.93222,76.205586
5,UCLA,Gonzaga,77.702174,83.569214
6,Wyoming,Nevada,74.156027,81.117827
7,Texas A&M,Abilene Christian,76.090714,67.092243
8,Louisville,Eastern Kentucky,75.131021,71.071076
9,Maryland,Maryland Eastern Shore,83.463836,59.37908


In [38]:
# Add columns for averages
predict_games["Average Predicted Team Score"] = predict_games[
    ["Predicted Team Score LR", "Predicted Team Score GAM"]
].mean(axis=1)

predict_games["Average Predicted Opponent Score"] = predict_games[
    ["Predicted Opponent Score LR", "Predicted Opponent Score GAM"]
].mean(axis=1)

# Select columns to display
columns_to_display = [
    "Team",
    "Opponent",
    #"Team_Score",
    #"Opponent_Score",
    "Predicted Team Score LR",
    "Predicted Opponent Score LR",
    "Predicted Team Score GAM",
    "Predicted Opponent Score GAM",
    "Average Predicted Team Score",
    "Average Predicted Opponent Score",
]

display(predict_games[columns_to_display])

Unnamed: 0,Team,Opponent,Predicted Team Score LR,Predicted Opponent Score LR,Predicted Team Score GAM,Predicted Opponent Score GAM,Average Predicted Team Score,Average Predicted Opponent Score
0,Richmond,Florida Gulf Coast,69.353458,69.820268,71.96846,75.09185,70.660959,72.456059
1,San Diego St.,Utah St.,73.373351,79.003429,75.622432,73.297351,74.497892,76.15039
2,Georgia Tech,Alabama A&M,71.861372,62.553392,73.946974,64.736861,72.904173,63.645127
3,San Jose St.,Boise St.,73.755773,77.264403,76.703117,77.395369,75.229445,77.329886
4,Colorado St.,New Mexico,70.93222,76.205586,71.293445,74.099146,71.112832,75.152366
5,UCLA,Gonzaga,77.702174,83.569214,80.17917,76.120794,78.940672,79.845004
6,Wyoming,Nevada,74.156027,81.117827,74.946571,77.672925,74.551299,79.395376
7,Texas A&M,Abilene Christian,76.090714,67.092243,76.347987,61.804466,76.21935,64.448354
8,Louisville,Eastern Kentucky,75.131021,71.071076,77.186915,71.16704,76.158968,71.119058
9,Maryland,Maryland Eastern Shore,83.463836,59.37908,83.878335,59.215811,83.671086,59.297446


In [53]:
actual_scores = pd.read_csv("./test_data.csv")

result = actual_scores[
    ((actual_scores['Team'].isin(predict_games['Team']) | actual_scores['Team'].isin(predict_games['Opponent'])) &
     (actual_scores['Opponent'].isin(predict_games['Team']) | actual_scores['Opponent'].isin(predict_games['Opponent'])))
]
result = result[['Team', 'Opponent', 'Team_Score', 'Opponent_Score']]

join1 = pd.merge(result, predict_games, left_on=['Team', 'Opponent'], right_on=['Team', 'Opponent'], how='outer', suffixes=('_df1', '_df2'))
join2 = pd.merge(result, predict_games, left_on=['Team', 'Opponent'], right_on=['Opponent', 'Team'], how='outer', suffixes=('_df1', '_df2'))
#final_result = pd.concat([join1, join2])
#final_result = final_result.drop_duplicates().reset_index(drop=True)
display(join1)

display(join2)
display(result)

Unnamed: 0,Team,Opponent,Team_Score,Opponent_Score,date,Location,ADJO,ADJD,EFG_pct,TO_Pct,OR_Pct,FTR_Pct,ThreePointer_Pct,opp_ADJO,opp_ADJD,Opp_EFG_Pct,Opp_TO_Pct,Opp_OR_Pct,Opp_FTR_Pct,Opp_ThreePointer_Pct,Predicted Team Score GAM,Predicted Opponent Score GAM,Predicted Team Score LR,Predicted Opponent Score LR,Average Predicted Team Score,Average Predicted Opponent Score
0,Abilene Christian,Texas A&M,54.0,92.0,,,,,,,,,,,,,,,,,,,,,,
1,Alabama A&M,Georgia Tech,49.0,92.0,,,,,,,,,,,,,,,,,,,,,,
2,Boise St.,San Jose St.,73.0,71.0,,,,,,,,,,,,,,,,,,,,,,
3,Boston College,Fairleigh Dickinson,,,2024-12-28,1.0,104.983408,106.031757,47.5,18.0,34.5,34.3,33.9,103.233639,114.72275,49.7,17.3,24.9,26.8,34.9,73.019461,71.169732,71.190804,67.022395,72.105132,69.096064
4,Bucknell,Syracuse,63.0,75.0,,,,,,,,,,,,,,,,,,,,,,
5,Colorado St.,New Mexico,,,2024-12-28,1.0,105.539411,100.325033,51.8,16.8,23.1,27.9,30.6,112.930069,100.27481,50.0,15.6,34.2,41.0,34.6,71.293445,74.099146,70.93222,76.205586,71.112832,75.152366
6,Eastern Kentucky,Louisville,76.0,78.0,,,,,,,,,,,,,,,,,,,,,,
7,Fairleigh Dickinson,Boston College,70.0,78.0,,,,,,,,,,,,,,,,,,,,,,
8,Florida Gulf Coast,Richmond,75.0,57.0,,,,,,,,,,,,,,,,,,,,,,
9,Georgia St.,Mercer,,,2024-12-28,1.0,99.210865,108.261554,46.2,19.7,32.4,37.6,31.0,103.091061,112.357757,51.6,18.1,28.7,28.2,31.5,70.877093,74.099157,68.190579,70.302649,69.533836,72.200903


Unnamed: 0,Team_df1,Opponent_df1,Team_Score,Opponent_Score,date,Team_df2,Opponent_df2,Location,ADJO,ADJD,EFG_pct,TO_Pct,OR_Pct,FTR_Pct,ThreePointer_Pct,opp_ADJO,opp_ADJD,Opp_EFG_Pct,Opp_TO_Pct,Opp_OR_Pct,Opp_FTR_Pct,Opp_ThreePointer_Pct,Predicted Team Score GAM,Predicted Opponent Score GAM,Predicted Team Score LR,Predicted Opponent Score LR,Average Predicted Team Score,Average Predicted Opponent Score
0,Abilene Christian,Texas A&M,54.0,92.0,2024-12-28,Texas A&M,Abilene Christian,1,112.02133,92.040856,47.8,18.4,44.5,39.8,31.6,100.344692,105.008776,48.3,19.1,28.6,43.6,27.2,76.347987,61.804466,76.090714,67.092243,76.21935,64.448354
1,Alabama A&M,Georgia Tech,49.0,92.0,2024-12-28,Georgia Tech,Alabama A&M,1,105.714147,101.88589,48.7,16.0,29.8,31.8,32.7,94.540931,118.331617,43.9,20.0,32.8,36.4,31.9,73.946974,64.736861,71.861372,62.553392,72.904173,63.645127
2,Boise St.,San Jose St.,73.0,71.0,2024-12-28,San Jose St.,Boise St.,1,108.957965,108.64251,51.3,15.4,26.7,38.0,35.2,113.108156,97.76246,51.5,16.0,33.9,42.7,30.0,76.703117,77.395369,73.755773,77.264403,75.229445,77.329886
3,Bucknell,Syracuse,63.0,75.0,2024-12-28,Syracuse,Bucknell,1,109.385629,108.150278,50.7,15.5,28.4,35.0,28.9,93.922636,104.917114,49.4,20.5,25.1,39.0,30.2,75.776224,67.995975,74.101592,65.207726,74.938908,66.60185
4,Eastern Kentucky,Louisville,76.0,78.0,2024-12-28,Louisville,Eastern Kentucky,1,114.541145,98.320567,51.2,17.1,33.9,37.1,29.8,110.376078,107.352351,48.5,15.4,33.5,22.1,32.0,77.186915,71.16704,75.131021,71.071076,76.158968,71.119058
5,Fairleigh Dickinson,Boston College,70.0,78.0,2024-12-28,Boston College,Fairleigh Dickinson,1,104.983408,106.031757,47.5,18.0,34.5,34.3,33.9,103.233639,114.72275,49.7,17.3,24.9,26.8,34.9,73.019461,71.169732,71.190804,67.022395,72.105132,69.096064
6,Florida Gulf Coast,Richmond,75.0,57.0,2024-12-28,Richmond,Florida Gulf Coast,1,101.758087,109.741049,48.3,15.0,20.7,40.6,29.2,105.710407,106.895812,51.6,17.2,25.6,18.1,33.1,71.96846,75.09185,69.353458,69.820268,70.660959,72.456059
7,Gonzaga,UCLA,62.0,65.0,2024-12-28,UCLA,Gonzaga,0,109.930056,87.869252,54.3,16.4,34.2,34.9,35.8,124.1186,93.745664,55.7,13.5,32.9,32.9,34.8,80.17917,76.120794,77.702174,83.569214,78.940672,79.845004
8,Maryland Eastern Shore,Maryland,66.0,81.0,2024-12-28,Maryland,Maryland Eastern Shore,1,117.891812,89.667452,57.3,13.4,33.9,29.2,37.1,99.924968,120.725914,43.8,19.2,22.0,35.3,30.0,83.878335,59.215811,83.463836,59.37908,83.671086,59.297446
9,Mercer,Georgia St.,71.0,68.0,2024-12-28,Georgia St.,Mercer,1,99.210865,108.261554,46.2,19.7,32.4,37.6,31.0,103.091061,112.357757,51.6,18.1,28.7,28.2,31.5,70.877093,74.099157,68.190579,70.302649,69.533836,72.200903


Unnamed: 0,Team,Opponent,Team_Score,Opponent_Score
0,Abilene Christian,Texas A&M,54,92
1,Alabama A&M,Georgia Tech,49,92
2,Boise St.,San Jose St.,73,71
3,Bucknell,Syracuse,63,75
7,Eastern Kentucky,Louisville,76,78
10,Fairleigh Dickinson,Boston College,70,78
11,Florida Gulf Coast,Richmond,75,57
13,Gonzaga,UCLA,62,65
20,Maryland Eastern Shore,Maryland,66,81
21,Mercer,Georgia St.,71,68
