In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import numpy as np

team_data = pd.read_csv("../Stats_competition-/final_opponent_and_team_data.csv")

In [3]:
threshold = 6

In [4]:
team_data

Unnamed: 0,Date,Team,Opponent,Location,WAB,ADJO,ADJD,EFF,EFG%,TO%,...,Opp EFG%,Opp TO%,Opp OR%,Opp FTR,Opp 2P,Opp 3P,Opponent_score,Team_score,opp_adj_o,opp_adj_d
0,2024-11-04,Duke,Maine,H,0.1,125.3,95.2,130.6,64.3,17.7,...,39.5,19.0,19.4,31.6,18-43,3-14,62,96,108.9,110.8
1,2024-11-08,Duke,Army,H,0.1,124.7,92.3,141.0,61.3,11.3,...,39.1,18.3,23.3,14.1,13-35,8-29,58,100,107.2,119.0
2,2024-11-12,Duke,Kentucky,N,-0.2,106.8,86.4,95.7,42.3,9.3,...,47.6,14.6,23.3,38.1,15-38,10-25,77,72,119.6,88.1
3,2024-11-16,Duke,Wofford,H,-0.1,124.7,56.9,133.7,61.3,15.5,...,28.9,29.5,29.3,5.3,9-24,5-33,35,86,69.8,115.5
4,2024-11-22,Duke,Arizona,A,0.6,111.7,75.9,101.9,50.0,20.7,...,45.3,22.2,16.7,20.8,15-30,6-23,55,69,98.9,90.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,2024-11-27,Michigan,Xavier,N,0.3,110.9,73.9,109.9,58.2,14.1,...,41.4,26.8,25.6,15.5,12-35,8-23,53,78,85.8,96.0
298,2024-12-03,Michigan,Wisconsin,A,1.0,102.7,84.4,99.3,44.1,17.8,...,39.1,14.8,34.8,29.7,16-37,6-27,64,67,103.9,89.6
299,2024-12-07,Michigan,Iowa,H,1.4,121.1,99.5,114.5,58.6,22.9,...,47.4,5.4,30.6,22.1,23-50,9-27,83,85,120.5,104.8
300,2024-12-10,Michigan,Arkansas,N,0.9,123.6,108.4,112.3,64.4,21.9,...,56.8,18.1,33.3,33.3,24-43,9-23,89,87,124.8,102.1


In [5]:
team_data["Location"] = np.where(
    team_data["Location"] == "N", 0, np.where(team_data["Location"] == "H", 1, -1)
)

In [6]:
columns_to_convert = [
    "Location",
    "ADJO",
    "ADJD",
    "EFG%",
    "TO%",
    "OR%",
    "FTR",
    "Opp EFG%",
    "Opp TO%",
    "Opp OR%",
    "Opp FTR",
]
for col in columns_to_convert:
    team_data[col] = pd.to_numeric(team_data[col], errors="coerce")

In [7]:
team_data_cleaned = team_data.dropna()

X = team_data_cleaned[columns_to_convert]
y_team = team_data_cleaned["Team_score"]
y_opp = team_data_cleaned["Opponent_score"]

X_train, X_test, y_team_train, y_team_test = train_test_split(
    X, y_team, test_size=0.2, random_state=42
)
y_opp_train, y_opp_test = train_test_split(y_opp, test_size=0.2, random_state=42)

model_team = LinearRegression()
model_team.fit(X_train, y_team_train)
team_pred = model_team.predict(X_test)

model_opp = LinearRegression()
model_opp.fit(X_train, y_opp_train)
opp_pred = model_opp.predict(X_test)

team_rmse = mean_squared_error(y_team_test, team_pred, squared=False)
opp_rmse = mean_squared_error(y_opp_test, opp_pred, squared=False)
print(f"Team Score RMSE: {team_rmse}, Opponent Score RMSE: {opp_rmse}")

Team Score RMSE: 5.694257875441654, Opponent Score RMSE: 5.397831922401339




In [8]:
team_accuracy = (abs(team_pred - y_team_test) <= threshold).mean() * 100
opp_accuracy = (abs(opp_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy: {team_accuracy:.2f}%")
print(f"Opponent Score Accuracy: {opp_accuracy:.2f}%")

Team Score Accuracy: 68.85%
Opponent Score Accuracy: 73.77%


In [9]:
rf_model_team = RandomForestRegressor(random_state=42)
rf_model_team.fit(X_train, y_team_train)
rf_team_pred = rf_model_team.predict(X_test)

rf_model_opp = RandomForestRegressor(random_state=42)
rf_model_opp.fit(X_train, y_opp_train)
rf_opp_pred = rf_model_opp.predict(X_test)

team_rf_rmse = mean_squared_error(y_team_test, rf_team_pred, squared=False)
opp_rf_rmse = mean_squared_error(y_opp_test, rf_opp_pred, squared=False)
print(f"Team Score RMSE: {team_rf_rmse}, Opponent Score RMSE: {opp_rf_rmse}")

Team Score RMSE: 6.759762096890494, Opponent Score RMSE: 5.717229353052915




In [10]:
team_accuracy = (abs(rf_team_pred - y_team_test) <= threshold).mean() * 100
opp_accuracy = (abs(rf_opp_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy: {team_accuracy:.2f}%")
print(f"Opponent Score Accuracy: {opp_accuracy:.2f}%")

Team Score Accuracy: 55.74%
Opponent Score Accuracy: 70.49%


In [11]:
import xgboost as xgb

X_train, X_test, y_team_train, y_team_test = train_test_split(
    X, y_team, test_size=0.2, random_state=42
)
X_train_opp, X_test_opp, y_opp_train, y_opp_test = train_test_split(
    X, y_opp, test_size=0.2, random_state=42
)

dtrain_team = xgb.DMatrix(X_train, label=y_team_train)
dtest_team = xgb.DMatrix(X_test, label=y_team_test)

dtrain_opp = xgb.DMatrix(X_train_opp, label=y_opp_train)
dtest_opp = xgb.DMatrix(X_test_opp, label=y_opp_test)

params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "learning_rate": 0.1,
    "max_depth": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
}

team_model = xgb.train(
    params,
    dtrain_team,
    num_boost_round=100,
    evals=[(dtest_team, "test")],
    early_stopping_rounds=10,
)

opp_model = xgb.train(
    params,
    dtrain_opp,
    num_boost_round=100,
    evals=[(dtest_opp, "test")],
    early_stopping_rounds=10,
)

xg_team_pred = team_model.predict(dtest_team)
xg_opp_pred = opp_model.predict(dtest_opp)

team_rmse = mean_squared_error(y_team_test, xg_team_pred, squared=False)
opp_rmse = mean_squared_error(y_opp_test, xg_opp_pred, squared=False)

print(f"Team Score RMSE: {team_rmse}")
print(f"Opponent Score RMSE: {opp_rmse}")

[0]	test-rmse:12.18444
[1]	test-rmse:11.58740
[2]	test-rmse:10.96294
[3]	test-rmse:10.70617
[4]	test-rmse:10.26874
[5]	test-rmse:9.94852
[6]	test-rmse:9.53174
[7]	test-rmse:9.28654
[8]	test-rmse:9.09317
[9]	test-rmse:8.83542
[10]	test-rmse:8.54354
[11]	test-rmse:8.26151
[12]	test-rmse:8.04741
[13]	test-rmse:7.83750
[14]	test-rmse:7.70025
[15]	test-rmse:7.62646
[16]	test-rmse:7.47966
[17]	test-rmse:7.38738
[18]	test-rmse:7.27690
[19]	test-rmse:7.19261
[20]	test-rmse:7.20109
[21]	test-rmse:7.13479
[22]	test-rmse:7.06218
[23]	test-rmse:7.01368
[24]	test-rmse:6.96618
[25]	test-rmse:6.93723
[26]	test-rmse:6.91986
[27]	test-rmse:6.90199
[28]	test-rmse:6.90731
[29]	test-rmse:6.89476
[30]	test-rmse:6.87245
[31]	test-rmse:6.81690
[32]	test-rmse:6.80684
[33]	test-rmse:6.78039
[34]	test-rmse:6.77995
[35]	test-rmse:6.79721
[36]	test-rmse:6.80123
[37]	test-rmse:6.77188
[38]	test-rmse:6.76807
[39]	test-rmse:6.75825
[40]	test-rmse:6.74601
[41]	test-rmse:6.72463
[42]	test-rmse:6.71903
[43]	test-rmse:6



In [12]:
team_accuracy = (abs(xg_team_pred - y_team_test) <= threshold).mean() * 100
opp_accuracy = (abs(xg_opp_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy: {team_accuracy:.2f}%")
print(f"Opponent Score Accuracy: {opp_accuracy:.2f}%")

Team Score Accuracy: 60.66%
Opponent Score Accuracy: 67.21%


In [13]:
predict_games = pd.read_csv("../Stats_competition-/basketball_games_data.csv")

In [14]:
predict_games["Location"] = np.where(
    predict_games["Location"] == "Neutral",
    0,
    np.where(predict_games["Location"] == "Home", 1, -1),
)

In [15]:
predict_games

Unnamed: 0,date,Location,Team,Opponent,ADJO,ADJD,EFG%,TO%,OR%,FTR,Opp EFG%,Opp TO%,Opp OR%,Opp FTR,opp_adj_o,opp_adj_d
0,2024-12-21,0,Auburn,Purdue,134.7,91.3,62.1,13.3,37.5,13.6,45.8,17.8,34.1,39.0,115.0,102.8
1,2024-12-21,1,Gonzaga,Bucknell,108.6,95.5,56.3,15.6,26.3,12.7,49.2,24.7,16.2,16.9,96.6,93.4
2,2024-12-21,0,San Diego St.,California,102.3,73.2,44.0,19.5,47.7,22.4,30.0,27.0,36.4,38.2,85.7,101.6
3,2024-12-21,1,Michigan St.,Florida Atlantic,112.2,89.6,50.7,13.5,35.0,29.0,42.6,12.1,22.7,20.6,105.4,103.5
4,2024-12-21,0,Kentucky,Ohio St.,105.0,114.2,33.3,14.8,31.7,56.1,60.4,10.4,26.9,50.9,133.1,85.4
5,2024-12-21,0,North Carolina,UCLA,118.9,99.2,55.3,25.0,30.0,74.5,57.5,20.8,21.9,41.5,112.3,93.6
6,2024-12-21,0,Stanford,Oregon,99.6,100.6,43.5,13.2,23.8,21.0,49.1,13.2,26.5,52.7,113.9,85.2
7,2024-12-21,1,Mississippi,Queens,107.8,94.5,53.1,11.3,22.9,21.5,55.9,29.6,21.9,21.6,96.6,99.7
8,2024-12-21,1,Houston,Texas A&M Corpus Chris,137.6,90.5,62.3,13.4,43.3,47.2,36.6,21.8,42.5,28.6,109.8,121.9
9,2024-12-21,1,Clemson,Wake Forest,112.1,91.7,47.5,17.1,27.0,26.2,53.0,27.1,18.5,24.0,97.1,95.9


In [16]:
X = predict_games[columns_to_convert]

In [17]:
X

Unnamed: 0,Location,ADJO,ADJD,EFG%,TO%,OR%,FTR,Opp EFG%,Opp TO%,Opp OR%,Opp FTR
0,0,134.7,91.3,62.1,13.3,37.5,13.6,45.8,17.8,34.1,39.0
1,1,108.6,95.5,56.3,15.6,26.3,12.7,49.2,24.7,16.2,16.9
2,0,102.3,73.2,44.0,19.5,47.7,22.4,30.0,27.0,36.4,38.2
3,1,112.2,89.6,50.7,13.5,35.0,29.0,42.6,12.1,22.7,20.6
4,0,105.0,114.2,33.3,14.8,31.7,56.1,60.4,10.4,26.9,50.9
5,0,118.9,99.2,55.3,25.0,30.0,74.5,57.5,20.8,21.9,41.5
6,0,99.6,100.6,43.5,13.2,23.8,21.0,49.1,13.2,26.5,52.7
7,1,107.8,94.5,53.1,11.3,22.9,21.5,55.9,29.6,21.9,21.6
8,1,137.6,90.5,62.3,13.4,43.3,47.2,36.6,21.8,42.5,28.6
9,1,112.1,91.7,47.5,17.1,27.0,26.2,53.0,27.1,18.5,24.0


In [18]:
from pygam import LinearGAM, s, f
from sklearn.metrics import mean_squared_error

# Fit PyGAM for Team Score
gam_team = LinearGAM(
    s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9) + s(10)
).fit(X_train, y_team_train)
team_gam_pred = gam_team.predict(X_test)

# Fit PyGAM for Opponent Score
gam_opp = LinearGAM(
    s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9) + s(10)
).fit(X_train, y_opp_train)
opp_gam_pred = gam_opp.predict(X_test)

# Calculate RMSE for PyGAM
team_gam_rmse = mean_squared_error(y_team_test, team_gam_pred, squared=False)
opp_gam_rmse = mean_squared_error(y_opp_test, opp_gam_pred, squared=False)
print(f"Team Score RMSE (PyGAM): {team_gam_rmse}")
print(f"Opponent Score RMSE (PyGAM): {opp_gam_rmse}")

# Calculate accuracy for PyGAM
team_gam_accuracy = (abs(team_gam_pred - y_team_test) <= threshold).mean() * 100
opp_gam_accuracy = (abs(opp_gam_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy (PyGAM): {team_gam_accuracy:.2f}%")
print(f"Opponent Score Accuracy (PyGAM): {opp_gam_accuracy:.2f}%")

# Predict scores using PyGAM
team_gam_pred_new = gam_team.predict(X)
opp_gam_pred_new = gam_opp.predict(X)

# Add PyGAM predictions to the table
predictions_gam = pd.DataFrame(
    {
        "Predicted Team Score GAM": team_gam_pred_new,
        "Predicted Opponent Score GAM": opp_gam_pred_new,
    }
)

predict_games = pd.concat([predict_games, predictions_gam], axis=1)

Team Score RMSE (PyGAM): 6.986779995501826
Opponent Score RMSE (PyGAM): 6.7824075879007
Team Score Accuracy (PyGAM): 65.57%
Opponent Score Accuracy (PyGAM): 68.85%




In [19]:
team_pred_new = model_team.predict(X)
opp_pred_new = model_opp.predict(X)

predictions = pd.DataFrame(
    {
        "Predicted Team Score LR": team_pred_new,
        "Predicted Opponent Score LR": opp_pred_new,
    }
)

In [20]:
rf_team_pred_new = rf_model_team.predict(X)
rf_opp_pred_new = rf_model_opp.predict(X)

predictions_rf = pd.DataFrame(
    {
        "Predicted Team Score RF": rf_team_pred_new,
        "Predicted Opponent Score RF": rf_team_pred_new,
    }
)

In [21]:
predict_games = pd.concat([predict_games, predictions], axis=1)
predict_games = pd.concat([predict_games, predictions_rf], axis=1)

In [22]:
predict_games[
    [
        "Team",
        "Opponent",
        "Predicted Team Score LR",
        "Predicted Opponent Score LR",
        "Predicted Team Score RF",
        "Predicted Opponent Score RF",
    ]
]

Unnamed: 0,Team,Opponent,Predicted Team Score LR,Predicted Opponent Score LR,Predicted Team Score RF,Predicted Opponent Score RF
0,Auburn,Purdue,88.958644,69.196933,92.88,92.88
1,Gonzaga,Bucknell,75.489987,57.822061,78.12,78.12
2,San Diego St.,California,68.264099,45.258941,70.09,70.09
3,Michigan St.,Florida Atlantic,78.373663,63.571918,75.97,75.97
4,Kentucky,Ohio St.,65.784275,92.545323,70.7,70.7
5,North Carolina,UCLA,81.393071,79.21374,81.68,81.68
6,Stanford,Oregon,67.011688,76.863495,63.34,63.34
7,Mississippi,Queens,76.918037,64.268468,77.94,77.94
8,Houston,Texas A&M Corpus Chris,94.460109,58.669339,93.23,93.23
9,Clemson,Wake Forest,69.165477,62.317847,73.6,73.6


In [23]:
# Add columns for averages
predict_games["Average Predicted Team Score"] = predict_games[
    ["Predicted Team Score LR", "Predicted Team Score GAM"]
].mean(axis=1)

predict_games["Average Predicted Opponent Score"] = predict_games[
    ["Predicted Opponent Score RF", "Predicted Opponent Score GAM"]
].mean(axis=1)

# Select columns to display
columns_to_display = [
    "Team",
    "Opponent",
    "Predicted Team Score LR",
    "Predicted Opponent Score LR",
    "Predicted Team Score GAM",
    "Predicted Opponent Score GAM",
    "Average Predicted Team Score",
    "Average Predicted Opponent Score",
]

# Display the updated table
print(predict_games[columns_to_display].to_string(index=False))

          Team               Opponent  Predicted Team Score LR  Predicted Opponent Score LR  Predicted Team Score GAM  Predicted Opponent Score GAM  Average Predicted Team Score  Average Predicted Opponent Score
        Auburn                 Purdue                88.958644                    69.196933                 86.013954                     65.526426                     87.486299                         79.203213
       Gonzaga               Bucknell                75.489987                    57.822061                 73.149932                     57.770330                     74.319959                         67.945165
 San Diego St.             California                68.264099                    45.258941                 75.394710                     56.043385                     71.829404                         63.066693
  Michigan St.       Florida Atlantic                78.373663                    63.571918                 79.448484                     63.573657     