In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import numpy as np

team_data = pd.read_csv("../Stats_competition-/team_data_collected_df.csv")

In [2]:
threshold = 6

In [3]:
team_data

Unnamed: 0,Location,Team,Opponent,WAB,ADJO,ADJD,EFF,EFG%,TO%,OR%,...,3P,Opp EFF,Opp EFG%,Opp TO%,Opp OR%,Opp FTR,Opp 2P,Opp 3P,Opponent_score,Team_score
0,H,DUKE,Maine,0.1,125.3,95.2,130.6,64.3,17.7,35.5,...,11-29,84.3,39.5,19.0,19.4,31.6,18-43,3-14,62,96
1,H,DUKE,Army,0.1,124.7,92.3,141.0,61.3,11.3,43.6,...,17-38,81.8,39.1,18.3,23.3,14.1,13-35,8-29,58,100
2,N,DUKE,Kentucky,-0.2,106.8,86.4,95.7,42.3,9.3,25.0,...,4-24,102.3,47.6,14.6,23.3,38.1,15-38,10-25,77,72
3,H,DUKE,Wofford,-0.1,124.7,56.9,133.7,61.3,15.5,45.2,...,16-38,54.4,28.9,29.5,29.3,5.3,9-24,5-33,35,86
4,A,DUKE,Arizona,0.6,111.7,75.9,101.9,50.0,20.7,35.1,...,9-25,81.2,45.3,22.2,16.7,20.8,15-30,6-23,55,69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,H,Miami FL,Charleston Southern,-2.6,109.1,130.5,116.3,57.6,14.7,25.8,...,12-33,122.2,59.5,17.7,36.4,17.5,21-39,11-24,83,79
204,H,Miami FL,Arkansas,-3.3,120.1,105.1,108.1,55.8,16.3,25.0,...,9-29,112.5,57.3,16.3,26.5,14.5,19-39,11-23,76,73
205,H,Miami FL,Clemson,-3.9,102.7,100.1,92.8,44.6,21.9,35.3,...,4-25,109.7,47.4,11.8,33.3,20.7,14-31,9-27,65,55
206,N,Miami FL,Tennessee,-4.0,109.0,99.2,91.6,45.9,20.7,26.8,...,8-33,110.9,57.7,16.3,18.5,34.6,15-27,10-25,75,62


In [4]:
team_data["Location"] = np.where(
    team_data["Location"] == "N", 0, np.where(team_data["Location"] == "H", 1, -1)
)

In [5]:
columns_to_convert = [
    "Location",
    "ADJO",
    "ADJD",
    "EFG%",
    "TO%",
    "OR%",
    "FTR",
    "Opp EFG%",
    "Opp TO%",
    "Opp OR%",
    "Opp FTR",
]
for col in columns_to_convert:
    team_data[col] = pd.to_numeric(team_data[col], errors="coerce")

In [6]:
team_data_cleaned = team_data.dropna()

X = team_data_cleaned[columns_to_convert]
y_team = team_data_cleaned["Team_score"]
y_opp = team_data_cleaned["Opponent_score"]

X_train, X_test, y_team_train, y_team_test = train_test_split(
    X, y_team, test_size=0.2, random_state=42
)
y_opp_train, y_opp_test = train_test_split(y_opp, test_size=0.2, random_state=42)

model_team = LinearRegression()
model_team.fit(X_train, y_team_train)
team_pred = model_team.predict(X_test)

model_opp = LinearRegression()
model_opp.fit(X_train, y_opp_train)
opp_pred = model_opp.predict(X_test)

team_rmse = mean_squared_error(y_team_test, team_pred, squared=False)
opp_rmse = mean_squared_error(y_opp_test, opp_pred, squared=False)
print(f"Team Score RMSE: {team_rmse}, Opponent Score RMSE: {opp_rmse}")

Team Score RMSE: 6.045373373192233, Opponent Score RMSE: 6.957248475591879




In [7]:
team_accuracy = (abs(team_pred - y_team_test) <= threshold).mean() * 100
opp_accuracy = (abs(opp_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy: {team_accuracy:.2f}%")
print(f"Opponent Score Accuracy: {opp_accuracy:.2f}%")

Team Score Accuracy: 64.29%
Opponent Score Accuracy: 59.52%


In [8]:
rf_model_team = RandomForestRegressor(random_state=42)
rf_model_team.fit(X_train, y_team_train)
rf_team_pred = rf_model_team.predict(X_test)

rf_model_opp = RandomForestRegressor(random_state=42)
rf_model_opp.fit(X_train, y_opp_train)
rf_opp_pred = rf_model_opp.predict(X_test)

team_rf_rmse = mean_squared_error(y_team_test, rf_team_pred, squared=False)
opp_rf_rmse = mean_squared_error(y_opp_test, rf_opp_pred, squared=False)
print(f"Team Score RMSE: {team_rf_rmse}, Opponent Score RMSE: {opp_rf_rmse}")

Team Score RMSE: 7.185105957398791, Opponent Score RMSE: 7.918119816507668




In [9]:
team_accuracy = (abs(rf_team_pred - y_team_test) <= threshold).mean() * 100
opp_accuracy = (abs(rf_opp_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy: {team_accuracy:.2f}%")
print(f"Opponent Score Accuracy: {opp_accuracy:.2f}%")

Team Score Accuracy: 69.05%
Opponent Score Accuracy: 52.38%


In [10]:
import xgboost as xgb

X_train, X_test, y_team_train, y_team_test = train_test_split(
    X, y_team, test_size=0.2, random_state=42
)
X_train_opp, X_test_opp, y_opp_train, y_opp_test = train_test_split(
    X, y_opp, test_size=0.2, random_state=42
)

dtrain_team = xgb.DMatrix(X_train, label=y_team_train)
dtest_team = xgb.DMatrix(X_test, label=y_team_test)

dtrain_opp = xgb.DMatrix(X_train_opp, label=y_opp_train)
dtest_opp = xgb.DMatrix(X_test_opp, label=y_opp_test)

params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "learning_rate": 0.1,
    "max_depth": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
}

team_model = xgb.train(
    params,
    dtrain_team,
    num_boost_round=100,
    evals=[(dtest_team, "test")],
    early_stopping_rounds=10,
)

opp_model = xgb.train(
    params,
    dtrain_opp,
    num_boost_round=100,
    evals=[(dtest_opp, "test")],
    early_stopping_rounds=10,
)

xg_team_pred = team_model.predict(dtest_team)
xg_opp_pred = opp_model.predict(dtest_opp)

team_rmse = mean_squared_error(y_team_test, xg_team_pred, squared=False)
opp_rmse = mean_squared_error(y_opp_test, xg_opp_pred, squared=False)

print(f"Team Score RMSE: {team_rmse}")
print(f"Opponent Score RMSE: {opp_rmse}")

[0]	test-rmse:12.48788
[1]	test-rmse:11.98595
[2]	test-rmse:11.47672
[3]	test-rmse:11.21937
[4]	test-rmse:10.67003
[5]	test-rmse:10.34327
[6]	test-rmse:9.96716
[7]	test-rmse:9.64412
[8]	test-rmse:9.51223
[9]	test-rmse:9.32098
[10]	test-rmse:9.04378
[11]	test-rmse:8.73916
[12]	test-rmse:8.58418
[13]	test-rmse:8.41762
[14]	test-rmse:8.22502
[15]	test-rmse:8.04169
[16]	test-rmse:8.04801
[17]	test-rmse:8.00354
[18]	test-rmse:7.88499
[19]	test-rmse:7.75807
[20]	test-rmse:7.68730
[21]	test-rmse:7.55569
[22]	test-rmse:7.49807
[23]	test-rmse:7.48297
[24]	test-rmse:7.44488
[25]	test-rmse:7.45027
[26]	test-rmse:7.41096
[27]	test-rmse:7.39751
[28]	test-rmse:7.37592
[29]	test-rmse:7.32260
[30]	test-rmse:7.29530
[31]	test-rmse:7.27623
[32]	test-rmse:7.26329
[33]	test-rmse:7.24360
[34]	test-rmse:7.23065
[35]	test-rmse:7.22353
[36]	test-rmse:7.20527
[37]	test-rmse:7.20784
[38]	test-rmse:7.18609
[39]	test-rmse:7.17125
[40]	test-rmse:7.19506
[41]	test-rmse:7.18388
[42]	test-rmse:7.16079
[43]	test-rmse:



In [11]:
team_accuracy = (abs(xg_team_pred - y_team_test) <= threshold).mean() * 100
opp_accuracy = (abs(xg_opp_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy: {team_accuracy:.2f}%")
print(f"Opponent Score Accuracy: {opp_accuracy:.2f}%")

Team Score Accuracy: 66.67%
Opponent Score Accuracy: 54.76%


In [12]:
predict_games = pd.read_csv("../Stats_competition-/basketball_games_data.csv")

In [13]:
predict_games["Location"] = np.where(
    predict_games["Location"] == "Neutral",
    0,
    np.where(predict_games["Location"] == "Home", 1, -1),
)

In [14]:
predict_games

Unnamed: 0,Location,Team,Opponent,ADJO,ADJD,EFG%,TO%,OR%,FTR,Opp EFG%,Opp TO%,Opp OR%,Opp FTR
0,0,Michigan,Oklahoma,116.4,97.1,58.0,21.4,36.2,33.2,56.1,16.4,30.5,36.1
1,-1,Butler,Marquette,112.2,102.3,53.0,18.6,30.7,49.7,55.0,13.9,30.2,28.5
2,1,Connecticut,Xavier,121.8,99.6,58.3,15.2,35.2,33.9,54.3,16.6,29.5,35.8
3,-1,Toledo,Houston,110.2,114.1,51.4,12.9,25.9,32.6,52.6,12.9,35.4,29.6
4,-1,Memphis,Virginia,117.7,101.1,54.8,19.6,32.6,46.4,52.0,20.1,24.2,25.8


In [15]:
X = predict_games[columns_to_convert]

In [16]:
X

Unnamed: 0,Location,ADJO,ADJD,EFG%,TO%,OR%,FTR,Opp EFG%,Opp TO%,Opp OR%,Opp FTR
0,0,116.4,97.1,58.0,21.4,36.2,33.2,56.1,16.4,30.5,36.1
1,-1,112.2,102.3,53.0,18.6,30.7,49.7,55.0,13.9,30.2,28.5
2,1,121.8,99.6,58.3,15.2,35.2,33.9,54.3,16.6,29.5,35.8
3,-1,110.2,114.1,51.4,12.9,25.9,32.6,52.6,12.9,35.4,29.6
4,-1,117.7,101.1,54.8,19.6,32.6,46.4,52.0,20.1,24.2,25.8


In [17]:
from pygam import LinearGAM, s, f
from sklearn.metrics import mean_squared_error

# Fit PyGAM for Team Score
gam_team = LinearGAM(
    s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9) + s(10)
).fit(X_train, y_team_train)
team_gam_pred = gam_team.predict(X_test)

# Fit PyGAM for Opponent Score
gam_opp = LinearGAM(
    s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9) + s(10)
).fit(X_train, y_opp_train)
opp_gam_pred = gam_opp.predict(X_test)

# Calculate RMSE for PyGAM
team_gam_rmse = mean_squared_error(y_team_test, team_gam_pred, squared=False)
opp_gam_rmse = mean_squared_error(y_opp_test, opp_gam_pred, squared=False)
print(f"Team Score RMSE (PyGAM): {team_gam_rmse}")
print(f"Opponent Score RMSE (PyGAM): {opp_gam_rmse}")

# Calculate accuracy for PyGAM
team_gam_accuracy = (abs(team_gam_pred - y_team_test) <= threshold).mean() * 100
opp_gam_accuracy = (abs(opp_gam_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy (PyGAM): {team_gam_accuracy:.2f}%")
print(f"Opponent Score Accuracy (PyGAM): {opp_gam_accuracy:.2f}%")

# Predict scores using PyGAM
team_gam_pred_new = gam_team.predict(X)
opp_gam_pred_new = gam_opp.predict(X)

# Add PyGAM predictions to the table
predictions_gam = pd.DataFrame(
    {
        "Predicted Team Score GAM": team_gam_pred_new,
        "Predicted Opponent Score GAM": opp_gam_pred_new,
    }
)

predict_games = pd.concat([predict_games, predictions_gam], axis=1)

Team Score RMSE (PyGAM): 8.015398275152359
Opponent Score RMSE (PyGAM): 8.426216627069767
Team Score Accuracy (PyGAM): 59.52%
Opponent Score Accuracy (PyGAM): 52.38%




In [18]:
team_pred_new = model_team.predict(X)
opp_pred_new = model_opp.predict(X)

predictions = pd.DataFrame(
    {
        "Predicted Team Score LR": team_pred_new,
        "Predicted Opponent Score LR": opp_pred_new,
    }
)

In [19]:
rf_team_pred_new = rf_model_team.predict(X)
rf_opp_pred_new = rf_model_opp.predict(X)

predictions_rf = pd.DataFrame(
    {
        "Predicted Team Score RF": rf_team_pred_new,
        "Predicted Opponent Score RF": rf_team_pred_new,
    }
)

In [20]:
predict_games = pd.concat([predict_games, predictions], axis=1)
predict_games = pd.concat([predict_games, predictions_rf], axis=1)

In [25]:
predict_games[
    [
        "Team",
        "Opponent",
        "Predicted Team Score LR",
        "Predicted Opponent Score LR",
        "Predicted Team Score RF",
        "Predicted Opponent Score RF",
    ]
]

Unnamed: 0,Team,Opponent,Predicted Team Score LR,Predicted Opponent Score LR,Predicted Team Score RF,Predicted Opponent Score RF
0,Michigan,Oklahoma,80.92906,80.975608,80.85,80.85
1,Butler,Marquette,77.681284,80.163101,78.43,78.43
2,Connecticut,Xavier,86.850363,78.853839,88.04,88.04
3,Toledo,Houston,73.370242,77.781194,81.07,81.07
4,Memphis,Virginia,80.12087,69.905718,77.85,77.85


In [33]:
# Add columns for averages
predict_games["Average Predicted Team Score"] = predict_games[
    ["Predicted Team Score LR", "Predicted Team Score GAM"]
].mean(axis=1)

predict_games["Average Predicted Opponent Score"] = predict_games[
    ["Predicted Opponent Score RF", "Predicted Opponent Score GAM"]
].mean(axis=1)

# Select columns to display
columns_to_display = [
    "Team",
    "Opponent",
    "Predicted Team Score LR",
    "Predicted Opponent Score LR",
    "Predicted Team Score GAM",
    "Predicted Opponent Score GAM",
    "Average Predicted Team Score",
    "Average Predicted Opponent Score",
]

# Display the updated table
print(predict_games[columns_to_display].to_string(index=False))

       Team  Opponent  Predicted Team Score LR  Predicted Opponent Score LR  Predicted Team Score GAM  Predicted Opponent Score GAM  Average Predicted Team Score  Average Predicted Opponent Score
   Michigan  Oklahoma                80.929060                    80.975608                 83.203849                     86.766694                     82.066454                         83.808347
     Butler Marquette                77.681284                    80.163101                 79.581254                     86.762465                     78.631269                         82.596232
Connecticut    Xavier                86.850363                    78.853839                 93.476375                     78.881936                     90.163369                         83.460968
     Toledo   Houston                73.370242                    77.781194                 72.522422                     84.268520                     72.946332                         82.669260
    Memphis  Virgini