In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from pygam import LinearGAM, s
import lightgbm as lgb
from catboost import CatBoostRegressor

In [8]:
# Display all columns and rows
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

# Load and display dataset
team_data = pd.read_csv("./ACC_train_data_till_5Feb.csv")
display(team_data.head())

# Data Cleaning and Feature Engineering
team_data["Location"] = np.where(
    team_data["Location"] == "N", 0, np.where(team_data["Location"] == "H", 1, -1)
)

columns_to_convert = [
    "Location",
    "ADJO",
    "ADJD",
    "EFG_pct",
    "TO_Pct",
    "OR_Pct",
    "FTR_Pct",
    "opp_ADJO",
    "opp_ADJD",
    "Opp_EFG_Pct",
    "Opp_TO_Pct",
    "Opp_OR_Pct",
    "Opp_FTR_Pct",
]

for col in columns_to_convert:
    team_data[col] = pd.to_numeric(team_data[col], errors="coerce")

# Drop rows with missing values
team_data_cleaned = team_data.dropna()

# Define features (X) and target variables (y)
X = team_data_cleaned[columns_to_convert]
y_team = team_data_cleaned["Team_Score"]
y_opp = team_data_cleaned["Opponent_Score"]

# Train-test split
X_train, X_test, y_team_train, y_team_test = train_test_split(
    X, y_team, test_size=0.2, random_state=42
)
y_opp_train, y_opp_test = train_test_split(y_opp, test_size=0.2, random_state=42)

Unnamed: 0,game_id,date,Team,Opponent,Team_Score,Opponent_Score,Location,ADJO,ADJD,EFF,EFG_pct,TO_Pct,OR_Pct,FTR_Pct,ThreePointer_Pct,opp_ADJO,opp_ADJD,Opp_EFF,Opp_EFG_Pct,Opp_TO_Pct,Opp_OR_Pct,Opp_FTR_Pct,Opp_ThreePointer_Pct
0,Air ForceCalifornia11-21,2024-11-21,California,California,78,69,H,113.2,112.6,118.1,57.7,22.7,43.3,51.9,30.77,113.2,112.6,118.1,57.7,22.7,43.3,51.9,30.77
1,Alabama A&MGeorgia Tech12-28,2024-12-28,Georgia Tech,Georgia Tech,92,49,H,99.6,76.6,115.7,62.3,23.9,34.5,34.4,47.62,99.6,76.6,115.7,62.3,23.9,34.5,34.4,47.62
2,Alabama St.SMU12-3,2024-12-03,SMU,SMU,101,72,H,122.6,110.6,138.6,55.1,11.0,47.6,49.3,34.48,122.6,110.6,138.6,55.1,11.0,47.6,49.3,34.48
3,AlabamaNorth Carolina12-4,2024-12-04,North Carolina,North Carolina,79,94,H,103.1,95.8,94.7,44.2,15.6,31.9,20.5,17.86,103.1,95.8,94.7,44.2,15.6,31.9,20.5,17.86
4,AlbanySyracuse12-10,2024-12-10,Syracuse,Syracuse,102,85,H,130.2,124.9,140.1,68.3,15.1,37.5,28.6,40.0,130.2,124.9,140.1,68.3,15.1,37.5,28.6,40.0


In [9]:
# Linear Regression Models
model_team = LinearRegression()
model_team.fit(X_train, y_team_train)
team_pred = model_team.predict(X_test)

model_opp = LinearRegression()
model_opp.fit(X_train, y_opp_train)
opp_pred = model_opp.predict(X_test)

# Calculate RMSE and Accuracy for Linear Regression
threshold = 6
team_rmse = mean_squared_error(y_team_test, team_pred, squared=False)
opp_rmse = mean_squared_error(y_opp_test, opp_pred, squared=False)
print(f"Team Score RMSE: {team_rmse}, Opponent Score RMSE: {opp_rmse}")

team_accuracy = (abs(team_pred - y_team_test) <= threshold).mean() * 100
opp_accuracy = (abs(opp_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy: {team_accuracy:.2f}%")
print(f"Opponent Score Accuracy: {opp_accuracy:.2f}%")

# PyGAM Models
# Fit PyGAM for Team Score
gam_team = LinearGAM(
    s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9) + s(10)
).fit(X_train, y_team_train)
team_gam_pred = gam_team.predict(X_test)

# Fit PyGAM for Opponent Score
gam_opp = LinearGAM(
    s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9) + s(10)
).fit(X_train, y_opp_train)
opp_gam_pred = gam_opp.predict(X_test)

# Calculate RMSE and Accuracy for PyGAM
team_gam_rmse = mean_squared_error(y_team_test, team_gam_pred, squared=False)
opp_gam_rmse = mean_squared_error(y_opp_test, opp_gam_pred, squared=False)
print(f"Team Score RMSE (PyGAM): {team_gam_rmse}")
print(f"Opponent Score RMSE (PyGAM): {opp_gam_rmse}")

team_gam_accuracy = (abs(team_gam_pred - y_team_test) <= threshold).mean() * 100
opp_gam_accuracy = (abs(opp_gam_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy (PyGAM): {team_gam_accuracy:.2f}%")
print(f"Opponent Score Accuracy (PyGAM): {opp_gam_accuracy:.2f}%")

# LightGBM Models
# LightGBM model for Team Score prediction
lgb_model_team = lgb.LGBMRegressor(
    objective="regression",
    boosting_type="gbdt",
    num_leaves=31,
    learning_rate=0.1,
    n_estimators=100,
)
lgb_model_team.fit(X_train, y_team_train)
team_lgb_pred = lgb_model_team.predict(X_test)

# LightGBM model for Opponent Score prediction
lgb_model_opp = lgb.LGBMRegressor(
    objective="regression",
    boosting_type="gbdt",
    num_leaves=31,
    learning_rate=0.1,
    n_estimators=100,
)
lgb_model_opp.fit(X_train, y_opp_train)
opp_lgb_pred = lgb_model_opp.predict(X_test)

# RMSE and Accuracy for LightGBM
team_lgb_rmse = mean_squared_error(y_team_test, team_lgb_pred, squared=False)
opp_lgb_rmse = mean_squared_error(y_opp_test, opp_lgb_pred, squared=False)
print(f"Team Score RMSE (LightGBM): {team_lgb_rmse}")
print(f"Opponent Score RMSE (LightGBM): {opp_lgb_rmse}")

team_lgb_accuracy = (abs(team_lgb_pred - y_team_test) <= threshold).mean() * 100
opp_lgb_accuracy = (abs(opp_lgb_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy (LightGBM): {team_lgb_accuracy:.2f}%")
print(f"Opponent Score Accuracy (LightGBM): {opp_lgb_accuracy:.2f}%")

# CatBoost Models
# CatBoost model for Team Score prediction
cat_model_team = CatBoostRegressor(
    iterations=500, depth=6, learning_rate=0.01, loss_function="RMSE", verbose=0
)
cat_model_team.fit(X_train, y_team_train)
team_cat_pred = cat_model_team.predict(X_test)

# CatBoost model for Opponent Score prediction
cat_model_opp = CatBoostRegressor(
    iterations=500, depth=6, learning_rate=0.01, loss_function="RMSE", verbose=0
)
cat_model_opp.fit(X_train, y_opp_train)
opp_cat_pred = cat_model_opp.predict(X_test)

# RMSE and Accuracy for CatBoost
team_cat_rmse = mean_squared_error(y_team_test, team_cat_pred, squared=False)
opp_cat_rmse = mean_squared_error(y_opp_test, opp_cat_pred, squared=False)
print(f"Team Score RMSE (CatBoost): {team_cat_rmse}")
print(f"Opponent Score RMSE (CatBoost): {opp_cat_rmse}")

team_cat_accuracy = (abs(team_cat_pred - y_team_test) <= threshold).mean() * 100
opp_cat_accuracy = (abs(opp_cat_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy (CatBoost): {team_cat_accuracy:.2f}%")
print(f"Opponent Score Accuracy (CatBoost): {opp_cat_accuracy:.2f}%")

Team Score RMSE: 6.619979780920167, Opponent Score RMSE: 7.326035317696381
Team Score Accuracy: 63.16%
Opponent Score Accuracy: 64.91%




Team Score RMSE (PyGAM): 7.798550326600104
Opponent Score RMSE (PyGAM): 8.516207616091442
Team Score Accuracy (PyGAM): 66.67%
Opponent Score Accuracy (PyGAM): 50.88%
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000669 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 799
[LightGBM] [Info] Number of data points in the train set: 228, number of used features: 13
[LightGBM] [Info] Start training from score 75.442982
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000295 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 799
[LightGBM] [Info] Number of data points in the train set: 228, number of used features: 13
[LightGBM] [Info] Start training from score 70.890351
Team Score RMSE (LightGBM): 8.887302434027776
Opponent Score RMSE (LightGBM): 8.185335801885852
Team Score Accuracy (LightGBM): 56.14%
Opponent Score Accu



Team Score RMSE (CatBoost): 8.787217023382302
Opponent Score RMSE (CatBoost): 7.506470109107117
Team Score Accuracy (CatBoost): 52.63%
Opponent Score Accuracy (CatBoost): 61.40%




In [10]:
# Predict scores using all models
predict_games = pd.read_csv("./ACC_test_file_from_5thFeb_14thFeb.csv")
predict_games["Location"] = np.where(
    predict_games["Location"] == "N",
    0,
    np.where(predict_games["Location"] == "H", 1, -1),
)

predict_games.rename(
    columns={
        "Opp_ADJO": "opp_ADJO",
        "Opp_ADJD": "opp_ADJD",
        "Opp_EFG_pct": "Opp_EFG_Pct",
    },
    inplace=True,
)

X = predict_games[columns_to_convert]

# Predictions
team_pred_new = model_team.predict(X)
opp_pred_new = model_opp.predict(X)
team_gam_pred_new = gam_team.predict(X)
opp_gam_pred_new = gam_opp.predict(X)
team_lgb_pred_new = lgb_model_team.predict(X)
opp_lgb_pred_new = lgb_model_opp.predict(X)
team_cat_pred_new = cat_model_team.predict(X)
opp_cat_pred_new = cat_model_opp.predict(X)

# Add predictions to DataFrame
predict_games["Predicted Team Score LR"] = team_pred_new
predict_games["Predicted Opponent Score LR"] = opp_pred_new
predict_games["Predicted Team Score GAM"] = team_gam_pred_new
predict_games["Predicted Opponent Score GAM"] = opp_gam_pred_new
predict_games["Predicted Team Score LightGBM"] = team_lgb_pred_new
predict_games["Predicted Opponent Score LightGBM"] = opp_lgb_pred_new
predict_games["Predicted Team Score CatBoost"] = team_cat_pred_new
predict_games["Predicted Opponent Score CatBoost"] = opp_cat_pred_new

# Calculate average predictions
predict_games["Average Predicted Team Score"] = predict_games[
    [
        "Predicted Team Score LR",
        "Predicted Team Score GAM",
        "Predicted Team Score LightGBM",
        "Predicted Team Score CatBoost",
    ]
].mean(axis=1)

predict_games["Average Predicted Opponent Score"] = predict_games[
    [
        "Predicted Opponent Score LR",
        "Predicted Opponent Score GAM",
        "Predicted Opponent Score LightGBM",
        "Predicted Opponent Score CatBoost",
    ]
].mean(axis=1)

# Display final predictions
predicted_score = predict_games[
    [
        "Team",
        "Opponent",
        "Predicted Team Score LR",
        "Predicted Opponent Score LR",
        "Predicted Team Score GAM",
        "Predicted Opponent Score GAM",
        "Predicted Team Score LightGBM",
        "Predicted Opponent Score LightGBM",
        "Predicted Team Score CatBoost",
        "Predicted Opponent Score CatBoost",
        "Average Predicted Team Score",
        "Average Predicted Opponent Score",
    ]
]

display(predicted_score)

Unnamed: 0,Team,Opponent,Predicted Team Score LR,Predicted Opponent Score LR,Predicted Team Score GAM,Predicted Opponent Score GAM,Predicted Team Score LightGBM,Predicted Opponent Score LightGBM,Predicted Team Score CatBoost,Predicted Opponent Score CatBoost,Average Predicted Team Score,Average Predicted Opponent Score
0,Syracuse,Duke,75.237107,72.923289,74.787023,73.708358,73.409977,73.686835,74.139258,73.851656,74.393341,73.542534
1,California,North Carolina St.,75.993674,71.428339,76.03769,71.546153,75.174529,71.437538,76.86185,73.191517,76.016936,71.900887
2,Clemson,Duke,81.378137,66.069356,82.753916,68.19642,80.433306,64.882915,83.086342,64.071677,81.912925,65.805092
3,Stanford,North Carolina St.,77.599869,68.594256,77.565173,68.662921,74.232154,66.910952,78.270841,69.438498,76.917009,68.401657
4,North Carolina,Pittsburgh,77.728642,69.951487,80.650972,70.530054,77.739998,67.035345,79.761538,68.082475,78.970287,68.89984
5,Clemson,North Carolina,80.151239,65.951135,79.069559,65.966923,78.034606,62.63452,81.434603,64.790525,79.672502,64.835776
6,Duke,California,82.64058,61.3031,81.965718,59.763318,81.334602,57.193519,83.957559,58.587119,82.474615,59.211764
7,North Carolina St.,Louisville,75.95282,71.073865,73.397903,66.709958,77.222496,70.756966,77.156425,69.924227,75.932411,69.616254


In [11]:
predicted_score.to_csv("ACC_testing_results_5th_to_14th.csv", index=False)

In [12]:
# Merge and display scores with actual scores as before
actual_scores1 = pd.read_csv("./test_file_Jan07to09.csv")

actual_scores = actual_scores1[
    (
        (
            actual_scores1["Team"].isin(predicted_score["Team"])
            | actual_scores1["Team"].isin(predicted_score["Opponent"])
        )
        & (
            actual_scores1["Opponent"].isin(predicted_score["Team"])
            | actual_scores1["Opponent"].isin(predicted_score["Opponent"])
        )
    )
]

actual_scores = actual_scores[["Team", "Opponent", "Team_Score", "Opponent_Score"]]

score1 = pd.merge(
    predicted_score,
    actual_scores,
    left_on=["Team", "Opponent"],
    right_on=["Team", "Opponent"],
    how="inner",
)
score2 = pd.merge(
    predicted_score, actual_scores, left_on=["Team"], right_on=["Opponent"], how="inner"
)

score2.rename(
    columns={
        "Team_x": "Team",
        "Opponent_x": "Opponent",
        "Team_y": "Opponent1",
        "Opponent_y": "Team1",
        "Team_Score": "Opponent_Score",
        "Opponent_Score": "Team_Score",
    },
    inplace=True,
)

score1 = score1[
    [
        "Team",
        "Opponent",
        "Team_Score",
        "Opponent_Score",
        "Predicted Team Score LR",
        "Predicted Opponent Score LR",
        "Predicted Team Score GAM",
        "Predicted Opponent Score GAM",
        "Predicted Team Score LightGBM",
        "Predicted Opponent Score LightGBM",
        "Predicted Team Score CatBoost",
        "Predicted Opponent Score CatBoost",
        "Average Predicted Team Score",
        "Average Predicted Opponent Score",
    ]
]

score2 = score2[
    [
        "Team",
        "Opponent",
        "Team_Score",
        "Opponent_Score",
        "Predicted Team Score LR",
        "Predicted Opponent Score LR",
        "Predicted Team Score GAM",
        "Predicted Opponent Score GAM",
        "Predicted Team Score LightGBM",
        "Predicted Opponent Score LightGBM",
        "Predicted Team Score CatBoost",
        "Predicted Opponent Score CatBoost",
        "Average Predicted Team Score",
        "Average Predicted Opponent Score",
    ]
]

scores = pd.concat([score1, score2], ignore_index=True)

display(scores)

FileNotFoundError: [Errno 2] No such file or directory: './test_file_Jan07to09.csv'

In [9]:
pd.set_option("display.width", 500)
delta_scores = pd.DataFrame()

delta_scores["Team"] = scores["Team"]
delta_scores["Opponent"] = scores["Opponent"]

# delta_scores["Team_Score"] = scores["Team_Score"]
# delta_scores["Opponent_Score"] = scores["Opponent_Score"]

delta_scores["delta_LR_Team"] = scores["Team_Score"] - scores["Predicted Team Score LR"]
delta_scores["delta_LR_Opponent"] = (
    scores["Opponent_Score"] - scores["Predicted Opponent Score LR"]
)
delta_scores["delta_GAM_Team"] = (
    scores["Team_Score"] - scores["Predicted Team Score GAM"]
)
delta_scores["delta_GAM_Opponent"] = (
    scores["Opponent_Score"] - scores["Predicted Opponent Score GAM"]
)

delta_scores["delta_LightGBM_Team"] = (
    scores["Team_Score"] - scores["Predicted Team Score LightGBM"]
)
delta_scores["delta_LightGBM_Opponent"] = (
    scores["Opponent_Score"] - scores["Predicted Opponent Score LightGBM"]
)
delta_scores["delta_CATBoost_Team"] = (
    scores["Team_Score"] - scores["Predicted Team Score CatBoost"]
)
delta_scores["delta_CATBoost_Opponent"] = (
    scores["Opponent_Score"] - scores["Predicted Opponent Score CatBoost"]
)

delta_scores["delta_Average_Team"] = (
    scores["Team_Score"] - scores["Average Predicted Team Score"]
)
delta_scores["delta_Average_Opponent"] = (
    scores["Opponent_Score"] - scores["Average Predicted Opponent Score"]
)
display(delta_scores)

# Calculate absolute averages for all columns
absolute_averages = (
    delta_scores[
        [
            "delta_LR_Team",
            "delta_LR_Opponent",
            "delta_GAM_Team",
            "delta_GAM_Opponent",
            "delta_LightGBM_Team",
            "delta_LightGBM_Opponent",
            "delta_CATBoost_Team",
            "delta_CATBoost_Opponent",
            "delta_Average_Team",
            "delta_Average_Opponent",
        ]
    ]
    .abs()
    .mean()
)

# Display the result
print(absolute_averages)

Unnamed: 0,Team,Opponent,delta_LR_Team,delta_LR_Opponent,delta_GAM_Team,delta_GAM_Opponent,delta_LightGBM_Team,delta_LightGBM_Opponent,delta_CATBoost_Team,delta_CATBoost_Opponent,delta_Average_Team,delta_Average_Opponent
0,Richmond,Florida Gulf Coast,-11.962304,5.433847,-15.571775,0.282942,-14.80175,1.94663,-16.387859,1.375219,-14.680922,2.259659
1,San Diego St.,Utah St.,-6.411362,-10.982591,-7.549167,-4.978822,-7.497518,-10.941018,-5.778877,-9.732662,-6.809231,-9.158773
2,Georgia Tech,Alabama A&M,20.779075,-13.028484,19.547656,-14.911546,21.545942,-15.586851,17.532549,-16.997775,19.851305,-15.131164
3,San Jose St.,Boise St.,-2.539018,-4.020569,-4.484949,-2.620047,-4.471292,-5.944647,-2.4537,-6.521045,-3.48724,-4.776577
4,Colorado St.,New Mexico,-2.23183,0.642207,-3.458083,1.833217,-3.439744,-1.632521,-3.174226,-1.042513,-3.075971,-0.049903
5,UCLA,Gonzaga,-11.860674,-20.679081,-12.768793,-12.742317,-9.720624,-15.176006,-8.422871,-15.638352,-10.69324,-16.058939
6,Wyoming,Nevada,-7.424835,-17.340145,-7.902518,-13.043515,-6.27293,-18.688164,-6.850501,-18.16842,-7.112696,-16.810061
7,Texas A&M,Abilene Christian,16.502828,-12.725882,15.245615,-9.051629,15.816132,-13.742042,15.780095,-13.07466,15.836167,-12.148553
8,Louisville,Eastern Kentucky,3.655902,5.49384,0.609693,5.172751,0.241759,2.884434,0.042442,3.401229,1.137449,4.238064
9,Maryland,Maryland Eastern Shore,-1.766879,7.218221,-2.45832,5.933816,-1.740017,4.793086,-2.352031,4.160383,-2.079312,5.526376


delta_LR_Team              8.192577
delta_LR_Opponent          7.935679
delta_GAM_Team             8.214028
delta_GAM_Opponent         5.732896
delta_LightGBM_Team        8.043358
delta_LightGBM_Opponent    7.490098
delta_CATBoost_Team        7.494337
delta_CATBoost_Opponent    7.467735
delta_Average_Team         7.947286
delta_Average_Opponent     6.866761
dtype: float64
