In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from pygam import LinearGAM, s
import lightgbm as lgb
from catboost import CatBoostRegressor

In [2]:
# Display all columns and rows
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

# Load and display dataset
team_data = pd.read_csv("./train_data.csv")
display(team_data.head())

# Data Cleaning and Feature Engineering
team_data["Location"] = np.where(
    team_data["Location"] == "N", 0, np.where(team_data["Location"] == "H", 1, -1)
)

columns_to_convert = [
    "Location",
    "ADJO",
    "ADJD",
    "EFG_pct",
    "TO_Pct",
    "OR_Pct",
    "FTR_Pct",
    "opp_ADJO",
    "opp_ADJD",
    "Opp_EFG_Pct",
    "Opp_TO_Pct",
    "Opp_OR_Pct",
    "Opp_FTR_Pct",
]

for col in columns_to_convert:
    team_data[col] = pd.to_numeric(team_data[col], errors="coerce")

# Drop rows with missing values
team_data_cleaned = team_data.dropna()

# Define features (X) and target variables (y)
X = team_data_cleaned[columns_to_convert]
y_team = team_data_cleaned["Team_Score"]
y_opp = team_data_cleaned["Opponent_Score"]

# Train-test split
X_train, X_test, y_team_train, y_team_test = train_test_split(
    X, y_team, test_size=0.2, random_state=42
)
y_opp_train, y_opp_test = train_test_split(y_opp, test_size=0.2, random_state=42)


Unnamed: 0,game_id,date,Team,Opponent,Team_Score,Opponent_Score,Location,ADJO,ADJD,EFF,EFG_pct,TO_Pct,OR_Pct,FTR_Pct,ThreePointer_Pct,opp_ADJO,opp_ADJD,Opp_EFF,Opp_EFG_Pct,Opp_TO_Pct,Opp_OR_Pct,Opp_FTR_Pct,Opp_ThreePointer_Pct
0,Abilene ChristianBaylor12-9,2024-12-09,Abilene Christian,Baylor,57,88,A,96.7,113.1,86.7,36.4,15.2,27.0,41.8,14.29,130.4,95.9,133.8,66.1,13.7,35.7,27.1,44.44
1,Abilene ChristianKennesaw St.11-20,2024-11-20,Abilene Christian,Kennesaw St.,78,84,A,108.2,115.5,107.1,50.0,9.6,20.0,50.8,33.33,112.9,116.9,115.3,62.2,26.1,31.8,100.0,43.75
2,Abilene ChristianMontana St.11-26,2024-11-26,Abilene Christian,Montana St.,59,85,A,90.7,129.0,91.6,44.0,12.4,20.6,20.7,16.67,131.5,97.9,131.9,75.0,20.2,25.0,26.0,65.38
3,Abilene ChristianNebraska Omaha11-30,2024-11-30,Abilene Christian,Nebraska Omaha,71,55,A,105.3,84.1,107.2,51.9,15.1,24.1,35.2,30.77,81.3,116.6,83.1,42.0,24.2,17.2,54.5,31.25
4,Abilene ChristianNew Mexico St.12-4,2024-12-04,Abilene Christian,New Mexico St.,78,70,A,117.7,107.1,117.8,58.9,16.6,32.0,77.8,43.75,106.0,125.6,105.7,51.0,18.1,27.6,58.0,33.33


In [3]:
# Linear Regression Models
model_team = LinearRegression()
model_team.fit(X_train, y_team_train)
team_pred = model_team.predict(X_test)

model_opp = LinearRegression()
model_opp.fit(X_train, y_opp_train)
opp_pred = model_opp.predict(X_test)

# Calculate RMSE and Accuracy for Linear Regression
threshold = 6
team_rmse = mean_squared_error(y_team_test, team_pred, squared=False)
opp_rmse = mean_squared_error(y_opp_test, opp_pred, squared=False)
print(f"Team Score RMSE: {team_rmse}, Opponent Score RMSE: {opp_rmse}")

team_accuracy = (abs(team_pred - y_team_test) <= threshold).mean() * 100
opp_accuracy = (abs(opp_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy: {team_accuracy:.2f}%")
print(f"Opponent Score Accuracy: {opp_accuracy:.2f}%")

# PyGAM Models
# Fit PyGAM for Team Score
gam_team = LinearGAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9) + s(10)).fit(X_train, y_team_train)
team_gam_pred = gam_team.predict(X_test)

# Fit PyGAM for Opponent Score
gam_opp = LinearGAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9) + s(10)).fit(X_train, y_opp_train)
opp_gam_pred = gam_opp.predict(X_test)

# Calculate RMSE and Accuracy for PyGAM
team_gam_rmse = mean_squared_error(y_team_test, team_gam_pred, squared=False)
opp_gam_rmse = mean_squared_error(y_opp_test, opp_gam_pred, squared=False)
print(f"Team Score RMSE (PyGAM): {team_gam_rmse}")
print(f"Opponent Score RMSE (PyGAM): {opp_gam_rmse}")

team_gam_accuracy = (abs(team_gam_pred - y_team_test) <= threshold).mean() * 100
opp_gam_accuracy = (abs(opp_gam_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy (PyGAM): {team_gam_accuracy:.2f}%")
print(f"Opponent Score Accuracy (PyGAM): {opp_gam_accuracy:.2f}%")

# LightGBM Models
# LightGBM model for Team Score prediction
lgb_model_team = lgb.LGBMRegressor(objective='regression', boosting_type='gbdt', num_leaves=31, learning_rate=0.1, n_estimators=100)
lgb_model_team.fit(X_train, y_team_train)
team_lgb_pred = lgb_model_team.predict(X_test)

# LightGBM model for Opponent Score prediction
lgb_model_opp = lgb.LGBMRegressor(objective='regression', boosting_type='gbdt', num_leaves=31, learning_rate=0.1, n_estimators=100)
lgb_model_opp.fit(X_train, y_opp_train)
opp_lgb_pred = lgb_model_opp.predict(X_test)

# RMSE and Accuracy for LightGBM
team_lgb_rmse = mean_squared_error(y_team_test, team_lgb_pred, squared=False)
opp_lgb_rmse = mean_squared_error(y_opp_test, opp_lgb_pred, squared=False)
print(f"Team Score RMSE (LightGBM): {team_lgb_rmse}")
print(f"Opponent Score RMSE (LightGBM): {opp_lgb_rmse}")

team_lgb_accuracy = (abs(team_lgb_pred - y_team_test) <= threshold).mean() * 100
opp_lgb_accuracy = (abs(opp_lgb_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy (LightGBM): {team_lgb_accuracy:.2f}%")
print(f"Opponent Score Accuracy (LightGBM): {opp_lgb_accuracy:.2f}%")

# CatBoost Models
# CatBoost model for Team Score prediction
cat_model_team = CatBoostRegressor(iterations=500, depth=6, learning_rate=0.01, loss_function='RMSE', verbose=0)
cat_model_team.fit(X_train, y_team_train)
team_cat_pred = cat_model_team.predict(X_test)

# CatBoost model for Opponent Score prediction
cat_model_opp = CatBoostRegressor(iterations=500, depth=6, learning_rate=0.01, loss_function='RMSE', verbose=0)
cat_model_opp.fit(X_train, y_opp_train)
opp_cat_pred = cat_model_opp.predict(X_test)

# RMSE and Accuracy for CatBoost
team_cat_rmse = mean_squared_error(y_team_test, team_cat_pred, squared=False)
opp_cat_rmse = mean_squared_error(y_opp_test, opp_cat_pred, squared=False)
print(f"Team Score RMSE (CatBoost): {team_cat_rmse}")
print(f"Opponent Score RMSE (CatBoost): {opp_cat_rmse}")

team_cat_accuracy = (abs(team_cat_pred - y_team_test) <= threshold).mean() * 100
opp_cat_accuracy = (abs(opp_cat_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy (CatBoost): {team_cat_accuracy:.2f}%")
print(f"Opponent Score Accuracy (CatBoost): {opp_cat_accuracy:.2f}%")




Team Score RMSE: 6.052539345145718, Opponent Score RMSE: 6.48926925035759
Team Score Accuracy: 71.10%
Opponent Score Accuracy: 70.41%




Team Score RMSE (PyGAM): 6.289460689750158
Opponent Score RMSE (PyGAM): 7.430902958909432
Team Score Accuracy (PyGAM): 68.12%
Opponent Score Accuracy (PyGAM): 61.01%
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000242 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2961
[LightGBM] [Info] Number of data points in the train set: 1743, number of used features: 13
[LightGBM] [Info] Start training from score 68.782559
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000152 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2961
[LightGBM] [Info] Number of data points in the train set: 1743, number of used features: 13
[LightGBM] [Info] Start training from score 77.218015




Team Score RMSE (LightGBM): 6.317128123514561
Opponent Score RMSE (LightGBM): 6.890362671734341
Team Score Accuracy (LightGBM): 71.79%
Opponent Score Accuracy (LightGBM): 66.28%
Team Score RMSE (CatBoost): 6.201656894951222
Opponent Score RMSE (CatBoost): 7.032734009789203
Team Score Accuracy (CatBoost): 70.64%
Opponent Score Accuracy (CatBoost): 65.14%




In [5]:
# Predict scores using all models
predict_games = pd.read_csv("./test_file_main.csv")
predict_games["Location"] = np.where(
    predict_games["Location"] == "N",
    0,
    np.where(predict_games["Location"] == "H", 1, -1),
)

predict_games.rename(columns={
    'Opp_ADJO': 'opp_ADJO',
    'Opp_ADJD': 'opp_ADJD',
    'Opp_EFG_pct': 'Opp_EFG_Pct'
}, inplace=True)

X = predict_games[columns_to_convert]

# Predictions
team_pred_new = model_team.predict(X)
opp_pred_new = model_opp.predict(X)
team_gam_pred_new = gam_team.predict(X)
opp_gam_pred_new = gam_opp.predict(X)
team_lgb_pred_new = lgb_model_team.predict(X)
opp_lgb_pred_new = lgb_model_opp.predict(X)
team_cat_pred_new = cat_model_team.predict(X)
opp_cat_pred_new = cat_model_opp.predict(X)

# Add predictions to DataFrame
predict_games["Predicted Team Score LR"] = team_pred_new
predict_games["Predicted Opponent Score LR"] = opp_pred_new
predict_games["Predicted Team Score GAM"] = team_gam_pred_new
predict_games["Predicted Opponent Score GAM"] = opp_gam_pred_new
predict_games["Predicted Team Score LightGBM"] = team_lgb_pred_new
predict_games["Predicted Opponent Score LightGBM"] = opp_lgb_pred_new
predict_games["Predicted Team Score CatBoost"] = team_cat_pred_new
predict_games["Predicted Opponent Score CatBoost"] = opp_cat_pred_new

# Calculate average predictions
predict_games["Average Predicted Team Score"] = predict_games[
    ["Predicted Team Score LR", "Predicted Team Score GAM", "Predicted Team Score LightGBM", "Predicted Team Score CatBoost"]
].mean(axis=1)

predict_games["Average Predicted Opponent Score"] = predict_games[
    ["Predicted Opponent Score LR", "Predicted Opponent Score GAM", "Predicted Opponent Score LightGBM", "Predicted Opponent Score CatBoost"]
].mean(axis=1)

# Display final predictions
predicted_score = predict_games[[
    "Team",
    "Opponent",
    "Predicted Team Score LR",
    "Predicted Opponent Score LR",
    "Predicted Team Score GAM",
    "Predicted Opponent Score GAM",
    "Predicted Team Score LightGBM",
    "Predicted Opponent Score LightGBM",
    "Predicted Team Score CatBoost",
    "Predicted Opponent Score CatBoost",
    "Average Predicted Team Score",
    "Average Predicted Opponent Score",
]]

display(predicted_score)


Unnamed: 0,Team,Opponent,Predicted Team Score LR,Predicted Opponent Score LR,Predicted Team Score GAM,Predicted Opponent Score GAM,Predicted Team Score LightGBM,Predicted Opponent Score LightGBM,Predicted Team Score CatBoost,Predicted Opponent Score CatBoost,Average Predicted Team Score,Average Predicted Opponent Score
0,SMU,Duke,81.41109,80.60651,85.162318,74.055174,85.886213,84.734464,78.087805,80.092448,82.636856,79.872149
1,Wake Forest,North Carolina St.,69.392043,76.052867,71.082098,73.621332,72.181967,79.710919,71.943166,76.393686,71.149819,76.444701
2,Notre Dame,North Carolina,76.506965,76.867189,77.888599,75.441758,76.301835,78.078416,75.822235,79.080202,76.629909,77.366891
3,Duke,Pittsburgh,80.048328,80.214896,82.856343,72.33112,83.569422,78.882684,78.904621,76.180036,81.344678,76.902184
4,North Carolina,SMU,77.391904,82.674131,78.99232,77.470287,78.980004,83.177494,77.794774,81.25338,78.28975,81.143823
5,North Carolina St.,Notre Dame,76.216467,75.993104,77.597073,74.276666,77.810816,78.395106,77.403317,77.311496,77.256918,76.494093
6,Duke,Notre Dame,80.160752,74.934662,82.499782,69.584704,82.168849,75.423775,81.490636,73.08196,81.580005,73.256275
7,North Carolina St.,North Carolina,76.433701,77.043486,77.495403,74.896823,77.721491,79.884584,75.704856,78.230997,76.838863,77.513973
8,Duke,Miami FL,80.305613,75.157259,82.787091,70.684262,82.29029,77.897784,81.953672,73.627438,81.834167,74.341686
9,North Carolina,California,77.787922,75.837705,79.129343,73.538099,79.344088,78.719742,79.447678,76.184895,78.927258,76.07011


In [6]:
# Merge and display scores with actual scores as before
actual_scores1 = pd.read_csv("./test_data.csv")

actual_scores = actual_scores1[
    (
        (actual_scores1['Team'].isin(predicted_score['Team']) | actual_scores1['Team'].isin(predicted_score['Opponent'])) &
        (actual_scores1['Opponent'].isin(predicted_score['Team']) | actual_scores1['Opponent'].isin(predicted_score['Opponent']))
    )
]

actual_scores = actual_scores[['Team', 'Opponent', 'Team_Score', 'Opponent_Score']]

score1 = pd.merge(predicted_score, actual_scores, left_on=["Team", "Opponent"], right_on=["Team", "Opponent"], how="inner")
score2 = pd.merge(predicted_score, actual_scores, left_on=["Team"], right_on=["Opponent"], how="inner")

score2.rename(columns={"Team_x": "Team", "Opponent_x": "Opponent",
                       "Team_y": "Opponent1", "Opponent_y": "Team1",
                       "Team_Score": "Opponent_Score",
                       "Opponent_Score": "Team_Score"}, inplace=True)

score1 = score1[["Team", "Opponent", "Team_Score", "Opponent_Score",
                 "Predicted Team Score LR", "Predicted Opponent Score LR", 
                 "Predicted Team Score GAM", "Predicted Opponent Score GAM",
                 "Predicted Team Score LightGBM", "Predicted Opponent Score LightGBM",
                 "Predicted Team Score CatBoost", "Predicted Opponent Score CatBoost",
                 "Average Predicted Team Score", "Average Predicted Opponent Score"]]

score2 = score2[["Team", "Opponent", "Team_Score", "Opponent_Score",
                 "Predicted Team Score LR", "Predicted Opponent Score LR", 
                 "Predicted Team Score GAM", "Predicted Opponent Score GAM",
                 "Predicted Team Score LightGBM", "Predicted Opponent Score LightGBM",
                 "Predicted Team Score CatBoost", "Predicted Opponent Score CatBoost",
                 "Average Predicted Team Score", "Average Predicted Opponent Score"]]

scores = pd.concat([score1, score2], ignore_index=True)

display(scores)


Unnamed: 0,Team,Opponent,Team_Score,Opponent_Score,Predicted Team Score LR,Predicted Opponent Score LR,Predicted Team Score GAM,Predicted Opponent Score GAM,Predicted Team Score LightGBM,Predicted Opponent Score LightGBM,Predicted Team Score CatBoost,Predicted Opponent Score CatBoost,Average Predicted Team Score,Average Predicted Opponent Score
0,Richmond,Florida Gulf Coast,57,75,68.962304,69.566153,72.571775,74.717058,71.80175,73.05337,73.387859,73.624781,71.680922,72.740341
1,San Diego St.,Utah St.,66,67,72.411362,77.982591,73.549167,71.978822,73.497518,77.941018,71.778877,76.732662,72.809231,76.158773
2,Georgia Tech,Alabama A&M,92,49,71.220925,62.028484,72.452344,63.911546,70.454058,64.586851,74.467451,65.997775,72.148695,64.131164
3,San Jose St.,Boise St.,71,73,73.539018,77.020569,75.484949,75.620047,75.471292,78.944647,73.4537,79.521045,74.48724,77.776577
4,Colorado St.,New Mexico,68,76,70.23183,75.357793,71.458083,74.166783,71.439744,77.632521,71.174226,77.042513,71.075971,76.049903
5,UCLA,Gonzaga,65,62,76.860674,82.679081,77.768793,74.742317,74.720624,77.176006,73.422871,77.638352,75.69324,78.058939
6,Wyoming,Nevada,66,63,73.424835,80.340145,73.902518,76.043515,72.27293,81.688164,72.850501,81.16842,73.112696,79.810061
7,Texas A&M,Abilene Christian,92,54,75.497172,66.725882,76.754385,63.051629,76.183868,67.742042,76.219905,67.07466,76.163833,66.148553
8,Louisville,Eastern Kentucky,78,76,74.344098,70.50616,77.390307,70.827249,77.758241,73.115566,77.957558,72.598771,76.862551,71.761936
9,Maryland,Maryland Eastern Shore,81,66,82.766879,58.781779,83.45832,60.066184,82.740017,61.206914,83.352031,61.839617,83.079312,60.473624


In [9]:
pd.set_option('display.width', 500)
delta_scores = pd.DataFrame()

delta_scores["Team"] = scores["Team"]
delta_scores["Opponent"] = scores["Opponent"]

#delta_scores["Team_Score"] = scores["Team_Score"]
#delta_scores["Opponent_Score"] = scores["Opponent_Score"]

delta_scores["delta_LR_Team"] = scores["Team_Score"] - scores["Predicted Team Score LR"]
delta_scores["delta_LR_Opponent"] = scores["Opponent_Score"] - scores["Predicted Opponent Score LR"]
delta_scores["delta_GAM_Team"] = scores["Team_Score"] - scores["Predicted Team Score GAM"]
delta_scores["delta_GAM_Opponent"] = scores["Opponent_Score"] - scores["Predicted Opponent Score GAM"]

delta_scores["delta_LightGBM_Team"] = scores["Team_Score"] - scores["Predicted Team Score LightGBM"]
delta_scores["delta_LightGBM_Opponent"] = scores["Opponent_Score"] - scores["Predicted Opponent Score LightGBM"]
delta_scores["delta_CATBoost_Team"] = scores["Team_Score"] - scores["Predicted Team Score CatBoost"]
delta_scores["delta_CATBoost_Opponent"] = scores["Opponent_Score"] - scores["Predicted Opponent Score CatBoost"]

delta_scores["delta_Average_Team"] = scores["Team_Score"] - scores["Average Predicted Team Score"]
delta_scores["delta_Average_Opponent"] = scores["Opponent_Score"] - scores["Average Predicted Opponent Score"]
display(delta_scores)

# Calculate absolute averages for all columns
absolute_averages = delta_scores[["delta_LR_Team", "delta_LR_Opponent",
                                  "delta_GAM_Team", "delta_GAM_Opponent",
                                  "delta_LightGBM_Team", "delta_LightGBM_Opponent",
                                  "delta_CATBoost_Team", "delta_CATBoost_Opponent",
                                  "delta_Average_Team", "delta_Average_Opponent"]].abs().mean()

# Display the result
print(absolute_averages)

Unnamed: 0,Team,Opponent,delta_LR_Team,delta_LR_Opponent,delta_GAM_Team,delta_GAM_Opponent,delta_LightGBM_Team,delta_LightGBM_Opponent,delta_CATBoost_Team,delta_CATBoost_Opponent,delta_Average_Team,delta_Average_Opponent
0,Richmond,Florida Gulf Coast,-11.962304,5.433847,-15.571775,0.282942,-14.80175,1.94663,-16.387859,1.375219,-14.680922,2.259659
1,San Diego St.,Utah St.,-6.411362,-10.982591,-7.549167,-4.978822,-7.497518,-10.941018,-5.778877,-9.732662,-6.809231,-9.158773
2,Georgia Tech,Alabama A&M,20.779075,-13.028484,19.547656,-14.911546,21.545942,-15.586851,17.532549,-16.997775,19.851305,-15.131164
3,San Jose St.,Boise St.,-2.539018,-4.020569,-4.484949,-2.620047,-4.471292,-5.944647,-2.4537,-6.521045,-3.48724,-4.776577
4,Colorado St.,New Mexico,-2.23183,0.642207,-3.458083,1.833217,-3.439744,-1.632521,-3.174226,-1.042513,-3.075971,-0.049903
5,UCLA,Gonzaga,-11.860674,-20.679081,-12.768793,-12.742317,-9.720624,-15.176006,-8.422871,-15.638352,-10.69324,-16.058939
6,Wyoming,Nevada,-7.424835,-17.340145,-7.902518,-13.043515,-6.27293,-18.688164,-6.850501,-18.16842,-7.112696,-16.810061
7,Texas A&M,Abilene Christian,16.502828,-12.725882,15.245615,-9.051629,15.816132,-13.742042,15.780095,-13.07466,15.836167,-12.148553
8,Louisville,Eastern Kentucky,3.655902,5.49384,0.609693,5.172751,0.241759,2.884434,0.042442,3.401229,1.137449,4.238064
9,Maryland,Maryland Eastern Shore,-1.766879,7.218221,-2.45832,5.933816,-1.740017,4.793086,-2.352031,4.160383,-2.079312,5.526376


delta_LR_Team              8.192577
delta_LR_Opponent          7.935679
delta_GAM_Team             8.214028
delta_GAM_Opponent         5.732896
delta_LightGBM_Team        8.043358
delta_LightGBM_Opponent    7.490098
delta_CATBoost_Team        7.494337
delta_CATBoost_Opponent    7.467735
delta_Average_Team         7.947286
delta_Average_Opponent     6.866761
dtype: float64
