In [11]:
# Implementation of LightGBM model. Also include other models from model_test.ipynb for comparison

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from pygam import LinearGAM, s
import lightgbm as lgb

In [13]:
# Display all columns and rows
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

# Load and display dataset
team_data = pd.read_csv("./train_data.csv")
display(team_data.head())

Unnamed: 0,game_id,date,Team,Opponent,Team_Score,Opponent_Score,Location,ADJO,ADJD,EFF,EFG_pct,TO_Pct,OR_Pct,FTR_Pct,ThreePointer_Pct,opp_ADJO,opp_ADJD,Opp_EFF,Opp_EFG_Pct,Opp_TO_Pct,Opp_OR_Pct,Opp_FTR_Pct,Opp_ThreePointer_Pct
0,Abilene ChristianBaylor12-9,2024-12-09,Abilene Christian,Baylor,57,88,A,94.8,112.7,86.7,36.4,15.2,27.0,41.8,14.29,129.2,94.3,133.8,66.1,13.7,35.7,27.1,44.44
1,Abilene ChristianKennesaw St.11-20,2024-11-20,Abilene Christian,Kennesaw St.,78,84,A,108.5,115.6,107.1,50.0,9.6,20.0,50.8,33.33,111.8,115.0,115.3,62.2,26.1,31.8,100.0,43.75
2,Abilene ChristianMontana St.11-26,2024-11-26,Abilene Christian,Montana St.,59,85,A,90.6,129.0,91.6,44.0,12.4,20.6,20.7,16.67,130.3,96.3,131.9,75.0,20.2,25.0,26.0,65.38
3,Abilene ChristianNebraska Omaha11-30,2024-11-30,Abilene Christian,Nebraska Omaha,71,55,A,105.5,84.2,107.2,51.9,15.1,24.1,35.2,30.77,80.6,114.7,83.1,42.0,24.2,17.2,54.5,31.25
4,Abilene ChristianNew Mexico St.12-4,2024-12-04,Abilene Christian,New Mexico St.,78,70,A,117.8,107.2,117.8,58.9,16.6,32.0,77.8,43.75,105.0,123.6,105.7,51.0,18.1,27.6,58.0,33.33


In [14]:
# Data Cleaning and Feature Engineering
team_data["Location"] = np.where(
    team_data["Location"] == "N", 0, np.where(team_data["Location"] == "H", 1, -1)
)

columns_to_convert = [
    "Location",
    "ADJO",
    "ADJD",
    "EFG_pct",
    "TO_Pct",
    "OR_Pct",
    "FTR_Pct",
    "opp_ADJO",
    "opp_ADJD",
    "Opp_EFG_Pct",
    "Opp_TO_Pct",
    "Opp_OR_Pct",
    "Opp_FTR_Pct",
]

for col in columns_to_convert:
    team_data[col] = pd.to_numeric(team_data[col], errors="coerce")

# Drop rows with missing values
team_data_cleaned = team_data.dropna()

# Define features (X) and target variables (y)
X = team_data_cleaned[columns_to_convert]
y_team = team_data_cleaned["Team_Score"]
y_opp = team_data_cleaned["Opponent_Score"]

# Train-test split
X_train, X_test, y_team_train, y_team_test = train_test_split(
    X, y_team, test_size=0.2, random_state=42
)
y_opp_train, y_opp_test = train_test_split(y_opp, test_size=0.2, random_state=42)

In [15]:
# Linear Regression Models
model_team = LinearRegression()
model_team.fit(X_train, y_team_train)
team_pred = model_team.predict(X_test)

model_opp = LinearRegression()
model_opp.fit(X_train, y_opp_train)
opp_pred = model_opp.predict(X_test)

# Calculate RMSE and Accuracy for Linear Regression
threshold = 6
team_rmse = mean_squared_error(y_team_test, team_pred, squared=False)
opp_rmse = mean_squared_error(y_opp_test, opp_pred, squared=False)
print(f"Team Score RMSE: {team_rmse}, Opponent Score RMSE: {opp_rmse}")

team_accuracy = (abs(team_pred - y_team_test) <= threshold).mean() * 100
opp_accuracy = (abs(opp_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy: {team_accuracy:.2f}%")
print(f"Opponent Score Accuracy: {opp_accuracy:.2f}%")

# PyGAM Models
# Fit PyGAM for Team Score
gam_team = LinearGAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9) + s(10)).fit(X_train, y_team_train)
team_gam_pred = gam_team.predict(X_test)

# Fit PyGAM for Opponent Score
gam_opp = LinearGAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9) + s(10)).fit(X_train, y_opp_train)
opp_gam_pred = gam_opp.predict(X_test)

# Calculate RMSE and Accuracy for PyGAM
team_gam_rmse = mean_squared_error(y_team_test, team_gam_pred, squared=False)
opp_gam_rmse = mean_squared_error(y_opp_test, opp_gam_pred, squared=False)
print(f"Team Score RMSE (PyGAM): {team_gam_rmse}")
print(f"Opponent Score RMSE (PyGAM): {opp_gam_rmse}")

team_gam_accuracy = (abs(team_gam_pred - y_team_test) <= threshold).mean() * 100
opp_gam_accuracy = (abs(opp_gam_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy (PyGAM): {team_gam_accuracy:.2f}%")
print(f"Opponent Score Accuracy (PyGAM): {opp_gam_accuracy:.2f}%")





Team Score RMSE: 6.281968476868378, Opponent Score RMSE: 6.947681965849563
Team Score Accuracy: 71.91%
Opponent Score Accuracy: 64.41%
Team Score RMSE (PyGAM): 6.3413843407155674
Opponent Score RMSE (PyGAM): 7.479263572159282
Team Score Accuracy (PyGAM): 70.94%
Opponent Score Accuracy (PyGAM): 60.29%




In [16]:
# LightGBM Models
# LightGBM model for Team Score prediction
lgb_model_team = lgb.LGBMRegressor(objective='regression', boosting_type='gbdt', num_leaves=31, learning_rate=0.1, n_estimators=100)
lgb_model_team.fit(X_train, y_team_train)
team_lgb_pred = lgb_model_team.predict(X_test)

# LightGBM model for Opponent Score prediction
lgb_model_opp = lgb.LGBMRegressor(objective='regression', boosting_type='gbdt', num_leaves=31, learning_rate=0.1, n_estimators=100)
lgb_model_opp.fit(X_train, y_opp_train)
opp_lgb_pred = lgb_model_opp.predict(X_test)

# RMSE and Accuracy for LightGBM
team_lgb_rmse = mean_squared_error(y_team_test, team_lgb_pred, squared=False)
opp_lgb_rmse = mean_squared_error(y_opp_test, opp_lgb_pred, squared=False)
print(f"Team Score RMSE (LightGBM): {team_lgb_rmse}")
print(f"Opponent Score RMSE (LightGBM): {opp_lgb_rmse}")

team_lgb_accuracy = (abs(team_lgb_pred - y_team_test) <= threshold).mean() * 100
opp_lgb_accuracy = (abs(opp_lgb_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy (LightGBM): {team_lgb_accuracy:.2f}%")
print(f"Opponent Score Accuracy (LightGBM): {opp_lgb_accuracy:.2f}%")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000135 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2947
[LightGBM] [Info] Number of data points in the train set: 1648, number of used features: 13
[LightGBM] [Info] Start training from score 68.727549
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000300 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2947
[LightGBM] [Info] Number of data points in the train set: 1648, number of used features: 13
[LightGBM] [Info] Start training from score 77.342840
Team Score RMSE (LightGBM): 6.344907203844792
Opponent Score RMSE (LightGBM): 7.011691025960997
Team Score Accuracy (LightGBM): 69.25%
Opponent Score Accuracy (LightGBM): 65.62%




In [17]:
# Predict scores using all models
predict_games = pd.read_csv("./test_file_29th.csv")
predict_games["Location"] = np.where(
    predict_games["Location"] == "N",
    0,
    np.where(predict_games["Location"] == "H", 1, -1),
)

predict_games.rename(columns={
    'Opp_ADJO': 'opp_ADJO',
    'Opp_ADJD': 'opp_ADJD',
    'Opp_EFG_pct': 'Opp_EFG_Pct'
}, inplace=True)

X = predict_games[columns_to_convert]

# Predictions
team_pred_new = model_team.predict(X)
opp_pred_new = model_opp.predict(X)
team_gam_pred_new = gam_team.predict(X)
opp_gam_pred_new = gam_opp.predict(X)
team_lgb_pred_new = lgb_model_team.predict(X)
opp_lgb_pred_new = lgb_model_opp.predict(X)

# Add predictions to DataFrame
predict_games["Predicted Team Score LR"] = team_pred_new
predict_games["Predicted Opponent Score LR"] = opp_pred_new
predict_games["Predicted Team Score GAM"] = team_gam_pred_new
predict_games["Predicted Opponent Score GAM"] = opp_gam_pred_new
predict_games["Predicted Team Score LightGBM"] = team_lgb_pred_new
predict_games["Predicted Opponent Score LightGBM"] = opp_lgb_pred_new

# Calculate average predictions
predict_games["Average Predicted Team Score"] = predict_games[
    ["Predicted Team Score LR", "Predicted Team Score GAM", "Predicted Team Score LightGBM"]
].mean(axis=1)

predict_games["Average Predicted Opponent Score"] = predict_games[
    ["Predicted Opponent Score LR", "Predicted Opponent Score GAM", "Predicted Opponent Score LightGBM"]
].mean(axis=1)


In [18]:
# Display final predictions
predicted_score = predict_games[[
    "Team",
    "Opponent",
    "Predicted Team Score LR",
    "Predicted Opponent Score LR",
    "Predicted Team Score GAM",
    "Predicted Opponent Score GAM",
    "Predicted Team Score LightGBM",
    "Predicted Opponent Score LightGBM",
    "Average Predicted Team Score",
    "Average Predicted Opponent Score",
]]

display(predicted_score)

Unnamed: 0,Team,Opponent,Predicted Team Score LR,Predicted Opponent Score LR,Predicted Team Score GAM,Predicted Opponent Score GAM,Predicted Team Score LightGBM,Predicted Opponent Score LightGBM,Average Predicted Team Score,Average Predicted Opponent Score
0,Texas St.,UT Arlington,74.582368,73.312475,76.493272,73.296332,74.50499,76.463615,75.193543,74.357474
1,Washington,NJIT,72.879839,60.979325,76.233826,64.495323,75.693793,63.981615,74.935819,63.152088
2,North Carolina,Campbell,78.453123,65.318175,80.537679,66.658136,83.065096,69.168325,80.685299,67.048212
3,Iona,Harvard,66.211335,64.792823,68.197577,68.188348,67.442181,65.299251,67.283698,66.093474
4,Army,UTSA,71.628707,67.579903,73.158456,70.42991,72.956726,69.314077,72.581296,69.107963
5,Northwestern,Northeastern,74.405105,70.470294,75.736617,65.502182,72.308022,67.890735,74.149915,67.954404
6,Alabama,South Dakota St.,80.993224,72.690678,82.526425,68.938729,83.931118,72.553478,82.483589,71.394295
7,Ohio St.,Indiana St.,81.245352,74.920731,82.744055,73.330155,80.995413,72.30059,81.661607,73.517159
8,Illinois,Chicago St.,78.537471,55.120266,79.378805,57.296688,81.336127,61.027251,79.750801,57.814735
9,Oregon,Weber St.,78.373003,69.810048,81.490669,69.130538,82.233911,72.677654,80.699194,70.539413


In [20]:
# Merge and display scores with actual scores as before
actual_scores1 = pd.read_csv("./test_data.csv")

actual_scores = actual_scores1[
    (
        (actual_scores1['Team'].isin(predicted_score['Team']) | actual_scores1['Team'].isin(predicted_score['Opponent'])) &
        (actual_scores1['Opponent'].isin(predicted_score['Team']) | actual_scores1['Opponent'].isin(predicted_score['Opponent']))
    )
]

actual_scores = actual_scores[['Team', 'Opponent', 'Team_Score', 'Opponent_Score']]

score1 = pd.merge(predicted_score, actual_scores, left_on=["Team", "Opponent"], right_on=["Team", "Opponent"], how="inner")
score2 = pd.merge(predicted_score, actual_scores, left_on=["Team"], right_on=["Opponent"], how="inner")

score2.rename(columns={"Team_x": "Team", "Opponent_x": "Opponent",
                       "Team_y": "Opponent1", "Opponent_y": "Team1",
                       "Team_Score": "Opponent_Score",
                       "Opponent_Score": "Team_Score"}, inplace=True)

score1 = score1[["Team", "Opponent", "Team_Score", "Opponent_Score",
                 "Predicted Team Score LR", "Predicted Opponent Score LR", 
                 "Predicted Team Score GAM", "Predicted Opponent Score GAM",
                 "Predicted Team Score LightGBM", "Predicted Opponent Score LightGBM",
                 "Average Predicted Team Score", "Average Predicted Opponent Score"]]

score2 = score2[["Team", "Opponent", "Team_Score", "Opponent_Score",
                 "Predicted Team Score LR", "Predicted Opponent Score LR", 
                 "Predicted Team Score GAM", "Predicted Opponent Score GAM",
                 "Predicted Team Score LightGBM", "Predicted Opponent Score LightGBM",
                 "Average Predicted Team Score", "Average Predicted Opponent Score"]]

scores = pd.concat([score1, score2], ignore_index=True)

display(scores)


Unnamed: 0,Team,Opponent,Team_Score,Opponent_Score,Predicted Team Score LR,Predicted Opponent Score LR,Predicted Team Score GAM,Predicted Opponent Score GAM,Predicted Team Score LightGBM,Predicted Opponent Score LightGBM,Average Predicted Team Score,Average Predicted Opponent Score
0,Texas St.,UT Arlington,72,80,74.582368,73.312475,76.493272,73.296332,74.50499,76.463615,75.193543,74.357474
1,Washington,NJIT,90,53,72.879839,60.979325,76.233826,64.495323,75.693793,63.981615,74.935819,63.152088
2,North Carolina,Campbell,97,81,78.453123,65.318175,80.537679,66.658136,83.065096,69.168325,80.685299,67.048212
3,Iona,Harvard,61,67,66.211335,64.792823,68.197577,68.188348,67.442181,65.299251,67.283698,66.093474
4,Army,UTSA,78,75,71.628707,67.579903,73.158456,70.42991,72.956726,69.314077,72.581296,69.107963
5,Northwestern,Northeastern,85,60,74.405105,70.470294,75.736617,65.502182,72.308022,67.890735,74.149915,67.954404
6,Alabama,South Dakota St.,105,82,80.993224,72.690678,82.526425,68.938729,83.931118,72.553478,82.483589,71.394295
7,Ohio St.,Indiana St.,103,83,81.245352,74.920731,82.744055,73.330155,80.995413,72.30059,81.661607,73.517159
8,Illinois,Chicago St.,117,64,78.537471,55.120266,79.378805,57.296688,81.336127,61.027251,79.750801,57.814735
9,Oregon,Weber St.,89,49,78.373003,69.810048,81.490669,69.130538,82.233911,72.677654,80.699194,70.539413
