In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import numpy as np

team_data = pd.read_csv('../Stats_competition-/team_data_collected_df.csv')

In [2]:
threshold = 6

In [3]:
team_data

Unnamed: 0,Location,Team,Opponent,WAB,ADJO,ADJD,EFF,EFG%,TO%,OR%,...,3P,Opp EFF,Opp EFG%,Opp TO%,Opp OR%,Opp FTR,Opp 2P,Opp 3P,Opponent_score,Team_score
0,H,DUKE,Maine,0.1,125.3,95.2,130.6,64.3,17.7,35.5,...,11-29,84.3,39.5,19.0,19.4,31.6,18-43,3-14,62,96
1,H,DUKE,Army,0.1,124.7,92.3,141.0,61.3,11.3,43.6,...,17-38,81.8,39.1,18.3,23.3,14.1,13-35,8-29,58,100
2,N,DUKE,Kentucky,-0.2,106.8,86.4,95.7,42.3,9.3,25.0,...,4-24,102.3,47.6,14.6,23.3,38.1,15-38,10-25,77,72
3,H,DUKE,Wofford,-0.1,124.7,56.9,133.7,61.3,15.5,45.2,...,16-38,54.4,28.9,29.5,29.3,5.3,9-24,5-33,35,86
4,A,DUKE,Arizona,0.6,111.7,75.9,101.9,50.0,20.7,35.1,...,9-25,81.2,45.3,22.2,16.7,20.8,15-30,6-23,55,69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300,N,Michigan,Xavier,0.3,110.9,73.9,109.9,58.2,14.1,20.0,...,11-22,74.7,41.4,26.8,25.6,15.5,12-35,8-23,53,78
301,A,Michigan,Wisconsin,1.0,102.7,84.4,99.3,44.1,17.8,33.3,...,6-25,94.8,39.1,14.8,34.8,29.7,16-37,6-27,64,67
302,H,Michigan,Iowa,1.4,121.1,99.5,114.5,58.6,22.9,42.3,...,4-20,111.8,47.4,5.4,30.6,22.1,23-50,9-27,83,85
303,N,Michigan,Arkansas,0.9,123.6,108.4,112.3,64.4,21.9,26.7,...,8-18,114.8,56.8,18.1,33.3,33.3,24-43,9-23,89,87


In [4]:
team_data['Location'] = np.where(team_data['Location'] == 'N', 0, np.where(team_data['Location'] == 'H', 1, -1))

In [5]:
columns_to_convert = ['Location','ADJO', 'ADJD', 'EFG%',
       'TO%', 'OR%', 'FTR', 'Opp EFG%', 'Opp TO%',
       'Opp OR%', 'Opp FTR']
for col in columns_to_convert:
    team_data[col] = pd.to_numeric(team_data[col], errors='coerce')

In [6]:
team_data_cleaned = team_data.dropna()

X = team_data_cleaned[columns_to_convert]
y_team = team_data_cleaned['Team_score']
y_opp = team_data_cleaned['Opponent_score']

X_train, X_test, y_team_train, y_team_test = train_test_split(X, y_team, test_size=0.2, random_state=42)
y_opp_train, y_opp_test = train_test_split(y_opp, test_size=0.2, random_state=42)

model_team = LinearRegression()
model_team.fit(X_train, y_team_train)
team_pred = model_team.predict(X_test)

model_opp = LinearRegression()
model_opp.fit(X_train, y_opp_train)
opp_pred = model_opp.predict(X_test)

team_rmse = mean_squared_error(y_team_test, team_pred, squared=False)
opp_rmse = mean_squared_error(y_opp_test, opp_pred, squared=False)
print(f"Team Score RMSE: {team_rmse}, Opponent Score RMSE: {opp_rmse}")


Team Score RMSE: 5.694257875441654, Opponent Score RMSE: 5.397831922401339




In [7]:

team_accuracy = (abs(team_pred - y_team_test) <= threshold).mean() * 100
opp_accuracy = (abs(opp_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy: {team_accuracy:.2f}%")
print(f"Opponent Score Accuracy: {opp_accuracy:.2f}%")

Team Score Accuracy: 68.85%
Opponent Score Accuracy: 73.77%


In [8]:
rf_model_team = RandomForestRegressor(random_state=42)
rf_model_team.fit(X_train, y_team_train)
rf_team_pred = rf_model_team.predict(X_test)

rf_model_opp = RandomForestRegressor(random_state=42)
rf_model_opp.fit(X_train, y_opp_train)
rf_opp_pred = rf_model_opp.predict(X_test)

team_rf_rmse = mean_squared_error(y_team_test, rf_team_pred, squared=False)
opp_rf_rmse = mean_squared_error(y_opp_test, rf_opp_pred, squared=False)
print(f"Team Score RMSE: {team_rf_rmse}, Opponent Score RMSE: {opp_rf_rmse}")


Team Score RMSE: 6.759762096890494, Opponent Score RMSE: 5.717229353052915




In [9]:

team_accuracy = (abs(rf_team_pred - y_team_test) <= threshold).mean() * 100
opp_accuracy = (abs(rf_opp_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy: {team_accuracy:.2f}%")
print(f"Opponent Score Accuracy: {opp_accuracy:.2f}%")

Team Score Accuracy: 55.74%
Opponent Score Accuracy: 70.49%


In [10]:
import xgboost as xgb

X_train, X_test, y_team_train, y_team_test = train_test_split(X, y_team, test_size=0.2, random_state=42)
X_train_opp, X_test_opp, y_opp_train, y_opp_test = train_test_split(X, y_opp, test_size=0.2, random_state=42)

dtrain_team = xgb.DMatrix(X_train, label=y_team_train)
dtest_team = xgb.DMatrix(X_test, label=y_team_test)

dtrain_opp = xgb.DMatrix(X_train_opp, label=y_opp_train)
dtest_opp = xgb.DMatrix(X_test_opp, label=y_opp_test)

params = {
    'objective': 'reg:squarederror',  
    'eval_metric': 'rmse',           
    'learning_rate': 0.1,            
    'max_depth': 6,                  
    'subsample': 0.8,                
    'colsample_bytree': 0.8          
}

team_model = xgb.train(params, dtrain_team, num_boost_round=100, evals=[(dtest_team, 'test')], early_stopping_rounds=10)

opp_model = xgb.train(params, dtrain_opp, num_boost_round=100, evals=[(dtest_opp, 'test')], early_stopping_rounds=10)

xg_team_pred = team_model.predict(dtest_team)
xg_opp_pred = opp_model.predict(dtest_opp)

team_rmse = mean_squared_error(y_team_test, xg_team_pred, squared=False)
opp_rmse = mean_squared_error(y_opp_test, xg_opp_pred, squared=False)

print(f"Team Score RMSE: {team_rmse}")
print(f"Opponent Score RMSE: {opp_rmse}")

XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib\n  Referenced from: <F2F42313-BF4F-3B95-A853-AE1DE94D4C87> /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file)"]


In [11]:

team_accuracy = (abs(xg_team_pred - y_team_test) <= threshold).mean() * 100
opp_accuracy = (abs(xg_opp_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy: {team_accuracy:.2f}%")
print(f"Opponent Score Accuracy: {opp_accuracy:.2f}%")

Team Score Accuracy: 85.71%
Opponent Score Accuracy: 78.57%


In [12]:
predict_games = pd.read_csv('../Stats_competition-/basketball_games_data.csv')

In [13]:
predict_games['Location'] = np.where(predict_games['Location'] == 'Neutral', 0, np.where(predict_games['Location'] == 'Home', 1, -1))

In [14]:
predict_games

Unnamed: 0,Location,Team,Opponent,ADJO,ADJD,EFG%,TO%,OR%,FTR,Opp EFG%,Opp TO%,Opp OR%,Opp FTR
0,0,Michigan,Oklahoma,116.4,97.1,58.0,21.4,36.2,33.2,56.1,16.4,30.5,36.1
1,-1,Butler,Marquette,112.2,102.3,53.0,18.6,30.7,49.7,55.0,13.9,30.2,28.5
2,1,Connecticut,Xavier,121.8,99.6,58.3,15.2,35.2,33.9,54.3,16.6,29.5,35.8
3,-1,Toledo,Houston,110.2,114.1,51.4,12.9,25.9,32.6,52.6,12.9,35.4,29.6
4,-1,Memphis,Virginia,117.7,101.1,54.8,19.6,32.6,46.4,52.0,20.1,24.2,25.8


In [15]:
X = predict_games[columns_to_convert]

In [16]:
X

Unnamed: 0,Location,ADJO,ADJD,EFG%,TO%,OR%,FTR,Opp EFG%,Opp TO%,Opp OR%,Opp FTR
0,0,116.4,97.1,58.0,21.4,36.2,33.2,56.1,16.4,30.5,36.1
1,-1,112.2,102.3,53.0,18.6,30.7,49.7,55.0,13.9,30.2,28.5
2,1,121.8,99.6,58.3,15.2,35.2,33.9,54.3,16.6,29.5,35.8
3,-1,110.2,114.1,51.4,12.9,25.9,32.6,52.6,12.9,35.4,29.6
4,-1,117.7,101.1,54.8,19.6,32.6,46.4,52.0,20.1,24.2,25.8


In [17]:

team_pred_new = model_team.predict(X)
opp_pred_new = model_opp.predict(X)

predictions = pd.DataFrame({
    'Predicted Team Score LR': team_pred_new,
    'Predicted Opponent Score LR': opp_pred_new
})

In [18]:

rf_team_pred_new = rf_model_team.predict(X)
rf_opp_pred_new = rf_model_opp.predict(X)

predictions_rf = pd.DataFrame({
    'Predicted Team Score RF': rf_team_pred_new,
    'Predicted Opponent Score RF': rf_team_pred_new
})

In [19]:
predict_games = pd.concat([predict_games, predictions], axis=1)
predict_games = pd.concat([predict_games, predictions_rf], axis=1)

In [21]:
predict_games[['Team', 'Opponent', 'Predicted Team Score LR', 'Predicted Opponent Score LR','Predicted Team Score RF','Predicted Opponent Score RF']]

Unnamed: 0,Team,Opponent,Predicted Team Score LR,Predicted Opponent Score LR,Predicted Team Score RF,Predicted Opponent Score RF
0,Michigan,Oklahoma,80.92906,80.975608,80.85,80.85
1,Butler,Marquette,77.681284,80.163101,78.43,78.43
2,Connecticut,Xavier,86.850363,78.853839,88.04,88.04
3,Toledo,Houston,73.370242,77.781194,81.07,81.07
4,Memphis,Virginia,80.12087,69.905718,77.85,77.85
