In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

team_data = pd.read_csv('../Stats_competition-/team_data_collected_df.csv')

In [2]:
team_data

Unnamed: 0,Location,Team,Opponent,WAB,ADJO,ADJD,EFF,EFG%,TO%,OR%,...,3P,Opp EFF,Opp EFG%,Opp TO%,Opp OR%,Opp FTR,Opp 2P,Opp 3P,Opponent_score,Team_score
0,H,DUKE,Maine,0.1,125.3,95.2,130.6,64.3,17.7,35.5,...,11-29,84.3,39.5,19.0,19.4,31.6,18-43,3-14,62,96
1,H,DUKE,Army,0.1,124.7,92.3,141.0,61.3,11.3,43.6,...,17-38,81.8,39.1,18.3,23.3,14.1,13-35,8-29,58,100
2,N,DUKE,Kentucky,-0.2,106.8,86.4,95.7,42.3,9.3,25.0,...,4-24,102.3,47.6,14.6,23.3,38.1,15-38,10-25,77,72
3,H,DUKE,Wofford,-0.1,124.7,56.9,133.7,61.3,15.5,45.2,...,16-38,54.4,28.9,29.5,29.3,5.3,9-24,5-33,35,86
4,A,DUKE,Arizona,0.6,111.7,75.9,101.9,50.0,20.7,35.1,...,9-25,81.2,45.3,22.2,16.7,20.8,15-30,6-23,55,69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,H,Miami FL,Charleston Southern,-2.6,109.1,130.5,116.3,57.6,14.7,25.8,...,12-33,122.2,59.5,17.7,36.4,17.5,21-39,11-24,83,79
204,H,Miami FL,Arkansas,-3.3,120.1,105.1,108.1,55.8,16.3,25.0,...,9-29,112.5,57.3,16.3,26.5,14.5,19-39,11-23,76,73
205,H,Miami FL,Clemson,-3.9,102.7,100.1,92.8,44.6,21.9,35.3,...,4-25,109.7,47.4,11.8,33.3,20.7,14-31,9-27,65,55
206,N,Miami FL,Tennessee,-4.0,109.0,99.2,91.6,45.9,20.7,26.8,...,8-33,110.9,57.7,16.3,18.5,34.6,15-27,10-25,75,62


In [3]:
columns_to_convert = ['WAB', 'ADJO', 'ADJD', 'EFF', 'EFG%',
       'TO%', 'OR%', 'FTR', 'Opp EFF', 'Opp EFG%', 'Opp TO%',
       'Opp OR%', 'Opp FTR']
for col in columns_to_convert:
    team_data[col] = pd.to_numeric(team_data[col], errors='coerce')

In [4]:
team_data_cleaned = team_data.dropna()

X = team_data_cleaned[columns_to_convert]
y_team = team_data_cleaned['Team_score']
y_opp = team_data_cleaned['Opponent_score']

X_train, X_test, y_team_train, y_team_test = train_test_split(X, y_team, test_size=0.2, random_state=42)
y_opp_train, y_opp_test = train_test_split(y_opp, test_size=0.2, random_state=42)

model_team = LinearRegression()
model_team.fit(X_train, y_team_train)
team_pred = model_team.predict(X_test)

model_opp = LinearRegression()
model_opp.fit(X_train, y_opp_train)
opp_pred = model_opp.predict(X_test)

team_rmse = mean_squared_error(y_team_test, team_pred, squared=False)
opp_rmse = mean_squared_error(y_opp_test, opp_pred, squared=False)
print(f"Team Score RMSE: {team_rmse}, Opponent Score RMSE: {opp_rmse}")


Team Score RMSE: 6.042031229716957, Opponent Score RMSE: 5.482247151711573




In [5]:
threshold = 6

team_accuracy = (abs(team_pred - y_team_test) <= threshold).mean() * 100
opp_accuracy = (abs(opp_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy: {team_accuracy:.2f}%")
print(f"Opponent Score Accuracy: {opp_accuracy:.2f}%")

Team Score Accuracy: 71.43%
Opponent Score Accuracy: 73.81%


In [6]:
rf_model_team = RandomForestRegressor(random_state=42)
rf_model_team.fit(X_train, y_team_train)
rf_team_pred = rf_model_team.predict(X_test)

rf_model_opp = RandomForestRegressor(random_state=42)
rf_model_opp.fit(X_train, y_opp_train)
rf_opp_pred = rf_model_opp.predict(X_test)

team_rf_rmse = mean_squared_error(y_team_test, rf_team_pred, squared=False)
opp_rf_rmse = mean_squared_error(y_opp_test, rf_opp_pred, squared=False)
print(f"Team Score RMSE: {team_rf_rmse}, Opponent Score RMSE: {opp_rf_rmse}")


Team Score RMSE: 6.848895635689847, Opponent Score RMSE: 5.515149697920827




In [7]:
threshold = 6

team_accuracy = (abs(rf_team_pred - y_team_test) <= threshold).mean() * 100
opp_accuracy = (abs(rf_opp_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy: {team_accuracy:.2f}%")
print(f"Opponent Score Accuracy: {opp_accuracy:.2f}%")

Team Score Accuracy: 64.29%
Opponent Score Accuracy: 73.81%


In [8]:
import xgboost as xgb

X_train, X_test, y_team_train, y_team_test = train_test_split(X, y_team, test_size=0.2, random_state=42)
X_train_opp, X_test_opp, y_opp_train, y_opp_test = train_test_split(X, y_opp, test_size=0.2, random_state=42)

dtrain_team = xgb.DMatrix(X_train, label=y_team_train)
dtest_team = xgb.DMatrix(X_test, label=y_team_test)

dtrain_opp = xgb.DMatrix(X_train_opp, label=y_opp_train)
dtest_opp = xgb.DMatrix(X_test_opp, label=y_opp_test)

params = {
    'objective': 'reg:squarederror',  
    'eval_metric': 'rmse',           
    'learning_rate': 0.1,            
    'max_depth': 6,                  
    'subsample': 0.8,                
    'colsample_bytree': 0.8          
}

team_model = xgb.train(params, dtrain_team, num_boost_round=100, evals=[(dtest_team, 'test')], early_stopping_rounds=10)

opp_model = xgb.train(params, dtrain_opp, num_boost_round=100, evals=[(dtest_opp, 'test')], early_stopping_rounds=10)

xg_team_pred = team_model.predict(dtest_team)
xg_opp_pred = opp_model.predict(dtest_opp)

team_rmse = mean_squared_error(y_team_test, xg_team_pred, squared=False)
opp_rmse = mean_squared_error(y_opp_test, xg_opp_pred, squared=False)

print(f"Team Score RMSE: {team_rmse}")
print(f"Opponent Score RMSE: {opp_rmse}")

[0]	test-rmse:12.22225
[1]	test-rmse:11.36482
[2]	test-rmse:10.72736
[3]	test-rmse:10.16507
[4]	test-rmse:9.79569
[5]	test-rmse:9.20061
[6]	test-rmse:8.83179
[7]	test-rmse:8.63630
[8]	test-rmse:8.36079
[9]	test-rmse:8.18608
[10]	test-rmse:7.89600
[11]	test-rmse:7.67426
[12]	test-rmse:7.45242
[13]	test-rmse:7.32343
[14]	test-rmse:7.28128
[15]	test-rmse:7.23096
[16]	test-rmse:7.15244
[17]	test-rmse:7.11076
[18]	test-rmse:7.12298
[19]	test-rmse:7.05691
[20]	test-rmse:7.04043
[21]	test-rmse:7.00271
[22]	test-rmse:6.99123
[23]	test-rmse:6.99681
[24]	test-rmse:6.98287
[25]	test-rmse:6.98273
[26]	test-rmse:6.99547
[27]	test-rmse:7.02367
[28]	test-rmse:7.02914
[29]	test-rmse:7.03097
[30]	test-rmse:7.03707
[31]	test-rmse:7.04666
[32]	test-rmse:7.03709
[33]	test-rmse:7.03870
[34]	test-rmse:7.03964
[35]	test-rmse:7.05179
[0]	test-rmse:13.38104
[1]	test-rmse:12.72558
[2]	test-rmse:11.90560
[3]	test-rmse:11.21446
[4]	test-rmse:10.63679
[5]	test-rmse:10.05780
[6]	test-rmse:9.55879
[7]	test-rmse:9.02



In [9]:
threshold = 6

team_accuracy = (abs(xg_team_pred - y_team_test) <= threshold).mean() * 100
opp_accuracy = (abs(xg_opp_pred - y_opp_test) <= threshold).mean() * 100
print(f"Team Score Accuracy: {team_accuracy:.2f}%")
print(f"Opponent Score Accuracy: {opp_accuracy:.2f}%")

Team Score Accuracy: 64.29%
Opponent Score Accuracy: 71.43%
