In [18]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [19]:
data = pd.read_excel('../Stats_competition-/Teams_Data.xlsx')
display(data)

data = data.dropna(subset=['Opponent_WAB'])
print(data.columns)

feature_cols = data.columns.difference(['Team', 'Opponent', 'Team Score', 'Opponent Score', 'Team_TEAM','Team_CONF', 'Team_G','Opponent_TEAM','Opponent_CONF', 'Opponent_G'])
X = data[feature_cols]
y = data[['Team Score', 'Opponent Score']]

In [24]:
def train_and_select_best_models(X, y):
    models = {}
    for target in y.columns:
        y_target = y[target]
        X_train, X_test, y_train, y_test = train_test_split(X, y_target, test_size=0.2, random_state=42)

        candidate_models = {
            'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
            'GradientBoosting': GradientBoostingRegressor(random_state=42),
            'LinearRegression': LinearRegression()
        }

        best_model = None
        best_score = float('inf')
        for name, model in candidate_models.items():
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            rmse = mean_squared_error(y_test, y_pred, squared=False)
            print(f"{target} - {name}: RMSE = {rmse:.4f}")
            if rmse < best_score:
                best_score = rmse
                best_model = model

        models[target] = best_model
        
    return models

In [25]:
models = train_and_select_best_models(X, y)

Team Score - RandomForest: RMSE = 14.1992
Team Score - GradientBoosting: RMSE = 16.2793
Team Score - LinearRegression: RMSE = 10.1629
Opponent Score - RandomForest: RMSE = 7.5801
Opponent Score - GradientBoosting: RMSE = 14.8409
Opponent Score - LinearRegression: RMSE = 40.2279




In [26]:
models

{'Team Score': LinearRegression(),
 'Opponent Score': RandomForestRegressor(random_state=42)}

In [27]:
def predict_and_simulate(models, X, n_simulations=10000):
    results = []

    for idx, row in X.iterrows():
        row_results = {'Index': idx}

        for target, model in models.items():
            score_pred = model.predict([row])[0]

            residual_std = 10 
            simulated_scores = np.random.normal(score_pred, residual_std, n_simulations)
            confidence_interval = (np.percentile(simulated_scores, 2.5), np.percentile(simulated_scores, 97.5))

            row_results[f'{target}_Predicted_Score'] = score_pred
            row_results[f'{target}_Confidence_Interval'] = confidence_interval

        results.append(row_results)

    return pd.DataFrame(results)

In [28]:
test_df = pd.read_excel('../Stats_competition-/Teams_Data_test.xlsx')

In [29]:
test_df

Unnamed: 0,Team,Opponent,Team_ADJOE,Team_ADJDE,Team_BARTHAG,Team_EFG_O,Team_EFG_D,Team_TOR,Team_TORD,Team_ORB,Team_DRB,Team_FTR,Team_FTRD,Team_2P_O,Team_2P_D,Team_3P_O,Team_3P_D,Team_ADJ_T,Team_WAB,Opponent_ADJOE,Opponent_ADJDE,Opponent_BARTHAG,Opponent_EFG_O,Opponent_EFG_D,Opponent_TOR,Opponent_TORD,Opponent_ORB,Opponent_DRB,Opponent_FTR,Opponent_FTRD,Opponent_2P_O,Opponent_2P_D,Opponent_3P_O,Opponent_3P_D,Opponent_ADJ_T,Opponent_WAB
0,Duke,Auburn,117.516,88.61,0.96261,55.352,40.75,16.612,21.354,33.694,24.025,26.7306,27.162,56.457,40.34,36.176,27.527,68.0197,0.4543,123.71,94.411,0.95714,60.64,45.042,13.929,16.0252,32.3123,25.443,29.6264,30.6121,65.71,42.825,36.373,33.9211,68.9139,3.01
1,Duke,Louisville,117.516,88.61,0.96261,55.352,40.75,16.612,21.354,33.694,24.025,26.7306,27.162,56.457,40.34,36.176,27.527,68.0197,0.4543,113.054,97.95,0.838645,52.113,46.257,18.5222,24.95,36.241,30.922,33.9174,33.2172,61.68,43.933,28.7298,33.1179,70.178,0.1953
2,North Carolina,Alabama,119.16,98.759,0.896518,54.666,50.2171,12.74,14.4315,25.9279,28.0113,43.244,28.181,53.5121,49.1136,37.746,34.6237,73.09,0.0463,121.73,96.833,0.93278,54.764,45.551,16.9132,13.334,35.65,28.714,46.323,38.6269,60.314,46.269,32.5194,29.57,73.27,1.86
3,North Carolina,Georgia Tech,119.16,98.759,0.896518,54.666,50.2171,12.74,14.4315,25.9279,28.0113,43.244,28.181,53.5121,49.1136,37.746,34.6237,73.09,0.0463,109.691,103.3127,0.664711,50.4179,47.8101,14.237,14.6305,32.5121,25.646,31.5224,37.4249,51.4173,44.946,32.6192,35.3255,70.847,-1.9194
4,N.C. State,Texas,110.285,98.355,0.787168,52.312,45.145,15.157,22.133,29.5192,33.529,37.499,30.0109,56.167,48.8125,29.5284,26.014,68.4171,-0.594,115.036,95.519,0.894621,56.729,42.619,12.74,19.4102,28.3216,27.184,38.091,28.79,58.234,38.21,35.79,33.6196,68.616,0.3049
5,N.C. State,Florida State,110.285,98.355,0.787168,52.312,45.145,15.157,22.133,29.5192,33.529,37.499,30.0109,56.167,48.8125,29.5284,26.014,68.4171,-0.594,107.6121,96.024,0.78687,52.6108,44.837,17.9186,25.43,29.9181,30.4201,45.527,40.4296,54.798,49.4142,32.5194,26.417,72.12,0.1854


In [30]:
test_X = test_df[feature_cols] 

In [31]:
test_X

Unnamed: 0,Opponent_2P_D,Opponent_2P_O,Opponent_3P_D,Opponent_3P_O,Opponent_ADJDE,Opponent_ADJOE,Opponent_ADJ_T,Opponent_BARTHAG,Opponent_DRB,Opponent_EFG_D,Opponent_EFG_O,Opponent_FTR,Opponent_FTRD,Opponent_ORB,Opponent_TOR,Opponent_TORD,Opponent_WAB,Team_2P_D,Team_2P_O,Team_3P_D,Team_3P_O,Team_ADJDE,Team_ADJOE,Team_ADJ_T,Team_BARTHAG,Team_DRB,Team_EFG_D,Team_EFG_O,Team_FTR,Team_FTRD,Team_ORB,Team_TOR,Team_TORD,Team_WAB
0,42.825,65.71,33.9211,36.373,94.411,123.71,68.9139,0.95714,25.443,45.042,60.64,29.6264,30.6121,32.3123,13.929,16.0252,3.01,40.34,56.457,27.527,36.176,88.61,117.516,68.0197,0.96261,24.025,40.75,55.352,26.7306,27.162,33.694,16.612,21.354,0.4543
1,43.933,61.68,33.1179,28.7298,97.95,113.054,70.178,0.838645,30.922,46.257,52.113,33.9174,33.2172,36.241,18.5222,24.95,0.1953,40.34,56.457,27.527,36.176,88.61,117.516,68.0197,0.96261,24.025,40.75,55.352,26.7306,27.162,33.694,16.612,21.354,0.4543
2,46.269,60.314,29.57,32.5194,96.833,121.73,73.27,0.93278,28.714,45.551,54.764,46.323,38.6269,35.65,16.9132,13.334,1.86,49.1136,53.5121,34.6237,37.746,98.759,119.16,73.09,0.896518,28.0113,50.2171,54.666,43.244,28.181,25.9279,12.74,14.4315,0.0463
3,44.946,51.4173,35.3255,32.6192,103.3127,109.691,70.847,0.664711,25.646,47.8101,50.4179,31.5224,37.4249,32.5121,14.237,14.6305,-1.9194,49.1136,53.5121,34.6237,37.746,98.759,119.16,73.09,0.896518,28.0113,50.2171,54.666,43.244,28.181,25.9279,12.74,14.4315,0.0463
4,38.21,58.234,33.6196,35.79,95.519,115.036,68.616,0.894621,27.184,42.619,56.729,38.091,28.79,28.3216,12.74,19.4102,0.3049,48.8125,56.167,26.014,29.5284,98.355,110.285,68.4171,0.787168,33.529,45.145,52.312,37.499,30.0109,29.5192,15.157,22.133,-0.594
5,49.4142,54.798,26.417,32.5194,96.024,107.6121,72.12,0.78687,30.4201,44.837,52.6108,45.527,40.4296,29.9181,17.9186,25.43,0.1854,48.8125,56.167,26.014,29.5284,98.355,110.285,68.4171,0.787168,33.529,45.145,52.312,37.499,30.0109,29.5192,15.157,22.133,-0.594


In [32]:
results_df = predict_and_simulate(models, test_X)



In [33]:
df_combined = pd.concat([test_df, results_df.drop('Index', axis = 1)], axis = 1)

In [34]:
df_combined[['Team', 'Opponent','Team Score_Predicted_Score',
       'Team Score_Confidence_Interval', 'Opponent Score_Predicted_Score',
       'Opponent Score_Confidence_Interval']]

Unnamed: 0,Team,Opponent,Team Score_Predicted_Score,Team Score_Confidence_Interval,Opponent Score_Predicted_Score,Opponent Score_Confidence_Interval
0,Duke,Auburn,55.0,"(34.83037428908654, 74.29454176321431)",78.66,"(59.05704225510345, 98.25232738339608)"
1,Duke,Louisville,96.802176,"(77.17922515368281, 116.58692270852146)",73.18,"(53.7570645001215, 92.23538362600029)"
2,North Carolina,Alabama,66.726569,"(47.448444148932296, 86.12624019304728)",77.6,"(57.454986632525625, 97.11397485418486)"
3,North Carolina,Georgia Tech,84.338049,"(64.65532049913325, 104.11011046911825)",65.73,"(46.22729595130238, 85.33924657193872)"
4,N.C. State,Texas,66.677328,"(47.2583872532297, 85.74360609003529)",79.15,"(59.58906284750917, 98.25523849430033)"
5,N.C. State,Florida State,82.207192,"(62.8661142970186, 101.53363657163501)",73.99,"(54.16249654140884, 93.23483996962798)"
