In [18]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [19]:
data = pd.read_excel('../Stats_competition-/Teams_Data.xlsx')
display(data)

data = data.dropna(subset=['Opponent_WAB'])
print(data.columns)

feature_cols = data.columns.difference(['Team', 'Opponent', 'Team Score', 'Opponent Score', 'Team_TEAM','Team_CONF', 'Team_G','Opponent_TEAM','Opponent_CONF', 'Opponent_G'])
X = data[feature_cols]
y = data[['Team Score', 'Opponent Score']]

In [24]:
def train_and_select_best_models(X, y):
    models = {}
    for target in y.columns:
        y_target = y[target]
        X_train, X_test, y_train, y_test = train_test_split(X, y_target, test_size=0.2, random_state=42)

        candidate_models = {
            'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
            'GradientBoosting': GradientBoostingRegressor(random_state=42),
            'LinearRegression': LinearRegression()
        }

        best_model = None
        best_score = float('inf')
        for name, model in candidate_models.items():
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            rmse = mean_squared_error(y_test, y_pred, squared=False)
            print(f"{target} - {name}: RMSE = {rmse:.4f}")
            if rmse < best_score:
                best_score = rmse
                best_model = model

        models[target] = best_model
        
    return models

In [25]:
models = train_and_select_best_models(X, y)

Team Score - RandomForest: RMSE = 14.1992
Team Score - GradientBoosting: RMSE = 16.2793
Team Score - LinearRegression: RMSE = 10.1629
Opponent Score - RandomForest: RMSE = 7.5801
Opponent Score - GradientBoosting: RMSE = 14.8409
Opponent Score - LinearRegression: RMSE = 40.2279




In [27]:
def predict_and_simulate(models, X, n_simulations=10000):
    results = []

    for idx, row in X.iterrows():
        row_results = {'Index': idx}

        for target, model in models.items():
            score_pred = model.predict([row])[0]

            residual_std = 10 
            simulated_scores = np.random.normal(score_pred, residual_std, n_simulations)
            confidence_interval = (np.percentile(simulated_scores, 2.5), np.percentile(simulated_scores, 97.5))

            row_results[f'{target}_Predicted_Score'] = score_pred
            row_results[f'{target}_Confidence_Interval'] = confidence_interval

        results.append(row_results)

    return pd.DataFrame(results)



In [30]:
test_df = pd.read_excel('../Stats_competition-/Teams_Data_test.xlsx')
test_X = test_df[feature_cols] 
results_df = predict_and_simulate(models, test_X)

In [33]:
df_combined = pd.concat([test_df, results_df.drop('Index', axis = 1)], axis = 1)
final = df_combined[['Team', 'Opponent','Team Score_Predicted_Score',
       'Team Score_Confidence_Interval', 'Opponent Score_Predicted_Score',
       'Opponent Score_Confidence_Interval']]