In [56]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, BayesianRidge
from sklearn.metrics import mean_squared_error

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [57]:
team_data = pd.read_csv('../Stats_competition-/final_opponent_and_team_data.csv')
display(team_data.head())

Unnamed: 0,Date,Team,Opponent,Location,WAB,ADJO,ADJD,EFF,EFG%,TO%,OR%,FTR,2P,3P,Opp EFF,Opp EFG%,Opp TO%,Opp OR%,Opp FTR,Opp 2P,Opp 3P,Opponent_score,Team_score,opp_adj_o,opp_adj_d
0,2024-11-04,Duke,Maine,H,0.1,125.3,95.2,130.6,64.3,17.7,35.5,31.7,24-34,11-29,84.3,39.5,19.0,19.4,31.6,18-43,3-14,62,96,108.9,110.8
1,2024-11-08,Duke,Army,H,0.1,124.7,92.3,141.0,61.3,11.3,43.6,25.4,18-33,17-38,81.8,39.1,18.3,23.3,14.1,13-35,8-29,58,100,107.2,119.0
2,2024-11-12,Duke,Kentucky,N,-0.2,106.8,86.4,95.7,42.3,9.3,25.0,23.9,24-47,4-24,102.3,47.6,14.6,23.3,38.1,15-38,10-25,77,72,119.6,88.1
3,2024-11-16,Duke,Wofford,H,-0.1,124.7,56.9,133.7,61.3,15.5,45.2,17.7,14-24,16-38,54.4,28.9,29.5,29.3,5.3,9-24,5-33,35,86,69.8,115.5
4,2024-11-22,Duke,Arizona,A,0.6,111.7,75.9,101.9,50.0,20.7,35.1,21.3,17-36,9-25,81.2,45.3,22.2,16.7,20.8,15-30,6-23,55,69,98.9,90.4


In [58]:
team_data['Location'] = np.where(team_data['Location'] == 'N', 0, np.where(team_data['Location'] == 'H', 1, -1))
columns_to_convert = ['Location','ADJO', 'ADJD', 'EFG%', 'TO%', 'OR%', 'FTR', 'Opp EFG%', 'Opp TO%', 'Opp OR%', 'Opp FTR']
for col in columns_to_convert:
    team_data[col] = pd.to_numeric(team_data[col], errors='coerce')
    
cleanDate = team_data.dropna()

In [59]:
# Removing columns not needed for predictions
feature_cols = cleanDate.columns.difference(['Date', 'Team', 'Opponent', 'Team_score', 'Opponent_score',
                                             '2P', '3P', 'Opp 2P', 'Opp 3P',
                                             'EFF', 'Opp EFF', 'WAB'])

# Defining the feature matrix (X) and target matrix (y)
X = cleanDate[feature_cols]
y = cleanDate[['Team_score', 'Opponent_score']]

display(X.head())
display(y.head())

Unnamed: 0,ADJD,ADJO,EFG%,FTR,Location,OR%,Opp EFG%,Opp FTR,Opp OR%,Opp TO%,TO%,opp_adj_d,opp_adj_o
0,95.2,125.3,64.3,31.7,1,35.5,39.5,31.6,19.4,19.0,17.7,110.8,108.9
1,92.3,124.7,61.3,25.4,1,43.6,39.1,14.1,23.3,18.3,11.3,119.0,107.2
2,86.4,106.8,42.3,23.9,0,25.0,47.6,38.1,23.3,14.6,9.3,88.1,119.6
3,56.9,124.7,61.3,17.7,1,45.2,28.9,5.3,29.3,29.5,15.5,115.5,69.8
4,75.9,111.7,50.0,21.3,-1,35.1,45.3,20.8,16.7,22.2,20.7,90.4,98.9


Unnamed: 0,Team_score,Opponent_score
0,96,62
1,100,58
2,72,77
3,86,35
4,69,55


## Train and Backtest Model

In [60]:
# Train and evaluate Bayesian Ridge regression models for multiple target variables.
def trainAndTestModel(X_train, y_train, X_test, y_test, threshold=6):
    models = {}
    for target in y_train.columns:
        print(f"Training model for {target}...")

        # Initialize the model
        model = BayesianRidge()

        # Train the model
        model.fit(X_train, y_train[target])
        models[target] = model

        # Make predictions
        y_pred = model.predict(X_test)

        # Evaluate accuracy within the threshold
        accuracy = (abs(y_pred - y_test[target]) <= threshold).mean() * 100
        print(f"Accuracy for {target} within {threshold} points: {accuracy:.2f}%")

        # Evaluate RMSE
        rmse = mean_squared_error(y_test[target], y_pred, squared=False)
        print(f"RMSE for {target}: {rmse:.4f}\n")

        # Optional: Uncomment to print predictions
        # print("Actual:", y_test[target].values)
        # print("Predicted values:", y_pred)

    return models


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model_main = trainAndTestModel(X_train, y_train, X_test, y_test)
print(model_main)

Training model for Team_score...
Accuracy for Team_score within 6 points: 67.21%
RMSE for Team_score: 5.6670

Training model for Opponent_score...
Accuracy for Opponent_score within 6 points: 68.85%
RMSE for Opponent_score: 5.4352

{'Team_score': BayesianRidge(), 'Opponent_score': BayesianRidge()}




## Train Model

In [61]:
## Train Bayesian Ridge regression models for multiple target variables
def trainModel(X, y):
    models = {}
    for target in y.columns:
        print(f"Training model for {target}...")

        # Initialize the model
        model = BayesianRidge()
        
        # Train the model
        model.fit(X, y[target])
        models[target] = model

        print(f"Model for {target} trained successfully.\n")

    return models

model1 = trainModel(X, y)
print(model1)

Training model for Team_score...
Model for Team_score trained successfully.

Training model for Opponent_score...
Model for Opponent_score trained successfully.

{'Team_score': BayesianRidge(), 'Opponent_score': BayesianRidge()}


In [62]:
display(X.head())
display(y.head())

Unnamed: 0,ADJD,ADJO,EFG%,FTR,Location,OR%,Opp EFG%,Opp FTR,Opp OR%,Opp TO%,TO%,opp_adj_d,opp_adj_o
0,95.2,125.3,64.3,31.7,1,35.5,39.5,31.6,19.4,19.0,17.7,110.8,108.9
1,92.3,124.7,61.3,25.4,1,43.6,39.1,14.1,23.3,18.3,11.3,119.0,107.2
2,86.4,106.8,42.3,23.9,0,25.0,47.6,38.1,23.3,14.6,9.3,88.1,119.6
3,56.9,124.7,61.3,17.7,1,45.2,28.9,5.3,29.3,29.5,15.5,115.5,69.8
4,75.9,111.7,50.0,21.3,-1,35.1,45.3,20.8,16.7,22.2,20.7,90.4,98.9


Unnamed: 0,Team_score,Opponent_score
0,96,62
1,100,58
2,72,77
3,86,35
4,69,55


## Test model using test data

In [63]:
def predictModel(models, data):
    # Preprocess the data
    print("Preprocessing data for prediction...")
    data['Location'] = np.where(data['Location'] == 'Neutral', 0, 
                                np.where(data['Location'] == 'Home', 1, -1))
    
    columns_to_convert = ['ADJD', 'ADJO', 'EFG%', 'FTR', 'Location', 'OR%', 
                        'Opp EFG%', 'Opp FTR', 'Opp OR%', 'Opp TO%', 'TO%',
                        'opp_adj_d', 'opp_adj_o']
    
    for col in columns_to_convert:
        data[col] = pd.to_numeric(data[col], errors='coerce')
    
    
    # Extract the features for prediction
    X = data[columns_to_convert]

    # Make predictions using the models
    print("Making predictions...")
    predictions = pd.DataFrame()
    for target, model in models.items():
        print(f"Predicting {target}...")
        predictions[target] = model.predict(X)

    print("Predictions completed.\n")
    
    teams = data[['Team', 'Opponent']]
    finalPredictions = pd.concat([teams, predictions], axis=1)
    
    return finalPredictions

# Example usage:
predict_data = pd.read_csv('../Stats_competition-/basketball_games_data.csv')
predictions = predictModel(model1, predict_data)
display(predictions)


Preprocessing data for prediction...
Making predictions...
Predicting Team_score...
Predicting Opponent_score...
Predictions completed.



Unnamed: 0,Team,Opponent,Team_score,Opponent_score
0,Auburn,Purdue,89.430534,79.165254
1,Houston,Texas A&M Corpus Christi,81.128646,72.195457
2,Duke,Georgia Tech,79.623114,69.422654
3,Gonzaga,Bucknell,85.874906,66.704418
4,Florida,North Florida,83.399218,72.800038
5,Kentucky,Ohio State,84.551605,77.921747
6,Marquette,Xavier,80.815527,75.748384
7,UCLA,North Carolina,80.615577,76.135407
8,Maryland,Syracuse,85.109556,73.897145
9,Connecticut,Butler,86.992765,76.944019


In [64]:
def trainAndTestEnsembleModel(X_train, y_train, X_test, y_test, threshold=6):
    """
    Train and evaluate a StackingRegressor for multiple target variables.

    Parameters:
    X_train (DataFrame): Training feature set.
    y_train (DataFrame): Training target set (multi-target).
    X_test (DataFrame): Test feature set.
    y_test (DataFrame): Test target set (multi-target).
    threshold (int, optional): Threshold for accuracy evaluation. Default is 6.

    Returns:
    dict: A dictionary containing trained ensemble models for each target variable.
    """
    models = {}
    for target in y_train.columns:
        print(f"Training StackingRegressor model for {target}...")

        # Define base models and stacking regressor
        base_models = [
            ('ridge', Ridge()),
            ('bayesian_ridge', BayesianRidge())
        ]
        stacking_model = StackingRegressor(estimators=base_models, final_estimator=BayesianRidge())

        # Train the stacking model
        stacking_model.fit(X_train, y_train[target])
        models[target] = stacking_model

        # Make predictions
        y_pred = stacking_model.predict(X_test)

        # Evaluate accuracy within the threshold
        accuracy = (abs(y_pred - y_test[target]) <= threshold).mean() * 100
        print(f"Accuracy for {target} within {threshold} points: {accuracy:.2f}%")

        # Evaluate RMSE
        rmse = mean_squared_error(y_test[target], y_pred, squared=False)
        print(f"RMSE for {target}: {rmse:.4f}\n")

        # Optional: Uncomment to print predictions
        # print("Actual:", y_test[target].values)
        # print("Predicted values:", y_pred)

    return models

# Example Usage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model2 = trainAndTestEnsembleModel(X_train, y_train, X_test, y_test)
print(model2)


Training StackingRegressor model for Team_score...
Accuracy for Team_score within 6 points: 67.21%
RMSE for Team_score: 5.6741

Training StackingRegressor model for Opponent_score...
Accuracy for Opponent_score within 6 points: 68.85%
RMSE for Opponent_score: 5.3860

{'Team_score': StackingRegressor(estimators=[('ridge', Ridge()),
                              ('bayesian_ridge', BayesianRidge())],
                  final_estimator=BayesianRidge()), 'Opponent_score': StackingRegressor(estimators=[('ridge', Ridge()),
                              ('bayesian_ridge', BayesianRidge())],
                  final_estimator=BayesianRidge())}




In [65]:
#Train and evaluate a StackingRegressor combining Bayesian Ridge and Random Forest for multiple target variables.

def trainAndTestEnsembleModelWithRF(X_train, y_train, X_test, y_test, threshold=6):
    models = {}
    for target in y_train.columns:
        print(f"Training StackingRegressor model for {target}...")

        # Define base models and stacking regressor
        base_models = [
            ('bayesian_ridge', BayesianRidge()),
            ('random_forest', RandomForestRegressor(n_estimators=100, random_state=42))
        ]
        stacking_model = StackingRegressor(estimators=base_models, final_estimator=BayesianRidge())

        # Train the stacking model
        stacking_model.fit(X_train, y_train[target])
        models[target] = stacking_model

        # Make predictions
        y_pred = stacking_model.predict(X_test)

        # Evaluate accuracy within the threshold
        accuracy = (abs(y_pred - y_test[target]) <= threshold).mean() * 100
        print(f"Accuracy for {target} within {threshold} points: {accuracy:.2f}%")

        # Evaluate RMSE
        rmse = mean_squared_error(y_test[target], y_pred, squared=False)
        print(f"RMSE for {target}: {rmse:.4f}\n")

    return models

# Example Usage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model3 = trainAndTestEnsembleModelWithRF(X_train, y_train, X_test, y_test)
print(model3)


Training StackingRegressor model for Team_score...
Accuracy for Team_score within 6 points: 67.21%
RMSE for Team_score: 5.6690

Training StackingRegressor model for Opponent_score...




Accuracy for Opponent_score within 6 points: 68.85%
RMSE for Opponent_score: 5.3803

{'Team_score': StackingRegressor(estimators=[('bayesian_ridge', BayesianRidge()),
                              ('random_forest',
                               RandomForestRegressor(random_state=42))],
                  final_estimator=BayesianRidge()), 'Opponent_score': StackingRegressor(estimators=[('bayesian_ridge', BayesianRidge()),
                              ('random_forest',
                               RandomForestRegressor(random_state=42))],
                  final_estimator=BayesianRidge())}


