In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.ensemble import StackingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge, BayesianRidge
from sklearn.datasets import make_regression

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [2]:
team_data = pd.read_csv('../Stats_competition-/final_opponent_and_team_data.csv')
team_data['Location'] = np.where(team_data['Location'] == 'N', 0, np.where(team_data['Location'] == 'H', 1, -1))
columns_to_convert = ['Location','ADJO', 'ADJD', 'EFG%', 'TO%', 'OR%', 'FTR', 'Opp EFG%', 'Opp TO%', 'Opp OR%', 'Opp FTR']
for col in columns_to_convert:
    team_data[col] = pd.to_numeric(team_data[col], errors='coerce')
    
cleanDate = team_data.dropna()

In [None]:
# Removing columns not needed for predictions
feature_cols = cleanDate.columns.difference(['Date', 'Team', 'Opponent', 'Team_score', 'Opponent_score',
                                             '2P', '3P', 'Opp 2P', 'Opp 3P',
                                             'EFF', 'Opp EFF', 'WAB'])

# Defining the feature matrix (X) and target matrix (y)
X = cleanDate[feature_cols]
y = cleanDate[['Team_score', 'Opponent_score']]

display(X.head())
display(y.head())

In [None]:
display(X.head())
display(y.head())

## Train Model

In [None]:
## Train Bayesian Ridge regression models for multiple target variables without evaluation.
def train_BaseModel(X, y):
    models = {}
    for target in y.columns:
        print(f"Training model for {target}...")

        # Initialize the model
        model = BayesianRidge()
        
        # Train the model
        model.fit(X, y[target])
        models[target] = model

        print(f"Model for {target} trained successfully.\n")

    return models

# Train a StackingRegressor for multiple target variables without evaluation.
def train_EnsembleModel(X, y, threshold=6):
    models = {}
    for target in y.columns:
        print(f"Training StackingRegressor model for {target}...")

        # Define base models and stacking regressor
        base_models = [
            ('ridge', Ridge()),
            ('bayesian_ridge', BayesianRidge())
        ]
        stacking_model = StackingRegressor(estimators=base_models, final_estimator=BayesianRidge())

        # Train the stacking model
        stacking_model.fit(X, y[target])
        models[target] = stacking_model

        print(f"Model for {target} trained successfully.\n")

    return models


# Train a StackingRegressor combining Bayesian Ridge and Random Forest for multiple target variables.
def train_EnsembleModelWithRF(X_train, y_train, threshold=6):
    models = {}
    for target in y_train.columns:
        print(f"Training StackingRegressor model for {target}...")

        # Define base models and stacking regressor
        base_models = [
            ('bayesian_ridge', BayesianRidge()),
            ('random_forest', RandomForestRegressor(n_estimators=100, random_state=42))
        ]
        stacking_model = StackingRegressor(estimators=base_models, final_estimator=BayesianRidge())

        # Train the stacking model
        stacking_model.fit(X_train, y_train[target])
        models[target] = stacking_model

        print(f"Model for {target} trained successfully.\n")

    return models

# Train Bayesian Ridge regression models with hyperparameter tuning for multiple target variables.
def train_baseModelWithHyperparameterTuning(X_train, y_train, threshold=6):
    models = {}

    for target in y_train.columns:
        print(f"Training model for {target}...\n")

        # Step 1: Broad parameter ranges for RandomizedSearchCV
        random_param_grid = {
            'alpha_1': np.logspace(-6, -2, 20),  # Wide range
            'alpha_2': np.logspace(-6, -2, 20),  # Wide range
            'lambda_1': np.logspace(-6, -2, 20), # Wide range
            'lambda_2': np.logspace(-6, -2, 20), # Wide range
            'tol': [1e-4, 1e-3, 1e-2],
            'max_iter': [100, 300, 500]
        }

        model = BayesianRidge()
        scorer = 'neg_mean_squared_error'

        random_search = RandomizedSearchCV(
            model, random_param_grid, scoring=scorer, cv=5, n_iter=50, verbose=1, random_state=42
        )
        random_search.fit(X_train, y_train[target])
        best_params_random = random_search.best_params_
        print(f"Best parameters from RandomizedSearchCV for {target}:", best_params_random)

        # Step 2: Define a narrower grid around the best parameters
        grid_param_grid = {
            'alpha_1': np.linspace(
                best_params_random['alpha_1'] / 10, best_params_random['alpha_1'] * 10, 10
            ),
            'alpha_2': np.linspace(
                best_params_random['alpha_2'] / 10, best_params_random['alpha_2'] * 10, 10
            ),
            'lambda_1': np.linspace(
                best_params_random['lambda_1'] / 10, best_params_random['lambda_1'] * 10, 10
            ),
            'lambda_2': np.linspace(
                best_params_random['lambda_2'] / 10, best_params_random['lambda_2'] * 10, 10
            ),
            'tol': [1e-4, 1e-3, 1e-2],
            'max_iter': [100, 300, 500]
        }

        grid_search = GridSearchCV(model, grid_param_grid, scoring=scorer, cv=5, verbose=1)
        grid_search.fit(X_train, y_train[target])
        best_params_grid = grid_search.best_params_
        print(f"Best parameters from GridSearchCV for {target}:", best_params_grid)

        # Step 3: Train the model with the best parameters
        final_model = BayesianRidge(**best_params_grid)
        final_model.fit(X_train, y_train[target])
        models[target] = final_model

        print(f"Model for {target} trained successfully.\n")

    return models

model1 = train_BaseModel(X, y)
print(model1)

model2 = train_EnsembleModel(X, y)
print(model2)

model3 = train_EnsembleModelWithRF(X, y)
print(model3)

model4 = train_baseModelWithHyperparameterTuning(X, y)
print(model4)


## Test model using test data

In [None]:
def predictModel(models, data):
    # Preprocess the data
    print("Preprocessing data for prediction...")
    data['Location'] = np.where(data['Location'] == 'Neutral', 0, 
                                np.where(data['Location'] == 'Home', 1, -1))
    
    columns_to_convert = ['ADJD', 'ADJO', 'EFG%', 'FTR', 'Location', 'OR%', 
                        'Opp EFG%', 'Opp FTR', 'Opp OR%', 'Opp TO%', 'TO%',
                        'opp_adj_d', 'opp_adj_o']
    
    for col in columns_to_convert:
        data[col] = pd.to_numeric(data[col], errors='coerce')
    
    
    # Extract the features for prediction
    X = data[columns_to_convert]

    # Make predictions using the models
    print("Making predictions...")
    predictions = pd.DataFrame()
    for target, model in models.items():
        print(f"Predicting {target}...")
        predictions[target] = model.predict(X)

    print("Predictions completed.\n")
    
    teams = data[['Team', 'Opponent']]
    finalPredictions = pd.concat([teams, predictions], axis=1)
    
    return finalPredictions

predict_data = pd.read_csv('../Stats_competition-/basketball_games_data.csv')
predict1 = predictModel(model1, predict_data)
predict2 = predictModel(model2, predict_data)
predict3 = predictModel(model3, predict_data)
predict4 = predictModel(model4, predict_data)

predict1 = predict1.rename(columns={'Team_score': 'Team_score_base', 'Opponent_score': 'Opponent_score_base'})
predict2 = predict2.rename(columns={'Team_score': 'Team_score_ensemble', 'Opponent_score': 'Opponent_score_ensemble'})
predict3 = predict3.rename(columns={'Team_score': 'Team_score_ensembleRF', 'Opponent_score': 'Opponent_score_ensembleRF'})
predict4 = predict4.rename(columns={'Team_score': 'Team_score_fineTuned', 'Opponent_score': 'Opponent_score_fineTuned'})

# Join all four DataFrames on 'team' and 'opponent'
dfFinalPrediction1 = predict1.merge(predict2, on=['Team', 'Opponent'], how='outer')
dfFinalPrediction2 = predict3.merge(predict4, on=['Team', 'Opponent'], how='outer')

In [None]:
display(dfFinalPrediction1)
display(dfFinalPrediction2)

# Backtesting Models

In [None]:
# Train and evaluate Bayesian Ridge regression models for multiple target variables.
def baseModel(X_train, y_train, X_test, y_test, threshold=6):
    models = {}
    for target in y_train.columns:
        print(f"Training model for {target}...")

        # Initialize the model
        model = BayesianRidge()

        # Train the model
        model.fit(X_train, y_train[target])
        models[target] = model

        # Make predictions
        y_pred = model.predict(X_test)

        # Evaluate accuracy within the threshold
        accuracy = (abs(y_pred - y_test[target]) <= threshold).mean() * 100
        print(f"Accuracy for {target} within {threshold} points: {accuracy:.2f}%")

        # Evaluate RMSE
        rmse = mean_squared_error(y_test[target], y_pred, squared=False)
        print(f"RMSE for {target}: {rmse:.4f}\n")

        # Optional: Uncomment to print predictions
        # print("Actual:", y_test[target].values)
        # print("Predicted values:", y_pred)

    return models


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model_main = baseModel(X_train, y_train, X_test, y_test)
print(model_main)

In [None]:
def EnsembleModel(X_train, y_train, X_test, y_test, threshold=6):
    """
    Train and evaluate a StackingRegressor for multiple target variables.

    Parameters:
    X_train (DataFrame): Training feature set.
    y_train (DataFrame): Training target set (multi-target).
    X_test (DataFrame): Test feature set.
    y_test (DataFrame): Test target set (multi-target).
    threshold (int, optional): Threshold for accuracy evaluation. Default is 6.

    Returns:
    dict: A dictionary containing trained ensemble models for each target variable.
    """
    models = {}
    for target in y_train.columns:
        print(f"Training StackingRegressor model for {target}...")

        # Define base models and stacking regressor
        base_models = [
            ('ridge', Ridge()),
            ('bayesian_ridge', BayesianRidge())
        ]
        stacking_model = StackingRegressor(estimators=base_models, final_estimator=BayesianRidge())

        # Train the stacking model
        stacking_model.fit(X_train, y_train[target])
        models[target] = stacking_model

        # Make predictions
        y_pred = stacking_model.predict(X_test)

        # Evaluate accuracy within the threshold
        accuracy = (abs(y_pred - y_test[target]) <= threshold).mean() * 100
        print(f"Accuracy for {target} within {threshold} points: {accuracy:.2f}%")

        # Evaluate RMSE
        rmse = mean_squared_error(y_test[target], y_pred, squared=False)
        print(f"RMSE for {target}: {rmse:.4f}\n")

        # Optional: Uncomment to print predictions
        # print("Actual:", y_test[target].values)
        # print("Predicted values:", y_pred)

    return models

# Example Usage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model2 = EnsembleModel(X_train, y_train, X_test, y_test)
print(model2)


In [None]:
#Train and evaluate a StackingRegressor combining Bayesian Ridge and Random Forest for multiple target variables.

def EnsembleModelWithRF(X_train, y_train, X_test, y_test, threshold=6):
    models = {}
    for target in y_train.columns:
        print(f"Training StackingRegressor model for {target}...")

        # Define base models and stacking regressor
        base_models = [
            ('bayesian_ridge', BayesianRidge()),
            ('random_forest', RandomForestRegressor(n_estimators=100, random_state=42))
        ]
        stacking_model = StackingRegressor(estimators=base_models, final_estimator=BayesianRidge())

        # Train the stacking model
        stacking_model.fit(X_train, y_train[target])
        models[target] = stacking_model

        # Make predictions
        y_pred = stacking_model.predict(X_test)

        # Evaluate accuracy within the threshold
        accuracy = (abs(y_pred - y_test[target]) <= threshold).mean() * 100
        print(f"Accuracy for {target} within {threshold} points: {accuracy:.2f}%")

        # Evaluate RMSE
        rmse = mean_squared_error(y_test[target], y_pred, squared=False)
        print(f"RMSE for {target}: {rmse:.4f}\n")

    return models

# Example Usage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model3 = EnsembleModelWithRF(X_train, y_train, X_test, y_test)
print(model3)


In [None]:
# Train and evaluate Bayesian Ridge regression models for multiple target variables
def baseModelWithHyperparameterTuning(X_train, y_train, X_test, y_test, threshold=6):
    models = {}
    
    for target in y_train.columns:
        print(f"Training model for {target}...\n")

        # Step 1: Broad parameter ranges for RandomizedSearchCV
        random_param_grid = {
            'alpha_1': np.logspace(-6, -2, 20),  # Wide range
            'alpha_2': np.logspace(-6, -2, 20),  # Wide range
            'lambda_1': np.logspace(-6, -2, 20), # Wide range
            'lambda_2': np.logspace(-6, -2, 20), # Wide range
            'tol': [1e-4, 1e-3, 1e-2],
            'max_iter': [100, 300, 500]
        }
        
        model = BayesianRidge()
        scorer = 'neg_mean_squared_error'
        
        random_search = RandomizedSearchCV(
            model, random_param_grid, scoring=scorer, cv=5, n_iter=50, verbose=1, random_state=42
        )
        random_search.fit(X_train, y_train[target])
        best_params_random = random_search.best_params_
        print(f"Best parameters from RandomizedSearchCV for {target}:", best_params_random)
        
        # Step 2: Define a narrower grid around the best parameters
        grid_param_grid = {
            'alpha_1': np.linspace(
                best_params_random['alpha_1'] / 10, best_params_random['alpha_1'] * 10, 10
            ),
            'alpha_2': np.linspace(
                best_params_random['alpha_2'] / 10, best_params_random['alpha_2'] * 10, 10
            ),
            'lambda_1': np.linspace(
                best_params_random['lambda_1'] / 10, best_params_random['lambda_1'] * 10, 10
            ),
            'lambda_2': np.linspace(
                best_params_random['lambda_2'] / 10, best_params_random['lambda_2'] * 10, 10
            ),
            'tol': [1e-4, 1e-3, 1e-2],
            'max_iter': [100, 300, 500]
        }
        
        grid_search = GridSearchCV(model, grid_param_grid, scoring=scorer, cv=5, verbose=1)
        grid_search.fit(X_train, y_train[target])
        best_params_grid = grid_search.best_params_
        print(f"Best parameters from GridSearchCV for {target}:", best_params_grid)
        
        # Step 3: Train the model with the best parameters
        final_model = BayesianRidge(**best_params_grid)
        final_model.fit(X_train, y_train[target])
        models[target] = final_model

        # Step 4: Make predictions
        y_pred = final_model.predict(X_test)

        # Evaluate accuracy within the threshold
        accuracy = (abs(y_pred - y_test[target]) <= threshold).mean() * 100
        print(f"Accuracy for {target} within {threshold} points: {accuracy:.2f}%")

        # Evaluate RMSE
        rmse = mean_squared_error(y_test[target], y_pred, squared=False)
        print(f"RMSE for {target}: {rmse:.4f}\n")

    return models

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate the model
model_main = baseModelWithHyperparameterTuning(X_train, y_train, X_test, y_test)
print(model_main)
