In [93]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_squared_error
import numpy as np

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [94]:
team_data = pd.read_csv('../Stats_competition-/team_data_collected_df.csv')
display(team_data.head())

Unnamed: 0,Location,Team,Opponent,WAB,ADJO,ADJD,EFF,EFG%,TO%,OR%,FTR,2P,3P,Opp EFF,Opp EFG%,Opp TO%,Opp OR%,Opp FTR,Opp 2P,Opp 3P,Opponent_score,Team_score
0,H,DUKE,Maine,0.1,125.3,95.2,130.6,64.3,17.7,35.5,31.7,24-34,11-29,84.3,39.5,19.0,19.4,31.6,18-43,3-14,62,96
1,H,DUKE,Army,0.1,124.7,92.3,141.0,61.3,11.3,43.6,25.4,18-33,17-38,81.8,39.1,18.3,23.3,14.1,13-35,8-29,58,100
2,N,DUKE,Kentucky,-0.2,106.8,86.4,95.7,42.3,9.3,25.0,23.9,24-47,4-24,102.3,47.6,14.6,23.3,38.1,15-38,10-25,77,72
3,H,DUKE,Wofford,-0.1,124.7,56.9,133.7,61.3,15.5,45.2,17.7,14-24,16-38,54.4,28.9,29.5,29.3,5.3,9-24,5-33,35,86
4,A,DUKE,Arizona,0.6,111.7,75.9,101.9,50.0,20.7,35.1,21.3,17-36,9-25,81.2,45.3,22.2,16.7,20.8,15-30,6-23,55,69


In [95]:
team_data['Location'] = np.where(team_data['Location'] == 'N', 0, np.where(team_data['Location'] == 'H', 1, -1))
columns_to_convert = ['Location','ADJO', 'ADJD', 'EFG%', 'TO%', 'OR%', 'FTR', 'Opp EFG%', 'Opp TO%', 'Opp OR%', 'Opp FTR']
for col in columns_to_convert:
    team_data[col] = pd.to_numeric(team_data[col], errors='coerce')

In [96]:
cleanDate = team_data.dropna()

X = cleanDate[columns_to_convert]
y_team = cleanDate['Team_score']
y_opp = cleanDate['Opponent_score']



In [97]:
def process_ratio_column(df, column):
    """Convert a column with string ratios (e.g., '13-25') to numerical percentages."""
    return df[column].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if len(x) == 2 and int(x[1]) != 0 else 0)

ratio_columns = ['2P', '3P', 'Opp 2P', 'Opp 3P']  # Add all ratio columns
#for col in ratio_columns:
 #   cleanDate[col] = process_ratio_column(cleanDate, col)

In [98]:

# Removing columns not needed for predictions
feature_cols = cleanDate.columns.difference(['Team', 'Opponent', 'Team_score', 'Opponent_score',
                                             '2P', '3P', 'Opp 2P', 'Opp 3P',
                                             'EFF', 'Opp EFF', 'WAB'])

# Defining the feature matrix (X) and target matrix (y)
X = cleanDate[feature_cols]
y = cleanDate[['Team_score', 'Opponent_score']]

display(X.head())
display(y.head())

Unnamed: 0,ADJD,ADJO,EFG%,FTR,Location,OR%,Opp EFG%,Opp FTR,Opp OR%,Opp TO%,TO%
0,95.2,125.3,64.3,31.7,1,35.5,39.5,31.6,19.4,19.0,17.7
1,92.3,124.7,61.3,25.4,1,43.6,39.1,14.1,23.3,18.3,11.3
2,86.4,106.8,42.3,23.9,0,25.0,47.6,38.1,23.3,14.6,9.3
3,56.9,124.7,61.3,17.7,1,45.2,28.9,5.3,29.3,29.5,15.5
4,75.9,111.7,50.0,21.3,-1,35.1,45.3,20.8,16.7,22.2,20.7


Unnamed: 0,Team_score,Opponent_score
0,96,62
1,100,58
2,72,77
3,86,35
4,69,55


## Train and Backtest Model

In [103]:

def train_and_evaluate_models(X_train, y_train, X_test, y_test, threshold=6):
    """
    Train and evaluate Bayesian Ridge regression models for multiple target variables.

    Parameters:
    X_train (DataFrame): Training feature set.
    y_train (DataFrame): Training target set (multi-target).
    X_test (DataFrame): Test feature set.
    y_test (DataFrame): Test target set (multi-target).
    threshold (int, optional): Threshold for accuracy evaluation. Default is 6.

    Returns:
    dict: A dictionary containing trained models for each target variable.
    """
    models = {}
    for target in y_train.columns:
        print(f"Training model for {target}...")

        # Initialize the model
        model = BayesianRidge()

        # Train the model
        model.fit(X_train, y_train[target])
        models[target] = model

        # Make predictions
        y_pred = model.predict(X_test)

        # Evaluate accuracy within the threshold
        accuracy = (abs(y_pred - y_test[target]) <= threshold).mean() * 100
        print(f"Accuracy for {target} within {threshold} points: {accuracy:.2f}%")

        # Evaluate RMSE
        rmse = mean_squared_error(y_test[target], y_pred, squared=False)
        print(f"RMSE for {target}: {rmse:.4f}\n")

        # Optional: Uncomment to print predictions
        # print("Actual:", y_test[target].values)
        # print("Predicted values:", y_pred)

    return models


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model_main = train_and_evaluate_models(X_train, y_train, X_test, y_test)
print(model_main)

Training model for Team_score...
Accuracy for Team_score within 6 points: 67.21%
RMSE for Team_score: 5.6774

Training model for Opponent_score...
Accuracy for Opponent_score within 6 points: 73.77%
RMSE for Opponent_score: 5.3658

{'Team_score': BayesianRidge(), 'Opponent_score': BayesianRidge()}




## Train Model

In [100]:
def train_models(X, y):
    """
    Train Bayesian Ridge regression models for multiple target variables.

    Parameters:
    X (DataFrame): Feature set.
    y (DataFrame): Target set (multi-target).

    Returns:
    dict: A dictionary containing trained models for each target variable.
    """
    models = {}
    for target in y.columns:
        print(f"Training model for {target}...")

        # Initialize the model
        model = BayesianRidge()

        # Train the model
        model.fit(X, y[target])
        models[target] = model

        print(f"Model for {target} trained successfully.\n")

    return models

models = train_models(X, y)

Training model for Team_score...
Model for Team_score trained successfully.

Training model for Opponent_score...
Model for Opponent_score trained successfully.



## Get Test data and test model

In [101]:
def predict_and_test(models, data):
    """
    Preprocess data and test trained models on it.

    Parameters:
    models (dict): A dictionary of trained models for each target variable.
    data (DataFrame): The raw DataFrame containing the data for prediction.

    Returns:
    DataFrame: A DataFrame containing predictions for each target variable.
    """
    # Preprocess the data
    print("Preprocessing data for prediction...")
    data['Location'] = np.where(data['Location'] == 'Neutral', 0, 
                                np.where(data['Location'] == 'Home', 1, -1))
    columns_to_convert = ['Location', 'ADJO', 'ADJD', 'EFG%', 'TO%', 'OR%', 
                          'FTR', 'Opp EFG%', 'Opp TO%', 'Opp OR%', 'Opp FTR']
    for col in columns_to_convert:
        data[col] = pd.to_numeric(data[col], errors='coerce')
    
    # Extract the features for prediction
    X = data[columns_to_convert]

    # Make predictions using the models
    print("Making predictions...")
    predictions = pd.DataFrame()
    for target, model in models.items():
        print(f"Predicting {target}...")
        predictions[target] = model.predict(X)

    print("Predictions completed.\n")
    return predictions

# Example usage:
predict_data = pd.read_csv('../Stats_competition-/basketball_games_data.csv')
predictions = predict_and_test(models, predict_data)
print(predictions.head())


Preprocessing data for prediction...
Making predictions...
Predicting Team_score...


ValueError: The feature names should match those that were passed during fit.
Feature names must be in the same order as they were in fit.


In [None]:
Location,Team,Opponent,ADJO,ADJD,EFG%,TO%,OR%,FTR,Opp EFG%,Opp TO%,Opp OR%,Opp FTR
