# Model Selection

1. Data Preparation
   - Load the data
   - Split into features and target
   - Create train/test split

2. Define Evaluation Metrics
   - Accuracy, Precision, Recall, F1-score for win prediction
   - Mean Absolute Error, Mean Squared Error for score prediction

3. Model Comparison (for win prediction)
   - Logistic Regression
   - Decision Trees
   - Random Forest
   - Gradient Boosting (e.g., XGBoost)
   - Support Vector Machines

4. Model Comparison (for score prediction)
   - Linear Regression
   - Decision Trees
   - Random Forest
   - Gradient Boosting

5. Cross-Validation
   - Implement k-fold cross-validation for each model

6. Hyperparameter Tuning
   - Use GridSearchCV or RandomizedSearchCV for best models

7. Final Model Selection
   - Choose the best model based on cross-validation results
   - Evaluate on the test set

8. Save Best Models
   - Save the best models and their corresponding scalers

In [5]:
%load_ext autoreload
%autoreload 2

import sys
import os
import pandas as pd
import joblib
from datetime import datetime
from sklearn.model_selection import train_test_split

# Add the project root to the Python path
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
project_root = os.path.dirname(notebook_dir)
sys.path.append(project_root)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1.   Data Preparation

In [4]:
# Load the data
df_all_years = pd.read_parquet('../data/03_processed/preprocessed_all_years.parquet')
df_2016_plus = pd.read_parquet('../data/03_processed/preprocessed_2016_plus.parquet')

In [7]:
def split_data(df, target_column, drop_columns=None, test_size=0.2, val_size=0.2, random_state=42):
    """
    Split the data into train, validation, and test sets based on seasons.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame
    target_column (str): The name of the target column
    drop_columns (list): List of column names to drop from features. If None, use all columns except target and 'season'
    test_size (float): Proportion of data to use for test set
    val_size (float): Proportion of non-test data to use for validation set
    random_state (int): Random state for reproducibility
    
    Returns:
    tuple: (X_train, X_val, X_test, y_train, y_val, y_test)
    """
    
    # Sort the DataFrame by season to ensure chronological order
    df = df.sort_values('season')
    
    # Define columns to drop
    if drop_columns is None:
        drop_columns = []
    drop_columns = set(drop_columns + [target_column, 'season'])
    
    # Select feature columns (all columns except those in drop_columns)
    feature_columns = [col for col in df.columns if col not in drop_columns]
    
    # Split features (X) and target (y)
    X = df[feature_columns]
    y = df[target_column]
    
    # Get unique seasons
    seasons = df['season'].unique()
    
    # Calculate the number of seasons for test and validation
    n_seasons = len(seasons)
    n_test_seasons = max(1, int(n_seasons * test_size))
    n_val_seasons = max(1, int((n_seasons - n_test_seasons) * val_size))
    
    # Split seasons into train, validation, and test
    test_seasons = seasons[-n_test_seasons:]
    val_seasons = seasons[-(n_test_seasons + n_val_seasons):-n_test_seasons]
    train_seasons = seasons[:-(n_test_seasons + n_val_seasons)]
    
    # Create masks for each split
    test_mask = df['season'].isin(test_seasons)
    val_mask = df['season'].isin(val_seasons)
    train_mask = df['season'].isin(train_seasons)
    
    # Split the data
    X_train, y_train = X[train_mask], y[train_mask]
    X_val, y_val = X[val_mask], y[val_mask]
    X_test, y_test = X[test_mask], y[test_mask]
    
    return X_train, X_val, X_test, y_train, y_val, y_test

In [13]:
X_train, X_val, X_test, y_train, y_val, y_test = split_data(
    df_all_years,
    target_column='win',
    drop_columns=[
        'season_type',
        'team_id',
        'opponent_id',
        'team_conference',
        'opponent_conference',
        'start_date'
        ]
    )

##  3.  Model Comparison

### 3.1 Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

def create_improved_logistic_regression_model(X_train, X_val, X_test, y_train, y_val, y_test):
    # Create a pipeline with more steps and options
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=2, include_bias=False)),
        ('logistic_regression', LogisticRegression(random_state=42, max_iter=1000))
    ])
    
    # Define parameter grid for GridSearchCV
    param_grid = {
        'poly__degree': [1, 2],
        'logistic_regression__C': [0.001, 0.01, 0.1, 1, 10, 100],
        'logistic_regression__penalty': ['l1', 'l2'],
        'logistic_regression__solver': ['liblinear', 'saga']
    }
    
    # Perform grid search
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Get the best model
    best_model = grid_search.best_estimator_
    
    # Make predictions
    train_predictions = best_model.predict(X_train)
    val_predictions = best_model.predict(X_val)
    test_predictions = best_model.predict(X_test)
    
    # Calculate accuracy
    train_accuracy = accuracy_score(y_train, train_predictions)
    val_accuracy = accuracy_score(y_val, val_predictions)
    test_accuracy = accuracy_score(y_test, test_predictions)
    
    # Print results
    print("Best parameters:", grid_search.best_params_)
    print("\nValidation Set Classification Report:")
    print(classification_report(y_val, val_predictions))
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    
    return best_model

# Usage
improved_model = create_improved_logistic_regression_model(X_train, X_val, X_test, y_train, y_val, y_test)