# Model Selection

1. Data Preparation
   - Load the data
   - Split into features and target
   - Create train/test split

2. Define Evaluation Metrics
   - Accuracy, Precision, Recall, F1-score for win prediction
   - Mean Absolute Error, Mean Squared Error for score prediction

3. Model Comparison (for win prediction)
   - Logistic Regression
   - Random Forest
   - Gradient Boosting (e.g., XGBoost)
   - Support Vector Machines

4. Model Comparison (for score prediction)
   - Linear Regression
   - Decision Trees
   - Random Forest
   - Gradient Boosting

5. Cross-Validation
   - Implement k-fold cross-validation for each model

6. Hyperparameter Tuning
   - Use GridSearchCV or RandomizedSearchCV for best models

7. Final Model Selection
   - Choose the best model based on cross-validation results
   - Evaluate on the test set

8. Save Best Models
   - Save the best models and their corresponding scalers

In [2]:
%load_ext autoreload
%autoreload 2

import sys
import os
import pandas as pd
import joblib
from datetime import datetime
from sklearn.model_selection import train_test_split

# Add the project root to the Python path
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
project_root = os.path.dirname(notebook_dir)
sys.path.append(project_root)

## 1.   Data Preparation

In [12]:
# Load the data
df_all_years = pd.read_parquet('../data/03_processed/preprocessed_all_years.parquet')
df_2016_plus = pd.read_parquet('../data/03_processed/preprocessed_2016_plus.parquet')

In [4]:
def split_data(df, target_column, drop_columns=None, test_size=0.2, val_size=0.2, random_state=42):
    """
    Split the data into train, validation, and test sets based on seasons.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame
    target_column (str): The name of the target column
    drop_columns (list): List of column names to drop from features. If None, use all columns except target and 'season'
    test_size (float): Proportion of data to use for test set
    val_size (float): Proportion of non-test data to use for validation set
    random_state (int): Random state for reproducibility
    
    Returns:
    tuple: (X_train, X_val, X_test, y_train, y_val, y_test)
    """
    
    # Sort the DataFrame by season to ensure chronological order
    df = df.sort_values('season')
    
    # Define columns to drop
    if drop_columns is None:
        drop_columns = []
    drop_columns = set(drop_columns + [target_column, 'season'])
    
    # Select feature columns (all columns except those in drop_columns)
    feature_columns = [col for col in df.columns if col not in drop_columns]
    
    # Split features (X) and target (y)
    X = df[feature_columns]
    y = df[target_column]
    
    # Get unique seasons
    seasons = df['season'].unique()
    
    # Calculate the number of seasons for test and validation
    n_seasons = len(seasons)
    n_test_seasons = max(1, int(n_seasons * test_size))
    n_val_seasons = max(1, int((n_seasons - n_test_seasons) * val_size))
    
    # Split seasons into train, validation, and test
    test_seasons = seasons[-n_test_seasons:]
    val_seasons = seasons[-(n_test_seasons + n_val_seasons):-n_test_seasons]
    train_seasons = seasons[:-(n_test_seasons + n_val_seasons)]
    
    # Create masks for each split
    test_mask = df['season'].isin(test_seasons)
    val_mask = df['season'].isin(val_seasons)
    train_mask = df['season'].isin(train_seasons)
    
    # Split the data
    X_train, y_train = X[train_mask], y[train_mask]
    X_val, y_val = X[val_mask], y[val_mask]
    X_test, y_test = X[test_mask], y[test_mask]
    
    return X_train, X_val, X_test, y_train, y_val, y_test

In [13]:
X_train, X_val, X_test, y_train, y_val, y_test = split_data(
    df_all_years,
    target_column='win',
    drop_columns=[
        'season_type',
        'team_id',
        'opponent_id',
        'team_conference',
        'opponent_conference',
        'start_date'
        ]
    )

##  3.  Model Comparison

### 3.1 Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

def create_improved_logistic_regression_model(X_train, X_val, X_test, y_train, y_val, y_test):
    # Create a pipeline with more steps and options
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=2, include_bias=False)),
        ('logistic_regression', LogisticRegression(random_state=42, max_iter=1000))
    ])
    
    # Define parameter grid for GridSearchCV
    param_grid = {
        'poly__degree': [1],
        'logistic_regression__C': [0.01, 0.1, 1, 10],
        'logistic_regression__penalty': ['l2'],
        'logistic_regression__solver': ['lbfgs']
    }
    
    # Perform grid search
    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Get the best model
    best_model = grid_search.best_estimator_
    
    # Make predictions
    train_predictions = best_model.predict(X_train)
    val_predictions = best_model.predict(X_val)
    test_predictions = best_model.predict(X_test)
    
    # Calculate accuracy
    train_accuracy = accuracy_score(y_train, train_predictions)
    val_accuracy = accuracy_score(y_val, val_predictions)
    test_accuracy = accuracy_score(y_test, test_predictions)
    
    # Print results
    print("Best parameters:", grid_search.best_params_)
    print("\nValidation Set Classification Report:")
    print(classification_report(y_val, val_predictions))
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    
    # Get feature importances
    feature_importances = abs(best_model.named_steps['logistic_regression'].coef_[0])
    feature_names = X_train.columns
    
    # Print top 10 feature importances
    print("\nTop 10 Feature Importances:")
    for name, importance in sorted(zip(feature_names, feature_importances), key=lambda x: x[1], reverse=True)[:10]:
        print(f"Feature '{name}': {importance:.4f}")
    
    return best_model

# Usage
improved_model = create_improved_logistic_regression_model(X_train, X_val, X_test, y_train, y_val, y_test)

Best parameters: {'logistic_regression__C': 0.1, 'logistic_regression__penalty': 'l2', 'logistic_regression__solver': 'lbfgs', 'poly__degree': 1}

Validation Set Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.72      0.72      1405
           1       0.72      0.72      0.72      1405

    accuracy                           0.72      2810
   macro avg       0.72      0.72      0.72      2810
weighted avg       0.72      0.72      0.72      2810

Training Accuracy: 0.7276
Validation Accuracy: 0.7185
Test Accuracy: 0.6978

Top 10 Feature Importances:
Feature 'is_home': 0.4589
Feature 'conference_game': 0.3897
Feature 'defense_total_ppa_weighted': 0.3354
Feature 'defense_passing_plays.ppa_weighted': 0.2818
Feature 'offense_passing_plays.ppa_last_3': 0.2706
Feature 'defense_passing_plays.ppa_last_3': 0.2546
Feature 'offense_total_ppa_weighted': 0.2511
Feature 'offense_passing_plays.total_ppa_last_3': 0.2418
Feature 'offense_ppa_

### 3.2 Random Forest

In [15]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score

def create_lightweight_random_forest(X_train, X_val, X_test, y_train, y_val, y_test):
    # Create a pipeline with reduced complexity
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),  # Handle null values
        ('scaler', StandardScaler()),  # Scale features
        ('rf', RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42))  # Reduced parameters
    ])
    
    # Fit the model
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    train_predictions = pipeline.predict(X_train)
    val_predictions = pipeline.predict(X_val)
    test_predictions = pipeline.predict(X_test)
    
    # Calculate accuracies
    train_accuracy = accuracy_score(y_train, train_predictions)
    val_accuracy = accuracy_score(y_val, val_predictions)
    test_accuracy = accuracy_score(y_test, test_predictions)
    
    # Print results
    print("\nValidation Set Classification Report:")
    print(classification_report(y_val, val_predictions))
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    
    # Feature importance (top 10)
    feature_importance = pipeline.named_steps['rf'].feature_importances_
    feature_names = X_train.columns
    print("\nTop 10 Feature Importances:")
    for name, importance in sorted(zip(feature_names, feature_importance), key=lambda x: x[1], reverse=True)[:10]:
        print(f"Feature '{name}': {importance:.4f}")
    
    return pipeline

# Usage
lightweight_model = create_lightweight_random_forest(X_train, X_val, X_test, y_train, y_val, y_test)


Validation Set Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.67      0.68      1405
           1       0.68      0.69      0.68      1405

    accuracy                           0.68      2810
   macro avg       0.68      0.68      0.68      2810
weighted avg       0.68      0.68      0.68      2810

Training Accuracy: 0.8824
Validation Accuracy: 0.6801
Test Accuracy: 0.6661

Top 10 Feature Importances:
Feature 'is_home': 0.0374
Feature 'yardsPerPass_weighted': 0.0216
Feature 'kickingPoints_weighted': 0.0173
Feature 'puntReturnYards_weighted': 0.0166
Feature 'defense_second_level_yards_weighted': 0.0162
Feature 'defense_standard_downs.success_rate_weighted': 0.0161
Feature 'defense_success_rate_weighted': 0.0159
Feature 'offense_success_rate_weighted': 0.0152
Feature 'totalYards_weighted': 0.0146
Feature 'rushingTDs_weighted': 0.0145


### 3.3 XGBoost

In [18]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score

def create_xgboost_model(X_train, X_val, X_test, y_train, y_val, y_test):
    # Create a pipeline
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),  # Handle null values
        ('scaler', StandardScaler()),  # Scale features
        ('xgb', XGBClassifier(
            n_estimators=100,  # Number of boosting rounds
            max_depth=3,       # Maximum tree depth
            learning_rate=0.1, # Learning rate
            subsample=0.8,     # Subsample ratio of the training instances
            colsample_bytree=0.8, # Subsample ratio of columns when constructing each tree
            random_state=42,
            use_label_encoder=False,  # Avoid warning about label encoder
            eval_metric='logloss'     # Evaluation metric
        ))
    ])
    
    # Fit the model
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    train_predictions = pipeline.predict(X_train)
    val_predictions = pipeline.predict(X_val)
    test_predictions = pipeline.predict(X_test)
    
    # Calculate accuracies
    train_accuracy = accuracy_score(y_train, train_predictions)
    val_accuracy = accuracy_score(y_val, val_predictions)
    test_accuracy = accuracy_score(y_test, test_predictions)
    
    # Print results
    print("\nValidation Set Classification Report:")
    print(classification_report(y_val, val_predictions))
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    
    # Feature importance (top 10)
    feature_importance = pipeline.named_steps['xgb'].feature_importances_
    feature_names = X_train.columns
    print("\nTop 10 Feature Importances:")
    for name, importance in sorted(zip(feature_names, feature_importance), key=lambda x: x[1], reverse=True)[:10]:
        print(f"Feature '{name}': {importance:.4f}")
    
    return pipeline

# Usage
xgboost_model = create_xgboost_model(X_train, X_val, X_test, y_train, y_val, y_test)

Parameters: { "use_label_encoder" } are not used.




Validation Set Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.73      0.72      1405
           1       0.72      0.70      0.71      1405

    accuracy                           0.71      2810
   macro avg       0.72      0.71      0.71      2810
weighted avg       0.72      0.71      0.71      2810

Training Accuracy: 0.7537
Validation Accuracy: 0.7149
Test Accuracy: 0.7026

Top 10 Feature Importances:
Feature 'win_rate_last_5': 0.1065
Feature 'yardsPerPass_weighted': 0.0653
Feature 'win_rate_last_10': 0.0588
Feature 'kickingPoints_weighted': 0.0523
Feature 'is_home': 0.0236
Feature 'rushingTDs_weighted': 0.0235
Feature 'defense_success_rate_weighted': 0.0192
Feature 'offense_success_rate_weighted': 0.0183
Feature 'offense_passing_plays.success_rate_weighted': 0.0164
Feature 'defense_success_rate_last_3': 0.0130


In [23]:
from xgboost import plot_importance
import matplotlib.pyplot as plt

plot_importance(xgboost_model)
plt.show()

ValueError: tree must be Booster, XGBModel or dict instance