# Model Selection

1. Data Preparation
   - Load the data
   - Split into features and target
   - Create train/test split

2. Define Evaluation Metrics
   - Accuracy, Precision, Recall, F1-score for win prediction
   - Mean Absolute Error, Mean Squared Error for score prediction

3. Model Comparison (for win prediction)
   - Logistic Regression
   - Random Forest
   - Gradient Boosting (e.g., XGBoost)
   - Support Vector Machines

4. Model Comparison (for score prediction)
   - Linear Regression
   - Decision Trees
   - Random Forest
   - Gradient Boosting

5. Cross-Validation
   - Implement k-fold cross-validation for each model

6. Hyperparameter Tuning
   - Use GridSearchCV or RandomizedSearchCV for best models

7. Final Model Selection
   - Choose the best model based on cross-validation results
   - Evaluate on the test set

8. Save Best Models
   - Save the best models and their corresponding scalers

In [27]:
%load_ext autoreload
%autoreload 2
# Add the project root to the Python path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('__file__'))))

import sys
import os
import sqlite3
import pandas as pd
import joblib
from datetime import datetime
from sklearn.model_selection import train_test_split
from src.visualization.distribution_plots import visualize_null_values

# Add the project root to the Python path
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
project_root = os.path.dirname(notebook_dir)
sys.path.append(project_root)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1.   Data Preparation

In [8]:
target_db_path = '../data/04_features/features_teams.db'
conn = sqlite3.connect(target_db_path)

# Read the data into a DataFrame
df_all_years = pd.read_sql_query("SELECT * FROM features_teams", conn)

# Close the connection
conn.close()

In [12]:
# Load the data
df_all_years = pd.read_parquet('../data/03_processed/preprocessed_all_years.parquet')
df_2016_plus = pd.read_parquet('../data/03_processed/preprocessed_2016_plus.parquet')

In [5]:
def split_data(df, target_column, drop_columns=None, test_size=0.2, val_size=0.2, random_state=42):
    """
    Split the data into train, validation, and test sets based on years.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame
    target_column (str): The name of the target column
    drop_columns (list): List of column names to drop from features. If None, use all columns except target and 'season'
    test_size (float): Proportion of data to use for test set
    val_size (float): Proportion of non-test data to use for validation set
    random_state (int): Random state for reproducibility
    
    Returns:
    tuple: (X_train, X_val, X_test, y_train, y_val, y_test)
    """
    
    # Sort the DataFrame by year to ensure chronological order
    df = df.sort_values('year')
    
    # Define columns to drop
    if drop_columns is None:
        drop_columns = []
    drop_columns = set(drop_columns + [target_column, 'year'])
    
    # Select feature columns (all columns except those in drop_columns)
    feature_columns = [col for col in df.columns if col not in drop_columns]
    
    # Split features (X) and target (y)
    X = df[feature_columns]
    y = df[target_column]
    
    # Get unique years
    years = df['year'].unique()
    
    # Calculate the number of years for test and validation
    n_years = len(years)
    n_test_years = max(1, int(n_years * test_size))
    n_val_years = max(1, int((n_years - n_test_years) * val_size))
    
    # Split years into train, validation, and test
    test_years = years[-n_test_years:]
    val_years = years[-(n_test_years + n_val_years):-n_test_years]
    train_years = years[:-(n_test_years + n_val_years)]
    
    # Create masks for each split
    test_mask = df['year'].isin(test_years)
    val_mask = df['year'].isin(val_years)
    train_mask = df['year'].isin(train_years)
    
    # Split the data
    X_train, y_train = X[train_mask], y[train_mask]
    X_val, y_val = X[val_mask], y[val_mask]
    X_test, y_test = X[test_mask], y[test_mask]
    
    return X_train, X_val, X_test, y_train, y_val, y_test

In [13]:
X_train, X_val, X_test, y_train, y_val, y_test = split_data(
    df_all_years[df_all_years['year'] < 2024],
    target_column='win',
    drop_columns=[
        'season_type',
        'team_id',
        'opponent_id',
        'team_conference',
        'opponent_conference',
        'start_date'
        ]
    )

##  3.  Model Comparison

### 3.1 Logistic Regression

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

def create_improved_logistic_regression_model(X_train, X_val, X_test, y_train, y_val, y_test):
    # Create a pipeline with more steps and options
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=2, include_bias=False)),
        ('logistic_regression', LogisticRegression(random_state=42, max_iter=1000))
    ])
    
    # Define parameter grid for GridSearchCV
    param_grid = {
        'poly__degree': [1],
        'logistic_regression__C': [0.01, 0.1, 1, 10],
        'logistic_regression__penalty': ['l2'],
        'logistic_regression__solver': ['lbfgs']
    }
    
    # Perform grid search
    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Get the best model
    best_model = grid_search.best_estimator_
    
    # Make predictions
    train_predictions = best_model.predict(X_train)
    val_predictions = best_model.predict(X_val)
    test_predictions = best_model.predict(X_test)
    
    # Calculate accuracy
    train_accuracy = accuracy_score(y_train, train_predictions)
    val_accuracy = accuracy_score(y_val, val_predictions)
    test_accuracy = accuracy_score(y_test, test_predictions)
    
    # Print results
    print("Best parameters:", grid_search.best_params_)
    print("\nValidation Set Classification Report:")
    print(classification_report(y_val, val_predictions))
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    
    # Get feature importances
    feature_importances = abs(best_model.named_steps['logistic_regression'].coef_[0])
    feature_names = X_train.columns
    
    # Print top 10 feature importances
    print("\nTop 10 Feature Importances:")
    for name, importance in sorted(zip(feature_names, feature_importances), key=lambda x: x[1], reverse=True)[:10]:
        print(f"Feature '{name}': {importance:.4f}")
    
    return best_model

# Usage
logistic_model = create_improved_logistic_regression_model(X_train, X_val, X_test, y_train, y_val, y_test)

Best parameters: {'logistic_regression__C': 0.01, 'logistic_regression__penalty': 'l2', 'logistic_regression__solver': 'lbfgs', 'poly__degree': 1}

Validation Set Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.68      0.68      1485
           1       0.68      0.68      0.68      1485

    accuracy                           0.68      2970
   macro avg       0.68      0.68      0.68      2970
weighted avg       0.68      0.68      0.68      2970

Training Accuracy: 0.6849
Validation Accuracy: 0.6805
Test Accuracy: 0.6696

Top 10 Feature Importances:
Feature 'is_home': 0.5039
Feature 'points_allowed_last_3': 0.2823
Feature 'points_scored_last_3': 0.2387
Feature 'conference_game': 0.2068
Feature 'rushingAttempts_last_3': 0.1565
Feature 'win_rate_last_10': 0.1515
Feature 'yardsPerRushAttempt_last_3': 0.1037
Feature 'win_rate_last_3': 0.1030
Feature 'rushingYards_last_3': 0.1027
Feature 'offense_line_yards_last_10': 0.1004


### 3.2 Random Forest

In [23]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score

def create_lightweight_random_forest(X_train, X_val, X_test, y_train, y_val, y_test):
    # Create a pipeline with reduced complexity
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),  # Handle null values
        ('scaler', StandardScaler()),  # Scale features
        ('rf', RandomForestClassifier(n_estimators=1000, max_depth=6, random_state=42))  # Reduced parameters
    ])
    
    # Fit the model
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    train_predictions = pipeline.predict(X_train)
    val_predictions = pipeline.predict(X_val)
    test_predictions = pipeline.predict(X_test)
    
    # Calculate accuracies
    train_accuracy = accuracy_score(y_train, train_predictions)
    val_accuracy = accuracy_score(y_val, val_predictions)
    test_accuracy = accuracy_score(y_test, test_predictions)
    
    # Print results
    print("\nValidation Set Classification Report:")
    print(classification_report(y_val, val_predictions))
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    
    # Feature importance (top 10)
    feature_importance = pipeline.named_steps['rf'].feature_importances_
    feature_names = X_train.columns
    print("\nTop 10 Feature Importances:")
    for name, importance in sorted(zip(feature_names, feature_importance), key=lambda x: x[1], reverse=True)[:10]:
        print(f"Feature '{name}': {importance:.4f}")
    
    return pipeline

# Usage
forest_model = create_lightweight_random_forest(X_train, X_val, X_test, y_train, y_val, y_test)


Validation Set Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.57      0.63      1485
           1       0.64      0.76      0.69      1485

    accuracy                           0.66      2970
   macro avg       0.67      0.66      0.66      2970
weighted avg       0.67      0.66      0.66      2970

Training Accuracy: 0.7113
Validation Accuracy: 0.6646
Test Accuracy: 0.6641

Top 10 Feature Importances:
Feature 'is_home': 0.1050
Feature 'win_rate_last_10': 0.0623
Feature 'win_rate_last_5': 0.0568
Feature 'points_allowed_last_3': 0.0473
Feature 'win_rate_last_3': 0.0427
Feature 'points_scored_last_3': 0.0368
Feature 'win_rate_last_1': 0.0311
Feature 'offense_success_rate_last_10': 0.0227
Feature 'points_allowed_last_1': 0.0174
Feature 'defense_ppa_last_10': 0.0160


### 3.3 XGBoost

In [25]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score

def create_xgboost_model(X_train, X_val, X_test, y_train, y_val, y_test):
    # Create a pipeline
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),  # Handle null values
        ('scaler', StandardScaler()),  # Scale features
        ('xgb', XGBClassifier(
            n_estimators=500,  # Number of boosting rounds
            max_depth=4,       # Maximum tree depth
            learning_rate=0.1, # Learning rate
            subsample=0.8,     # Subsample ratio of the training instances
            colsample_bytree=0.8, # Subsample ratio of columns when constructing each tree
            random_state=42,
            use_label_encoder=False,  # Avoid warning about label encoder
            eval_metric='logloss'     # Evaluation metric
        ))
    ])
    
    # Fit the model
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    train_predictions = pipeline.predict(X_train)
    val_predictions = pipeline.predict(X_val)
    test_predictions = pipeline.predict(X_test)
    
    # Calculate accuracies
    train_accuracy = accuracy_score(y_train, train_predictions)
    val_accuracy = accuracy_score(y_val, val_predictions)
    test_accuracy = accuracy_score(y_test, test_predictions)
    
    # Print results
    print("\nValidation Set Classification Report:")
    print(classification_report(y_val, val_predictions))
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    
    # Feature importance (top 10)
    feature_importance = pipeline.named_steps['xgb'].feature_importances_
    feature_names = X_train.columns
    print("\nTop 10 Feature Importances:")
    for name, importance in sorted(zip(feature_names, feature_importance), key=lambda x: x[1], reverse=True)[:10]:
        print(f"Feature '{name}': {importance:.4f}")
    
    return pipeline

# Usage
xgboost_model = create_xgboost_model(X_train, X_val, X_test, y_train, y_val, y_test)

Parameters: { "use_label_encoder" } are not used.




Validation Set Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.63      0.65      1485
           1       0.65      0.70      0.68      1485

    accuracy                           0.66      2970
   macro avg       0.67      0.66      0.66      2970
weighted avg       0.67      0.66      0.66      2970

Training Accuracy: 0.9181
Validation Accuracy: 0.6646
Test Accuracy: 0.6533

Top 10 Feature Importances:
Feature 'win_rate_last_5': 0.0683
Feature 'is_home': 0.0577
Feature 'win_rate_last_10': 0.0469
Feature 'conference_game': 0.0177
Feature 'points_allowed_last_3': 0.0134
Feature 'offense_success_rate_last_10': 0.0118
Feature 'neutral_site': 0.0113
Feature 'win_rate_last_1': 0.0111
Feature 'points_scored_last_3': 0.0111
Feature 'win_rate_last_3': 0.0108


## Scratch

In [29]:
from src.utils.team_pairs import get_team_pairs

# Get team pairs
team_pairs = dict(get_team_pairs())

# Filter the data for 2024 week 2 games
df_2024_week2 = df_all_years[(df_all_years['year'] == 2024) & (df_all_years['week'] == 2)]

# Prepare the features for prediction
X_predict = df_2024_week2.drop(['win', 'year', 'week', 'season_type', 'team_id', 'opponent_id', 'team_conference', 'opponent_conference', 'start_date'], axis=1)

# Ensure X_predict has the same columns as X_train
missing_cols = set(X_train.columns) - set(X_predict.columns)
for col in missing_cols:
    X_predict[col] = 0

X_predict = X_predict[X_train.columns]

# Predict probabilities for 2024 week 2 games using all three models
logistic_probabilities = logistic_model.predict_proba(X_predict)[:, 1]
forest_probabilities = forest_model.predict_proba(X_predict)[:, 1]
xgboost_probabilities = xgboost_model.predict_proba(X_predict)[:, 1]

# Create the result DataFrame
result_df = pd.DataFrame({
    'year': df_2024_week2['year'],
    'week': df_2024_week2['week'],
    'team_id': df_2024_week2['team_id'],
    'team': df_2024_week2['team_id'].map(team_pairs),
    'opponent_id': df_2024_week2['opponent_id'],
    'opponent': df_2024_week2['opponent_id'].map(team_pairs),
    'logistic_win_probability': logistic_probabilities,
    'forest_win_probability': forest_probabilities,
    'xgboost_win_probability': xgboost_probabilities,
    'avg_win_probability': (logistic_probabilities + forest_probabilities + xgboost_probabilities) / 3
})

# Sort by xgboost win probability in descending order
result_df = result_df.sort_values('xgboost_win_probability', ascending=False)

# Display the result
print(result_df)

# Save the result as a parquet file
output_dir = '../models/win_probability'
os.makedirs(output_dir, exist_ok=True)
output_file = f'{output_dir}/prediction_{df_2024_week2["year"].iloc[0]}_{df_2024_week2["week"].iloc[0]}.parquet'
result_df.to_parquet(output_file, index=False)

print(f"Predictions saved to: {output_file}")

       year  week  team_id              team  opponent_id            opponent  \
9890   2024     2      130          Michigan          251               Texas   
10025  2024     2      264        Washington         2199    Eastern Michigan   
10149  2024     2       97        Louisville           55  Jacksonville State   
9902   2024     2      201          Oklahoma          248             Houston   
10156  2024     2      228           Clemson         2026   Appalachian State   
...     ...   ...      ...               ...          ...                 ...   
20526  2024     2     2184          Duquesne          103      Boston College   
20393  2024     2     2710  Western Illinois           84             Indiana   
20266  2024     2     2377           McNeese          245           Texas A&M   
20276  2024     2     2016      Alcorn State          238          Vanderbilt   
20643  2024     2      399            Albany          277       West Virginia   

       logistic_win_probabi