# Model Selection

-   Compare different algorithms (logistic, trees, gradient boosting, etc.)
-   Perform initial cross-validation
-   Select best models for further tuning

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
import pandas as pd

# Add the project root to the Python path
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
project_root = os.path.dirname(notebook_dir)
sys.path.append(project_root)

In [2]:
# Load features from the parquet file
df = pd.read_parquet('../data/mvp_features.parquet')

## MVP Game Win Predictor

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import numpy as np

# Select features for the model
features = ['totalYards', 'rushingYards', 'netPassingYards', 'turnovers', 'firstDowns', 
            'offense_success_rate', 'defense_success_rate', 'offense_ppa', 'defense_ppa', 
            'team_talent', 'all_time_win_rate', 'season_win_rate']

X = df[features]
y = df['win']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

def predict_winner(home_team_id, away_team_id):
    home_data = df[df['team_id'] == home_team_id].iloc[-1][features]
    away_data = df[df['team_id'] == away_team_id].iloc[-1][features]
    
    # Create a feature vector considering home advantage
    combined_data = np.array([home_data.values - away_data.values])
    scaled_data = scaler.transform(combined_data)
    
    probability = model.predict_proba(scaled_data)[0][1]
    winner = home_team_id if probability > 0.5 else away_team_id
    win_probability = probability if probability > 0.5 else 1 - probability
    
    return winner, win_probability

In [8]:
def predict_game(home_team_id, away_team_id):
    winner, probability = predict_winner(home_team_id, away_team_id)

    print(f"\nPrediction for Team ID {home_team_id} (Home) vs Team ID {away_team_id} (Away):")
    print(f"Predicted winner: Team ID {winner}")
    print(f"Win probability: {probability:.2f}")

In [9]:
# miami vs florida
predict_game(2390, 57)


Prediction for Team ID 2390 (Home) vs Team ID 57 (Away):
Predicted winner: Team ID 2390
Win probability: 0.94




In [10]:
# north texas and south alabama
predict_game(249, 6)


Prediction for Team ID 249 (Home) vs Team ID 6 (Away):
Predicted winner: Team ID 6
Win probability: 1.00




In [11]:
# notre dame vs texas A&M
predict_game(87, 245)


Prediction for Team ID 87 (Home) vs Team ID 245 (Away):
Predicted winner: Team ID 87
Win probability: 1.00




# MVP Game Score Predictor

In [13]:
df.columns

Index(['season', 'week', 'team_id', 'opponent_id', 'matchup', 'is_home',
       'neutral_site', 'conference_game', 'team_points', 'opponent_points',
       'totalYards', 'rushingYards', 'netPassingYards', 'turnovers',
       'firstDowns', 'offense_success_rate', 'defense_success_rate',
       'offense_explosiveness', 'defense_explosiveness', 'offense_ppa',
       'defense_ppa', 'team_talent', 'opponent_talent', 'win',
       'team_vs_team_win_rate', 'games_played_in_season', 'all_time_win_rate',
       'season_win_rate'],
      dtype='object')

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# Select features for the score prediction model
score_features = ['totalYards', 'rushingYards', 'netPassingYards', 'turnovers', 'firstDowns', 
                  'offense_success_rate', 'defense_success_rate', 'offense_ppa', 'defense_ppa', 
                  'team_talent', 'all_time_win_rate', 'season_win_rate']

X_score = df[score_features]
y_score = df[['points', 'opponent_points']]

# Split the data
X_train_score, X_test_score, y_train_score, y_test_score = train_test_split(X_score, y_score, test_size=0.2, random_state=42)

# Scale the features
scaler_score = StandardScaler()
X_train_score_scaled = scaler_score.fit_transform(X_train_score)
X_test_score_scaled = scaler_score.transform(X_test_score)

# Train the score prediction model
score_model = RandomForestRegressor(n_estimators=100, random_state=42)
score_model.fit(X_train_score_scaled, y_train_score)

def predict_score(home_team_id, away_team_id):
    home_data = df[df['team_id'] == home_team_id].iloc[-1][score_features]
    away_data = df[df['team_id'] == away_team_id].iloc[-1][score_features]
    
    # Create feature vectors considering home advantage
    home_features = np.array([home_data.values])
    away_features = np.array([away_data.values])
    
    scaled_home = scaler_score.transform(home_features)
    scaled_away = scaler_score.transform(away_features)
    
    home_score, away_score = score_model.predict(scaled_home)[0]
    away_score_2, home_score_2 = score_model.predict(scaled_away)[0]
    
    # Average the predictions for better accuracy
    home_score = (home_score + home_score_2) / 2
    away_score = (away_score + away_score_2) / 2
    
    return round(home_score), round(away_score)

KeyError: "['points'] not in index"