In [457]:
import pandas as pd
from numpy import array 
from scipy.stats import skew
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, RepeatedKFold, cross_val_score
import random
import warnings

import sklearn.linear_model as lm, pandas as pd, sklearn.ensemble as se, numpy as np
from sklearn.model_selection import KFold, RepeatedKFold, cross_val_score
from numpy import mean, std
from sklearn import svm
from sklearn import gaussian_process
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn import svm
from sklearn.naive_bayes import CategoricalNB
from xgboost import XGBClassifier
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
import pandas as pd
from numpy import mean, std
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

YEAR = 2020


In [458]:
def get_data()->pd.DataFrame:
    df = pd.read_csv('data/data.csv')
    df.drop(columns=['wins', 'losses'], inplace=True)
    df['comp_pct'] = df['pass_cmp']/df['pass_att']
    df = df[df['year'] > YEAR]
    df.columns = df.columns.str.replace('_y', 'y')

    return df



def prep_season_data()->pd.DataFrame:
    df = get_data()
    to_not_average = ['year', 'team', 'ties', 'win_loss_perc', 'yds_per_play_offense', 'pass_net_yds_per_att', 'rush_yds_per_att','score_pct', 'turnover_pct','g', "comp_pct", "points_diff", "mov" ]
    for col in df.columns:
        if col not in to_not_average:
            df[col] = df[col]/df['g']
    df['mov'] = df['points_diff']/ df['g']
    return df.drop(columns=['g'])


In [459]:
def get_team_df():
    team_df = pd.read_csv('data/nfl_teams.csv')
    team_df.drop(columns=["team_id_pfr","team_conference_pre2002", "team_division_pre2002" ], inplace=True)
    return team_df

def get_games_df():
    games= pd.read_csv('data/spreadspoke_scores.csv')
    games = games[games['schedule_season'] > YEAR]
    games = games[games['schedule_week'] != '1']
    games = games[games['schedule_week'] != '2']
    games = games[games['schedule_week'] !='3']
    games = games[games['schedule_week'] !='4']
    games = games[games['schedule_week'] !='5']
    games = games[games['schedule_week'] !='6']

    games['spread_favorite_sort'] = abs(games['spread_favorite'])
    
    return games
    

def get_stadiums():
    stadiums = pd.read_csv('data/nfl_stadiums.csv')
    return stadiums

def mege_dfs():
    team_df = get_team_df()
    df = prep_season_data()
    df = df.merge(team_df, left_on='team', right_on='team_name', how='left')
    stadiums = get_stadiums()
    games_df = get_games_df()
    games_df = games_df.merge(stadiums, left_on='stadium', right_on='stadium_name', how='left')
    games_df =games_df[games_df['stadium_neutral'] == False]

    games_df.drop(columns=['stadium_name', 'stadium_location', 'stadium_open', 'stadium_close', 'stadium_type', 'stadium_address', 'stadium_weather_station_zipcode', 'stadium_surface', 'stadium_weather_station', 'stadium_weather_station_name', 'stadium_latitude', 'stadium_longitude', 'stadium_azimuthangle', 'stadium_elevation', 'weather_temperature', 'weather_wind_mph', 'weather_humidity', 'weather_detail', 'stadium_capacity', "stadium"], inplace=True)
    games_df = df.merge(games_df, left_on=['year', 'team'], right_on=['schedule_season', 'team_home' ], how='left')
    games_df = df.merge(games_df, left_on=['year', 'team'], right_on=['schedule_season', 'team_away' ], how='left')
    games_df.columns = games_df.columns.str.replace('_x', '_home')
    games_df.columns = games_df.columns.str.replace('_y', '_away')
    games_df['home_fav'] = games_df['team_favorite_id'] == games_df['team_id_home']
    games_df['home_win'] = games_df['score_home'] > games_df['score_away']
    games_df = games_df.sort_values(by='spread_favorite_sort', ascending=True)
    games_df['over_under_line'] = games_df['over_under_line'].astype(float)
    games_df['over_under_line'] = games_df['over_under_line'].fillna(games_df['over_under_line'].mean())

    columns_to_drop = [
    'spread_favorite_sort', 'stadium_neutral', 'team_favorite_id', 'team_away', 
    'schedule_playoff', 'team_home', 'schedule_season', 'schedule_week', 
    'team_name_away', 'team_name_short_away', 'team_id_away', 
    'team_conference_away', 'team_division_away', 'schedule_date', 
    'team_away', 'team_name_home', 'team_name_short_home', 'team_id_home', 
    'team_conference_home', 'team_division_home', 'year_away', 'year_home','over_under_line', 'score_home', 'score_away'
    ]
    games_df.drop(columns=columns_to_drop, inplace=True)
    return games_df.sample(n=550, random_state=1)

def bin_data(df):
    df_weather = pd.DataFrame({"Count": df['stadium_weather_type'].value_counts()})
    df_weather['Proportion'] = df_weather['Count'] / df.shape[0]
    #bin moderate and warm together since they make up a smaller portion and warmer weather is not 
    # a bigger factor in october
    df['stadium_weather_type'] = df['stadium_weather_type'].replace(['moderate', 'warm'], 'moderate/warm')
    return df


def check_skew(df):
    cols_to_drop = []
    for col in df.columns:
        if df[col].dtype in ['int64', 'float64']:
            skewness = skew(df[col])
            if skewness > 1 or skewness < -1:
                print(f'{col} has a skew of {skewness}')
                cols_to_drop.append(col)
    if len(cols_to_drop) > 0:
        df.drop(columns=cols_to_drop, inplace=True)
    return df

def missing_data(df):
    missing_data = pd.DataFrame({"Count": df.isnull().sum()})
    missing_data['Proportion'] = missing_data['Count'] / df.shape[0]
    missing_data = missing_data[missing_data['Count'] > 0]
    for col in missing_data.index:
        df[col] = df[col].fillna(0)
    return df


def Xandy(df, label):
    y = df[label]
    X = df.drop(columns=[label])
    return X, y

def dummy_code(X):
    X = pd.get_dummies(X, drop_first=True, dtype=float)
    return X
    
def minmax(X):
    X = pd.DataFrame(MinMaxScaler().fit_transform(X.copy()), columns=X.columns, index=X.index)
    return X

def impute_KNN(df, label, neighbors=5 ):
    df = dummy_code(df.copy())
    X, y = Xandy(df, label)
    X = minmax(X.copy())
    imp = KNNImputer(n_neighbors=neighbors, weights="uniform")
    X = pd.DataFrame(imp.fit_transform(X), columns=X.columns, index=X.index)
    return X.merge(y, left_index=True, right_index=True)


def select_features(df, label,  max='auto'):

    X, y = Xandy(df, label)
    clf = ExtraTreesClassifier(n_estimators=100)
    clf = clf.fit(X, y)
    model = SelectFromModel(clf, prefit=True)
    X_new = model.transform(X)
    selected_columns = X.columns[model.get_support()]
    return df[selected_columns].merge(y, left_index=True, right_index=True)


df = mege_dfs()
df = missing_data(df)
df = bin_data(df)
df= check_skew(df)
df = impute_KNN(df, 'home_win')
df.head()
df = select_features(df, 'home_win')
df

ties_home has a skew of 3.3982205579576967
ties_away has a skew of 3.7748213284572896
spread_favorite has a skew of -1.1136612667391173


Unnamed: 0,win_loss_perc_home,points_home,points_diff_home,totalyards_home,yds_per_play_offense_home,first_down_home,pass_td_home,pass_netyds_per_att_home,score_pct_home,exp_pts_tot_home,win_loss_perc_away,points_away,points_opp_away,points_diff_away,mov_away,pass_netyds_per_att_away,score_pct_away,exp_pts_tot_away,home_fav,home_win
224,0.665722,0.387755,0.587224,0.495638,0.56,0.484277,0.25000,0.380952,0.621262,0.515714,0.500000,0.261905,0.336100,0.390663,0.390663,0.404762,0.392027,0.417663,0.0,False
179,0.665722,0.649660,0.653563,0.744251,0.80,0.635220,0.46875,0.833333,0.697674,0.685983,0.832861,1.000000,0.336100,0.923833,0.923833,0.690476,0.784053,0.744285,0.0,False
113,0.165722,0.074830,0.113022,0.148295,0.24,0.245283,0.12500,0.214286,0.242525,0.207251,0.916431,0.935374,0.315353,0.889435,0.889435,0.642857,0.800664,0.865091,0.0,True
563,0.165722,0.319728,0.194103,0.396114,0.44,0.440252,0.21875,0.261905,0.491694,0.409302,0.582153,0.435374,0.518672,0.407862,0.407862,0.523810,0.617940,0.545167,0.0,True
401,0.749292,0.544218,0.584767,0.475813,0.28,0.408805,0.40625,0.309524,0.421927,0.185577,0.916431,0.840136,0.012448,1.000000,1.000000,0.619048,0.760797,0.687252,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.582153,0.500000,0.582310,0.540444,0.60,0.490566,0.65625,0.547619,0.647841,0.729892,0.832861,0.867347,0.087137,0.975430,0.975430,1.000000,0.833887,0.896999,0.0,True
253,0.208215,0.180272,0.162162,0.312450,0.28,0.314465,0.18750,0.238095,0.299003,0.185765,0.832861,0.785714,0.269710,0.808354,0.808354,0.523810,0.707641,0.649188,0.0,True
55,0.582153,0.731293,0.712531,0.551546,0.60,0.591195,0.50000,0.452381,0.767442,0.679805,0.665722,0.649660,0.365145,0.653563,0.653563,0.833333,0.697674,0.685983,0.0,False
123,0.500000,0.642857,0.498771,0.657415,0.64,0.452830,0.71875,0.595238,0.634551,0.565385,0.124646,0.302721,0.788382,0.152334,0.152334,0.357143,0.468439,0.429811,1.0,True


In [460]:
def fit_cv_classification_expanded(df, label, k=3, r=10, repeat=True, random_state=1):
        X = df.drop(columns=[label])
        y = df[label]
      
        if repeat:
          cv = RepeatedKFold(n_splits=k, n_repeats=r, random_state=random_state)
        else:
          cv = KFold(n_splits=k, random_state=random_state, shuffle=True)
        
        fit = {}    # Use this to store each of the fit metrics
        models = {} # Use this to store each of the models
        
        # Create the model objects
        model_log = lm.LogisticRegression(max_iter=100)
        model_logcv = lm.RidgeClassifier()
        model_sgd = lm.SGDClassifier(max_iter=1000, tol=1e-3)
        model_pa = lm.PassiveAggressiveClassifier(max_iter=1000, random_state=random_state, tol=1e-3)
        model_per = lm.Perceptron(fit_intercept=False, max_iter=10, tol=None, shuffle=False)
        model_knn = KNeighborsClassifier(n_neighbors=3)
        model_svm = svm.SVC(decision_function_shape='ovo') # Remove the parameter for two-class model
        model_nb = CategoricalNB()
        model_bag = se.BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)
        model_ada = se.AdaBoostClassifier(n_estimators=100, random_state=random_state)
        model_ext = se.ExtraTreesClassifier(n_estimators=100, random_state=random_state)
        model_rf = se.RandomForestClassifier(n_estimators=10)
        model_hgb = se.HistGradientBoostingClassifier(max_iter=100)
        model_vot = se.VotingClassifier(estimators=[('lr', model_log), ('rf', model_ext), ('gnb', model_hgb)], voting='hard')
        model_gb = se.GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
        estimators = [('ridge', lm.RidgeCV()), ('lasso', lm.LassoCV(random_state=random_state)), ('knr', KNeighborsRegressor(n_neighbors=20, metric='euclidean'))]
        final_estimator = se.GradientBoostingRegressor(n_estimators=25, subsample=0.5, min_samples_leaf=25, max_features=1, random_state=random_state)
        model_st = se.StackingRegressor(estimators=estimators, final_estimator=final_estimator)
        model_xgb = XGBClassifier( random_state=random_state, use_label_encoder=True, eval_metric='mlogloss')
        model_nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=random_state)
      
        # Fit a cross-validated R squared score and add it to the dict
        fit['Logistic'] = mean(cross_val_score(model_log, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
        fit['Ridge'] = mean(cross_val_score(model_logcv, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
        fit['SGD'] = mean(cross_val_score(model_sgd, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
        fit['PassiveAggressive'] = mean(cross_val_score(model_pa, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
        fit['Perceptron'] = mean(cross_val_score(model_per, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
        fit['KNN'] = mean(cross_val_score(model_knn, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
        fit['SVM'] = mean(cross_val_score(model_svm, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
        fit['NaiveBayes'] = mean(cross_val_score(model_nb, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
        fit['Bagging'] = mean(cross_val_score(model_bag, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
        fit['AdaBoost'] = mean(cross_val_score(model_ada, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
        fit['ExtraTrees'] = mean(cross_val_score(model_ext, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
        fit['RandomForest'] = mean(cross_val_score(model_rf, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
        fit['HistGradient'] = mean(cross_val_score(model_hgb, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
        fit['Voting'] = mean(cross_val_score(model_vot, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
        fit['GradBoost'] = mean(cross_val_score(model_gb, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
        fit['XGBoost'] = mean(cross_val_score(model_xgb, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
        fit['NeuralN'] = mean(cross_val_score(model_nn, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
        
        # XGBoost needs to LabelEncode the y before fitting the model
        from sklearn.preprocessing import LabelEncoder
        le = LabelEncoder().fit(y)
        y_encoded = le.transform(y.copy())
        fit['XGBoost'] = mean(cross_val_score(model_xgb, X, y_encoded, scoring='accuracy', cv=cv, n_jobs=-1))
      
        # Add the model to another dictionary; make sure the keys have the same names as the list above
        models['Logistic'] = model_log
        models['Ridge'] = model_logcv
        models['SGD'] = model_sgd
        models['PassiveAggressive'] = model_pa
        models['Perceptron'] = model_per
        models['KNN'] = model_knn
        models['SVM'] = model_svm
        models['NaiveBayes'] = model_nb
        models['Bagging'] = model_bag
        models['AdaBoost'] = model_ada
        models['ExtraTrees'] = model_ext
        models['RandomForest'] = model_rf
        models['HistGradient'] = model_hgb
        models['Voting'] = model_vot
        models['GradBoost'] = model_gb
        models['XGBoost'] = model_xgb
        models['NeuralN'] = model_nn
      
        # Add the fit dictionary to a new DataFrame, sort, extract the top row, use it to retrieve the model object from the models dictionary
        df_fit = pd.DataFrame({'Accuracy':fit})
        df_fit.sort_values(by=['Accuracy'], ascending=False, inplace=True)
        best_model = df_fit.index[0]
        print(df_fit)
      
        return models[best_model].fit(X, y)

model = fit_cv_classification_expanded(df, 'home_win')

NameError: name 'Logistic' is not defined