In [193]:
import pandas as pd
from numpy import array 
from scipy.stats import skew
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
import random
pd.set_option('display.max_columns', None)




In [194]:
def get_data()->pd.DataFrame:
    df = pd.read_csv('data/data.csv')
    df.drop(columns=['wins', 'losses'], inplace=True)
    df['comp_pct'] = df['pass_cmp']/df['pass_att']
    df = df[df['year'] > 2018]
    df.columns = df.columns.str.replace('_y', 'y')

    return df



def prep_season_data()->pd.DataFrame:
    df = get_data()
    to_not_average = ['year', 'team', 'ties', 'win_loss_perc', 'yds_per_play_offense', 'pass_net_yds_per_att', 'rush_yds_per_att','score_pct', 'turnover_pct','g', "comp_pct", "points_diff", "mov" ]
    for col in df.columns:
        if col not in to_not_average:
            df[col] = df[col]/df['g']
    df['mov'] = df['points_diff']/ df['g']
    return df.drop(columns=['g'])


In [195]:
def get_team_df():
    team_df = pd.read_csv('data/nfl_teams.csv')
    team_df.drop(columns=["team_id_pfr","team_conference_pre2002", "team_division_pre2002" ], inplace=True)
    return team_df

def get_games_df():
    games= pd.read_csv('data/spreadspoke_scores.csv')
    games = games[games['schedule_season'] >2018]
    games = games[games['schedule_week'] != '1']
    games = games[games['schedule_week'] != '2']
    games = games[games['schedule_week'] !='3']
    games = games[games['schedule_week'] !='4']
    games = games[games['schedule_week'] !='5']

    games['spread_favorite_sort'] = abs(games['spread_favorite'])
    
    return games
    

def get_stadiums():
    stadiums = pd.read_csv('data/nfl_stadiums.csv')
    return stadiums

def mege_dfs():
    team_df = get_team_df()
    df = prep_season_data()
    df = df.merge(team_df, left_on='team', right_on='team_name', how='left')
    stadiums = get_stadiums()
    games_df = get_games_df()
    games_df = games_df.merge(stadiums, left_on='stadium', right_on='stadium_name', how='left')
    games_df =games_df[games_df['stadium_neutral'] == False]

    games_df.drop(columns=['stadium_name', 'stadium_location', 'stadium_open', 'stadium_close', 'stadium_type', 'stadium_address', 'stadium_weather_station_zipcode', 'stadium_surface', 'stadium_weather_station', 'stadium_weather_station_name', 'stadium_latitude', 'stadium_longitude', 'stadium_azimuthangle', 'stadium_elevation', 'weather_temperature', 'weather_wind_mph', 'weather_humidity', 'weather_detail', 'stadium_capacity', "stadium"], inplace=True)
    games_df = df.merge(games_df, left_on=['year', 'team'], right_on=['schedule_season', 'team_home' ], how='left')
    games_df = df.merge(games_df, left_on=['year', 'team'], right_on=['schedule_season', 'team_away' ], how='left')
    games_df.columns = games_df.columns.str.replace('_x', '_home')
    games_df.columns = games_df.columns.str.replace('_y', '_away')
    games_df['home_fav'] = games_df['team_favorite_id'] == games_df['team_id_home']
    games_df['home_win'] = games_df['score_home'] > games_df['score_away']
    games_df = games_df.sort_values(by='spread_favorite_sort', ascending=True)
    games_df['over_under_line'] = games_df['over_under_line'].astype(float)
    games_df['over_under_line'] = games_df['over_under_line'].fillna(games_df['over_under_line'].mean())

    columns_to_drop = [
    'spread_favorite_sort', 'stadium_neutral', 'team_favorite_id', 'team_away', 
    'schedule_playoff', 'team_home', 'schedule_season', 'schedule_week', 
    'team_name_away', 'team_name_short_away', 'team_id_away', 
    'team_conference_away', 'team_division_away', 'schedule_date', 
    'team_away', 'team_name_home', 'team_name_short_home', 'team_id_home', 
    'team_conference_home', 'team_division_home', 'year_away', 'year_home','over_under_line', 'score_home', 'score_away'
    ]
    games_df.drop(columns=columns_to_drop, inplace=True)
    return games_df.head(550)

def bin_data(df):
    df_weather = pd.DataFrame({"Count": df['stadium_weather_type'].value_counts()})
    df_weather['Proportion'] = df_weather['Count'] / df.shape[0]
    #bin moderate and warm together since they make up a smaller portion and warmer weather is not 
    # a bigger factor in october
    df['stadium_weather_type'] = df['stadium_weather_type'].replace(['moderate', 'warm'], 'moderate/warm')
    return df


def check_skew(df):
    cols_to_drop = []
    for col in df.columns:
        if df[col].dtype in ['int64', 'float64']:
            skewness = skew(df[col])
            if skewness > 1 or skewness < -1:
                print(f'{col} has a skew of {skewness}')
                cols_to_drop.append(col)
    if len(cols_to_drop) > 0:
        df.drop(columns=cols_to_drop, inplace=True)
    return df

def missing_data(df):
    missing_data = pd.DataFrame({"Count": df.isnull().sum()})
    missing_data['Proportion'] = missing_data['Count'] / df.shape[0]
    missing_data = missing_data[missing_data['Count'] > 0]
    for col in missing_data.index:
        df[col] = df[col].fillna(0)
    return df


def Xandy(df, label):
    y = df[label]
    X = df.drop(columns=[label])
    return X, y

def dummy_code(X):
    X = pd.get_dummies(X, drop_first=True, dtype=float)
    return X
    
def minmax(X):
    X = pd.DataFrame(MinMaxScaler().fit_transform(X.copy()), columns=X.columns, index=X.index)
    return X

def impute_KNN(df, label, neighbors=5 ):
    df = dummy_code(df.copy())
    X, y = Xandy(df, label)
    X = minmax(X.copy())
    imp = KNNImputer(n_neighbors=neighbors, weights="uniform")
    X = pd.DataFrame(imp.fit_transform(X), columns=X.columns, index=X.index)
    return X.merge(y, left_index=True, right_index=True)



def fs_select_trees(X, y, label="home_win"):
    X, y = Xandy(df, label)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random.randint(0,1000))
    clf = ExtraTreesClassifier(criterion='log_loss',n_estimators=1000)
    clf.fit(X_train, y_train)
    print(clf.score(X_test, y_test))
    clf = clf.fit(X, y)
    sel = SelectFromModel(clf, prefit=True)
    sel.transform(X)
    columns = list(X.columns[sel.get_support()])
    columns.append(label)
    return df[columns]




df = mege_dfs()
df = missing_data(df)
df = bin_data(df)
df= check_skew(df)
df = impute_KNN(df, 'home_win')
df = fs_select_trees(df, 'home_win')
df

ties_home has a skew of 4.17396641210445
ties_away has a skew of 3.847293299756682
0.49696969696969695




Unnamed: 0,win_loss_perc_home,points_home,points_diff_home,mov_home,yds_per_play_offense_home,score_pct_home,comp_pct_home,win_loss_perc_away,points_opp_away,points_diff_away,mov_away,penaltiesyds_away,pen_fd_away,turnover_pct_away,spread_favorite,home_fav,home_win
53,0.587847,0.582365,0.615551,0.615551,0.80,0.354430,0.648878,0.461823,0.503401,0.423841,0.408163,0.215531,0.937269,0.450617,1.0,1.0,False
679,0.232497,0.155399,0.306695,0.315843,0.40,0.265823,0.283834,0.429803,0.431973,0.390728,0.377751,0.439452,0.265683,0.345679,1.0,1.0,True
639,0.388375,0.188916,0.330454,0.338204,0.40,0.243671,0.413070,0.176108,0.579232,0.161148,0.155796,0.114291,0.383764,0.592593,1.0,1.0,False
87,0.257596,0.371929,0.444924,0.444924,0.72,0.588608,0.630360,0.461823,0.659864,0.216336,0.195011,0.957211,0.090406,0.351852,1.0,1.0,True
36,0.339498,0.365454,0.336933,0.336933,0.60,0.433544,0.195457,0.538177,0.265306,0.419426,0.403628,0.568938,0.905904,0.740741,1.0,1.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
746,0.388375,0.286422,0.429806,0.431711,0.60,0.363924,0.631539,0.937192,0.335934,0.743929,0.719221,0.170970,0.413284,0.382716,0.0,0.0,False
754,0.388375,0.393068,0.416847,0.419515,0.52,0.610759,0.377507,0.429803,0.339136,0.417219,0.403361,0.408129,0.177122,0.549383,0.0,0.0,True
155,0.257596,0.381642,0.181425,0.181425,0.40,0.382911,0.279312,0.923645,0.299320,0.589404,0.578231,0.380349,0.811808,0.172840,0.0,0.0,True
879,0.698811,0.600267,0.473002,0.472367,0.52,0.718354,0.602091,0.357143,0.537615,0.139073,0.134454,0.214226,0.413284,0.277778,0.0,1.0,True
