In [724]:
import pandas as pd
import random
import warnings
from numpy import array 
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.simplefilter(action='ignore', category=FutureWarning)





In [725]:
def get_data()->pd.DataFrame:
    df = pd.read_csv('data.csv')
    df.drop(columns=['wins', 'losses'], inplace=True)
    df['comp_pct'] = df['pass_cmp']/df['pass_att']
    df = df[df['year'] > 2018]
    df.columns = df.columns.str.replace('_y', 'y')

    return df

pd.set_option('display.max_columns', None)


def prep_season_data()->pd.DataFrame:
    df = get_data()
    df['ties'] = df['ties'].fillna(0)
    to_not_average = ['year', 'team', 'ties', 'win_loss_perc', 'yds_per_play_offense', 'pass_net_yds_per_att', 'rush_yds_per_att','score_pct', 'turnover_pct','g', "comp_pct", "points_diff", "mov" ]
    for col in df.columns:
        if col not in to_not_average:
            df[col] = df[col]/df['g']
    df['mov'] = df['points_diff']/ df['g']
    return df.drop(columns=['g'])


In [726]:
def get_team_df():
    team_df = pd.read_csv('nfl_teams.csv')
    team_df.drop(columns=["team_id_pfr","team_conference_pre2002", "team_division_pre2002" ], inplace=True)
    return team_df

def get_games_df():
    games= pd.read_csv('spreadspoke_scores.csv')
    games = games[games['schedule_season'] >2018]
    #getting rid of the first 5 weeks of the season because there will not be enough data to make a prediction
    games = games[games['schedule_week'] != '1']
    games = games[games['schedule_week'] != '2']
    games = games[games['schedule_week'] !='3']
    games = games[games['schedule_week'] !='4']
    games = games[games['schedule_week'] !='5']

    games['spread_favorite_sort'] = abs(games['spread_favorite'])
    
    return games
    

def get_stadiums():
    stadiums = pd.read_csv('nfl_stadiums.csv')
    return stadiums

def mege_dfs():
    team_df = get_team_df()
    df =prep_data()
    df = df.merge(team_df, left_on='team', right_on='team_name', how='left')
    stadiums = get_stadiums()
    games_df = get_games_df()
    games_df = games_df.merge(stadiums, left_on='stadium', right_on='stadium_name', how='left')
    games_df =games_df[games_df['stadium_neutral'] == False]

    games_df.drop(columns=['stadium_name', 'stadium_location', 'stadium_open', 'stadium_close', 'stadium_type', 'stadium_address', 'stadium_weather_station_zipcode', 'stadium_surface', 'stadium_weather_station', 'stadium_weather_station_name', 'stadium_latitude', 'stadium_longitude', 'stadium_azimuthangle', 'stadium_elevation', 'weather_temperature', 'weather_wind_mph', 'weather_humidity', 'weather_detail', 'stadium_capacity', "stadium"], inplace=True)
    games_df = df.merge(games_df, left_on=['year', 'team'], right_on=['schedule_season', 'team_home' ], how='left')
    games_df = df.merge(games_df, left_on=['year', 'team'], right_on=['schedule_season', 'team_away' ], how='left')
    # change the "_x" and "_y" to "_home" and "_away"
    games_df.columns = games_df.columns.str.replace('_x', '_home')
    games_df.columns = games_df.columns.str.replace('_y', '_away')
    games_df['home_fav'] = games_df['team_favorite_id'] == games_df['team_id_home']
    games_df['home_win'] = games_df['score_home'] > games_df['score_away']
    games_df = games_df.sort_values(by='spread_favorite_sort', ascending=True)
    games_df['over_under_line'] = games_df['over_under_line'].astype(float)
    games_df['over_under_line'] = games_df['over_under_line'].fillna(games_df['over_under_line'].mean())

    return games_df

df= mege_dfs()


columns_to_drop = [
    'spread_favorite_sort', 'stadium_neutral', 'team_favorite_id', 'team_away', 
    'schedule_playoff', 'team_home', 'schedule_season', 'schedule_week', 
    'team_name_away', 'team_name_short_away', 'team_id_away', 
    'team_conference_away', 'team_division_away', 'schedule_date', 
    'team_away', 'team_name_home', 'team_name_short_home', 'team_id_home', 
    'team_conference_home', 'team_division_home', 'year_away', 'year_home', 'ties_home'
]

df.drop(columns=columns_to_drop, inplace=True)
df


#bin rare group values
df['stadium_weather_type'].unique()
df['stadium_weather_type'].value_counts()
# Most are either cold or indoor. Since these both  

stadium_weather_type
cold        379
indoor      262
moderate    189
warm        150
Name: count, dtype: int64