In [30]:
import numpy as np
import pandas as pd
import datetime

### Cleaning
  * game data from https://github.com/ryurko/nflscrapR-data/tree/master/games_data
  * elo data from https://github.com/fivethirtyeight/data/tree/master/nfl-elo

In [31]:
elo = pd.read_csv('./data/nfl_elo_update.csv')
games = pd.read_csv('./data/all_games.csv')

#### simple cleaning tasks
  * remove old games from elo csv (only have games from 2009 onward in all_games.csv) 
  * drop na values from games (e.g., na for score field)
  * need games file because of game_id field --> maps to weather and stadium dataset as well

In [32]:
# only keep data from 2009 onward in ELO (because thats all game data we have)
start = datetime.datetime(2009, 1, 1) 
elo['date'] = pd.to_datetime(elo['date'])
elo = elo.rename(columns={"team1": "home_team", "team2": "away_team"})
elo = elo[elo['date'] > start]
elo.head()

Unnamed: 0,date,season,neutral,playoff,home_team,away_team,elo1_pre,elo2_pre,elo_prob1,elo_prob2,...,qb2_game_value,qb1_value_post,qb2_value_post,qbelo1_post,qbelo2_post,score1,score2,quality,importance,total_rating
13593,2009-01-03,2008,0,w,LAC,IND,1593.456,1691.593,0.452456,0.547544,...,175.85798,162.089678,214.250408,1592.437892,1641.64418,23.0,17.0,96.0,,
13594,2009-01-03,2008,0,w,ARI,ATL,1493.926,1558.51,0.500599,0.499401,...,22.757157,173.533778,111.792835,1496.953402,1523.763584,30.0,24.0,58.0,,
13595,2009-01-04,2008,0,w,MIA,BAL,1545.051,1644.229,0.450972,0.549028,...,85.936527,136.71657,92.921789,1496.142146,1647.734835,9.0,27.0,85.0,,
13596,2009-01-04,2008,0,w,MIN,PHI,1583.539,1616.177,0.546438,0.453562,...,156.518539,100.661065,160.098468,1551.382149,1633.941356,14.0,26.0,93.0,,
13597,2009-01-10,2008,0,d,TEN,BAL,1653.015,1670.38,0.568126,0.431874,...,126.14533,94.427242,96.244143,1623.516815,1665.774048,10.0,13.0,97.0,,


In [33]:
# drop any nas in games
games = games[(games['home_score'].notna()) & (games['away_score'].notna())]
games = games.drop('Unnamed: 0', axis=1)
games.head()

Unnamed: 0,type,game_id,home_team,away_team,week,season,state_of_game,game_url,home_score,away_score
0,post,2017010700,HOU,OAK,18,2016,POST,http://www.nfl.com/liveupdate/game-center/2017...,27.0,14.0
1,post,2017010701,SEA,DET,18,2016,POST,http://www.nfl.com/liveupdate/game-center/2017...,26.0,6.0
2,post,2017010800,PIT,MIA,18,2016,POST,http://www.nfl.com/liveupdate/game-center/2017...,30.0,12.0
3,post,2017010801,GB,NYG,18,2016,POST,http://www.nfl.com/liveupdate/game-center/2017...,38.0,13.0
4,post,2017011400,ATL,SEA,18,2016,POST,http://www.nfl.com/liveupdate/game-center/2017...,36.0,20.0


### Ensure team abbreviations have 1:1 mapping across datasets

In [34]:
# need to ensure team mappings are 1:1 across datasets
elo_teams = elo['home_team'].unique()
elo_teams = [w.upper() for w in elo_teams]
games_teams = games['home_team'].unique()

# find differences 
elo_diff = set(elo_teams).difference(set(games_teams)) # whats in elo and not in games
game_diff = set(games_teams).difference(set(elo_teams)) # whats in games and not elo
print(f'whats in elo and not in games: {elo_diff}')
print(f'whats in games and not elo: {game_diff}\n')

whats in elo and not in games: {'WSH', 'LAR'}
whats in games and not elo: {'CRT', 'STL', 'WAS', 'NPR', 'JAC', 'APR', 'SAN', 'LA', 'SD', 'RIC'}



#### Mappings that need to be fixed
 * SD --> chargers (games)
 * JAC --> jaguars (games)
 * LA --> rams (games)
 * WAS --> commanders (games)
 * STL --> rams (games) 

 * map rams to --> LAR
 * map commanders to --> WSH
 * map chargers to --> LAC

 * drop APR, CRT, NPR, RIC, SAN (games)

 * ELO dataset is clean
 * LAR --> rams (elo)
 * WSH --> commanders (elo)

In [35]:
nonsense_teams = ['APR', 'CRT', 'NPR', 'RIC', 'SAN', 'CRT']
games = games[(~games['home_team'].isin(nonsense_teams)) & (~games['away_team'].isin(nonsense_teams))]
# convert elo abbreviations to upper
elo['home_team'] = elo['home_team'].apply(str.upper)
elo['away_team'] = elo['away_team'].apply(str.upper)

In [36]:
# correct mapping abbreviations
def correct_mapping(name):
    if name == 'LA'  or name == 'STL': # rams
        return 'LAR'
    elif name == 'WAS':
        return 'WSH'
    elif name == 'JAC':
        return 'JAX'
    elif name == 'SD':
        return 'LAC'
    else:
        return name

games['home_team'] = games['home_team'].apply(correct_mapping)
games['away_team'] = games['away_team'].apply(correct_mapping)

# sanity check
elo_teams = elo['home_team'].unique()
games_teams = games['home_team'].unique()
elo_diff = set(elo_teams).difference(set(games_teams)) # whats in elo and not in games
game_diff = set(games_teams).difference(set(elo_teams)) # whats in games and not elo
print(f'whats in elo and not in games: {elo_diff}')
print(f'whats in games and not elo: {game_diff}\n')

whats in elo and not in games: set()
whats in games and not elo: set()



### Join ELO and games datasets using dates and team abbreviations

In [37]:
# create date column in games dataset
# parse date from url
def date_from_url(url):
    """
    each game queried from sports-center api
    date found at ...center/<date>/... portion of url
    date in form of YYYYMMDDXX where XX is not needed
    date-form has len 8
    """
    i = url.index('center') + len('center') + 1 # +1 because of /
    date = url[i:i+8]
    return date

games['date'] = games['game_url'].apply(date_from_url)
games['date'] = pd.to_datetime(games['date'])

In [38]:
complete_df = pd.merge(games, elo, left_on=['date','home_team','away_team'], right_on = ['date','home_team','away_team'])

#### Join weather data
 * data from https://www.datawithbliss.com/weather-data

In [39]:
# preprocessing of weather dataset
weather = pd.read_csv('./data/weather_data/games_weather.csv')
weather_cols = ['game_id', 'TimeMeasure', 'Temperature', 'DewPoint', 'Humidity', 
                  'Precipitation', 'WindSpeed', 'WindDirection', 'Pressure',
                  'EstimatedCondition']
weather = weather[weather_cols] # columns desired

start = datetime.datetime(2009, 1, 1) 
weather['date'] = pd.to_datetime(weather['TimeMeasure'])
weather = weather[weather['date'] > start]
weather.head()
# keep earliest weather measurement (prior to game) <--> multiple weather measurements before, during, after game
print(f'shape prior to dropping duplicate game_ids: {weather.shape}')
weather = weather.drop_duplicates(subset=['game_id'], keep='first')
print(f'shape after dropping duplicate game_ids: {weather.shape}')

shape prior to dropping duplicate game_ids: (21842, 11)
shape after dropping duplicate game_ids: (3207, 11)


In [40]:
# complete_df left join with weather
complete_df = pd.merge(how='left', left=complete_df, right=weather, on='game_id')
complete_df.head()

Unnamed: 0,type,game_id,home_team,away_team,week,season_x,state_of_game,game_url,home_score,away_score,...,TimeMeasure,Temperature,DewPoint,Humidity,Precipitation,WindSpeed,WindDirection,Pressure,EstimatedCondition,date_y
0,post,2017010700,HOU,OAK,18,2016,POST,http://www.nfl.com/liveupdate/game-center/2017...,27.0,14.0,...,1/7/2017 15:00,41.0,13.28,32.0,0.0,11.43,330.0,30.6728,Clear,2017-01-07 15:00:00
1,post,2017010701,SEA,DET,18,2016,POST,http://www.nfl.com/liveupdate/game-center/2017...,26.0,6.0,...,1/7/2017 16:58,28.4,24.8,86.0,,9.29,,29.8149,,2017-01-07 16:58:00
2,post,2017010800,PIT,MIA,18,2016,POST,http://www.nfl.com/liveupdate/game-center/2017...,30.0,12.0,...,1/8/2017 13:00,15.08,1.04,53.0,0.0,18.33,310.0,30.6137,Clear,2017-01-08 13:00:00
3,post,2017010801,GB,NYG,18,2016,POST,http://www.nfl.com/liveupdate/game-center/2017...,38.0,13.0,...,1/8/2017 15:00,12.92,3.02,64.0,0.0,8.08,210.0,30.5901,Clear,2017-01-08 15:00:00
4,post,2017011400,ATL,SEA,18,2016,POST,http://www.nfl.com/liveupdate/game-center/2017...,36.0,20.0,...,1/14/2017 16:00,73.04,49.82,44.0,0.0,3.36,300.0,30.3509,Clear,2017-01-14 16:00:00


In [41]:
# address NA values
# fill precipitation with 0 and pressure with avg pressure
complete_df['Precipitation'] = complete_df['Precipitation'].fillna(value=0)
complete_df['Pressure'] = complete_df['Pressure'].fillna(value=complete_df['Pressure'].mean())
# drop WindDirection, EstimatedCondition column --> too many nas to deal with

drop_cols = ['WindDirection', 'EstimatedCondition', 'importance', 'total_rating']
complete_df = complete_df.drop(drop_cols, axis=1)
# playoff = 1 if playoff else 0
# one-hot encode the playoff

complete_df['playoff'] = complete_df['playoff'].apply(lambda x: "n" if pd.isna(x) else x)
playoff_one_hot = pd.get_dummies(complete_df['playoff'], prefix='Playoff_')
complete_df = complete_df.drop('playoff', axis=1)
complete_df = complete_df.join(playoff_one_hot)

# binary indicator of whether home team won or not
complete_df['home_win'] = complete_df.apply(lambda x: 1 if x['home_score'] > x['away_score'] else 0, axis=1)
complete_df.head(10)

Unnamed: 0,type,game_id,home_team,away_team,week,season_x,state_of_game,game_url,home_score,away_score,...,Precipitation,WindSpeed,Pressure,date_y,Playoff__c,Playoff__d,Playoff__n,Playoff__s,Playoff__w,home_win
0,post,2017010700,HOU,OAK,18,2016,POST,http://www.nfl.com/liveupdate/game-center/2017...,27.0,14.0,...,0.0,11.43,30.6728,2017-01-07 15:00:00,0,0,0,0,1,1
1,post,2017010701,SEA,DET,18,2016,POST,http://www.nfl.com/liveupdate/game-center/2017...,26.0,6.0,...,0.0,9.29,29.8149,2017-01-07 16:58:00,0,0,0,0,1,1
2,post,2017010800,PIT,MIA,18,2016,POST,http://www.nfl.com/liveupdate/game-center/2017...,30.0,12.0,...,0.0,18.33,30.6137,2017-01-08 13:00:00,0,0,0,0,1,1
3,post,2017010801,GB,NYG,18,2016,POST,http://www.nfl.com/liveupdate/game-center/2017...,38.0,13.0,...,0.0,8.08,30.5901,2017-01-08 15:00:00,0,0,0,0,1,1
4,post,2017011400,ATL,SEA,18,2016,POST,http://www.nfl.com/liveupdate/game-center/2017...,36.0,20.0,...,0.0,3.36,30.3509,2017-01-14 16:00:00,0,1,0,0,0,1
5,post,2017011401,NE,HOU,18,2016,POST,http://www.nfl.com/liveupdate/game-center/2017...,34.0,16.0,...,0.0,6.4,30.3849,2017-01-14 19:35:00,0,1,0,0,0,1
6,post,2017011501,DAL,GB,18,2016,POST,http://www.nfl.com/liveupdate/game-center/2017...,31.0,34.0,...,0.0,6.96,29.9582,2017-01-15 15:00:00,0,1,0,0,0,0
7,post,2017011500,KC,PIT,18,2016,POST,http://www.nfl.com/liveupdate/game-center/2017...,16.0,18.0,...,0.0,8.08,30.1944,2017-01-15 19:00:00,0,1,0,0,0,0
8,post,2017012200,ATL,GB,18,2016,POST,http://www.nfl.com/liveupdate/game-center/2017...,44.0,21.0,...,0.012,10.31,29.2701,2017-01-22 15:00:00,1,0,0,0,0,1
9,post,2017012201,NE,PIT,18,2016,POST,http://www.nfl.com/liveupdate/game-center/2017...,36.0,17.0,...,0.012,11.43,29.8814,2017-01-22 18:00:00,1,0,0,0,0,1


In [42]:
# sanity check
print(f'Total NA values in df: {complete_df.isna().sum().sum()}')
print(f"scores from games dataset match elo: {(complete_df['home_score'] == complete_df['score1']).all()}")

Total NA values in df: 0
scores from games dataset match elo: True


In [44]:
# columns to keep 
ftrs = ['type', 'home_team', 'away_team', 'week', 'date_x', 'Playoff__c', 'Playoff__d', 
        'Playoff__n', 'Playoff__s', 'Playoff__w', 'elo1_pre',
        'elo2_pre', 'elo_prob1', 'elo_prob2', 'qbelo1_pre', 'qbelo2_pre', 'qb1', 
        'qb2', 'qb1_value_pre', 'qb2_value_pre', 'qb1_adj', 'qb2_adj', 'qbelo_prob1',
        'qbelo_prob2', 'quality', 'Temperature', 'DewPoint', 'Humidity', 
        'Precipitation', 'WindSpeed', 'Pressure']
labels = ['home_win', 'home_score', 'away_score'] 
complete_df = complete_df[ftrs+labels]

In [45]:
complete_df.to_csv('weather_game_elo_playoff_encoded.csv')

In [46]:
# latest nfl elo scores for games that haven't happend (test model in real-time for rest of season)
# join with weather prediction data to make predictions
latest = pd.read_csv('./data/nfl_elo_latest.csv')
latest.tail()

Unnamed: 0,date,season,neutral,playoff,team1,team2,elo1_pre,elo2_pre,elo_prob1,elo_prob2,...,qb2_game_value,qb1_value_post,qb2_value_post,qbelo1_post,qbelo2_post,score1,score2,quality,importance,total_rating
267,2023-01-08,2022,0,,CHI,MIN,1412.056659,1570.130531,0.369171,0.630829,...,,,,,,,,40,12,26
268,2023-01-08,2022,0,,WSH,DAL,1502.073155,1637.80913,0.399586,0.600414,...,,,,,,,,85,74,80
269,2023-01-08,2022,0,,JAX,TEN,1377.594909,1616.208521,0.269059,0.730941,...,,,,,,,,54,7,31
270,2023-01-08,2022,0,,MIA,NYJ,1555.234243,1473.171051,0.699846,0.300154,...,,,,,,,,61,85,73
271,2023-01-08,2022,0,,PHI,NYG,1593.300018,1435.429491,0.782951,0.217049,...,,,,,,,,66,78,72
