In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import time
import pickle

In [2]:
# Setting pandas to display columns
pd.set_option('display.max_columns', None)

In [3]:
nfl_small = pd.read_csv('nfl_small.csv')
nfl_small.shape

(178962, 51)

In [4]:
# Dropping unused index column and sorting to allow for future calculations
nfl_small = nfl_small.drop(columns=['Unnamed: 0']).sort_values(by=['game_id','play_id'])

In [5]:
nfl_small.head(5)

Unnamed: 0,index,play_id,game_id,game_date,time,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,game_half,quarter_end,qtr,home_team,away_team,posteam,posteam_type,defteam,side_of_field,yardline_100,drive,sp,down,goal_to_go,yrdln,ydstogo,ydsnet,desc,play_type,yards_gained,pass_length,air_yards,yards_after_catch,field_goal_result,kick_distance,extra_point_result,two_point_conv_result,td_team,total_home_score,total_away_score,posteam_score,defteam_score,sack,touchdown,pass_touchdown,rush_touchdown,return_touchdown,extra_point_attempt,two_point_attempt,field_goal_attempt,fumble,complete_pass
0,270409,36,2015091000,2015-09-10,15:00,900.0,1800.0,3600.0,Half1,0,1,NE,PIT,PIT,away,NE,NE,35.0,1,0,,0.0,NE 35,0,0,S.Gostkowski kicks 65 yards from NE 35 to end ...,kickoff,0.0,,,,,,,,,0,0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,270410,51,2015091000,2015-09-10,15:00,900.0,1800.0,3600.0,Half1,0,1,NE,PIT,PIT,away,NE,PIT,80.0,1,0,1.0,0.0,PIT 20,10,18,(15:00) De.Williams right tackle to PIT 38 for...,run,18.0,,,,,,,,,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,270411,72,2015091000,2015-09-10,14:21,861.0,1761.0,3561.0,Half1,0,1,NE,PIT,PIT,away,NE,PIT,62.0,1,0,1.0,0.0,PIT 38,10,31,(14:21) B.Roethlisberger pass short right to A...,pass,9.0,short,-4.0,13.0,,,,,,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,270412,101,2015091000,2015-09-10,14:04,844.0,1744.0,3544.0,Half1,0,1,NE,PIT,PIT,away,NE,PIT,53.0,1,0,2.0,0.0,PIT 47,1,31,(14:04) De.Williams right guard to NE 49 for 4...,run,4.0,,,,,,,,,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,270413,122,2015091000,2015-09-10,13:26,806.0,1706.0,3506.0,Half1,0,1,NE,PIT,PIT,away,NE,NE,49.0,1,0,1.0,0.0,NE 49,10,45,(13:26) B.Roethlisberger pass short right to H...,pass,14.0,short,9.0,5.0,,,,,,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


<h2>Data Cleaning</h2>

In [6]:
# Changing date object to date format
nfl_small['game_date'] = pd.to_datetime(nfl_small['game_date'])
nfl_small.head(2)

Unnamed: 0,index,play_id,game_id,game_date,time,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,game_half,quarter_end,qtr,home_team,away_team,posteam,posteam_type,defteam,side_of_field,yardline_100,drive,sp,down,goal_to_go,yrdln,ydstogo,ydsnet,desc,play_type,yards_gained,pass_length,air_yards,yards_after_catch,field_goal_result,kick_distance,extra_point_result,two_point_conv_result,td_team,total_home_score,total_away_score,posteam_score,defteam_score,sack,touchdown,pass_touchdown,rush_touchdown,return_touchdown,extra_point_attempt,two_point_attempt,field_goal_attempt,fumble,complete_pass
0,270409,36,2015091000,2015-09-10,15:00,900.0,1800.0,3600.0,Half1,0,1,NE,PIT,PIT,away,NE,NE,35.0,1,0,,0.0,NE 35,0,0,S.Gostkowski kicks 65 yards from NE 35 to end ...,kickoff,0.0,,,,,,,,,0,0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,270410,51,2015091000,2015-09-10,15:00,900.0,1800.0,3600.0,Half1,0,1,NE,PIT,PIT,away,NE,PIT,80.0,1,0,1.0,0.0,PIT 20,10,18,(15:00) De.Williams right tackle to PIT 38 for...,run,18.0,,,,,,,,,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Inspecting datatypes
nfl_small.dtypes

index                                 int64
play_id                               int64
game_id                               int64
game_date                    datetime64[ns]
time                                 object
quarter_seconds_remaining           float64
half_seconds_remaining              float64
game_seconds_remaining              float64
game_half                            object
quarter_end                           int64
qtr                                   int64
home_team                            object
away_team                            object
posteam                              object
posteam_type                         object
defteam                              object
side_of_field                        object
yardline_100                        float64
drive                                 int64
sp                                    int64
down                                float64
goal_to_go                          float64
yrdln                           

In [7]:
# Creating a function that examines missing values in a dataframe easily. This can re-used later.
def null_count_func(dataframe):
    columns = list(dataframe.columns)

    for column in columns:
        total = dataframe[column].count()
        null_cnt = dataframe[column].isnull().sum()
        print('Column {} has {} real values, and {} null values'.format(column,total,null_cnt))

In [8]:
null_count_func(nfl_small)

Column index has 178962 real values, and 0 null values
Column play_id has 178962 real values, and 0 null values
Column game_id has 178962 real values, and 0 null values
Column game_date has 178962 real values, and 0 null values
Column time has 178891 real values, and 71 null values
Column quarter_seconds_remaining has 178891 real values, and 71 null values
Column half_seconds_remaining has 178867 real values, and 95 null values
Column game_seconds_remaining has 178869 real values, and 93 null values
Column game_half has 178962 real values, and 0 null values
Column quarter_end has 178962 real values, and 0 null values
Column qtr has 178962 real values, and 0 null values
Column home_team has 178962 real values, and 0 null values
Column away_team has 178962 real values, and 0 null values
Column posteam has 172898 real values, and 6064 null values
Column posteam_type has 173058 real values, and 5904 null values
Column defteam has 173058 real values, and 5904 null values
Column side_of_fiel

From the null check function, we see quite a few columns with nulls, but I am unsure if that is on purpose or just data gaps.

I plan to explore:   
1. time - Done - fill forward
2. quarter_seconds_remaining - Done - fill forward
3. half_seconds_remaining - Done - fill forward
4. game_seconds_remaining - Done - fill forward
5. posteam - Done - fill forward
6. posteam_type - Done - fill forward
7. defteam - Done - fill forward
8. yardline_100 - Done - dropping nulls
9. down - Done - dropping nulls
10. goal_to_go - Done - dropping nulls
11. play_type - Done - dropping nulls
12. pass_length (ensure it's de-duped from runs) - Done - Fill with zero. Most nulls are kicks or runs. Sacks are also null
13. air_yards (ensure it's de-duped from runs) - Done - Same as pass_length. Ensure we can use this for incomplete passes?
14. yards_after_catch (ensure it's de-duped from runs) - Done - Mostly the same as above, but also includes nulls for incomplete passes
15. posteam_score - Done - beginning of game, so fill with zero
16. defteam_score - Done - beginning of game, so fill with zero

In [9]:
nfl_small.shape

(178962, 50)

In [10]:
# Removing BLANK PLAY
blank_play_drop = list(nfl_small[nfl_small['desc']=='BLANK PLAY'].index)
nfl_small.drop(nfl_small.index[blank_play_drop], inplace=True)
nfl_small.shape

(178956, 50)

In [11]:
# Forward fill time for weather delays ***Potential Improvement - Incorprate Delay Times***
nfl_small['time'] = nfl_small['time'].fillna(method='pad')
nfl_small['quarter_seconds_remaining'] = nfl_small['quarter_seconds_remaining'].fillna(method='pad')
nfl_small['half_seconds_remaining'] = nfl_small['half_seconds_remaining'].fillna(method='pad')
nfl_small['game_seconds_remaining'] = nfl_small['game_seconds_remaining'].fillna(method='pad')

In [12]:
# Forward filling the possession columns
nfl_small['posteam'] = nfl_small['posteam'].fillna(method='pad')
nfl_small['posteam_type'] = nfl_small['posteam_type'].fillna(method='pad')
nfl_small['defteam'] = nfl_small['defteam'].fillna(method='pad')

In [13]:
# Other fill operations
nfl_small['yardline_100'] = nfl_small['yardline_100'].fillna(method='pad')
nfl_small['pass_length'] = nfl_small['pass_length'].fillna(0)
nfl_small['air_yards'] = nfl_small['air_yards'].fillna(0)
nfl_small['yards_after_catch'] = nfl_small['yards_after_catch'].fillna(0)
nfl_small['posteam_score'] = nfl_small['posteam_score'].fillna(0)
nfl_small['defteam_score'] = nfl_small['posteam_score'].fillna(0)

In [14]:
null_count_func(nfl_small)

Column index has 178956 real values, and 0 null values
Column play_id has 178956 real values, and 0 null values
Column game_id has 178956 real values, and 0 null values
Column game_date has 178956 real values, and 0 null values
Column time has 178956 real values, and 0 null values
Column quarter_seconds_remaining has 178956 real values, and 0 null values
Column half_seconds_remaining has 178956 real values, and 0 null values
Column game_seconds_remaining has 178956 real values, and 0 null values
Column game_half has 178956 real values, and 0 null values
Column quarter_end has 178956 real values, and 0 null values
Column qtr has 178956 real values, and 0 null values
Column home_team has 178956 real values, and 0 null values
Column away_team has 178956 real values, and 0 null values
Column posteam has 178956 real values, and 0 null values
Column posteam_type has 178956 real values, and 0 null values
Column defteam has 178956 real values, and 0 null values
Column side_of_field has 178683 

<h2>Field Exploration is Below. Summarizing and applying changes above</h2>

In [15]:
# Null time also affects other time columns so will fill all
nfl_small[nfl_small['time'].isnull()].head()

Unnamed: 0,index,play_id,game_id,game_date,time,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,game_half,quarter_end,qtr,home_team,away_team,posteam,posteam_type,defteam,side_of_field,yardline_100,drive,sp,down,goal_to_go,yrdln,ydstogo,ydsnet,desc,play_type,yards_gained,pass_length,air_yards,yards_after_catch,field_goal_result,kick_distance,extra_point_result,two_point_conv_result,td_team,total_home_score,total_away_score,posteam_score,defteam_score,sack,touchdown,pass_touchdown,rush_touchdown,return_touchdown,extra_point_attempt,two_point_attempt,field_goal_attempt,fumble,complete_pass


In [20]:
# Most look like stoppage in play. I will try to backfill from previous time
# nfl_small[nfl_small['time'].isnull()].groupby('desc')['desc'].count()

In [21]:
# Exploring blank plays, it looks like we can remove the rows from the dataset
nfl_small[(nfl_small['game_id']==2017091000) & (nfl_small['play_id'].between(1062,1102))]

Unnamed: 0,index,play_id,game_id,game_date,time,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,game_half,quarter_end,qtr,home_team,away_team,posteam,posteam_type,defteam,side_of_field,yardline_100,drive,sp,down,goal_to_go,yrdln,ydstogo,ydsnet,desc,play_type,yards_gained,pass_length,air_yards,yards_after_catch,field_goal_result,kick_distance,extra_point_result,two_point_conv_result,td_team,total_home_score,total_away_score,posteam_score,defteam_score,sack,touchdown,pass_touchdown,rush_touchdown,return_touchdown,extra_point_attempt,two_point_attempt,field_goal_attempt,fumble,complete_pass
242,362336,1062,2017091000,2017-09-10,13:59,839.0,839.0,2639.0,Half1,0,2,BUF,NYJ,BUF,home,NYJ,NYJ,34.0,5,0,2.0,0.0,NYJ 34,10,77,(13:59) (Shotgun) T.Taylor pass short middle t...,pass,12.0,short,4.0,8.0,,,,,,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
244,362338,1102,2017091000,2017-09-10,13:22,802.0,802.0,2602.0,Half1,0,2,BUF,NYJ,BUF,home,NYJ,NYJ,22.0,5,0,1.0,0.0,NYJ 22,10,71,(13:22) M.Tolbert right tackle to NYJ 8 for 14...,run,14.0,0,0.0,0.0,,,,,,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
blank_play_drop = list(nfl_small[nfl_small['desc']=='BLANK PLAY'].index)
blank_play_drop

[81929, 103172, 104369]

In [23]:
nfl_small.drop(nfl_small.index[blank_play_drop], inplace=True)

In [16]:
list(nfl_small[nfl_small['desc']=='BLANK PLAY'].index)

[81929, 103172, 104369]

In [17]:
# Exploring delays, it looks like we will need to account for delay times, but can drop rows after
# nfl_small[(nfl_small['game_id']==2017091710) & (nfl_small['play_id'].between(750,850))]

In [18]:
# Exploring end of half/game, it looks like we should also account for time, but can drop rows after
# nfl_small[(nfl_small['game_id']==2017092401) & (nfl_small['play_id'].between(2500,2600))]

In [21]:
# Forward fill time for weather delays ***Potential Improvement - Incorprate Delay Times***
nfl_small['time'] = nfl_small['time'].fillna(method='pad')
nfl_small['quarter_seconds_remaining'] = nfl_small['quarter_seconds_remaining'].fillna(method='pad')
nfl_small['half_seconds_remaining'] = nfl_small['half_seconds_remaining'].fillna(method='pad')
nfl_small['game_seconds_remaining'] = nfl_small['game_seconds_remaining'].fillna(method='pad')

<h2>Exploring posteam nulls</h2>
5. posteam
6. posteam_type
7. defteam

In [22]:
# All null posteam is due to stoppage in play. Will forward fill team from previous record
# nfl_small[nfl_small['posteam'].isnull()].groupby('desc')['desc'].count()

In [23]:
# Same as posteam
# nfl_small[nfl_small['posteam_type'].isnull()].groupby('desc')['desc'].count()

In [24]:
# Same as both above
# nfl_small[nfl_small['defteam'].isnull()].groupby('desc')['desc'].count()

In [25]:
# Forward filling the possession columns
nfl_small['posteam'] = nfl_small['posteam'].fillna(method='pad')
nfl_small['posteam_type'] = nfl_small['posteam_type'].fillna(method='pad')
nfl_small['defteam'] = nfl_small['defteam'].fillna(method='pad')

<h2>Exploring yardline_100 nulls</h2>

In [26]:
# Null yardline_100 appears to match time nulls and are stoppages in play
# nfl_small[nfl_small['yardline_100'].isnull()].head()

In [27]:
# nfl_small[nfl_small['yardline_100'].isnull()]['desc'].unique()

In [28]:
nfl_small['yardline_100'] = nfl_small['yardline_100'].fillna(method='pad')

<h2>Exploring goal_to_go nulls</h2>

In [29]:
# Same as yardline
nfl_small[nfl_small['goal_to_go'].isnull()].head()

Unnamed: 0,index,play_id,game_id,game_date,time,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,game_half,quarter_end,qtr,home_team,away_team,posteam,posteam_type,defteam,side_of_field,yardline_100,drive,sp,down,goal_to_go,yrdln,ydstogo,ydsnet,desc,play_type,yards_gained,pass_length,air_yards,yards_after_catch,field_goal_result,kick_distance,extra_point_result,two_point_conv_result,td_team,total_home_score,total_away_score,posteam_score,defteam_score,sack,touchdown,pass_touchdown,rush_touchdown,return_touchdown,extra_point_attempt,two_point_attempt,field_goal_attempt,fumble,complete_pass
32,270441,836,2015091000,2015-09-10,00:00,0.0,900.0,2700.0,Half1,1,1,NE,PIT,NE,home,PIT,NE,81.0,4,0,,,NE 19,0,18,END QUARTER 1,,0.0,0,0.0,0.0,,,,,,0,0,0.0,0.0,,,,,,,,,,
65,270474,1592,2015091000,2015-09-10,02:00,120.0,120.0,1920.0,Half1,0,2,NE,PIT,PIT,away,NE,PIT,79.0,7,0,,,PIT 21,0,41,Two-Minute Warning,,0.0,0,0.0,0.0,,,,,,14,0,0.0,0.0,,,,,,,,,,
80,270489,1910,2015091000,2015-09-10,00:00,0.0,0.0,1800.0,Half1,1,2,NE,PIT,NE,home,PIT,NE,80.0,8,0,,,NE 20,0,-2,END QUARTER 2,,0.0,0,0.0,0.0,,,,,,14,3,0.0,0.0,,,,,,,,,,
116,270525,2693,2015091000,2015-09-10,00:00,0.0,900.0,900.0,Half2,1,3,NE,PIT,PIT,away,NE,NE,14.0,12,0,,,NE 14,0,65,END QUARTER 3,,0.0,0,0.0,0.0,,,,,,21,11,0.0,0.0,,,,,,,,,,
157,270566,3745,2015091000,2015-09-10,02:00,120.0,120.0,120.0,Half2,0,4,NE,PIT,PIT,away,NE,PIT,61.0,18,0,,,PIT 39,0,16,Two-Minute Warning,,0.0,0,0.0,0.0,,,,,,28,14,0.0,0.0,,,,,,,,,,


In [36]:
nfl_small[nfl_small['goal_to_go'].isnull()]['play_id'].count()

2864

In [37]:
# All play types are NAN
nfl_small[nfl_small['goal_to_go'].isnull()].groupby('play_type')['play_type'].count()

Series([], Name: play_type, dtype: int64)

In [38]:
# goal_to_go is an indicator
nfl_small.groupby('goal_to_go')['goal_to_go'].count()

goal_to_go
0.0    80170
1.0     4236
Name: goal_to_go, dtype: int64

In [40]:
null_goal_to_go_drop = list(nfl_small[nfl_small['goal_to_go'].isnull()].index)
len(null_goal_to_go_drop)

2864

In [43]:
# nfl_small.drop(nfl_small.index[null_goal_to_go_drop], inplace=True)

<h2>Exploring play_type nulls</h2>

In [44]:
# All are stoppages of play. Will exclude
# nfl_small[nfl_small['play_type'].isnull()].head()
# All play types are NAN
# nfl_small[nfl_small['play_type'].isnull()].groupby('desc')['desc'].count()
nfl_small.shape

(87270, 50)

In [45]:
nfl_small = nfl_small[nfl_small['play_type'].notna()]
nfl_small.shape

(84371, 50)

<h2>Exploring pass_length nulls</h2>

In [46]:
# Majority are kicks, stoppages of play (will be excluded) or runs
# nfl_small[nfl_small['pass_length'].isnull()].head()
nfl_small[nfl_small['pass_length'].isnull()].groupby('play_type')['play_type'].count()

Series([], Name: play_type, dtype: int64)

In [48]:
# Exploring null pass_length for pass plays, which appear to be sacks
nfl_small[(nfl_small['pass_length'].isnull()) & (nfl_small['play_type'] == 'pass')].head()

Unnamed: 0,index,play_id,game_id,game_date,time,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,game_half,quarter_end,qtr,home_team,away_team,posteam,posteam_type,defteam,side_of_field,yardline_100,drive,sp,down,goal_to_go,yrdln,ydstogo,ydsnet,desc,play_type,yards_gained,pass_length,air_yards,yards_after_catch,field_goal_result,kick_distance,extra_point_result,two_point_conv_result,td_team,total_home_score,total_away_score,posteam_score,defteam_score,sack,touchdown,pass_touchdown,rush_touchdown,return_touchdown,extra_point_attempt,two_point_attempt,field_goal_attempt,fumble,complete_pass


In [49]:
# All null pass_length plays for play type = pass are sacks. 
# Will fill all nulls with zero and need to use play_type to evaluate.
regex = 'sacked'
series = nfl_small[(nfl_small['pass_length'].isnull()) & (nfl_small['play_type'] == 'pass')]['desc']
series.count(), series.str.contains(regex).count()

(0, 0)

In [50]:
nfl_small['pass_length'] = nfl_small['pass_length'].fillna(0)

<h2>Exploring air_yards nulls</h2>

In [51]:
# Same distribution so will fill with zero
nfl_small[nfl_small['air_yards'].isnull()].groupby('play_type')['play_type'].count()

Series([], Name: play_type, dtype: int64)

In [52]:
nfl_small['air_yards'] = nfl_small['air_yards'].fillna(0)

<h2>Exploring yards_after_catch nulls</h2>

In [53]:
# A lot more nulls for pass plays, so will investigate non-sacks
nfl_small[nfl_small['yards_after_catch'].isnull()].groupby('play_type')['play_type'].count()

Series([], Name: play_type, dtype: int64)

In [54]:
regex = 'sacked'
series = nfl_small[(nfl_small['yards_after_catch'].isnull()) & (nfl_small['play_type'] == 'pass')]['desc']
series.count(), series.str.contains(regex).count()

(0, 0)

In [55]:
# Look like incomplete passes
nfl_small[(nfl_small['yards_after_catch'].isnull()) & (nfl_small['play_type'] == 'pass')\
         & (~nfl_small['desc'].str.contains(regex))
         ].head()

Unnamed: 0,index,play_id,game_id,game_date,time,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,game_half,quarter_end,qtr,home_team,away_team,posteam,posteam_type,defteam,side_of_field,yardline_100,drive,sp,down,goal_to_go,yrdln,ydstogo,ydsnet,desc,play_type,yards_gained,pass_length,air_yards,yards_after_catch,field_goal_result,kick_distance,extra_point_result,two_point_conv_result,td_team,total_home_score,total_away_score,posteam_score,defteam_score,sack,touchdown,pass_touchdown,rush_touchdown,return_touchdown,extra_point_attempt,two_point_attempt,field_goal_attempt,fumble,complete_pass


In [56]:
nfl_small[(nfl_small['yards_after_catch'].isnull()) & (nfl_small['play_type'] == 'pass')\
         & (~nfl_small['desc'].str.contains(regex)) & (nfl_small['desc'].str.contains('incomplete'))]['play_id'].count()

0

In [57]:
nfl_small['yards_after_catch'] = nfl_small['yards_after_catch'].fillna(0)

<h2>Exploring posteam_score nulls</h2>


In [58]:
nfl_small.head(5)

Unnamed: 0,index,play_id,game_id,game_date,time,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,game_half,quarter_end,qtr,home_team,away_team,posteam,posteam_type,defteam,side_of_field,yardline_100,drive,sp,down,goal_to_go,yrdln,ydstogo,ydsnet,desc,play_type,yards_gained,pass_length,air_yards,yards_after_catch,field_goal_result,kick_distance,extra_point_result,two_point_conv_result,td_team,total_home_score,total_away_score,posteam_score,defteam_score,sack,touchdown,pass_touchdown,rush_touchdown,return_touchdown,extra_point_attempt,two_point_attempt,field_goal_attempt,fumble,complete_pass
0,362094,44,2017090700,2017-09-07,15:00,900.0,1800.0,3600.0,Half1,0,1,NE,KC,NE,home,KC,KC,35.0,1,0,,0.0,KC 35,0,73,C.Santos kicks 64 yards from KC 35 to NE 1. D....,kickoff,0.0,0,0.0,0.0,,64.0,,,,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,362095,68,2017090700,2017-09-07,14:55,895.0,1795.0,3595.0,Half1,0,1,NE,KC,NE,home,KC,NE,73.0,1,0,1.0,0.0,NE 27,10,0,(14:55) NE 12-Brady 18th season as Patriots QB...,pass,0.0,deep,27.0,0.0,,,,,,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,362096,94,2017090700,2017-09-07,14:49,889.0,1789.0,3589.0,Half1,0,1,NE,KC,NE,home,KC,NE,73.0,1,0,2.0,0.0,NE 27,10,8,(14:49) T.Brady pass short right to R.Burkhead...,pass,8.0,short,1.0,7.0,,,,,,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,362097,118,2017090700,2017-09-07,14:14,854.0,1754.0,3554.0,Half1,0,1,NE,KC,NE,home,KC,NE,65.0,1,0,3.0,0.0,NE 35,2,73,(14:14) (Shotgun) J.White left guard to NE 43 ...,run,8.0,0,0.0,0.0,,,,,,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,362098,139,2017090700,2017-09-07,13:52,832.0,1732.0,3532.0,Half1,0,1,NE,KC,NE,home,KC,NE,57.0,1,0,1.0,0.0,NE 43,10,19,"(13:52) (No Huddle, Shotgun) J.White up the mi...",run,3.0,0,0.0,0.0,,,,,,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
# All are beginning of game, so will fill with zero
nfl_small[nfl_small['posteam_score'].isnull()].groupby(['play_type','qtr','game_seconds_remaining'])['play_type'].count()

Series([], Name: play_type, dtype: int64)

In [31]:
nfl_small['posteam_score'] = nfl_small['posteam_score'].fillna(0)

<h2>Exploring defteam_score nulls</h2>

In [32]:
# All are beginning of game, so will fill with zero
nfl_small[nfl_small['defteam_score'].isnull()].groupby(['play_type','qtr','game_seconds_remaining'])['play_type'].count()

Series([], Name: play_type, dtype: int64)

In [62]:
nfl_small['defteam_score'] = nfl_small['defteam_score'].fillna(0)

In [63]:
null_count_func(nfl_small)

Column index has 84371 real values, and 0 null values
Column play_id has 84371 real values, and 0 null values
Column game_id has 84371 real values, and 0 null values
Column game_date has 84371 real values, and 0 null values
Column time has 84371 real values, and 0 null values
Column quarter_seconds_remaining has 84371 real values, and 0 null values
Column half_seconds_remaining has 84371 real values, and 0 null values
Column game_seconds_remaining has 84371 real values, and 0 null values
Column game_half has 84371 real values, and 0 null values
Column quarter_end has 84371 real values, and 0 null values
Column qtr has 84371 real values, and 0 null values
Column home_team has 84371 real values, and 0 null values
Column away_team has 84371 real values, and 0 null values
Column posteam has 84371 real values, and 0 null values
Column posteam_type has 84371 real values, and 0 null values
Column defteam has 84371 real values, and 0 null values
Column side_of_field has 84282 real values, and 

<h2>Exploring down nulls - Removing null downs since vast majoirty are Kickoffs and PAT plays</h2>

In [64]:
#Mostly kickoffs and timeouts which will be out of scope for our analysis. Removing
# nfl_small[nfl_small['down'].isnull()].groupby('desc')['desc'].count()
null_down_drop = list(nfl_small[nfl_small['down'].isnull()].index)
len(null_down_drop)

11006

In [65]:
nfl_small[nfl_small['down'].isnull()].groupby('play_type')['play_type'].count()

play_type
extra_point    2098
kickoff        4941
no_play        3752
pass            139
qb_kneel         28
run              48
Name: play_type, dtype: int64

In [66]:
nfl_small.drop(nfl_small.index[null_down_drop], inplace=True)

IndexError: index 84373 is out of bounds for axis 0 with size 84371

<h2> Adding Time: We must add a column that is directly or approximates time of day, so we are able to join with our weather data</h2>

In [67]:
# Need team names to create time DF
# list(nfl_small['home_team'].unique())

In [33]:
def game_schedule(year):
    
    import requests
    import urllib.request
    import time
    import pandas as pd
    
    url = 'https://www.pro-football-reference.com/years/' + year + '/games.htm'
    
    df = pd.read_html(url)[0]
    
    # Renaming column that determine location of game
    df = df.rename(columns={"Unnamed: 5": "location"})
    
    df['year'] = year
    
    df['home_team'] = df.apply(lambda x : x['Loser/tie'] if x['location'] == '@' else x['Winner/tie'],axis=1)
    df['away_team'] = df.apply(lambda x : x['Winner/tie'] if x['location'] == '@' else x['Loser/tie'],axis=1)
    
    team_mapping = {
    'Arizona Cardinals':'ARI',
    'Atlanta Falcons':'ATL',
    'Baltimore Ravens':'BAL',
    'Buffalo Bills':'BUF',
    'Carolina Panthers':'CAR',
    'Chicago Bears':'CHI',
    'Cincinnati Bengals':'CIN',
    'Cleveland Browns':'CLE',
    'Dallas Cowboys':'DAL',
    'Denver Broncos':'DEN',
    'Detroit Lions':'DET',
    'Green Bay Packers':'GB',
    'Houston Texans':'HOU',
    'Indianapolis Colts':'IND',
    'Jacksonville Jaguars':'JAX',
    'Kansas City Chiefs':'KC',
    'Los Angeles Rams':'LA',
    'Los Angeles Chargers':'LAC',
    'Miami Dolphins':'MIA',
    'Minnesota Vikings':'MIN',
    'New England Patriots':'NE',
    'New Orleans Saints':'NO',
    'New York Giants':'NYG',
    'New York Jets':'NYJ',
    'Oakland Raiders':'OAK',
    'Philadelphia Eagles':'PHI',
    'Pittsburgh Steelers':'PIT',
    'Seattle Seahawks':'SEA',
    'San Francisco 49ers':'SF',
    'Tampa Bay Buccaneers':'TB',
    'Tennessee Titans':'TEN',
    'Washington Redskins':'WAS'
    }
    
    # Replacing team names to be consistent with play-by-play data
    df = df.replace({'home_team':team_mapping}).replace({'away_team':team_mapping})
    
    # Removing date headers
    df = df[~df['Date'].isin(['Date','Playoffs'])]
    
    # Casting date object as date
    df['Date'] = pd.to_datetime(df['Date'] + ', ' + year)
    
#     df['Date_time'] = df['Date'] + ', 2017 ' + df['Time']
#     df['Datetime'] = pd.to_datetime(df['Date_time'])
    
    # Casting Time object as time
    df['Start_Time'] = pd.to_datetime(df['Time']).dt.time
    
    return df[['Date','Start_Time','home_team','away_team']]

In [35]:
sch_15 = game_schedule('2015')
sch_16 = game_schedule('2016')
sch_17 = game_schedule('2017')
sch_18 = game_schedule('2018')
sched = sch_15.append([sch_16, sch_17, sch_18])

In [70]:
schd_17.dtypes

Date          datetime64[ns]
Start_Time            object
home_team             object
away_team             object
dtype: object

In [36]:
# Adding join values to add time to kickoff
sched['qtr'] = 1
sched['play_type'] = 'kickoff'
sched['time'] = '15:00'
sched.head(5)

Unnamed: 0,Date,Start_Time,home_team,away_team,qtr,play_type,time
0,2015-09-10,20:40:00,NE,PIT,1,kickoff,15:00
1,2015-09-13,13:00:00,St. Louis Rams,SEA,1,kickoff,15:00
2,2015-09-13,13:00:00,CHI,GB,1,kickoff,15:00
3,2015-09-13,13:02:00,WAS,MIA,1,kickoff,15:00
4,2015-09-13,13:02:00,HOU,KC,1,kickoff,15:00


In [84]:
schd_17.dtypes

Date          datetime64[ns]
Start_Time            object
home_team             object
away_team             object
qtr                    int64
play_type             object
time                  object
dtype: object

In [37]:
# Merging schedule data to get start of game
nfl_small2 = nfl_small.merge(sched, how = 'left', left_on = ['game_date','home_team','away_team','qtr','play_type','time'],\
                                        right_on = ['Date','home_team','away_team','qtr','play_type','time'])

In [38]:
nfl_small2.drop(columns=['Date'],axis=1,inplace=True)

In [39]:
nfl_small2.dtypes

index                                 int64
play_id                               int64
game_id                               int64
game_date                    datetime64[ns]
time                                 object
quarter_seconds_remaining           float64
half_seconds_remaining              float64
game_seconds_remaining              float64
game_half                            object
quarter_end                           int64
qtr                                   int64
home_team                            object
away_team                            object
posteam                              object
posteam_type                         object
defteam                              object
side_of_field                        object
yardline_100                        float64
drive                                 int64
sp                                    int64
down                                float64
goal_to_go                          float64
yrdln                           

In [88]:
# If records are dropped above, time fill will not work
nfl_small2['Start_Time']

0        20:30:00
1             NaN
2             NaN
3             NaN
4             NaN
           ...   
84366         NaN
84367         NaN
84368         NaN
84369         NaN
84370         NaN
Name: Start_Time, Length: 84371, dtype: object

In [40]:
# Filling start time for all game rows to add cummulative seconds
nfl_small2['Start_Time'] = nfl_small2['Start_Time'].fillna(method='pad')
nfl_small2['Start_Time'] = nfl_small2.apply(lambda x :\
                                        datetime.combine(x['game_date'],x['Start_Time']),axis=1)

In [41]:
# Adding time difference from each play. Assuming a 3X increase in actual time
nfl_small2['diff'] = nfl_small2.groupby(['game_id'])['game_seconds_remaining'].diff().fillna(0)
nfl_small2['diff'] = nfl_small2['diff']*-3

In [42]:
# Updating start of overtime to have diff equal to zero to account for unexpected time remaining
nfl_small2.loc[(nfl_small2['diff'] < 0)\
               & (nfl_small2['qtr'] == 5)\
               & (nfl_small2['game_seconds_remaining'] == 600),'diff'] = 0

In [43]:
nfl_small2[(nfl_small2['diff'] < 0) & (nfl_small2['qtr'] == 5) & (nfl_small2['game_seconds_remaining'] == 600)]

Unnamed: 0,index,play_id,game_id,game_date,time,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,game_half,quarter_end,qtr,home_team,away_team,posteam,posteam_type,defteam,side_of_field,yardline_100,drive,sp,down,goal_to_go,yrdln,ydstogo,ydsnet,desc,play_type,yards_gained,pass_length,air_yards,yards_after_catch,field_goal_result,kick_distance,extra_point_result,two_point_conv_result,td_team,total_home_score,total_away_score,posteam_score,defteam_score,sack,touchdown,pass_touchdown,rush_touchdown,return_touchdown,extra_point_attempt,two_point_attempt,field_goal_attempt,fumble,complete_pass,Start_Time,diff


In [44]:
# Summing up all time diff
nfl_small2['diff'] = nfl_small2.groupby(['game_id'])['diff'].cumsum()

In [45]:
# Sorting data by game_id and play_id removes negative diff, but we still need to see how overtime looks
nfl_small2[nfl_small2['diff'] < 0].head()

Unnamed: 0,index,play_id,game_id,game_date,time,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,game_half,quarter_end,qtr,home_team,away_team,posteam,posteam_type,defteam,side_of_field,yardline_100,drive,sp,down,goal_to_go,yrdln,ydstogo,ydsnet,desc,play_type,yards_gained,pass_length,air_yards,yards_after_catch,field_goal_result,kick_distance,extra_point_result,two_point_conv_result,td_team,total_home_score,total_away_score,posteam_score,defteam_score,sack,touchdown,pass_touchdown,rush_touchdown,return_touchdown,extra_point_attempt,two_point_attempt,field_goal_attempt,fumble,complete_pass,Start_Time,diff


In [46]:
# Convert time diff to timedelta to allow for adding to current time
from datetime import timedelta
nfl_small2['diff'] = nfl_small2.apply(lambda x : timedelta(seconds=x['diff']),axis=1)

In [47]:
nfl_small2[['Start_Time','diff']].dtypes

Start_Time     datetime64[ns]
diff          timedelta64[ns]
dtype: object

In [48]:
nfl_small2['Estimated_Time'] = nfl_small2['Start_Time'] + nfl_small2['diff']

In [49]:
# Exploring overtime diff, finding overtime records
nfl_small2[nfl_small2['qtr']==5].head(3)

Unnamed: 0,index,play_id,game_id,game_date,time,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,game_half,quarter_end,qtr,home_team,away_team,posteam,posteam_type,defteam,side_of_field,yardline_100,drive,sp,down,goal_to_go,yrdln,ydstogo,ydsnet,desc,play_type,yards_gained,pass_length,air_yards,yards_after_catch,field_goal_result,kick_distance,extra_point_result,two_point_conv_result,td_team,total_home_score,total_away_score,posteam_score,defteam_score,sack,touchdown,pass_touchdown,rush_touchdown,return_touchdown,extra_point_attempt,two_point_attempt,field_goal_attempt,fumble,complete_pass,Start_Time,diff,Estimated_Time
526,271815,3967,2015091301,2015-09-13,15:00,900.0,900.0,900.0,Overtime,0,5,STL,SEA,STL,home,SEA,SEA,35.0,21,0,,0.0,SEA 35,0,0,S.Hauschka kicks onside 14 yards from SEA 35 t...,kickoff,0.0,0,0.0,0.0,,14.0,,,,30,29,30.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2015-09-13 13:00:00,02:15:00,2015-09-13 15:15:00
527,271816,4000,2015091301,2015-09-13,15:00,900.0,900.0,900.0,Overtime,0,5,STL,SEA,STL,home,SEA,SEA,49.0,21,0,1.0,0.0,SEA 49,10,2,(15:00) B.Cunningham up the middle to SEA 47 f...,run,2.0,0,0.0,0.0,,,,,,30,29,30.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2015-09-13 13:00:00,02:15:00,2015-09-13 15:15:00
528,271817,4030,2015091301,2015-09-13,14:27,867.0,867.0,867.0,Overtime,0,5,STL,SEA,STL,home,SEA,SEA,47.0,21,0,2.0,0.0,SEA 47,8,24,(14:27) (Shotgun) N.Foles pass deep right to S...,pass,22.0,deep,20.0,2.0,,,,,,30,29,30.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2015-09-13 13:00:00,02:16:39,2015-09-13 15:16:39


In [99]:
# Exploring overtime diff, exploring one game
nfl_small2[(nfl_small2['game_id']==2017091702) & (nfl_small2['play_id'].between(4100,4400))].head(5)

Unnamed: 0,index,play_id,game_id,game_date,time,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,game_half,quarter_end,qtr,home_team,away_team,posteam,posteam_type,defteam,side_of_field,yardline_100,drive,sp,down,goal_to_go,yrdln,ydstogo,ydsnet,desc,play_type,yards_gained,pass_length,air_yards,yards_after_catch,field_goal_result,kick_distance,extra_point_result,two_point_conv_result,td_team,total_home_score,total_away_score,posteam_score,defteam_score,sack,touchdown,pass_touchdown,rush_touchdown,return_touchdown,extra_point_attempt,two_point_attempt,field_goal_attempt,fumble,complete_pass,Start_Time,diff,Estimated_Time
3181,366113,4128,2017091702,2017-09-17,00:17,17.0,17.0,17.0,Half2,0,4,IND,ARI,ARI,away,IND,IND,31.0,22,0,1.0,0.0,IND 31,10,23,(:17) (Run formation) C.Palmer left guard to I...,run,3.0,0,0.0,0.0,,,,,,13,13,13.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-09-17 13:00:00,02:59:09,2017-09-17 15:59:09
3182,366114,4149,2017091702,2017-09-17,00:12,12.0,12.0,12.0,Half2,0,4,IND,ARI,ARI,away,IND,IND,31.0,22,0,,0.0,IND 31,0,23,Timeout #2 by IND at 00:12.,no_play,0.0,0,0.0,0.0,,,,,,13,13,13.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-09-17 13:00:00,02:59:24,2017-09-17 15:59:24
3183,366115,4166,2017091702,2017-09-17,00:12,12.0,12.0,12.0,Half2,0,4,IND,ARI,ARI,away,IND,IND,28.0,22,0,2.0,0.0,IND 28,7,27,(:12) (Run formation) C.Palmer kneels to IND 2...,no_play,0.0,0,0.0,0.0,,,,,,13,13,13.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-09-17 13:00:00,02:59:24,2017-09-17 15:59:24
3184,366116,4209,2017091702,2017-09-17,00:09,9.0,9.0,9.0,Half2,0,4,IND,ARI,ARI,away,IND,IND,23.0,22,0,2.0,0.0,IND 23,2,27,(:09) C.Palmer kneels to IND 24 for -1 yards.,qb_kneel,-1.0,0,0.0,0.0,,,,,,13,13,13.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-09-17 13:00:00,02:59:33,2017-09-17 15:59:33
3185,366117,4230,2017091702,2017-09-17,00:03,3.0,3.0,3.0,Half2,0,4,IND,ARI,ARI,away,IND,IND,23.0,22,0,,0.0,IND 23,0,27,Timeout #3 by ARI at 00:03.,no_play,0.0,0,0.0,0.0,,,,,,13,13,13.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-09-17 13:00:00,02:59:51,2017-09-17 15:59:51


In [50]:
# Columns for dropping null records  ***Cannot drop these rows yet. It breaks our time cleaning below***
nfl_small = nfl_small[nfl_small['down'].notna()]
nfl_small.shape

(150943, 50)

In [51]:
# Columns for dropping null records  ***Cannot drop these rows yet. It breaks our time cleaning below***
nfl_small = nfl_small[nfl_small['goal_to_go'].notna()]
nfl_small.shape

(150724, 50)

In [52]:
# Columns for dropping null records  ***Cannot drop these rows yet. It breaks our time cleaning below***
nfl_small = nfl_small[nfl_small['play_type'].notna()]
nfl_small.shape

(150720, 50)

In [53]:
# Saving nfl_small to csv to skip loading full file
nfl_small2.to_csv('nfl_small_cleaned.csv')