In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import time

In [2]:
# Setting pandas to display columns
pd.set_option('display.max_columns', None)

In [None]:
# Setting pandas to display rows
pd.set_option('display.max_rows', None)

In [8]:
# Pulling in NFL Play-by-Play Data
# nfl_data_df = pd.read_csv('NFL Play by Play 2009-2018 (v5).csv')

In [214]:
# Looking at size of dataframe
nfl_data_df.shape

(449371, 255)

In [7]:
# Checking out data types
# nfl_data_df.dtypes

In [6]:
# nfl_data_df.head()

In [5]:
# Getting a list of columns in the dataset to allow me to trim down to most important
# list(nfl_data_df.columns)

In [4]:
# Getting smaller set of columns to use for analysis
# cols = ['play_id','game_id','game_date','time','quarter_seconds_remaining','half_seconds_remaining',
#  'game_seconds_remaining','game_half','quarter_end','qtr','home_team','away_team','posteam','posteam_type',
#  'defteam','side_of_field','yardline_100','drive','sp','down','goal_to_go','yrdln','ydstogo','ydsnet',
#  'desc','play_type','yards_gained','pass_length','air_yards','yards_after_catch','field_goal_result',
#  'kick_distance','extra_point_result','two_point_conv_result','td_team','total_home_score','total_away_score',
#  'posteam_score','defteam_score','sack','touchdown','pass_touchdown','rush_touchdown','return_touchdown',
#  'extra_point_attempt','two_point_attempt','field_goal_attempt','fumble','complete_pass']

In [3]:
# Creating dataframe with most important columns and only 2017 and 2018
# nfl_small = nfl_data_df[nfl_data_df['game_date']>='2017-04-01'][cols].reset_index() #Keeping original index column unless needed later
# nfl_small.head(3)

In [220]:
# Saving nfl_small to csv to skip loading full file
# nfl_small.to_csv('nfl_small.csv')

In [9]:
nfl_small = pd.read_csv('nfl_small.csv')
nfl_small.shape

(87277, 51)

In [10]:
# Dropping unused index column and sorting to allow for future calculations
nfl_small = nfl_small.drop(columns=['Unnamed: 0']).sort_values(by=['game_id','play_id'])

In [12]:
nfl_small.head(5)

Unnamed: 0,index,play_id,game_id,game_date,time,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,game_half,quarter_end,qtr,home_team,away_team,posteam,posteam_type,defteam,side_of_field,yardline_100,drive,sp,down,goal_to_go,yrdln,ydstogo,ydsnet,desc,play_type,yards_gained,pass_length,air_yards,yards_after_catch,field_goal_result,kick_distance,extra_point_result,two_point_conv_result,td_team,total_home_score,total_away_score,posteam_score,defteam_score,sack,touchdown,pass_touchdown,rush_touchdown,return_touchdown,extra_point_attempt,two_point_attempt,field_goal_attempt,fumble,complete_pass
0,362094,44,2017090700,2017-09-07,15:00,900.0,1800.0,3600.0,Half1,0,1,NE,KC,NE,home,KC,KC,35.0,1,0,,0.0,KC 35,0,73,C.Santos kicks 64 yards from KC 35 to NE 1. D....,kickoff,0.0,,,,,64.0,,,,0,0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,362095,68,2017090700,2017-09-07,14:55,895.0,1795.0,3595.0,Half1,0,1,NE,KC,NE,home,KC,NE,73.0,1,0,1.0,0.0,NE 27,10,0,(14:55) NE 12-Brady 18th season as Patriots QB...,pass,0.0,deep,27.0,,,,,,,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,362096,94,2017090700,2017-09-07,14:49,889.0,1789.0,3589.0,Half1,0,1,NE,KC,NE,home,KC,NE,73.0,1,0,2.0,0.0,NE 27,10,8,(14:49) T.Brady pass short right to R.Burkhead...,pass,8.0,short,1.0,7.0,,,,,,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,362097,118,2017090700,2017-09-07,14:14,854.0,1754.0,3554.0,Half1,0,1,NE,KC,NE,home,KC,NE,65.0,1,0,3.0,0.0,NE 35,2,73,(14:14) (Shotgun) J.White left guard to NE 43 ...,run,8.0,,,,,,,,,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,362098,139,2017090700,2017-09-07,13:52,832.0,1732.0,3532.0,Half1,0,1,NE,KC,NE,home,KC,NE,57.0,1,0,1.0,0.0,NE 43,10,19,"(13:52) (No Huddle, Shotgun) J.White up the mi...",run,3.0,,,,,,,,,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<h2>Data Cleaning</h2>

In [13]:
# Changing date object to date format
nfl_small['game_date'] = pd.to_datetime(nfl_small['game_date'])
nfl_small.head(2)

Unnamed: 0,index,play_id,game_id,game_date,time,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,game_half,quarter_end,qtr,home_team,away_team,posteam,posteam_type,defteam,side_of_field,yardline_100,drive,sp,down,goal_to_go,yrdln,ydstogo,ydsnet,desc,play_type,yards_gained,pass_length,air_yards,yards_after_catch,field_goal_result,kick_distance,extra_point_result,two_point_conv_result,td_team,total_home_score,total_away_score,posteam_score,defteam_score,sack,touchdown,pass_touchdown,rush_touchdown,return_touchdown,extra_point_attempt,two_point_attempt,field_goal_attempt,fumble,complete_pass
0,362094,44,2017090700,2017-09-07,15:00,900.0,1800.0,3600.0,Half1,0,1,NE,KC,NE,home,KC,KC,35.0,1,0,,0.0,KC 35,0,73,C.Santos kicks 64 yards from KC 35 to NE 1. D....,kickoff,0.0,,,,,64.0,,,,0,0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,362095,68,2017090700,2017-09-07,14:55,895.0,1795.0,3595.0,Half1,0,1,NE,KC,NE,home,KC,NE,73.0,1,0,1.0,0.0,NE 27,10,0,(14:55) NE 12-Brady 18th season as Patriots QB...,pass,0.0,deep,27.0,,,,,,,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Inspecting datatypes
nfl_small.dtypes

index                                 int64
play_id                               int64
game_id                               int64
game_date                    datetime64[ns]
time                                 object
quarter_seconds_remaining           float64
half_seconds_remaining              float64
game_seconds_remaining              float64
game_half                            object
quarter_end                           int64
qtr                                   int64
home_team                            object
away_team                            object
posteam                              object
posteam_type                         object
defteam                              object
side_of_field                        object
yardline_100                        float64
drive                                 int64
sp                                    int64
down                                float64
goal_to_go                          float64
yrdln                           

In [15]:
# Creating a function that examines missing values in a dataframe easily. This can re-used later.
def null_count_func(dataframe):
    columns = list(dataframe.columns)

    for column in columns:
        total = dataframe[column].count()
        null_cnt = dataframe[column].isnull().sum()
        print('Column {} has {} real values, and {} null values'.format(column,total,null_cnt))

In [26]:
null_count_func(nfl_small)

Column index has 87277 real values, and 0 null values
Column play_id has 87277 real values, and 0 null values
Column game_id has 87277 real values, and 0 null values
Column game_date has 87277 real values, and 0 null values
Column time has 87277 real values, and 0 null values
Column quarter_seconds_remaining has 87277 real values, and 0 null values
Column half_seconds_remaining has 87277 real values, and 0 null values
Column game_seconds_remaining has 87277 real values, and 0 null values
Column game_half has 87277 real values, and 0 null values
Column quarter_end has 87277 real values, and 0 null values
Column qtr has 87277 real values, and 0 null values
Column home_team has 87277 real values, and 0 null values
Column away_team has 87277 real values, and 0 null values
Column posteam has 84286 real values, and 2991 null values
Column posteam_type has 84375 real values, and 2902 null values
Column defteam has 84375 real values, and 2902 null values
Column side_of_field has 87132 real val

From the null check function, we see quite a few columns with nulls, but I am unsure if that is on purpose or just data gaps.

I plan to explore:   
1. time - Done
2. quarter_seconds_remaining - Done
3. half_seconds_remaining - Done
4. game_seconds_remaining - Done
5. posteam
6. posteam_type
7. defteam
8. yardline_100
9. down
10. goal_to_go
11. play_type
12. pass_length (ensure it's de-duped from runs)
13. air_yards (ensure it's de-duped from runs)
14. yards_after_catch (ensure it's de-duped from runs)
15. posteam_score
16. defteam_score

In [24]:
# Null time also affects other time columns so will fill all
nfl_small[nfl_small['time'].isnull()]

Unnamed: 0,index,play_id,game_id,game_date,time,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,game_half,quarter_end,qtr,home_team,away_team,posteam,posteam_type,defteam,side_of_field,yardline_100,drive,sp,down,goal_to_go,yrdln,ydstogo,ydsnet,desc,play_type,yards_gained,pass_length,air_yards,yards_after_catch,field_goal_result,kick_distance,extra_point_result,two_point_conv_result,td_team,total_home_score,total_away_score,posteam_score,defteam_score,sack,touchdown,pass_touchdown,rush_touchdown,return_touchdown,extra_point_attempt,two_point_attempt,field_goal_attempt,fumble,complete_pass
243,362337,1090,2017091000,2017-09-10,,,,,Half1,0,2,BUF,NYJ,,,,NYJ,,5,0,,,NYJ 34,0,77,BLANK PLAY,,0.0,,,,,,,,,0,0,,,,,,,,,,,,
2861,364955,2154,2017091700,2017-09-17,,,,,Half1,0,2,BAL,CLE,,,,CLE,,14,0,,,CLE 2,0,40,BLANK PLAY,,0.0,,,,,,,,,20,7,,,,,,,,,,,,
4902,366996,815,2017091710,2017-09-17,,,,,Half1,0,1,DEN,DAL,,,,DEN,,5,0,,,DEN 46,0,15,The game has been suspended. Game halted due t...,,0.0,,,,,,,,,7,0,,,,,,,,,,,,
4903,366997,831,2017091710,2017-09-17,,,,,Half1,0,1,DEN,DAL,,,,,,5,0,,,,0,15,The game has resumed. Game re-started at 3:58 PM,,0.0,,,,,,,,,7,0,,,,,,,,,,,,
6429,368523,2528,2017092401,2017-09-24,,,,,Half1,0,2,BUF,DEN,,,,BUF,,13,0,,,BUF 35,0,0,End of half - 2.34 pm,,0.0,,,,,,,,,13,13,,,,,,,,,,,,
6519,368613,4662,2017092401,2017-09-24,,,,,Half2,0,4,BUF,DEN,,,,BUF,,24,0,,,BUF 40,0,-4,End of game - 4.05 pm,,0.0,,,,,,,,,26,16,,,,,,,,,,,,
8316,370410,1027,2017092800,2017-09-28,,,,,Half1,0,2,GB,CHI,,,,,,8,0,,,,0,11,The game has been suspended. Field cleared tem...,,0.0,,,,,,,,,14,0,,,,,,,,,,,,
8317,370411,1060,2017092800,2017-09-28,,,,,Half1,0,2,GB,CHI,,,,,,8,0,,,,0,11,The game has resumed.,,0.0,,,,,,,,,14,0,,,,,,,,,,,,
11487,373581,3809,2017100801,2017-10-08,,,,,Half2,0,4,CLE,NYJ,,,,NYJ,,20,0,,,NYJ 41,0,86,BLANK PLAY,,0.0,,,,,,,,,13,17,,,,,,,,,,,,
12684,374778,2030,2017100806,2017-10-08,,,,,Half1,0,2,PHI,ARI,,,,ARI,,13,0,,,ARI 33,0,47,BLANK PLAY,,0.0,,,,,,,,,21,7,,,,,,,,,,,,


In [19]:
# Most look like stoppage in play. I will try to backfill from previous time
nfl_small[nfl_small['time'].isnull()].groupby('desc')['desc'].count()

desc
BLANK PLAY                                                                                                                                                                                   5
End of game - 11.29 pm                                                                                                                                                                       1
End of game - 3.59 pm                                                                                                                                                                        2
End of game - 4.00 pm                                                                                                                                                                        1
End of game - 4.04 pm                                                                                                                                                                        1
End of game - 4.05 pm                   

In [111]:
# Exploring blank plays, it looks like we can remove the rows from the dataset
nfl_small[(nfl_small['game_id']==2017091000) & (nfl_small['play_id'].between(1062,1102))]

Unnamed: 0,index,play_id,game_id,game_date,time,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,game_half,quarter_end,qtr,home_team,away_team,posteam,posteam_type,defteam,side_of_field,yardline_100,drive,sp,down,goal_to_go,yrdln,ydstogo,ydsnet,desc,play_type,yards_gained,pass_length,air_yards,yards_after_catch,field_goal_result,kick_distance,extra_point_result,two_point_conv_result,td_team,total_home_score,total_away_score,posteam_score,defteam_score,sack,touchdown,pass_touchdown,rush_touchdown,return_touchdown,extra_point_attempt,two_point_attempt,field_goal_attempt,fumble,complete_pass
242,362336,1062,2017091000,2017-09-10,13:59,839.0,839.0,2639.0,Half1,0,2,BUF,NYJ,BUF,home,NYJ,NYJ,34.0,5,0,2.0,0.0,NYJ 34,10,77,(13:59) (Shotgun) T.Taylor pass short middle t...,pass,12.0,short,4.0,8.0,,,,,,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
243,362337,1090,2017091000,2017-09-10,,,,,Half1,0,2,BUF,NYJ,,,,NYJ,,5,0,,,NYJ 34,0,77,BLANK PLAY,,0.0,,,,,,,,,0,0,,,,,,,,,,,,
244,362338,1102,2017091000,2017-09-10,13:22,802.0,802.0,2602.0,Half1,0,2,BUF,NYJ,BUF,home,NYJ,NYJ,22.0,5,0,1.0,0.0,NYJ 22,10,71,(13:22) M.Tolbert right tackle to NYJ 8 for 14...,run,14.0,,,,,,,,,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [112]:
blank_play_drop = list(nfl_small[nfl_small['desc']=='BLANK PLAY'].index)
blank_play_drop

[243, 2861, 11487, 12684, 26364]

In [113]:
nfl_small.drop(nfl_small.index[blank_play_drop], inplace=True)

In [114]:
list(nfl_small[nfl_small['desc']=='BLANK PLAY'].index)

[11487, 12684]

In [28]:
# Exploring delays, it looks like we will need to account for delay times, but can drop rows after
# nfl_small[(nfl_small['game_id']==2017091710) & (nfl_small['play_id'].between(750,850))]

In [29]:
# Exploring end of half/game, it looks like we should also account for time, but can drop rows after
# nfl_small[(nfl_small['game_id']==2017092401) & (nfl_small['play_id'].between(2500,2600))]

In [25]:
# Forward fill time for weather delays ***Potential Improvement - Incorprate Delay Times***
nfl_small['time'] = nfl_small['time'].fillna(method='pad')
nfl_small['quarter_seconds_remaining'] = nfl_small['quarter_seconds_remaining'].fillna(method='pad')
nfl_small['half_seconds_remaining'] = nfl_small['half_seconds_remaining'].fillna(method='pad')
nfl_small['game_seconds_remaining'] = nfl_small['game_seconds_remaining'].fillna(method='pad')

<h2> Adding Time: We must add a column that is directly or approximates time of day, so we are able to join with our weather data</h2>

In [30]:
# Need team names to create time DF
# list(nfl_small['home_team'].unique())

In [31]:
def game_schedule(year):
    
    import requests
    import urllib.request
    import time
    import pandas as pd
    
    url = 'https://www.pro-football-reference.com/years/' + year + '/games.htm'
    
    df = pd.read_html(url)[0]
    
    # Renaming column that determine location of game
    df = df.rename(columns={"Unnamed: 5": "location"})
    
    df['year'] = year
    
    df['home_team'] = df.apply(lambda x : x['Loser/tie'] if x['location'] == '@' else x['Winner/tie'],axis=1)
    df['away_team'] = df.apply(lambda x : x['Winner/tie'] if x['location'] == '@' else x['Loser/tie'],axis=1)
    
    team_mapping = {
    'Arizona Cardinals':'ARI',
    'Atlanta Falcons':'ATL',
    'Baltimore Ravens':'BAL',
    'Buffalo Bills':'BUF',
    'Carolina Panthers':'CAR',
    'Chicago Bears':'CHI',
    'Cincinnati Bengals':'CIN',
    'Cleveland Browns':'CLE',
    'Dallas Cowboys':'DAL',
    'Denver Broncos':'DEN',
    'Detroit Lions':'DET',
    'Green Bay Packers':'GB',
    'Houston Texans':'HOU',
    'Indianapolis Colts':'IND',
    'Jacksonville Jaguars':'JAX',
    'Kansas City Chiefs':'KC',
    'Los Angeles Rams':'LA',
    'Los Angeles Chargers':'LAC',
    'Miami Dolphins':'MIA',
    'Minnesota Vikings':'MIN',
    'New England Patriots':'NE',
    'New Orleans Saints':'NO',
    'New York Giants':'NYG',
    'New York Jets':'NYJ',
    'Oakland Raiders':'OAK',
    'Philadelphia Eagles':'PHI',
    'Pittsburgh Steelers':'PIT',
    'Seattle Seahawks':'SEA',
    'San Francisco 49ers':'SF',
    'Tampa Bay Buccaneers':'TB',
    'Tennessee Titans':'TEN',
    'Washington Redskins':'WAS'
    }
    
    # Replacing team names to be consistent with play-by-play data
    df = df.replace({'home_team':team_mapping}).replace({'away_team':team_mapping})
    
    # Removing date headers
    df = df[~df['Date'].isin(['Date','Playoffs'])]
    
    # Casting date object as date
    df['Date'] = pd.to_datetime(df['Date'] + ', ' + year)
    
#     df['Date_time'] = df['Date'] + ', 2017 ' + df['Time']
#     df['Datetime'] = pd.to_datetime(df['Date_time'])
    
    # Casting Time object as time
    df['Start_Time'] = pd.to_datetime(df['Time']).dt.time
    
    return df[['Date','Start_Time','home_team','away_team']]

In [32]:
schd_17 = game_schedule('2017')
schd_17.head(4)

Unnamed: 0,Date,Start_Time,home_team,away_team
0,2017-09-07,20:30:00,NE,KC
1,2017-09-10,13:00:00,TEN,OAK
2,2017-09-10,13:00:00,CHI,ATL
3,2017-09-10,13:00:00,BUF,NYJ


In [33]:
schd_17.dtypes

Date          datetime64[ns]
Start_Time            object
home_team             object
away_team             object
dtype: object

In [34]:
# Adding join values to add time to kickoff
schd_17['qtr'] = 1
schd_17['play_type'] = 'kickoff'
schd_17['time'] = '15:00'
schd_17.head(3)

Unnamed: 0,Date,Start_Time,home_team,away_team,qtr,play_type,time
0,2017-09-07,20:30:00,NE,KC,1,kickoff,15:00
1,2017-09-10,13:00:00,TEN,OAK,1,kickoff,15:00
2,2017-09-10,13:00:00,CHI,ATL,1,kickoff,15:00


In [35]:
schd_17.dtypes

Date          datetime64[ns]
Start_Time            object
home_team             object
away_team             object
qtr                    int64
play_type             object
time                  object
dtype: object

In [36]:
# Merging schedule data to get start of game
nfl_small2 = nfl_small.merge(schd_17, how = 'left', left_on = ['game_date','home_team','away_team','qtr','play_type','time'],\
                                        right_on = ['Date','home_team','away_team','qtr','play_type','time'])

In [37]:
nfl_small2.drop(columns=['Date'],axis=1,inplace=True)

In [38]:
# Filling start time for all game rows to add cummulative seconds
nfl_small2['Start_Time'] = nfl_small2['Start_Time'].fillna(method='pad')
nfl_small2['Start_Time'] = nfl_small2.apply(lambda x :\
                                        datetime.combine(x['game_date'],x['Start_Time']),axis=1)

In [39]:
# Adding time difference from each play. Assuming a 3X increase in actual time
nfl_small2['diff'] = nfl_small2.groupby(['game_id'])['game_seconds_remaining'].diff().fillna(0)
nfl_small2['diff'] = nfl_small2['diff']*-3

In [40]:
# Updating start of overtime to have diff equal to zero to account for unexpected time remaining
nfl_small2.loc[(nfl_small2['diff'] < 0)\
               & (nfl_small2['qtr'] == 5)\
               & (nfl_small2['game_seconds_remaining'] == 600),'diff'] = 0

In [41]:
nfl_small2[(nfl_small2['diff'] < 0) & (nfl_small2['qtr'] == 5) & (nfl_small2['game_seconds_remaining'] == 600)]

Unnamed: 0,index,play_id,game_id,game_date,time,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,game_half,quarter_end,qtr,home_team,away_team,posteam,posteam_type,defteam,side_of_field,yardline_100,drive,sp,down,goal_to_go,yrdln,ydstogo,ydsnet,desc,play_type,yards_gained,pass_length,air_yards,yards_after_catch,field_goal_result,kick_distance,extra_point_result,two_point_conv_result,td_team,total_home_score,total_away_score,posteam_score,defteam_score,sack,touchdown,pass_touchdown,rush_touchdown,return_touchdown,extra_point_attempt,two_point_attempt,field_goal_attempt,fumble,complete_pass,Start_Time,diff


In [42]:
# Summing up all time diff
nfl_small2['diff'] = nfl_small2.groupby(['game_id'])['diff'].cumsum()

In [43]:
# Sorting data by game_id and play_id removes negative diff, but we still need to see how overtime looks
nfl_small2[nfl_small2['diff'] < 0].head()

Unnamed: 0,index,play_id,game_id,game_date,time,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,game_half,quarter_end,qtr,home_team,away_team,posteam,posteam_type,defteam,side_of_field,yardline_100,drive,sp,down,goal_to_go,yrdln,ydstogo,ydsnet,desc,play_type,yards_gained,pass_length,air_yards,yards_after_catch,field_goal_result,kick_distance,extra_point_result,two_point_conv_result,td_team,total_home_score,total_away_score,posteam_score,defteam_score,sack,touchdown,pass_touchdown,rush_touchdown,return_touchdown,extra_point_attempt,two_point_attempt,field_goal_attempt,fumble,complete_pass,Start_Time,diff


In [44]:
# Convert time diff to timedelta to allow for adding to current time
from datetime import timedelta
nfl_small2['diff'] = nfl_small2.apply(lambda x : timedelta(seconds=x['diff']),axis=1)

In [45]:
nfl_small2[['Start_Time','diff']].dtypes

Start_Time     datetime64[ns]
diff          timedelta64[ns]
dtype: object

In [46]:
nfl_small2['Estimated_Time'] = nfl_small2['Start_Time'] + nfl_small2['diff']

In [138]:
# Exploring overtime diff, finding overtime records
# nfl_small2[nfl_small2['qtr']==5].head(3)

In [137]:
# Exploring overtime diff, exploring one game
# nfl_small2[(nfl_small2['game_id']==2017091702) & (nfl_small2['play_id'].between(4100,4400))]

<h2>Additional Time Analysis Exploration</h2>

In [79]:
nfl_small[nfl_small['play_type'].isnull()].head(5)

Unnamed: 0.1,Unnamed: 0,index,play_id,game_id,game_date,time,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,game_half,quarter_end,qtr,home_team,away_team,posteam,posteam_type,defteam,side_of_field,yardline_100,drive,sp,down,goal_to_go,yrdln,ydstogo,ydsnet,desc,play_type,yards_gained,pass_length,air_yards,yards_after_catch,field_goal_result,kick_distance,extra_point_result,two_point_conv_result,td_team,total_home_score,total_away_score,posteam_score,defteam_score,sack,touchdown,pass_touchdown,rush_touchdown,return_touchdown,extra_point_attempt,two_point_attempt,field_goal_attempt,fumble,complete_pass,Time
9,9,362103,279,2017090700,2017-09-07,12:21,741.0,1641.0,3441.0,Half1,0,1,NE,KC,,,,KC,,1,0,1.0,,KC 2,2,71,(12:21) N.Solder reported in as eligible. M.G...,,0.0,,,,,,,,,0,0,,,,,,,,,,,,,
45,45,362139,1202,2017090700,2017-09-07,00:00,0.0,900.0,2700.0,Half1,1,1,NE,KC,,,,KC,,5,0,,,KC 8,0,67,END QUARTER 1,,0.0,,,,,,,,,7,7,,,,,,,,,,,,,
80,80,362174,2017,2017090700,2017-09-07,02:00,120.0,120.0,1920.0,Half1,0,2,NE,KC,,,,KC,,10,0,,,KC 11,0,12,Two-Minute Warning,,0.0,,,,,,,,,17,7,,,,,,,,,,,,,
97,97,362191,2398,2017090700,2017-09-07,00:00,0.0,0.0,1800.0,Half1,1,2,NE,KC,,,,NE,,11,0,,,NE 18,0,-1,END QUARTER 2,,0.0,,,,,,,,,17,14,,,,,,,,,,,,,
100,100,362194,2451,2017090700,2017-09-07,14:21,861.0,1761.0,1761.0,Half2,0,3,NE,KC,,,,KC,,12,0,1.0,,KC 35,10,10,(14:21),,0.0,,,,,,,,,17,14,,,,,,,,,,,,,


In [73]:
nfl_small['play_type'].unique()

array(['kickoff', 'pass', 'run', 'no_play', nan, 'extra_point',
       'field_goal', 'punt', 'qb_kneel', 'qb_spike'], dtype=object)

In [77]:
for row in nfl_small[nfl_small['play_type'].isnull()]['desc']:
    print(row)

(12:21) N.Solder reported in as eligible.  M.Gillislee right tackle for 2 yards, TOUCHDOWN NULLIFIED by Penalty. Penalty on NE-N.Solder, Offensive Holding, offsetting, enforced at KC 2 - No Play. Penalty on KC-A.Bailey, Defensive Holding, offsetting.
END QUARTER 1
END QUARTER 2
(14:21)
END QUARTER 3
END GAME
END QUARTER 1 - 1.36 pm
END QUARTER 2 - 2.19 pm
END QUARTER 3 - 3.12 pm
END GAME
END QUARTER 1
END QUARTER 3
END GAME
END QUARTER 1
END GAME
END QUARTER 1
END QUARTER 2
END QUARTER 3
END GAME
END QUARTER 1
END QUARTER 3
END GAME
END QUARTER 1
END QUARTER 2
END QUARTER 3
END GAME
END QUARTER 1
END QUARTER 2
END QUARTER 3
END GAME
END QUARTER 1
END QUARTER 2
END QUARTER 3
END GAME
END QUARTER 1
END QUARTER 2
END QUARTER 3
END GAME
END QUARTER 1
END QUARTER 3
END GAME
END QUARTER 1
(1:03) (Punt formation) Penalty on GB-J.Vogel, Delay of Game, declined.
END QUARTER 3
(5:28) (Shotgun) A.Rodgers pass short left to T.Montgomery to GB 39 for 12 yards (R.Sherman) [S.Richardson]. Penalty on 

END QUARTER 3
END GAME
END QUARTER 1
END QUARTER 2
END QUARTER 3
END GAME
END QUARTER 1
END QUARTER 3
END GAME
END QUARTER 1
END QUARTER 2
END QUARTER 3
(10:58) (Shotgun) T.Brady pass short left to J.White to IND 37 for 11 yards (Z.Franklin; K.Turay). Penalty on NE-M.Cannon, Tripping, offsetting, enforced at IND 48 - No Play. Penalty on IND-L.Pipkins, Unnecessary Roughness, offsetting.
END GAME NE 12-Brady 226th win as starter (including postseason), ties most all-time (Vinatieri).
End of quarter - 1.37 pm
END QUARTER 1
End of quarter - 3.10 pm
END QUARTER 3
END QUARTER 1
END QUARTER 2
END QUARTER 3
END GAME
END QUARTER 1
B.McManus kicks 62 yards from DEN 35 to NYJ 3. A.Roberts to NYJ 18 for 15 yards (D.Booker). Penalty on DEN-D.Booker, Illegal Formation, offsetting, enforced at DEN 35 - No Play. Penalty on NYJ-C.Herndon, Illegal Block Above the Waist, offsetting.
END QUARTER 2
END QUARTER 1
END QUARTER 2
END QUARTER 3
END GAME
END QUARTER 1
(9:13) M.Stafford pass deep right to M.Jones

<h2>Individual Game Exploration</h2>

In [None]:
# Getting data from one game to explore further
ten_pit_game = nfl_small[nfl_small['game_id']==2009091000].sort_values(by='play_id',ascending=True)

In [None]:
ten_pit_game.head()

In [None]:
ten_pit_game.shape

In [None]:
# It looks like games don't have consecutive play_ids
nfl_small[(nfl_small['play_id']==47) & (nfl_small['game_id'] == 2009091000)]

In [None]:
# Creating end of drive column to easily sum data

for i in range(0, len(ten_pit_game)-1):
    ten_pit_game.loc[i,'end_of_drive'] = ten_pit_game.loc[i+1,'drive'] - ten_pit_game.loc[i,'drive']

# Adding one more for last drive of game
ten_pit_game.loc[len(ten_pit_game)-1,'end_of_drive'] = 1

In [None]:
# Ensuring end of drive column is created correctly
ten_pit_game.head(10)

In [None]:
# Pulling end of drive rows only
ten_pit_game[ten_pit_game['end_of_drive'] == 1]

In [None]:
# There appears to be something wrong with yardage data for TEN
# https://www.espn.com/nfl/playbyplay?gameId=290910023
ten_pit_game[(ten_pit_game['end_of_drive'] == 1)&(ten_pit_game['game_half'] == 'Half1')]\
                                            .groupby('posteam').agg({'ydsnet':np.sum})

In [None]:
# Reset Fig
plt.close('all')
plt.style.use('seaborn')
fig, ax1 = plt.subplots(figsize=(20,7))
# plt.gca().invert_xaxis()

line1 = ax1.plot(ten_pit_game['drive'],ten_pit_game['total_home_score'],'y--',label='PIT')
line2 = ax1.plot(ten_pit_game['drive'],ten_pit_game['total_away_score'],'b-',label='TEN')

ax1.set_xlabel('Drive Number',fontsize=14)
ax1.set_ylabel('End of Drive Score',fontsize=14)
ax1.set_title("End of Drive Score",fontsize=14,fontweight='bold')

ax1.legend(loc=4,fontsize=16)

plt.show()