In [7]:
import pandas as pd
import numpy as np
import random 

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

import matplotlib
%matplotlib inline

In [66]:
def clean(filename):
    pbp = pd.read_csv(filename)

    # convert column names to upper case (since they are inconsistent in the data by season)
    pbp.columns = [x.upper() for x in pbp.columns]
    
    # drop all columns with no info, we won't use, or aren't in all the data
    pbp.drop(
        ['UNNAMED: 10', \
         'UNNAMED: 12', \
         'UNNAMED: 16',\
         'UNNAMED: 17', \
         'CHALLENGER', \
         'ISMEASUREMENT', \
         'NEXTSCORE', \
         'TEAMWIN',\
         'ISINCOMPLETE', \
         'ISTOUCHDOWN', \
         'ISSACK', \
         'ISCHALLENGE', \
         'ISCHALLENGEREVERSED', \
         'ISINTERCEPTION', \
         'ISPENALTY', \
         'ISTWOPOINTCONVERSION', \
         'SERIESFIRSTDOWN', \
         'ISTWOPOINTCONVERSIONSUCCESSFUL', \
         'ISPENALTYACCEPTED', \
         'PENALTYTEAM', \
         'ISFUMBLE', \
         'PENALTYTYPE', \
         'PENALTYYARDS', \
         'SEASONYEAR', \
         'GAMEID', \
         'GAMEDATE', \
         'ISNOPLAY', \
         'DEFENSESCORE',
         'ISMESUREMENT',
         'ISPRESEASON',
         'OFFENSESCORE',
         'PENALIZEDPLAYER',
         'PLAYID',
         'SCORECHANGE',
         'SCOREDIFF'], \
          axis=1, inplace=True,  errors='ignore')
    
    # rename the columns to be more friendly
    pbp.columns = ['Quarter', \
                    'Minute', \
                    'Second', \
                    'OffenseTeam', \
                    'DefenseTeam', \
                    'Down', \
                    'ToGo', \
                    'YardLine', \
                    'Description', \
                    'Yards', \
                    'Formation', \
                    'PlayType', \
                    'IsRush', \
                    'IsPass', \
                    'PassType', \
                    'RushDirection', \
                    'YardLineFixed', \
                    'YardLineDirection']

    # get rid of all the kicks except punts and field goals
    pbp = pbp[(pbp.PlayType != 'KICK OFF') & (pbp.PlayType != 'EXTRA POINT') & \
              (pbp.PlayType != 'TWO-POINT CONVERSION')]

    # get rid of all the timeout plays
    pbp = pbp[pbp.OffenseTeam.notnull()]

    # get rid of all of the no-plays
    pbp = pbp[(pbp.PlayType != 'NO PLAY') & (pbp.PlayType != 'EXCEPTION') & \
              (pbp.PlayType != 'CLOCK STOP')& (pbp.PlayType != 'PENALTY')]

    # get rid of all the malformed pass types
    passtypes = ['DEEP MIDDLE', 'SHORT LEFT', 'SHORT RIGHT', 'SHORT MIDDLE', \
                 'DEEP LEFT', 'DEEP RIGHT']
    pbp = pbp[(pbp['PassType'].isin(passtypes)) | (pbp['PassType'].isnull())]

    # replace the nan PlayTypes with 'DIRECT SNAP'
    pbp.PlayType = pbp.PlayType.fillna('DIRECT SNAP')

    # drop the existing IsRush/IsPass and create new ones
    pbp.drop(['IsRush', 'IsPass'], axis=1, inplace=True)

    play_to_rush = {
        'RUSH': 1,
        'PASS' : 0,
        'PUNT' : 0,
        'QB KNEEL' : 1,
        'SCRAMBLE' : 0,
        'FIELD GOAL' : 0,
        'SACK' : 0,
        'FUMBLES' : 1,
        'DIRECT SNAP' : 1
    }

    pbp['IsRush'] = pbp['PlayType'].map(play_to_rush)

    play_to_pass = {
        'RUSH': 0,
        'PASS' : 1,
        'PUNT' : 0,
        'QB KNEEL' : 0,
        'SCRAMBLE' : 1,
        'FIELD GOAL' : 0,
        'SACK' : 1,
        'FUMBLES' : 0,
        'DIRECT SNAP' : 0
    }

    pbp['IsPass'] = pbp['PlayType'].map(play_to_pass)

    play_to_kick = {
        'RUSH': 0,
        'PASS' : 0,
        'PUNT' : 1,
        'QB KNEEL' : 0,
        'SCRAMBLE' : 0,
        'FIELD GOAL' : 1,
        'SACK' : 0,
        'FUMBLES' : 0,
        'DIRECT SNAP' : 0
    }

    pbp['IsKick'] = pbp['PlayType'].map(play_to_kick)


    # Combine the dummy classes into one var and drop the dummies
    def play_type(x):
        if x.IsRush == 1:
            return 'RUSH'
        if x.IsPass == 1:
            return 'PASS'
        if x.IsKick == 1:
            return 'KICK'
        else:
            return 'NaN'

    pbp['Play'] = pbp.apply(lambda x: play_type(x), axis=1)

    pbp.drop(['IsRush', 'IsPass', 'IsKick'], axis=1, inplace=True)

    # Convert some columns to categorical
    pbp.Formation = pbp.Formation.astype("category")
    pbp.OffenseTeam = pbp.OffenseTeam.astype("category")
    pbp.DefenseTeam = pbp.DefenseTeam.astype("category")
    pbp.Play = pbp.Play.astype("category")

    return pbp

In [9]:
def prep(df):

    # A few columns have to go for us to build a model (though we use them for user presentation)
    df.drop(['YardLineFixed', 'YardLineDirection','Description', 'PlayType', \
             'PassType', 'RushDirection', 'Yards', 'DefenseTeam'], axis=1, inplace=True)

    # create dummy variables for formations
    form_dummies = pd.get_dummies(df.Formation)
    form_dummies.columns = map(lambda x: 'FORMATION_' + x.replace (' ', '_'), form_dummies.columns)

    # create dummy variables for teams
    team_dummies = pd.get_dummies(df.OffenseTeam)
    team_dummies.columns = map(lambda x: 'TEAM_' + str(x), team_dummies.columns)

    # combine the dummy variables and drop the categorical versions
    df_prepped = pd.concat(
        [df.ix[:,['Quarter', 'Minute', 'Second', 'Down', 'ToGo', 'YardLine', 'Play']],
        team_dummies,
        form_dummies], axis=1)

    return df_prepped

In [10]:
def combine(filenames):

    dfs = []
    for filename in filenames:
        dfs.append(pd.read_csv(filename))

    df = pd.concat(dfs)

    return df

In [11]:
def prep_record(record):
    '''
    INPUT: One play record as a single row DataFrame in "clean form"
    OUTPUT: The record in "model form"

    This will work for multiple records, although it's really meant for one.
    '''

    formations = [
    'FIELD_GOAL',
    'NO_HUDDLE',
    'NO_HUDDLE_SHOTGUN',
    'PUNT',
    'SHOTGUN',
    'UNDER_CENTER',
    'WILDCAT']

    teams = [
    'ARI',
    'ATL',
    'BAL',
    'BUF',
    'CAR',
    'CHI',
    'CIN',
    'CLE',
    'DAL',
    'DEN',
    'DET',
    'GB',
    'HOU',
    'IND',
    'JAX',
    'KC',
    'LA',
    'MIA',
    'MIN',
    'NE',
    'NO',
    'NYG',
    'NYJ',
    'OAK',
    'PHI',
    'PIT',
    'SD',
    'SEA',
    'SF',
    'TB',
    'TEN',
    'WAS']

    # Dummy the team
    df2 = pd.get_dummies(record.OffenseTeam)
    dummies_frame = pd.get_dummies(teams)
    df2 = df2.reindex(columns=dummies_frame.columns, fill_value=0)
    df2.columns = map(lambda x: 'TEAM_' + str(x), df2.columns)

    # Dummy the formation
    df1 = pd.get_dummies(record.Formation)
    dummies_frame = pd.get_dummies(formations)
    df1 = df1.reindex(columns=dummies_frame.columns, fill_value=0)
    df1.columns = map(lambda x: 'FORMATION_' + x.replace (' ', '_'), df1.columns)

    # Combine the dummy variables and drop the categorical versions
    record = pd.concat(
        [record.ix[:,['Quarter', 'Minute', 'Second', 'Down', 'ToGo', 'YardLine', 'Play']],
        df2,
        df1], axis=1)

    return record

In [67]:
pbp2016 = clean('../data/pbp-2016.csv')

In [74]:
pbp2016.to_csv('../data/pbp2016-clean.csv', index=False)

In [31]:
# this would have been amazing to have... so, of course it's not filled in
pbp2016['scorediff'].unique()

array([0])

In [44]:
pbp2016.columns = [x.upper() for x in pbp2016.columns]
pbp2015.columns = [x.upper() for x in pbp2015.columns]

In [45]:
cols2016 = set(pbp2016.columns)
cols2015 = set(pbp2015.columns)

In [50]:
cols2016.difference(cols2015)

{'DEFENSESCORE',
 'ISMESUREMENT',
 'ISPRESEASON',
 'OFFENSESCORE',
 'PENALIZEDPLAYER',
 'PLAYID',
 'SCORECHANGE',
 'SCOREDIFF'}

In [55]:
pbp2016.columns = [x.upper() for x in pbp2016.columns]
    
# drop all columns with no info, we won't use, or aren't in all the data
pbp2016.drop(
    ['UNNAMED: 10', \
     'UNNAMED: 12', \
     'UNNAMED: 16',\
     'UNNAMED: 17', \
     'CHALLENGER', \
     'ISMEASUREMENT', \
     'NEXTSCORE', \
     'TEAMWIN',\
     'ISINCOMPLETE', \
     'ISTOUCHDOWN', \
     'ISSACK', \
     'ISCHALLENGE', \
     'ISCHALLENGEREVERSED', \
     'ISINTERCEPTION', \
     'ISPENALTY', \
     'ISTWOPOINTCONVERSION', \
     'SERIESFIRSTDOWN', \
     'ISTWOPOINTCONVERSIONSUCCESSFUL', \
     'ISPENALTYACCEPTED', \
     'PENALTYTEAM', \
     'ISFUMBLE', \
     'PENALTYTYPE', \
     'PENALTYYARDS', \
     'SEASONYEAR', \
     'GAMEID', \
     'GAMEDATE', \
     'ISNOPLAY', \
     'DEFENSESCORE',
     'ISMESUREMENT',
     'ISPRESEASON',
     'OFFENSESCORE',
     'PENALIZEDPLAYER',
     'PLAYID',
     'SCORECHANGE',
     'SCOREDIFF'], \
      axis=1, inplace=True,  errors='ignore')

In [57]:
for c in pbp2016.columns:
    print c

QUARTER
MINUTE
SECOND
OFFENSETEAM
DEFENSETEAM
DOWN
TOGO
YARDLINE
DESCRIPTION
YARDS
FORMATION
PLAYTYPE
ISRUSH
ISPASS
PASSTYPE
RUSHDIRECTION
YARDLINEFIXED
YARDLINEDIRECTION


In [63]:
pbp2016.columns = ['Quarter', \
 'Minute', \
 'Second', \
 'OffenseTeam', \
 'DefenseTeam', \
 'Down', \
 'ToGo', \
 'YardLine', \
 'Description', \
 'Yards', \
 'Formation', \
 'PlayType', \
 'IsRush', \
 'IsPass', \
 'PassType', \
 'RushDirection', \
 'YardLineFixed', \
 'YardLineDirection']

In [64]:
pbp2016.columns

Index([u'Quarter', u'Minute', u'Second', u'OffenseTeam', u'DefenseTeam',
       u'Down', u'ToGo', u'YardLine', u'Description', u'Yards', u'Formation',
       u'PlayType', u'IsRush', u'IsPass', u'PassType', u'RushDirection',
       u'YardLineFixed', u'YardLineDirection'],
      dtype='object')

In [85]:
# read in the cleaned data and combine it
filenames = ['../data/pbp2016-clean.csv', '../data/pbp2015-clean.csv', '../data/pbp2014-clean.csv', '../data/pbp2013-clean.csv']
pbp = combine(filenames)

In [91]:
pbp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 134998 entries, 0 to 39489
Data columns (total 9 columns):
Quarter        134998 non-null int64
Minute         134998 non-null int64
Second         134998 non-null int64
OffenseTeam    134998 non-null object
Down           134998 non-null int64
ToGo           134998 non-null int64
YardLine       134998 non-null int64
Formation      134993 non-null object
Play           134998 non-null object
dtypes: int64(6), object(3)
memory usage: 10.3+ MB


In [87]:
prep_pbp = prep(pbp)

In [88]:
# prep the data for modeling and then save it to file
prep_pbp.to_csv('../data/pbp-prepped.csv', index=False)

In [92]:
prep_pbp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 134998 entries, 0 to 39489
Data columns (total 46 columns):
Quarter                        134998 non-null int64
Minute                         134998 non-null int64
Second                         134998 non-null int64
Down                           134998 non-null int64
ToGo                           134998 non-null int64
YardLine                       134998 non-null int64
Play                           134998 non-null object
TEAM_ARI                       134998 non-null uint8
TEAM_ATL                       134998 non-null uint8
TEAM_BAL                       134998 non-null uint8
TEAM_BUF                       134998 non-null uint8
TEAM_CAR                       134998 non-null uint8
TEAM_CHI                       134998 non-null uint8
TEAM_CIN                       134998 non-null uint8
TEAM_CLE                       134998 non-null uint8
TEAM_DAL                       134998 non-null uint8
TEAM_DEN                       134998 non