In [17]:
# typical imports
import pandas as pd
import matplotlib

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import StandardScaler

%matplotlib inline



In [190]:
pbp2015 = pd.read_csv('pbp-2015.csv')
pbp2014 = pd.read_csv('pbp-2014.csv')
#pbp2013 = pd.read_csv('pbp-2013.csv')

- Check descriptions for NaN formations --> They were timeouts or 2-min warnings
* We could bin up ToGo 1, 2-5, 6-10, 10-15, 15+
- Combine No Huddle with corresponding formations --> There's a straight 'NO HUDDLE' formation, so nothing would be left
- Combine IsRush/IsPass with other outcomes (punt, field goal, etc.) and make that the target
* Train GBC on it
* Bayesian classifier?

In [191]:
for column in pbp2015:
    print column, pbp2015[column].unique()

GameId [2015091000 2015091300 2015091301 2015091302 2015091303 2015091304
 2015091305 2015091306 2015091307 2015091308 2015091309 2015091310
 2015091311 2015091312 2015091400 2015091401 2015091700 2015092000
 2015092001 2015092002 2015092003 2015092004 2015092005 2015092006
 2015092007 2015092008 2015092009 2015092010 2015092011 2015092012
 2015092013 2015092100 2015092400 2015092700 2015092701 2015092702
 2015092703 2015092704 2015092705 2015092706 2015092707 2015092708
 2015092709 2015092710 2015092711 2015092712 2015092713 2015092800
 2015100100 2015100400 2015100401 2015100402 2015100403 2015100404
 2015100405 2015100406 2015100407 2015100408 2015100409 2015100410
 2015100411 2015100412 2015100500 2015100800 2015101100 2015101101
 2015101102 2015101103 2015101104 2015101105 2015101106 2015101107
 2015101108 2015101109 2015101110 2015101111 2015101200 2015101500
 2015101800 2015101801 2015101802 2015101803 2015101804 2015101805
 2015101806 2015101807 2015101808 2015101809 2015101810

In [192]:
# drop all columns with no info
pbp2015.drop(['Unnamed: 10', 'Unnamed: 12','Unnamed: 16', 'Unnamed: 17', 'Challenger'], axis=1, inplace=True)
pbp2015.drop(['IsMeasurement', 'NextScore', 'TeamWin'], axis=1, inplace=True)

In [193]:
# drop all columns we won't use
pbp2015.drop(['IsIncomplete', 'IsTouchdown','IsSack', 'IsChallenge', 'IsChallengeReversed'], axis=1, inplace=True)
pbp2015.drop(['IsInterception', 'IsPenalty', 'IsTwoPointConversion', 'SeriesFirstDown'], axis=1, inplace=True)
pbp2015.drop(['IsTwoPointConversionSuccessful', 'IsPenaltyAccepted', 'PenaltyTeam'], axis=1, inplace=True)
pbp2015.drop(['IsFumble', 'PenaltyType', 'PenaltyYards', 'SeasonYear', 'GameId', 'GameDate'], axis=1, inplace=True)

In [194]:
pbp2015.PlayType.unique()

array([nan, 'RUSH', 'PASS', 'TIMEOUT', 'PUNT', 'QB KNEEL', 'KICK OFF',
       'NO PLAY', 'SCRAMBLE', 'FIELD GOAL', 'SACK', 'EXTRA POINT',
       'TWO-POINT CONVERSION', 'EXCEPTION', 'FUMBLES', 'CLOCK STOP',
       'PENALTY'], dtype=object)

In [195]:
# get rid of all the kicks except punts and field goals
pbp2015 = pbp2015[(pbp2015.PlayType != 'KICK OFF') & (pbp2015.PlayType != 'EXTRA POINT') & (pbp2015.PlayType != 'TWO-POINT CONVERSION')]

# get rid of all the timeout plays
pbp2015 = pbp2015[pbp2015.OffenseTeam.notnull()]

# get rid of all of the no-plays
pbp2015 = pbp2015[(pbp2015.PlayType != 'NO PLAY') & (pbp2015.PlayType != 'EXCEPTION') & (pbp2015.PlayType != 'CLOCK STOP')& (pbp2015.PlayType != 'PENALTY')]

# replace the nan PlayTypes with 'DIRECT SNAP' (they are...)
pbp2015.PlayType = pbp2015.PlayType.fillna('DIRECT SNAP')

In [196]:
pbp2015.PlayType.unique()

array(['RUSH', 'PASS', 'PUNT', 'QB KNEEL', 'SCRAMBLE', 'FIELD GOAL',
       'SACK', 'FUMBLES', 'DIRECT SNAP'], dtype=object)

In [197]:
# drop the existing IsRush/IsPass and create new ones
pbp2015.drop(['IsRush', 'IsPass'], axis=1, inplace=True)

play_to_rush = {
    'RUSH': 1,
    'PASS' : 0,
    'PUNT' : 0,
    'QB KNEEL' : 1,
    'SCRAMBLE' : 0,
    'FIELD GOAL' : 0,
    'SACK' : 0,
    'FUMBLES' : 1,
    'DIRECT SNAP' : 1
}

pbp2015['IsRush'] = pbp2015['PlayType'].map(play_to_rush)

play_to_pass = {
    'RUSH': 0,
    'PASS' : 1,
    'PUNT' : 0,
    'QB KNEEL' : 0,
    'SCRAMBLE' : 1,
    'FIELD GOAL' : 0,
    'SACK' : 1,
    'FUMBLES' : 0,
    'DIRECT SNAP' : 0
}

pbp2015['IsPass'] = pbp2015['PlayType'].map(play_to_pass)

play_to_kick = {
    'RUSH': 0,
    'PASS' : 0,
    'PUNT' : 1,
    'QB KNEEL' : 0,
    'SCRAMBLE' : 0,
    'FIELD GOAL' : 1,
    'SACK' : 0,
    'FUMBLES' : 0,
    'DIRECT SNAP' : 0
}

pbp2015['IsKick'] = pbp2015['PlayType'].map(play_to_kick)

In [198]:
pbp2015.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38027 entries, 5 to 46276
Data columns (total 20 columns):
Quarter              38027 non-null int64
Minute               38027 non-null int64
Second               38027 non-null int64
OffenseTeam          38027 non-null object
DefenseTeam          38027 non-null object
Down                 38027 non-null int64
ToGo                 38027 non-null int64
YardLine             38027 non-null int64
Description          38027 non-null object
Yards                38027 non-null int64
Formation            38027 non-null object
PlayType             38027 non-null object
PassType             19351 non-null object
RushDirection        12543 non-null object
YardLineFixed        38027 non-null int64
YardLineDirection    38027 non-null object
IsNoPlay             38027 non-null int64
IsRush               38027 non-null int64
IsPass               38027 non-null int64
IsKick               38027 non-null int64
dtypes: int64(12), object(8)
memory usage: 

In [199]:
pbp2015.shape

(38027, 20)