In [1]:
import nflgame
import mirofeatures as mf
import pandas as pd
import numpy as np
from sklearn import linear_model

In [57]:
ftrs2, isPass = mf.getFeatures(2009,2016)

In [58]:
df_ftrs = pd.DataFrame(ftrs2)
df_isPass = pd.DataFrame(isPass)
isPass =np.array(isPass)
df_ftrs.head(30)

Unnamed: 0,down,isHome,opponent,position,shotgun,team,time,togo
0,1,True,TEN,58,0,PIT,1793.0,10
1,2,True,TEN,53,0,PIT,1756.0,5
2,3,True,TEN,56,1,PIT,1715.0,8
3,1,False,PIT,98,0,TEN,1696.0,10
4,2,False,PIT,98,0,TEN,1660.0,10
5,3,False,PIT,94,1,TEN,1631.0,6
6,1,True,TEN,43,0,PIT,1584.0,10
7,2,True,TEN,40,0,PIT,1548.0,7
8,1,True,TEN,30,0,PIT,1521.0,10
9,2,True,TEN,31,0,PIT,1483.0,11


In [59]:
ftrs = np.array(ftrs2)

## Using a simple logistic classifier

To start, we have to drop the categorical features because they don't play well with linear classification. 

In [60]:
X = df_ftrs.drop(['opponent', 'team'], axis = 1)

In [88]:
np.random.seed(12346)
mask = np.full(len(ftrs), False)
mask[:len(ftrs)*3/4] = True
np.random.shuffle(mask)

In [75]:
clf = linear_model.SGDClassifier(loss='log')
clf.fit(X[mask],isPass[mask])

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [76]:
print("Success rate in predicting pass (for training data) is %f%%" % (100*clf.score(X[mask], isPass[mask])))
print("Success rate in predicting pass (for test data) is %f%%" % (100*clf.score(X[np.invert(mask)], isPass[np.invert(mask)])))

Success rate in predicting pass (for training data) is 57.041950%
Success rate in predicting pass (for test data) is 57.002515%


Before modification: 57% ! Just based on linear regression (not log)

In [64]:
np.sum(isPass)/float(len(isPass))

0.57018679877115996

Big surprise! Our naive classifier is only barely better than just always guessing "pass."

In [65]:
naive = [True for i in range(len(isPass))]
for i in range(len(isPass)):
    if df_ftrs['togo'][i] <= 3:
        naive[i] = False

In [66]:
print(np.sum(naive==isPass)/float(len(isPass)))

0.585199435035


In [13]:
print(clf.coef_)

[[ 525.84629903  -93.71808626  -23.27111979  798.66119171   -0.82831947
   100.42345504]]


## Using a tree based model

In [77]:
from sklearn.ensemble import RandomForestClassifier

In [89]:
rf_model = RandomForestClassifier()
rf_model.fit(X[mask],isPass[mask])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [90]:
print("Success rate in predicting pass (for training data) is %f%%" % (100*rf_model.score(X[mask], isPass[mask])))
print("Success rate in predicting pass (for test data) is %f%%" % (100*rf_model.score(X[np.invert(mask)], isPass[np.invert(mask)])))

Success rate in predicting pass (for training data) is 95.210367%
Success rate in predicting pass (for test data) is 65.290863%


Naively, seems like there is a substantial improvement in predicting play/pass. 65% is not terrible
