In [3]:
import nflgame
import mirofeatures as mf
import morefeatures as mf2
import pandas as pd
import numpy as np
from sklearn import linear_model

In [4]:
ftrs2, isPass = mf.getFeatures(2009,2016)

In [5]:
df_ftrs = pd.DataFrame(ftrs2)
df_isPass = pd.DataFrame(isPass)
isPass =np.array(isPass)
df_ftrs.head(10)

Unnamed: 0,down,isHome,opponent,position,quarter,shotgun,team,time,togo
0,1,True,TEN,58,1,0,PIT,893.0,10
1,2,True,TEN,53,1,0,PIT,856.0,5
2,3,True,TEN,56,1,1,PIT,815.0,8
3,1,False,PIT,98,1,0,TEN,796.0,10
4,2,False,PIT,98,1,0,TEN,760.0,10
5,3,False,PIT,94,1,1,TEN,731.0,6
6,1,True,TEN,43,1,0,PIT,684.0,10
7,2,True,TEN,40,1,0,PIT,648.0,7
8,1,True,TEN,30,1,0,PIT,621.0,10
9,2,True,TEN,31,1,0,PIT,583.0,11


In [6]:
ftrs = np.array(ftrs2)

## Using a simple logistic classifier

To start, we have to drop the categorical features because they don't play well with linear classification. 

In [7]:
X = df_ftrs.drop(['opponent', 'team'], axis = 1)

In [8]:
np.random.seed(123456)
mask = np.full(len(ftrs), False)
mask[:len(ftrs)*3/4] = True
np.random.shuffle(mask)

In [9]:
clf = linear_model.SGDClassifier(loss='log')
clf.fit(X[mask],isPass[mask])



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [10]:
print("Success rate in predicting pass (for training data) is %f%%" % (100*clf.score(X[mask], isPass[mask])))
print("Success rate in predicting pass (for test data) is %f%%" % (100*clf.score(X[np.invert(mask)], isPass[np.invert(mask)])))

Success rate in predicting pass (for training data) is 47.702977%
Success rate in predicting pass (for test data) is 48.033529%


Before modification: 57% ! Just based on linear regression (not log). Huge amoung of variance based on what our cut is.


In [11]:
np.sum(isPass)/float(len(isPass))

0.57018679877116

Big surprise! Our naive classifier is only barely better than just always guessing "pass."

Let's compare to a decision tree that Tynan thought up (I know nothing about football so couldn't really help - Miro)

In [31]:
naive = [True for i in range(len(isPass))]
for i in range(len(isPass)):
    if df_ftrs['down'][i] == 1 and df_ftrs['togo'][i] <= 5:
        naive[i] = False
    elif df_ftrs['down'][i] == 2 and df_ftrs['togo'][i] < 6:
        naive[i] = False
    elif df_ftrs['down'][i] == 3 and df_ftrs['togo'][i] < 4:
        naive[i] = False
    elif df_ftrs['down'][i] == 4 and df_ftrs['togo'][i] < 3:
        naive[i] = False

In [14]:
print(np.sum(naive==isPass)/float(len(isPass)))

0.5894366698938386


Ashwin's decision tree yields a 2% higher accuracy:

In [41]:
naive2 = [True for i in range(len(isPass))]
for i in range(len(isPass)):
    if df_ftrs['down'][i] == 1 and df_ftrs['togo'][i] <= 5:
        naive2[i] = False
    elif df_ftrs['down'][i] == 2 and df_ftrs['togo'][i] < 6:
        naive2[i] = False
    elif df_ftrs['down'][i] == 3 and df_ftrs['togo'][i] < 3:
        naive2[i] = False
    elif df_ftrs['down'][i] == 4 and df_ftrs['togo'][i] < 2:
        naive2[i] = False

In [42]:
print(np.sum(naive2==isPass)/float(len(isPass)))

0.6004719215084724


Linear regression just barely outperforms this!

## Using LDA (Linear Discriminant Analysis)

In [15]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [16]:
lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X[mask],isPass[mask])

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [17]:
print("Success rate in predicting pass (for training data) is %f%%" % (100*lda_model.score(X[mask], isPass[mask])))
print("Success rate in predicting pass (for test data) is %f%%" % (100*lda_model.score(X[np.invert(mask)], isPass[np.invert(mask)])))

Success rate in predicting pass (for training data) is 70.707296%
Success rate in predicting pass (for test data) is 70.990780%


## Using a tree based model

In [29]:
from sklearn.ensemble import RandomForestClassifier

In [30]:
rf_model = RandomForestClassifier()
rf_model.fit(X[mask],isPass[mask])

ValueError: Item wrong length 367758 instead of 238599.

In [None]:
print("Success rate in predicting pass (for training data) is %f%%" % (100*rf_model.score(X[mask], isPass[mask])))
print("Success rate in predicting pass (for test data) is %f%%" % (100*rf_model.score(X[np.invert(mask)], isPass[np.invert(mask)])))

Naively, seems like there is a substantial improvement in predicting play/pass. 66% is not terrible! But 96% training accuracy? Clearly some overfitting is happening here. Let's try dropping out time.


In [None]:
X = df_ftrs.drop(['opponent', 'team', 'time'], axis = 1)

In [None]:
rf_model2 = RandomForestClassifier()
rf_model2.fit(X[mask],isPass[mask])

In [None]:
print("Success rate in predicting pass (for training data) is %f%%" % (100*rf_model2.score(X[mask], isPass[mask])))
print("Success rate in predicting pass (for test data) is %f%%" % (100*rf_model2.score(X[np.invert(mask)], isPass[np.invert(mask)])))

Nice! We've increased test accuracy and looks like at least some of the overfitting for training data has been resolved. What if we include team and opponent?

## Expanding our feature set (and also changing data set to be 2002-2012 rather than 2009-2016)

In [None]:
nftrs2, nisPass = mf2.getAllFeatures()

In [None]:
ndf_ftrs = pd.DataFrame(nftrs2)
ndf_isPass = pd.DataFrame(nisPass)
nisPass =np.array(nisPass)
ndf_ftrs.head(10)

In [None]:
nX = ndf_ftrs.drop(['opponent', 'team','togoal'], axis = 1)

In [None]:
np.random.seed(126)
mask = np.full(len(nftrs2), False)
mask[:len(nftrs2)*3/4] = True
np.random.shuffle(mask)
rf_model3 = RandomForestClassifier()
rf_model3.fit(nX[mask],nisPass[mask])

In [32]:
print("Success rate in predicting pass (for training data) is %f%%" % (100*rf_model3.score(nX[mask], nisPass[mask])))
print("Success rate in predicting pass (for test data) is %f%%" % (100*rf_model3.score(nX[np.invert(mask)], nisPass[np.invert(mask)])))

Success rate in predicting pass (for training data) is 73.306311%
Success rate in predicting pass (for test data) is 69.731347%
