Predicting Pass Result on Short vs Long Passes From Defensive Formation

Research Question 1:

How well can a defensive formation predict the outcome of a pass? 
Does short vs deep pass plays affect our ability to predict the outcome?

In [58]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
from tabulate import tabulate
from sklearn import metrics
from sklearn.metrics import mean_squared_error, roc_auc_score

In [34]:
plays = pd.read_csv("/Users/williamzwetolitz/Desktop/DS320/Data/plays.csv")

plays.head(10)

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,playType,yardlineSide,yardlineNumber,...,preSnapHomeScore,gameClock,absoluteYardlineNumber,penaltyCodes,penaltyJerseyNumbers,passResult,offensePlayResult,playResult,epa,isDefensivePI
0,2018090600,75,(15:00) M.Ryan pass short right to J.Jones pus...,1,1,15,ATL,play_type_pass,ATL,20,...,0.0,15:00:00,90.0,,,C,10,10,0.261827,False
1,2018090600,146,(13:10) M.Ryan pass incomplete short right to ...,1,1,10,ATL,play_type_pass,PHI,39,...,0.0,13:10:00,49.0,,,I,0,0,-0.37236,False
2,2018090600,168,(13:05) (Shotgun) M.Ryan pass incomplete short...,1,2,10,ATL,play_type_pass,PHI,39,...,0.0,13:05:00,49.0,,,I,0,0,-0.702779,False
3,2018090600,190,(13:01) (Shotgun) M.Ryan pass deep left to J.J...,1,3,10,ATL,play_type_pass,PHI,39,...,0.0,13:01:00,49.0,,,C,33,33,3.04753,False
4,2018090600,256,(10:59) (Shotgun) M.Ryan pass incomplete short...,1,3,1,ATL,play_type_pass,PHI,1,...,0.0,10:59:00,11.0,,,I,0,0,-0.842272,False
5,2018090600,320,(10:10) (Shotgun) N.Foles pass short left to N...,1,2,8,PHI,play_type_pass,PHI,4,...,0.0,10:10:00,14.0,,,C,4,4,-0.344096,False
6,2018090600,344,(9:24) (Shotgun) N.Foles pass incomplete short...,1,3,4,PHI,play_type_pass,PHI,8,...,0.0,09:24:00,18.0,,,I,0,0,-1.192208,False
7,2018090600,402,(9:08) M.Ryan pass incomplete deep left to M.S...,1,1,10,ATL,play_type_pass,PHI,44,...,0.0,09:08:00,54.0,,,I,0,0,-0.429863,False
8,2018090600,492,(7:01) M.Ryan pass short left to T.Coleman pus...,1,2,13,ATL,play_type_pass,PHI,36,...,0.0,07:01:00,46.0,,,C,26,26,1.879804,False
9,2018090600,521,(6:19) M.Ryan pass short left to A.Hooper to P...,1,1,10,ATL,play_type_pass,PHI,10,...,0.0,06:19:00,20.0,,,C,3,3,0.045665,False


In [52]:
# Here we do some preprocessing:
#  - Taking out plays that end in interceptions, penalties, and fumbles
#  - Then taking only the passing plays since we are not focusing on runs
#  - We can see that we are using a total of 16,197 passing plays


df = plays.copy()
passes = df[df['playDescription'].str.contains(" pass ", case=False, na=False)]
passes.shape[0]

17851

In [36]:
# Here we created data tables from the play table based on the location and result (complete or incomplete) of a pass

# The "playDescription" column was a text string describing the events of the play. From this we extracted what happened
# during the play and filtered the rows into their respective table

incomplete_short = passes[passes['playDescription'].str.contains("incomplete short", case=False, na=False)]
incomplete_deep = passes[passes['playDescription'].str.contains("incomplete deep", case=False, na=False)]
complete_short = passes[passes['playDescription'].str.contains("pass short", case=False, na=False)]
complete_deep = passes[passes['playDescription'].str.contains("pass deep", case=False, na=False)]

#complete_short.to_csv("/Users/williamzwetolitz/Desktop/DS320/Data/complete_short.csv")
#complete_deep.to_csv("/Users/williamzwetolitz/Desktop/DS320/Data/complete_deep.csv")
#incomplete_short.to_csv("/Users/williamzwetolitz/Desktop/DS320/Data/incomplete_short.csv")
#incomplete_deep.to_csv("/Users/williamzwetolitz/Desktop/DS320/Data/incomplete_deep.csv")

In [60]:
# Need to deal with special plays, those including sacks, interceptions and fumbles. Since there was no
# no catch on the play we are labeling this as an incompletion
words = "INTERCEPTED|PENALTY|FUMBLES|SACKED|INCOMPLETE"
passes["binaryPassResult"] = np.where(passes["playDescription"].str.contains(words, case=False, na=False), 0, 1)
passes.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  passes["binaryPassResult"] = np.where(passes["playDescription"].str.contains(words, case=False, na=False), 0, 1)


Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,playType,yardlineSide,yardlineNumber,...,gameClock,absoluteYardlineNumber,penaltyCodes,penaltyJerseyNumbers,passResult,offensePlayResult,playResult,epa,isDefensivePI,binaryPassResult
0,2018090600,75,(15:00) M.Ryan pass short right to J.Jones pus...,1,1,15,ATL,play_type_pass,ATL,20,...,15:00:00,90.0,,,C,10,10,0.261827,False,1
1,2018090600,146,(13:10) M.Ryan pass incomplete short right to ...,1,1,10,ATL,play_type_pass,PHI,39,...,13:10:00,49.0,,,I,0,0,-0.37236,False,0
2,2018090600,168,(13:05) (Shotgun) M.Ryan pass incomplete short...,1,2,10,ATL,play_type_pass,PHI,39,...,13:05:00,49.0,,,I,0,0,-0.702779,False,0
3,2018090600,190,(13:01) (Shotgun) M.Ryan pass deep left to J.J...,1,3,10,ATL,play_type_pass,PHI,39,...,13:01:00,49.0,,,C,33,33,3.04753,False,1
4,2018090600,256,(10:59) (Shotgun) M.Ryan pass incomplete short...,1,3,1,ATL,play_type_pass,PHI,1,...,10:59:00,11.0,,,I,0,0,-0.842272,False,0


In [68]:
# Similar to above, but this time splitting pass plays, regardless of outcome, into short and long passes

short = passes[passes['playDescription'].str.contains("short", case=False, na=False)]
deep = passes[passes['playDescription'].str.contains("deep", case=False, na=False)]


short.head()


Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,playType,yardlineSide,yardlineNumber,...,gameClock,absoluteYardlineNumber,penaltyCodes,penaltyJerseyNumbers,passResult,offensePlayResult,playResult,epa,isDefensivePI,binaryPassResult
0,2018090600,75,(15:00) M.Ryan pass short right to J.Jones pus...,1,1,15,ATL,play_type_pass,ATL,20,...,15:00:00,90.0,,,C,10,10,0.261827,False,1
1,2018090600,146,(13:10) M.Ryan pass incomplete short right to ...,1,1,10,ATL,play_type_pass,PHI,39,...,13:10:00,49.0,,,I,0,0,-0.37236,False,0
2,2018090600,168,(13:05) (Shotgun) M.Ryan pass incomplete short...,1,2,10,ATL,play_type_pass,PHI,39,...,13:05:00,49.0,,,I,0,0,-0.702779,False,0
4,2018090600,256,(10:59) (Shotgun) M.Ryan pass incomplete short...,1,3,1,ATL,play_type_pass,PHI,1,...,10:59:00,11.0,,,I,0,0,-0.842272,False,0
5,2018090600,320,(10:10) (Shotgun) N.Foles pass short left to N...,1,2,8,PHI,play_type_pass,PHI,4,...,10:10:00,14.0,,,C,4,4,-0.344096,False,1


In [70]:
print(short.shape[0])
print(deep.shape[0])


14449
3402


In [75]:
# Removed plays where the number of defenders in the box or number of pass rushers was unknown

short = short[pd.notnull(short['defendersInTheBox'])]
deep = deep[pd.notnull(deep['defendersInTheBox'])]
print(short['defendersInTheBox'].isna().sum())
print(deep['defendersInTheBox'].isna().sum())
short = short[pd.notnull(short['numberOfPassRushers'])]
deep = deep[pd.notnull(deep['numberOfPassRushers'])]
print(short['numberOfPassRushers'].isna().sum())
print(deep['numberOfPassRushers'].isna().sum())

0
0
0
0


In [121]:
# The variables used to predict are:
# yardsToGo - Distance needed for the offense to gain a first down
#       May affect how aggressive a QB is, how they throw it, and type of pass they throw
# defendersInTheBox - Number of defenders in close proximity to line-of-scrimmage
#       Relates to how many players drop into coverage to protect the pass, the less players, the harder it is to pass
# numberOfPassRushers - Number of players rushing the quarterback
#       This affects how much time the QB as to throw

# The prediction variable is passResult, either complete or incomplete

# The data was split into training and testing, with 30% reserved for the latter

X = short[['yardsToGo', 'defendersInTheBox', 'numberOfPassRushers']]
y = short['binaryPassResult']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [122]:
# Here we train a random forest classifier using 100 estimators. RFC is a good choice here as it performs well in 
# classification problems like this one, effective at avoiding overfitting and can provide relative feature importance

short_clf=RandomForestClassifier(n_estimators=100)

short_clf.fit(X_train,y_train)
y_pred=short_clf.predict(X_test)

In [123]:
# Here we see the accuracy of our model

print("Model accuracy on short passes:",metrics.accuracy_score(y_test, y_pred))

rfc_short_acc = metrics.accuracy_score(y_test, y_pred)

print(rfc_short_acc)


Model accuracy on short passes: 0.6746332229058211
0.6746332229058211


In [124]:
print(roc_auc_score(y_test, y_pred))

0.5101073783087142


In [125]:
# Now, we will repeat the same process for deep passes

deep = deep[pd.notnull(deep['defendersInTheBox'])]

X = deep[['yardsToGo', 'defendersInTheBox', 'numberOfPassRushers']]
y = deep['passResult']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


In [126]:
clf2=RandomForestClassifier(n_estimators=100)

clf2.fit(X_train,y_train)
y_pred=clf2.predict(X_test)

In [127]:
print("Model accuracy on deep passes:",metrics.accuracy_score(y_test, y_pred))

rfc_deep_acc = metrics.accuracy_score(y_test, y_pred)

print(rfc_deep_acc)

Model accuracy on deep passes: 0.5220588235294118
0.5220588235294118


We can see that the prediction accuracy was much higher for short passes than long passes. This is expected because deep passes are much less consistant than short passes in the NFL. 

In [128]:
# Using XGB for the same question looking at short passes
from xgboost import XGBClassifier

params_xgd = {
    'max_depth': 7,
    'objective': 'reg:logistic',
    'learning_rate': 0.05,
    'n_estimators': 10000
    }

X = short[['yardsToGo', 'defendersInTheBox', 'numberOfPassRushers']]
y = short['binaryPassResult']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 7)



In [138]:
model1 = XGBClassifier()
model1.fit(X_train, y_train)

In [139]:
y_pred = model1.predict(X_test)
preds = [round(value) for value in y_pred]

accuracy_short = accuracy_score(y_test, preds)
print("Accuracy: %.2f%%" % (accuracy_short * 100.0))


Accuracy: 60.16%


In [140]:
# Now doing XGBoost on deep passes

from xgboost import XGBClassifier

params_xgd = {
    'max_depth': 7,
    'objective': 'reg:logistic',
    'learning_rate': 0.05,
    'n_estimators': 10000
    }

X = deep[['yardsToGo', 'defendersInTheBox', 'numberOfPassRushers']]
y = deep['binaryPassResult']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 7)


In [142]:
model2 = XGBClassifier()
model2.fit(X_train, y_train)

In [143]:
y_pred = model2.predict(X_test)
preds = [round(value) for value in y_pred]

accuracy_deep = accuracy_score(y_test, preds)
print("Accuracy: %.2f%%" % (accuracy_deep * 100.0))

Accuracy: 60.16%


In [134]:
accuracy_data = [["Random Forest Classifier", rfc_short_acc, rfc_deep_acc], ["XGBoost", accuracy_short, accuracy_deep]]

col_names = ["Model", "Short Passes", "Long Passes"]

print(tabulate(accuracy_data, headers=col_names))

Model                       Short Passes    Long Passes
------------------------  --------------  -------------
Random Forest Classifier        0.674633       0.522059
XGBoost                         0.669664       0.601575


In [135]:
print(roc_auc_score(y_test, y_pred))

0.49040600667408235


Importance of Features

In [136]:
#Short Passes RFC
short_clf.feature_importances_

#features: 'yardsToGo', 'defendersInTheBox', 'numberOfPassRushers'

array([0.52531279, 0.22565956, 0.24902765])

In [137]:
#Deep Passes RFC
clf2.feature_importances_

#features: 'yardsToGo', 'defendersInTheBox', 'numberOfPassRushers'

array([0.57941718, 0.21794149, 0.20264133])

In [144]:
#Short Passes XGB
model1.feature_importances_

#features: 'yardsToGo', 'defendersInTheBox', 'numberOfPassRushers'

array([0.334362  , 0.34388238, 0.32175562], dtype=float32)

In [146]:
#Deep Passes XGB
model2.feature_importances_

#features: 'yardsToGo', 'defendersInTheBox', 'numberOfPassRushers'

array([0.334362  , 0.34388238, 0.32175562], dtype=float32)