In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings(action='ignore')

# data
import pandas as pd
import numpy as np
import random as rnd
import sklearn.preprocessing
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import RadiusNeighborsClassifier

from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn import metrics


from sklearn.model_selection import GridSearchCV
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score



In [2]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
pd.set_option('precision', 0)

In [3]:
origin_df = pd.read_csv('bdb.csv', low_memory = False)
# origin_df.head(60)

In [4]:
origin_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 682154 entries, 0 to 682153
Data columns (total 49 columns):
GameId                    682154 non-null int64
PlayId                    682154 non-null int64
Team                      682154 non-null object
X                         682154 non-null float64
Y                         682154 non-null float64
S                         682154 non-null float64
A                         682154 non-null float64
Dis                       682154 non-null float64
Orientation               682131 non-null float64
Dir                       682126 non-null float64
NflId                     682154 non-null int64
DisplayName               682154 non-null object
JerseyNumber              682154 non-null int64
Season                    682154 non-null int64
YardLine                  682154 non-null int64
Quarter                   682154 non-null int64
GameClock                 682154 non-null object
PossessionTeam            682154 non-null object
Down   

In [5]:
working_df = origin_df.copy()

In [6]:
working_df.columns

Index(['GameId', 'PlayId', 'Team', 'X', 'Y', 'S', 'A', 'Dis', 'Orientation',
       'Dir', 'NflId', 'DisplayName', 'JerseyNumber', 'Season', 'YardLine',
       'Quarter', 'GameClock', 'PossessionTeam', 'Down', 'Distance',
       'FieldPosition', 'HomeScoreBeforePlay', 'VisitorScoreBeforePlay',
       'NflIdRusher', 'OffenseFormation', 'OffensePersonnel',
       'DefendersInTheBox', 'DefensePersonnel', 'PlayDirection', 'TimeHandoff',
       'TimeSnap', 'Yards', 'PlayerHeight', 'PlayerWeight', 'PlayerBirthDate',
       'PlayerCollegeName', 'Position', 'HomeTeamAbbr', 'VisitorTeamAbbr',
       'Week', 'Stadium', 'Location', 'StadiumType', 'Turf', 'GameWeather',
       'Temperature', 'Humidity', 'WindSpeed', 'WindDirection'],
      dtype='object')

### Setup

In [7]:
working_df.drop(columns=['GameId','Dir', 'X', 'Y', 'S', 'A', 'Dis', 'Orientation','DisplayName', 
                         'JerseyNumber', 'PlayerHeight', 'PlayerWeight', 'NflId', 'TimeHandoff',
                         'PlayerBirthDate', 'PlayerCollegeName', 'Position', 'Team', 'TimeSnap',
                         'Stadium', 'Location', 'WindSpeed', 'WindDirection', 'NflIdRusher', 'PlayDirection']
                , axis= 1, inplace= True)

In [8]:
working_df.drop_duplicates(subset= "PlayId", inplace= True)

In [9]:
working_df.set_index('PlayId', inplace=True)

### Season

In [10]:
le = sklearn.preprocessing.LabelEncoder()

In [11]:
working_df['Season'] = le.fit_transform(working_df.Season.values)

### YardLine

In [12]:
working_df['YardLine'].isnull().sum()
# leave as is

0

### Quarter

In [13]:
working_df['Quarter'].value_counts()
# leave as is

1    7942
4    7779
3    7648
2    7441
5     197
Name: Quarter, dtype: int64

### GameClock

In [14]:
# Because Quarter is a feature, we can just make convert this column into seconds that are left in the quarter

def GameClockSeconds(GameClock):
    time_split = GameClock.split(':')
    seconds = int(time_split[0])*60 + int(time_split[1]) 
    return seconds

working_df['GameClock'] = working_df['GameClock'].apply(GameClockSeconds)

### PossessionTeam

In [15]:
working_df.rename(columns={'PossessionTeam':'Team'}, inplace=True)

In [16]:
# The below code fixes incorrect names and is included in the data
# cleaning module:

working_df['Team'].replace(to_replace= {'ARZ':'ARI', 'BLT':'BAL',
                                                  'CLV':'CLE', 'HST':'HOU'}, 
                                    inplace= True)

working_df['FieldPosition'].replace(to_replace= {'ARZ':'ARI', 'BLT':'BAL',
                                                  'CLV':'CLE', 'HST':'HOU'}, 
                                    inplace= True)

In [17]:
# create isHome column
working_df['isHome'] = 0

working_df.loc[working_df['Team'] == working_df['HomeTeamAbbr'], 'isHome'] = 1 
working_df.loc[working_df['Team'] == working_df['VisitorTeamAbbr'], 'isHome'] = 0 

In [18]:
# establish categorical Opponent column

working_df.loc[working_df['Team'] == working_df['HomeTeamAbbr'], 'Opponent'] = working_df['VisitorTeamAbbr']
working_df.loc[working_df['Team'] == working_df['VisitorTeamAbbr'], 'Opponent'] = working_df['HomeTeamAbbr'] 

In [19]:
working_df.drop(columns=['HomeTeamAbbr', 'VisitorTeamAbbr'], axis=1, inplace=True)

### Down

In [20]:
working_df['Down'].value_counts()
# leave as is

1    17925
2    10393
3     2421
4      268
Name: Down, dtype: int64

### Distance

In [21]:
working_df.rename(columns={'Distance':'YardsToFirst'}, inplace=True)

### FieldPosition

In [22]:
# if FieldPosition = Team, ball's in their territory.

working_df['FieldPosition'].isnull().sum()

391

In [23]:
working_df = working_df[working_df['FieldPosition'].notna()]

In [24]:
working_df.loc[working_df['FieldPosition'] == working_df['Team'], 'YardsToTD'] = ((50 - working_df['YardLine']) + 50).astype(int)
working_df.loc[working_df['FieldPosition'] == working_df['Opponent'], 'YardsToTD'] = working_df['YardLine'].astype(int)

In [25]:
working_df.loc[working_df['FieldPosition'] == working_df['Team'], 'YardsToFGRange'] = ((50 - working_df['YardLine']) + 15).astype(int)

working_df.loc[working_df['FieldPosition'] == working_df['Opponent'], 'YardsToFGRange'] = (working_df['YardLine'] - 35).astype(int)
working_df.loc[working_df['YardsToFGRange'] < 0, 'YardsToFGRange'] = 0

In [26]:
working_df.drop(columns='FieldPosition', axis=1, inplace=True)

### HomeScoreBeforePlay + VisitorScoreBeforePlay

In [27]:
working_df.loc[working_df['isHome'] == 1, 'ScoreBeforePlay'] = working_df['HomeScoreBeforePlay']
working_df.loc[working_df['isHome'] == 0, 'ScoreBeforePlay'] = working_df['VisitorScoreBeforePlay']

working_df.loc[working_df['isHome'] == 1, 'OppScoreBeforePlay'] = working_df['VisitorScoreBeforePlay']
working_df.loc[working_df['isHome'] == 0, 'OppScoreBeforePlay'] = working_df['HomeScoreBeforePlay']

In [28]:
working_df.drop(columns=['HomeScoreBeforePlay', 'VisitorScoreBeforePlay'], axis=1, inplace=True)

In [29]:
working_df['ScoreDifferential'] = (working_df['ScoreBeforePlay'] - working_df['OppScoreBeforePlay']).abs()

### NflIdRusher

### OffenseFormation

In [30]:
working_df['OffenseFormation'].value_counts()

SINGLEBACK    13435
SHOTGUN        9270
I_FORM         6156
PISTOL          966
JUMBO           676
WILDCAT          77
EMPTY            31
ACE               1
Name: OffenseFormation, dtype: int64

In [31]:
working_df['OffenseFormation'] = working_df['OffenseFormation'].astype(str)

In [32]:
working_df['OffenseFormation'] = le.fit_transform(working_df.OffenseFormation)

### OffensePersonnel

In [33]:
# FIX GROUPS

### these we can make assumption of 1 QB and 5 OL, add both:

# 1 RB, 1 TE, 3 WR                310288 -- 1 QB, 5 OL, 1 RB, 1 TE, 3 WR
# 1 RB, 2 TE, 2 WR                155232 -- 1 QB, 5 OL, 1 RB, 2 TE, 2 WR 
# 2 RB, 1 TE, 2 WR                 71764 -- 1 QB, 5 OL, 2 RB, 1 TE, 2 WR 
# 1 RB, 3 TE, 1 WR                 39556 -- 1 QB, 5 OL, 1 RB, 3 TE, 1 WR 
# 2 RB, 2 TE, 1 WR                 34958 -- 1 QB, 5 OL, 2 RB, 2 TE, 1 WR
# 2 RB, 0 TE, 3 WR                  4752 -- 1 QB, 5 OL, 2 RB, 0 TE, 3 WR 
# 1 RB, 0 TE, 4 WR                  3828 -- 1 QB, 5 OL, 1 RB, 0 TE, 4 WR
# 2 RB, 3 TE, 0 WR                  2134 -- 1 QB, 5 OL, 2 RB, 3 TE, 0 WR
# 3 RB, 1 TE, 1 WR                   726 -- 1 QB, 5 OL, 3 RB, 1 TE, 1 WR 
# 3 RB, 0 TE, 2 WR                   308 -- 1 QB, 5 OL, 3 RB, 0 TE, 2 WR 
# 0 RB, 2 TE, 3 WR                   220 -- 1 QB, 5 OL, 0 RB, 2 TE, 3 WR 
# 1 RB, 4 TE, 0 WR                   154 -- 1 QB, 5 OL, 1 RB, 4 TE, 0 WR 
# 3 RB, 2 TE, 0 WR                   132 -- 1 QB, 5 OL, 3 RB, 2 TE, 0 WR 
# 0 RB, 3 TE, 2 WR                    66 -- 1 QB, 5 OL, 0 RB, 3 TE, 2 WR 
# 0 RB, 0 TE, 5 WR                    44 -- 1 QB, 5 OL, 0 RB, 0 TE, 5 WR 
# 0 RB, 1 TE, 4 WR                  1738 -- 1 QB, 5 OL, 0 RB, 1 TE, 4 WR 

working_df['OffensePersonnel'].replace(to_replace={'1 RB, 1 TE, 3 WR' : '1 QB, 5 OL, 1 RB, 1 TE, 3 WR', 
                                                   '1 RB, 2 TE, 2 WR' : '1 QB, 5 OL, 1 RB, 2 TE, 2 WR',  
                                                   '2 RB, 1 TE, 2 WR' : '1 QB, 5 OL, 2 RB, 1 TE, 2 WR',  
                                                   '1 RB, 3 TE, 1 WR' : '1 QB, 5 OL, 1 RB, 3 TE, 1 WR',  
                                                   '2 RB, 2 TE, 1 WR' : '1 QB, 5 OL, 2 RB, 2 TE, 1 WR', 
                                                   '2 RB, 0 TE, 3 WR' : '1 QB, 5 OL, 2 RB, 0 TE, 3 WR',  
                                                   '1 RB, 0 TE, 4 WR' : '1 QB, 5 OL, 1 RB, 0 TE, 4 WR', 
                                                   '2 RB, 3 TE, 0 WR' : '1 QB, 5 OL, 2 RB, 3 TE, 0 WR', 
                                                   '3 RB, 1 TE, 1 WR' : '1 QB, 5 OL, 3 RB, 1 TE, 1 WR',  
                                                   '3 RB, 0 TE, 2 WR' : '1 QB, 5 OL, 3 RB, 0 TE, 2 WR',  
                                                   '0 RB, 2 TE, 3 WR' : '1 QB, 5 OL, 0 RB, 2 TE, 3 WR',  
                                                   '1 RB, 4 TE, 0 WR' : '1 QB, 5 OL, 1 RB, 4 TE, 0 WR',  
                                                   '3 RB, 2 TE, 0 WR' : '1 QB, 5 OL, 3 RB, 2 TE, 0 WR',  
                                                   '0 RB, 3 TE, 2 WR' : '1 QB, 5 OL, 0 RB, 3 TE, 2 WR',  
                                                   '0 RB, 0 TE, 5 WR' : '1 QB, 5 OL, 0 RB, 0 TE, 5 WR',
                                                   '0 RB, 1 TE, 4 WR' : '1 QB, 5 OL, 0 RB, 1 TE, 4 WR'
                                                  }, inplace = True)
                                       

### these we can make assumption of 1 QB and 6 OL, add QB:

# 6 OL, 3 RB, 0 TE, 1 WR              22 -- 1 QB, 6 OL, 3 RB, 0 TE, 1 WR 
# 6 OL, 0 RB, 2 TE, 2 WR              22 -- 1 QB, 6 OL, 0 RB, 2 TE, 2 WR 
# 6 OL, 2 RB, 0 TE, 2 WR            1452 -- 1 QB, 6 OL, 2 RB, 0 TE, 2 WR
# 6 OL, 1 RB, 3 TE, 0 WR            1606 -- 1 QB, 6 OL, 1 RB, 3 TE, 0 WR
# 6 OL, 1 RB, 0 TE, 3 WR            2948 -- 1 QB, 6 OL, 1 RB, 0 TE, 3 WR
# 6 OL, 2 RB, 2 TE, 0 WR            4444 -- 1 QB, 6 OL, 2 RB, 2 TE, 0 WR

# 6 OL, 2 RB, 1 TE, 1 WR            7392 -- 1 QB, 6 OL, 2 RB, 1 TE, 1 WR
# 6 OL, 1 RB, 1 TE, 2 WR           12936 -- 1 QB, 6 OL, 1 RB, 1 TE, 2 WR
# 6 OL, 1 RB, 2 TE, 1 WR           14190 -- 1 QB, 6 OL, 1 RB, 2 TE, 1 WR

working_df['OffensePersonnel'].replace(to_replace={'6 OL, 3 RB, 0 TE, 1 WR' : '1 QB, 6 OL, 3 RB, 0 TE, 1 WR', 
                                                   '6 OL, 0 RB, 2 TE, 2 WR' : '1 QB, 6 OL, 0 RB, 2 TE, 2 WR', 
                                                   '6 OL, 2 RB, 0 TE, 2 WR' : '1 QB, 6 OL, 2 RB, 0 TE, 2 WR', 
                                                   '6 OL, 1 RB, 3 TE, 0 WR' : '1 QB, 6 OL, 1 RB, 3 TE, 0 WR', 
                                                   '6 OL, 1 RB, 0 TE, 3 WR' : '1 QB, 6 OL, 1 RB, 0 TE, 3 WR',
                                                   '6 OL, 2 RB, 2 TE, 0 WR' : '1 QB, 6 OL, 1 RB, 0 TE, 3 WR', 
                                                   '6 OL, 2 RB, 1 TE, 1 WR' : '1 QB, 6 OL, 2 RB, 1 TE, 1 WR', 
                                                   '6 OL, 1 RB, 1 TE, 2 WR' : '1 QB, 6 OL, 1 RB, 1 TE, 2 WR', 
                                                   '6 OL, 1 RB, 2 TE, 1 WR' : '1 QB, 6 OL, 1 RB, 2 TE, 1 WR'
                                                  }, inplace= True)


### due to high count of plays, it is unlikely 7 ol were used. RB was probs seen as OL. 
### subtract one OL, add on RB, add QB

# 7 OL, 1 RB, 0 TE, 2 WR             682 -- 1 QB, 6 OL, 2 RB, 0 TE, 2 WR
# 7 OL, 1 RB, 2 TE, 0 WR             308 -- 1 QB, 6 OL, 2 RB, 2 TE, 0 WR
# 7 OL, 1 RB, 1 TE, 1 WR              44 -- 1 QB, 6 OL, 1 RB, 2 TE, 1 WR
# 7 OL, 2 RB, 1 TE, 0 WR              88 -- 1 QB, 6 OL, 3 RB, 1 TE, 0 WR
# 7 OL, 2 RB, 0 TE, 1 WR             220 -- 1 QB, 6 OL, 3 RB, 0 TE, 1 WR
# 7 OL, 1 RB, 2 TE, 0 WR             308 -- 1 QB, 6 OL, 2 RB, 2 TE, 0 WR

working_df['OffensePersonnel'].replace(to_replace={'7 OL, 1 RB, 0 TE, 2 WR' : '1 QB, 6 OL, 2 RB, 0 TE, 2 WR', 
                                                   '7 OL, 1 RB, 2 TE, 0 WR' : '1 QB, 6 OL, 2 RB, 2 TE, 0 WR', 
                                                   '7 OL, 1 RB, 1 TE, 1 WR' : '1 QB, 6 OL, 1 RB, 2 TE, 1 WR', 
                                                   '7 OL, 2 RB, 1 TE, 0 WR' : '1 QB, 6 OL, 3 RB, 1 TE, 0 WR',
                                                   '7 OL, 2 RB, 0 TE, 1 WR' : '1 QB, 6 OL, 3 RB, 0 TE, 1 WR',
                                                   '7 OL, 1 RB, 2 TE, 0 WR' : '1 QB, 6 OL, 2 RB, 2 TE, 0 WR'
                                                  }, inplace= True)

### the following were corrected by verifying actual game film for each play (NFL GamePass)
### play numbers are included followed by correct grouping and ind. replacement code:

# 1 RB, 1 TE, 2 WR,1 LB               66 -- 1 QB, 5 OL, 2 RB, 1 TE, 2 WR -- 20171113000112 / 20171113001385 / 20191117101942
# 1 RB, 0 TE, 3 WR,1 DB               66 -- 1 QB, 5 OL, 1 RB, 0 TE, 4 WR -- 20171105060104 / 20171112061099 / 20171112063586
# 2 QB, 2 RB, 0 TE, 2 WR              66 -- 1 QB, 5 OL, 3 RB, 0 TE, 2 WR -- 20181021012259 / 20181209042583 / 20191117063033
# 1 RB, 2 TE, 1 WR,1 DB               66 -- 1 QB, 5 OL, 1 RB, 2 TE, 2 WR -- 20181111110273 / 20181118002325 / 20181129000675
# 2 QB, 1 RB, 3 TE, 0 WR              44 -- 1 QB, 5 OL, 2 RB, 3 TE, 0 WR -- 20181028013051 / 20181028112146
# 6 OL, 1 RB, 0 TE, 2 WR,1 LB         44 -- 1 QB, 6 OL, 2 RB, 0 TE, 2 WR -- 20191117100236 / 20191117100497
# 2 RB, 2 TE, 0 WR,1 DL               44 -- 1 QB, 6 OL, 2 RB, 2 TE, 0 WR -- 20180930100710 / 20181101000953
# 7 OL, 1 RB, 1 TE, 0 WR,1 LB         44 -- 1 QB, 6 OL, 3 RB, 1 TE, 0 WR -- 20190915090924 / 20190915090949
# 2 QB, 2 RB, 2 TE, 0 WR              22 -- 1 QB, 5 OL, 3 RB, 2 TE, 0 WR -- 20181008003713
# 2 RB, 3 TE, 1 WR                    22 -- 1 QB, 5 OL, 1 RB, 3 TE, 1 WR -- 20170924102908
# 1 RB, 3 TE, 0 WR,1 DB               22 -- 1 QB, 5 OL, 1 RB, 3 TE, 1 WR -- 20171116000514
# 6 OL, 1 RB, 1 TE, 1 WR,1 LB         22 -- 1 QB, 6 OL, 2 RB, 1 TE, 1 WR -- 20191124090534
# 1 RB, 2 TE, 3 WR                    22 -- 1 QB, 5 OL, 1 RB, 2 TE, 2 WR -- 20171112080136
# 2 RB, 1 TE, 1 WR,1 DB               22 -- 1 QB, 5 OL, 2 RB, 1 TE, 2 WR -- 20181104013118
# 6 OL, 1 RB, 1 TE, 0 WR,2 DL         22 -- 1 QB, 6 OL, 3 RB, 1 TE, 0 WR -- 20181202071749
# 2 QB, 3 RB, 1 TE, 0 WR              22 -- 1 QB, 6 OL, 3 RB, 1 TE, 0 WR -- 20181021012713

working_df['OffensePersonnel'].replace(to_replace={'1 RB, 1 TE, 2 WR,1 LB' : '1 QB, 5 OL, 2 RB, 1 TE, 2 WR', 
                                                   '1 RB, 0 TE, 3 WR,1 DB' : '1 QB, 5 OL, 1 RB, 0 TE, 4 WR', 
                                                   '2 QB, 2 RB, 0 TE, 2 WR' : '1 QB, 5 OL, 3 RB, 0 TE, 2 WR', 
                                                   '1 RB, 2 TE, 1 WR,1 DB' : '1 QB, 5 OL, 1 RB, 2 TE, 2 WR', 
                                                   '2 QB, 1 RB, 3 TE, 0 WR' : '1 QB, 5 OL, 2 RB, 3 TE, 0 WR', 
                                                   '6 OL, 1 RB, 0 TE, 2 WR,1 LB' : '1 QB, 6 OL, 2 RB, 0 TE, 2 WR', 
                                                   '2 RB, 2 TE, 0 WR,1 DL' : '1 QB, 6 OL, 2 RB, 2 TE, 0 WR', 
                                                   '7 OL, 1 RB, 1 TE, 0 WR,1 LB' : '1 QB, 6 OL, 3 RB, 1 TE, 0 WR', 
                                                   '2 QB, 2 RB, 2 TE, 0 WR' : '1 QB, 5 OL, 3 RB, 2 TE, 0 WR', 
                                                   '2 RB, 3 TE, 1 WR' : '1 QB, 5 OL, 1 RB, 3 TE, 1 WR', 
                                                   '1 RB, 3 TE, 0 WR,1 DB' : '1 QB, 5 OL, 1 RB, 3 TE, 1 WR', 
                                                   '6 OL, 1 RB, 1 TE, 1 WR,1 LB' : '1 QB, 6 OL, 2 RB, 1 TE, 1 WR', 
                                                   '1 RB, 2 TE, 3 WR' : '1 QB, 5 OL, 1 RB, 2 TE, 2 WR', 
                                                   '2 RB, 1 TE, 1 WR,1 DB' : '1 QB, 5 OL, 2 RB, 1 TE, 2 WR', 
                                                   '6 OL, 1 RB, 1 TE, 0 WR,2 DL' : '1 QB, 6 OL, 3 RB, 1 TE, 0 WR', 
                                                   '2 QB, 3 RB, 1 TE, 0 WR' : '1 QB, 6 OL, 3 RB, 1 TE, 0 WR', 
                                                  }, inplace=True)


### finally, the following were adjusted using football logic
### when in doubt, an out of place player is counted as RB since this df is rushing plays
### whether Machine Vision or manual jersey number, DL are probs OL

# 1 RB, 2 TE, 1 WR,1 DL             3102 -- 1 QB, 6 OL, 1 RB, 2 TE, 1 WR
# 2 QB, 1 RB, 1 TE, 2 WR            1716 -- 1 QB, 5 OL, 2 RB, 1 TE, 2 WR
# 1 RB, 1 TE, 2 WR,1 DL              748 -- 1 QB, 6 OL, 1 RB, 1 TE, 2 WR
# 2 QB, 2 RB, 1 TE, 1 WR             550 -- 1 QB, 5 OL, 3 RB, 1 TE, 1 WR
# 1 RB, 3 TE, 0 WR,1 DL              506 -- 1 QB, 6 OL, 1 RB, 3 TE, 0 WR
# 2 QB, 1 RB, 2 TE, 1 WR             462 -- 1 QB, 5 OL, 2 RB, 2 TE, 1 WR
# 6 OL, 1 RB, 2 TE, 0 WR,1 LB        440 -- 1 QB, 6 OL, 2 RB, 2 TE, 0 WR
# 6 OL, 1 RB, 2 TE, 0 WR,1 DL        374 -- 1 QB, 6 OL, 2 RB, 2 TE, 0 WR
# 1 RB, 2 TE, 1 WR,1 LB              264 -- 1 QB, 5 OL, 2 RB, 2 TE, 1 WR
# 1 RB, 1 TE, 2 WR,1 DB              242 -- 1 QB, 5 OL, 1 RB, 1 TE, 3 WR
# 6 OL, 2 RB, 1 TE, 0 WR,1 DL        198 -- 1 QB, 6 OL, 3 RB, 1 TE, 0 WR
# 2 QB, 1 RB, 0 TE, 3 WR             198 -- 1 QB, 5 OL, 2 RB, 0 TE, 3 WR
# 2 QB, 6 OL, 1 RB, 1 TE, 1 WR       176 -- 1 QB, 6 OL, 2 RB, 1 TE, 1 WR
# 6 OL, 1 RB, 1 TE, 1 WR,1 DL        154 -- 1 QB, 6 OL, 2 RB, 1 TE, 1 WR
# 1 RB, 3 TE, 0 WR,1 LB              154 -- 1 QB, 5 OL, 2 RB, 3 TE, 0 WR

working_df['OffensePersonnel'].replace(to_replace={'1 RB, 2 TE, 1 WR,1 DL' : '1 QB, 6 OL, 1 RB, 2 TE, 1 WR', 
                                                   '2 QB, 1 RB, 1 TE, 2 WR' : '1 QB, 5 OL, 2 RB, 1 TE, 2 WR',
                                                   '1 RB, 1 TE, 2 WR,1 DL' : '1 QB, 6 OL, 1 RB, 1 TE, 2 WR', 
                                                   '2 QB, 2 RB, 1 TE, 1 WR' : '1 QB, 5 OL, 3 RB, 1 TE, 1 WR', 
                                                   '1 RB, 3 TE, 0 WR,1 DL' : '1 QB, 6 OL, 1 RB, 3 TE, 0 WR',
                                                   '2 QB, 1 RB, 2 TE, 1 WR' : '1 QB, 5 OL, 2 RB, 2 TE, 1 WR', 
                                                   '6 OL, 1 RB, 2 TE, 0 WR,1 LB' : '1 QB, 6 OL, 2 RB, 2 TE, 0 WR', 
                                                   '6 OL, 1 RB, 2 TE, 0 WR,1 DL' : '1 QB, 6 OL, 2 RB, 2 TE, 0 WR', 
                                                   '1 RB, 2 TE, 1 WR,1 LB' : '1 QB, 5 OL, 2 RB, 2 TE, 1 WR', 
                                                   '1 RB, 1 TE, 2 WR,1 DB' : '1 QB, 5 OL, 1 RB, 1 TE, 3 WR', 
                                                   '6 OL, 2 RB, 1 TE, 0 WR,1 DL' : '1 QB, 6 OL, 3 RB, 1 TE, 0 WR', 
                                                   '2 QB, 1 RB, 0 TE, 3 WR' : '1 QB, 5 OL, 2 RB, 0 TE, 3 WR', 
                                                   '2 QB, 6 OL, 1 RB, 1 TE, 1 WR' : '1 QB, 6 OL, 2 RB, 1 TE, 1 WR', 
                                                   '6 OL, 1 RB, 1 TE, 1 WR,1 DL' : '1 QB, 6 OL, 2 RB, 1 TE, 1 WR',
                                                   '1 RB, 3 TE, 0 WR,1 LB' : '1 QB, 5 OL, 2 RB, 3 TE, 0 WR'
                                                  }, inplace= True)

In [34]:
# function needs to go to lh, removed from here 

def labelRB(OffensePersonnel):
    groups = OffensePersonnel.split(',')
    for group in groups:
        if "RB" in group:
            backs = [int(x) for x in group if x.isdigit()]
            backs = backs[0]
            return backs

working_df['RB'] = working_df['OffensePersonnel'].apply(labelRB)

In [35]:
# function needs to go to lh, removed from here 

def labelTE(OffensePersonnel):
    groups = OffensePersonnel.split(',')
    for group in groups:
        if "TE" in group:
            ends = [int(x) for x in group if x.isdigit()]
            ends = ends[0]
            return ends
        
working_df['TE'] = working_df['OffensePersonnel'].apply(labelTE)

In [36]:
# function needs to go to lh, removed from here 

def labelWR(OffensePersonnel):
    groups = OffensePersonnel.split(',')
    for group in groups:
        if "WR" in group:
            receivers = [int(x) for x in group if x.isdigit()]
            receivers = receivers[0]
            return receivers

working_df['WR'] = working_df['OffensePersonnel'].apply(labelWR)

In [37]:
# function needs to go to lh, removed from here 

def labelQB(OffensePersonnel):
    groups = OffensePersonnel.split(',')
    for group in groups:
        if "QB" in group:
            passers = [int(x) for x in group if x.isdigit()]
            passers = passers[0]
            return passers

working_df['QB'] = working_df['OffensePersonnel'].apply(labelQB)

In [38]:
# function needs to go to lh, removed from here 

def labelOL(OffensePersonnel):
    groups = OffensePersonnel.split(',')
    for group in groups:
        if "OL" in group:
            linemen = [int(x) for x in group if x.isdigit()]
            linemen = linemen[0]
            return linemen

working_df['OL'] = working_df['OffensePersonnel'].apply(labelOL)

In [39]:
working_df.drop(columns=['OffensePersonnel'], axis=1, inplace=True)

### DefendersInTheBox

In [40]:
# one null value, filled with 7: mean of column is (6.9), median is 7
defendersMedian = working_df['DefendersInTheBox'].median()

working_df['DefendersInTheBox'].fillna(value=defendersMedian, inplace=True)

### DefensePersonnel

In [42]:
# working_df['DefensePersonnel'].value_counts()

In [43]:
# all values that contained proper personnel total were left as is, 
# the few outliers were corrected below with the assistance of film

working_df['DefensePersonnel'].replace(to_replace={'2 DL, 4 LB, 4 DB, 1 RB' : '3 DL, 4 LB, 4 DB',
                                                   '1 DL, 4 LB, 5 DB, 1 RB' : '2 DL, 4 LB, 5 DB',
                                                   '5 DL, 3 LB, 2 DB, 1 OL' : '6 OL, 3 DB, 2 DB',
                                                   '0 DL, 5 LB, 6 DB' : '5 DL, 0 LB, 6 DB',      
                                                   '0 DL, 4 LB, 7 DB' : '3 DL, 1 LB, 7 DB',      
                                                   '1 DL, 3 LB, 6 DB, 1 RB' : '3 DL, 2 LB, 6 DB',     
                                                   '5 DL, 4 LB, 1 DB, 1 OL' : '6 DL, 4 LB, 1 DB',
                                                   '2 DL, 4 LB, 4 DB, 1 OL' : '3 OL, 4 LB, 4 DB',     
                                                   '2 DL, 3 LB, 5 DB, 1 RB' : '3 DL, 3 LB, 5 DB',
                                                   '3 DL, 4 LB, 3 DB, 1 RB' : '4 DL, 4 LB, 3 DB',
                                                   '0 DL, 6 LB, 5 DB' : '6 DL, 0 LB, 5 DB',      
                                                   '4 DL, 5 LB, 1 DB, 1 OL' : '5 DL, 5 LB, 1 DB',     
                                                   '3 DL, 4 LB, 3 DB, 1 OL' : '4 DL, 4 LB, 3 DB',     
                                                   '0 DL, 4 LB, 6 DB, 1 RB' : '5 DL, 0 LB, 6 DB'
                                                  }, inplace=True)

In [44]:
def labelDL(DefensePersonnel):
    groups = DefensePersonnel.split(',')
    for group in groups:
        if "DL" in group:
            linemen = [int(x) for x in group if x.isdigit()]
            linemen = linemen[0]
            return linemen

working_df['DL'] = working_df['DefensePersonnel'].apply(labelDL)

In [45]:
def labelLB(DefensePersonnel):
    groups = DefensePersonnel.split(',')
    for group in groups:
        if "LB" in group:
            backers = [int(x) for x in group if x.isdigit()]
            backers = backers[0]
            return backers

working_df['LB'] = working_df['DefensePersonnel'].apply(labelLB)

In [46]:
def labelDB(DefensePersonnel):
    groups = DefensePersonnel.split(',')
    for group in groups:
        if "DB" in group:
            backs = [int(x) for x in group if x.isdigit()]
            backs = backs[0]
            return backs

working_df['DB'] = working_df['DefensePersonnel'].apply(labelDB)

In [47]:
working_df.drop(columns='DefensePersonnel', axis=1, inplace=True)

### TimeHandoff

### TimeSnap

### Yards

In [48]:
working_df['Yards'].isnull().sum()

0

### HomeTeamAbbr + VisitorTeamAbbr

In [49]:
# see PossessionTeam section above

### Week

In [50]:
working_df['Week'].isnull().sum()
# leave as is

0

### StadiumType

In [51]:
# working_df['StadiumType'].value_counts()

In [52]:
working_df['EnclosedStadium'] = 0
working_df['OpenStadium'] = 0

indoor = ['Indoors', 'Dome', 'Indoor', 'Retr. Roof-Closed', 'Domed, closed', 'Retr. Roof - Closed',
         'Retractable Roof - Closed', 'Closed Dome', 'Dome, closed', 'Domed', 'indoor',
         'Indoor, Roof Closed', 'Retr. Roof Closed']
openAir = ['Outdoor', 'Outdoors', 'Retractable Roof', 'Open', 'Domed, open', 'Retr. Roof-Open',
          'Domed, Open', 'OUTDOOR', 'Outdoor', 'Outddors', 'Bowl', 'Retr. Roof - Open', 'Outdoor Retr Roof-Open',
          'Outdor', 'Ourdoor', 'Indoor, Open Roof', 'Outside', 'Cloudy', 'Indoor, roof open', 'Heinz Field']

working_df.loc[working_df['StadiumType'].isin(indoor), 'EnclosedStadium'] = 1
working_df.loc[working_df['StadiumType'].isin(openAir), 'EnclosedStadium'] = 0

working_df.loc[working_df['StadiumType'].isin(openAir), 'OpenStadium'] = 1
working_df.loc[working_df['StadiumType'].isin(indoor), 'OpenStadium'] = 0

In [57]:
working_df.drop(columns='StadiumType', axis=1, inplace=True)

### Turf

In [58]:
# simplify Turf classes
# this was done by manually verifying each playing surface brand name
turf_list = ['Field Turf', 'Artificial', 'FieldTurf', 'UBU Speed Series-S5-M',
            'A-Turf Titan', 'UBU Sports Speed S5-M', 'FieldTurf360', 'Twenty-Four/Seven Turf',
            'FieldTurf 360', 'Twenty Four/Seven Turf', 'Turf', 'Field turf', 
             'UBU-Speed Series-S5-M', 'Artifical']
grass_list = ['Grass', 'Natural Grass', 'Natural', 'Naturall Grass', 'natural grass', 'grass',
             'Natural grass']
hybrid_list = ['SISGrass', 'DD GrassMaster']

working_df.loc[working_df['Turf'].isin(turf_list), 'Turf'] = "TurfField"
working_df.loc[working_df['Turf'].isin(grass_list), 'Turf'] = "GrassField"
working_df.loc[working_df['Turf'].isin(hybrid_list), 'Turf'] = "HybridField"

In [59]:
turf_dummies = pd.get_dummies(data=working_df['Turf'])

In [60]:
working_df = pd.merge(working_df, turf_dummies, on='PlayId')

In [61]:
working_df.drop(columns='Turf', axis=1, inplace=True)

### GameWeather

In [62]:
# working_df['GameWeather'].value_counts()

In [63]:
working_df['RainOrSnow'] = 0

rainorsnow = ['Rain', 'Light Rain', 'Rain shower', 'Cloudy, Rain', 'Rain and Wind', 'Scattered Showers',
       'Rain likely, temps in low 40s.', 'Cloudy, 50% change of rain', 'Cloudy with showers and wind',
       'Raining', 'Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.',
       'Showers', 'Light rain', 'Rainy', 'Snow', 'Heavy lake effect snow', 'Cloudy, light snow accumulating 1-3"']

working_df.loc[working_df['GameWeather'].isin(rainorsnow), 'RainOrSnow'] = 1

In [64]:
working_df.drop(columns='GameWeather', axis=1, inplace=True)

In [65]:
working_df['RainOrSnow'].value_counts()

0    28530
1     2086
Name: RainOrSnow, dtype: int64

### Temp

In [66]:
# temp had 2862 null values
# fill NA with median of temp based on StadiumType and GameWeather

working_df['Temperature'].isnull().sum()

2862

In [68]:
tempMedian = working_df['Temperature'].median()

working_df['Temperature'].fillna(value=tempMedian, inplace=True)

### Humidity

In [69]:
working_df['Humidity'].isnull().sum()

277

In [70]:
humidityMedian = working_df['Humidity'].median()

working_df['Humidity'].fillna(value=humidityMedian, inplace=True)

### Final Adjustments

In [67]:
working_df['Team'] = le.fit_transform(working_df.Team.values)
working_df['Opponent'] = le.fit_transform(working_df.Opponent.values)

In [71]:
working_df

Unnamed: 0_level_0,Season,YardLine,Quarter,GameClock,Team,Down,YardsToFirst,OffenseFormation,DefendersInTheBox,Yards,Week,Temperature,Humidity,isHome,Opponent,YardsToTD,YardsToFGRange,ScoreBeforePlay,OppScoreBeforePlay,ScoreDifferential,RB,TE,WR,QB,OL,DL,LB,DB,EnclosedStadium,OpenStadium,GrassField,HybridField,TurfField,RainOrSnow
PlayId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
20170907000118,0,35,1,854,20,3,2,5,6,8,1,63,77,1,15,65,30,0,0,0,1,1,3,1,5,2,3,6,0,1,0,0,1,0
20170907000139,0,43,1,832,20,1,10,5,6,3,1,63,77,1,15,57,22,0,0,0,1,1,3,1,5,2,3,6,0,1,0,0,1,0
20170907000189,0,35,1,782,20,1,10,6,7,5,1,63,77,1,15,35,0,0,0,0,1,1,3,1,5,2,3,6,0,1,0,0,1,0
20170907000345,0,2,1,732,20,2,2,3,9,2,1,63,77,1,15,2,0,0,0,0,1,0,3,1,6,4,4,3,0,1,0,0,1,0
20170907000395,0,25,1,728,15,1,10,5,7,7,1,63,77,0,20,75,40,0,7,7,1,3,1,1,5,3,2,6,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20191125003419,2,46,4,587,2,1,10,4,8,1,12,62,64,0,16,54,19,42,6,36,2,2,1,1,5,4,3,4,0,1,1,0,0,0
20191125003440,2,47,4,543,2,2,9,4,7,1,12,62,64,0,16,53,18,42,6,36,1,1,3,1,5,3,3,5,0,1,1,0,0,0
20191125003496,2,13,4,484,2,1,10,6,8,1,12,62,64,0,16,13,0,42,6,36,1,1,3,1,5,3,3,5,0,1,1,0,0,0
20191125003768,2,37,4,295,2,1,10,4,7,1,12,62,64,0,16,63,28,45,6,39,2,1,2,1,5,4,3,4,0,1,1,0,0,0


In [90]:
working_df['Yards'].isnull().sum()

0

# Train Test Split

In [73]:
working_df.columns

Index(['Season', 'YardLine', 'Quarter', 'GameClock', 'Team', 'Down',
       'YardsToFirst', 'OffenseFormation', 'DefendersInTheBox', 'Yards',
       'Week', 'Temperature', 'Humidity', 'isHome', 'Opponent', 'YardsToTD',
       'YardsToFGRange', 'ScoreBeforePlay', 'OppScoreBeforePlay',
       'ScoreDifferential', 'RB', 'TE', 'WR', 'QB', 'OL', 'DL', 'LB', 'DB',
       'EnclosedStadium', 'OpenStadium', 'GrassField', 'HybridField',
       'TurfField', 'RainOrSnow'],
      dtype='object')

In [95]:
X = working_df[['Season', 'YardLine', 'Quarter', 'GameClock', 'Team', 'Down',
       'YardsToFirst', 'OffenseFormation', 'DefendersInTheBox',
       'Week', 'Temperature', 'Humidity', 'isHome', 'Opponent', 'YardsToTD',
       'YardsToFGRange', 'ScoreBeforePlay', 'OppScoreBeforePlay',
       'ScoreDifferential', 'RB', 'TE', 'WR', 'QB', 'OL', 'DL', 'LB', 'DB',
       'EnclosedStadium', 'OpenStadium', 'GrassField', 'HybridField',
       'TurfField', 'RainOrSnow']]
y = working_df[['Yards']]

In [96]:
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [97]:
Y = yTrain

kfd = KFold(10)

def kfold(model, score_type) :
    kfold_scores = []
    
    for train_index, test_index in kfd.split(X, Y):
        # print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        
        clf = model()
        clf.fit(X_train, Y_train)
        Y_pred = clf.predict(X_test)
        score = score_type(Y_test, Y_pred)
        kfold_scores.append(score)
        
    
    return(sum(kfold_scores)/len(kfold_scores))

# Modeling

In [98]:
lr_maScore = kfold(LinearRegression, mean_absolute_error)
lr_msScore = kfold(LinearRegression, mean_squared_error)
lr_vaScore = kfold(LinearRegression, explained_variance_score)
lr_r2Score = kfold(LinearRegression, r2_score)

ValueError: Found input variables with inconsistent numbers of samples: [30616, 24492]

In [91]:
sv_maScore = kfold(LinearSVR, mean_absolute_error)
sv_msScore = kfold(LinearSVR, mean_squared_error)
sv_vaScore = kfold(LinearSVR, explained_variance_score)
sv_r2Score = kfold(LinearSVR, r2_score)

ValueError: Found input variables with inconsistent numbers of samples: [30616, 24492]

In [92]:
rf_maScore = kfold(RandomForestRegressor, mean_absolute_error)
rf_msScore = kfold(RandomForestRegressor, mean_squared_error)
rf_vaScore = kfold(RandomForestRegressor, explained_variance_score)
rf_r2Score = kfold(RandomForestRegressor, r2_score)

ValueError: Found input variables with inconsistent numbers of samples: [30616, 24492]

In [93]:
dt_maScore = kfold(DecisionTreeRegressor, mean_absolute_error)
dt_msScore = kfold(DecisionTreeRegressor, mean_squared_error)
dt_vaScore = kfold(DecisionTreeRegressor, explained_variance_score)
dt_r2Score = kfold(DecisionTreeRegressor, r2_score)

ValueError: Found input variables with inconsistent numbers of samples: [30616, 24492]

In [94]:
modelEvaluation = pd.DataFrame({'Model': ['Linear Regression', 'LinearSVR','Decision Tree', 'Random Forest'], 
                       'Mean Absolute Error' : [lr_maScore, sv_maScore, rf_maScore, dt_maScore],
                       'Mean Squared Error' : [lr_msScore, sv_msScore, rf_msScore, dt_msScore],
                       'Explained Variance' : [lr_vaScore, sv_vaScore, rf_vaScore, dt_vaScore],
                       'R2 Score' : [lr_r2Score, sv_r2Score, rf_r2Score, dt_r2Score]})

modelEvaluation

NameError: name 'lr_maScore' is not defined