In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings(action='ignore')

# data
import pandas as pd
import numpy as np
import random as rnd
import sklearn.preprocessing
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from collections import Counter

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import RadiusNeighborsClassifier

from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor


from sklearn.model_selection import GridSearchCV
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [2]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
pd.set_option('precision', 0)

In [3]:
origin_df = pd.read_csv('bdb.csv', low_memory = False)
# origin_df.head(60)

In [4]:
origin_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 682154 entries, 0 to 682153
Data columns (total 49 columns):
GameId                    682154 non-null int64
PlayId                    682154 non-null int64
Team                      682154 non-null object
X                         682154 non-null float64
Y                         682154 non-null float64
S                         682154 non-null float64
A                         682154 non-null float64
Dis                       682154 non-null float64
Orientation               682131 non-null float64
Dir                       682126 non-null float64
NflId                     682154 non-null int64
DisplayName               682154 non-null object
JerseyNumber              682154 non-null int64
Season                    682154 non-null int64
YardLine                  682154 non-null int64
Quarter                   682154 non-null int64
GameClock                 682154 non-null object
PossessionTeam            682154 non-null object
Down   

In [5]:
working_df = origin_df.copy()

In [6]:
working_df.columns

Index(['GameId', 'PlayId', 'Team', 'X', 'Y', 'S', 'A', 'Dis', 'Orientation',
       'Dir', 'NflId', 'DisplayName', 'JerseyNumber', 'Season', 'YardLine',
       'Quarter', 'GameClock', 'PossessionTeam', 'Down', 'Distance',
       'FieldPosition', 'HomeScoreBeforePlay', 'VisitorScoreBeforePlay',
       'NflIdRusher', 'OffenseFormation', 'OffensePersonnel',
       'DefendersInTheBox', 'DefensePersonnel', 'PlayDirection', 'TimeHandoff',
       'TimeSnap', 'Yards', 'PlayerHeight', 'PlayerWeight', 'PlayerBirthDate',
       'PlayerCollegeName', 'Position', 'HomeTeamAbbr', 'VisitorTeamAbbr',
       'Week', 'Stadium', 'Location', 'StadiumType', 'Turf', 'GameWeather',
       'Temperature', 'Humidity', 'WindSpeed', 'WindDirection'],
      dtype='object')

### Setup

In [7]:
working_df.drop(columns=['GameId','Dir', 'X', 'Y', 'Dis', 'Orientation','DisplayName', 
                         'JerseyNumber', 'PlayerHeight', 'PlayerWeight', 'TimeHandoff',
                         'PlayerBirthDate', 'PlayerCollegeName', 'Position', 'Team', 'TimeSnap',
                         'Stadium', 'Location', 'WindSpeed', 'WindDirection', 'PlayDirection']
                , axis= 1, inplace= True)

In [8]:
#working_df.drop_duplicates(subset= "PlayId", inplace= True)

In [9]:
#working_df.set_index('PlayId', inplace=True)

### Season

In [10]:
le = sklearn.preprocessing.LabelEncoder()

In [11]:
working_df['Season'] = le.fit_transform(working_df.Season.values)

### YardLine

In [12]:
working_df['YardLine'].isnull().sum()
# leave as is

0

### Quarter

In [13]:
working_df['Quarter'].value_counts()
# leave as is

1    174724
4    171138
3    168256
2    163702
5      4334
Name: Quarter, dtype: int64

### GameClock

In [14]:
# Because Quarter is a feature, we can just make convert this column into seconds that are left in the quarter

def GameClockSeconds(GameClock):
    time_split = GameClock.split(':')
    seconds = int(time_split[0])*60 + int(time_split[1]) 
    return seconds

working_df['GameClock'] = working_df['GameClock'].apply(GameClockSeconds)

### PossessionTeam

In [15]:
working_df.rename(columns={'PossessionTeam':'Team'}, inplace=True)

In [16]:
# The below code fixes incorrect names and is included in the data
# cleaning module:

working_df['Team'].replace(to_replace= {'ARZ':'ARI', 'BLT':'BAL',
                                                  'CLV':'CLE', 'HST':'HOU'}, 
                                    inplace= True)

working_df['FieldPosition'].replace(to_replace= {'ARZ':'ARI', 'BLT':'BAL',
                                                  'CLV':'CLE', 'HST':'HOU'}, 
                                    inplace= True)

In [17]:
# create isHome column
working_df['isHome'] = 0

working_df.loc[working_df['Team'] == working_df['HomeTeamAbbr'], 'isHome'] = 1 
working_df.loc[working_df['Team'] == working_df['VisitorTeamAbbr'], 'isHome'] = 0 

In [18]:
# establish categorical Opponent column

working_df.loc[working_df['Team'] == working_df['HomeTeamAbbr'], 'Opponent'] = working_df['VisitorTeamAbbr']
working_df.loc[working_df['Team'] == working_df['VisitorTeamAbbr'], 'Opponent'] = working_df['HomeTeamAbbr'] 

In [19]:
working_df.drop(columns=['HomeTeamAbbr', 'VisitorTeamAbbr'], axis=1, inplace=True)

### Down

In [20]:
working_df['Down'].value_counts()
# leave as is

1    394350
2    228646
3     53262
4      5896
Name: Down, dtype: int64

### Distance

In [21]:
working_df.rename(columns={'Distance':'YardsToFirst'}, inplace=True)

### FieldPosition

In [22]:
# if FieldPosition = Team, ball's in their territory.

working_df['FieldPosition'].isnull().sum()

8602

In [23]:
working_df = working_df[working_df['FieldPosition'].notna()]

In [24]:
working_df.loc[working_df['FieldPosition'] == working_df['Team'], 'YardsToTD'] = ((50 - working_df['YardLine']) + 50).astype(int)
working_df.loc[working_df['FieldPosition'] == working_df['Opponent'], 'YardsToTD'] = working_df['YardLine'].astype(int)

In [25]:
working_df.loc[working_df['FieldPosition'] == working_df['Team'], 'YardsToFGRange'] = ((50 - working_df['YardLine']) + 15).astype(int)

working_df.loc[working_df['FieldPosition'] == working_df['Opponent'], 'YardsToFGRange'] = (working_df['YardLine'] - 35).astype(int)
working_df.loc[working_df['YardsToFGRange'] < 0, 'YardsToFGRange'] = 0

In [26]:
working_df.drop(columns='FieldPosition', axis=1, inplace=True)

### HomeScoreBeforePlay + VisitorScoreBeforePlay

In [27]:
working_df.loc[working_df['isHome'] == 1, 'ScoreBeforePlay'] = working_df['HomeScoreBeforePlay'].astype(int)
working_df.loc[working_df['isHome'] == 0, 'ScoreBeforePlay'] = working_df['VisitorScoreBeforePlay'].astype(int)

working_df.loc[working_df['isHome'] == 1, 'OppScoreBeforePlay'] = working_df['VisitorScoreBeforePlay'].astype(int)
working_df.loc[working_df['isHome'] == 0, 'OppScoreBeforePlay'] = working_df['HomeScoreBeforePlay'].astype(int)

In [28]:
working_df.drop(columns=['HomeScoreBeforePlay', 'VisitorScoreBeforePlay'], axis=1, inplace=True)

In [29]:
working_df['ScoreDifferential'] = (working_df['ScoreBeforePlay'] - working_df['OppScoreBeforePlay']).abs().astype(int)

### NflIdRusher

In [30]:
working_df.drop(working_df[working_df['NflId'] != working_df['NflIdRusher']].index, inplace = True) 

In [31]:
working_df.rename(columns={'NflIdRusher':'ballCarrier'}, inplace=True)
working_df.drop(columns=['NflId'], axis=1, inplace=True)

### OffenseFormation

In [32]:
working_df['OffenseFormation'].value_counts()

SINGLEBACK    13435
SHOTGUN        9270
I_FORM         6156
PISTOL          966
JUMBO           676
WILDCAT          77
EMPTY            31
ACE               1
Name: OffenseFormation, dtype: int64

In [33]:
working_df['OffenseFormation'] = working_df['OffenseFormation'].astype(str)

In [34]:
working_df['OffenseFormation'] = le.fit_transform(working_df.OffenseFormation)

### OffensePersonnel

In [35]:
# FIX GROUPS

### these we can make assumption of 1 QB and 5 OL, add both:

# 1 RB, 1 TE, 3 WR                310288 -- 1 QB, 5 OL, 1 RB, 1 TE, 3 WR
# 1 RB, 2 TE, 2 WR                155232 -- 1 QB, 5 OL, 1 RB, 2 TE, 2 WR 
# 2 RB, 1 TE, 2 WR                 71764 -- 1 QB, 5 OL, 2 RB, 1 TE, 2 WR 
# 1 RB, 3 TE, 1 WR                 39556 -- 1 QB, 5 OL, 1 RB, 3 TE, 1 WR 
# 2 RB, 2 TE, 1 WR                 34958 -- 1 QB, 5 OL, 2 RB, 2 TE, 1 WR
# 2 RB, 0 TE, 3 WR                  4752 -- 1 QB, 5 OL, 2 RB, 0 TE, 3 WR 
# 1 RB, 0 TE, 4 WR                  3828 -- 1 QB, 5 OL, 1 RB, 0 TE, 4 WR
# 2 RB, 3 TE, 0 WR                  2134 -- 1 QB, 5 OL, 2 RB, 3 TE, 0 WR
# 3 RB, 1 TE, 1 WR                   726 -- 1 QB, 5 OL, 3 RB, 1 TE, 1 WR 
# 3 RB, 0 TE, 2 WR                   308 -- 1 QB, 5 OL, 3 RB, 0 TE, 2 WR 
# 0 RB, 2 TE, 3 WR                   220 -- 1 QB, 5 OL, 0 RB, 2 TE, 3 WR 
# 1 RB, 4 TE, 0 WR                   154 -- 1 QB, 5 OL, 1 RB, 4 TE, 0 WR 
# 3 RB, 2 TE, 0 WR                   132 -- 1 QB, 5 OL, 3 RB, 2 TE, 0 WR 
# 0 RB, 3 TE, 2 WR                    66 -- 1 QB, 5 OL, 0 RB, 3 TE, 2 WR 
# 0 RB, 0 TE, 5 WR                    44 -- 1 QB, 5 OL, 0 RB, 0 TE, 5 WR 
# 0 RB, 1 TE, 4 WR                  1738 -- 1 QB, 5 OL, 0 RB, 1 TE, 4 WR 

working_df['OffensePersonnel'].replace(to_replace={'1 RB, 1 TE, 3 WR' : '1 QB, 5 OL, 1 RB, 1 TE, 3 WR', 
                                                   '1 RB, 2 TE, 2 WR' : '1 QB, 5 OL, 1 RB, 2 TE, 2 WR',  
                                                   '2 RB, 1 TE, 2 WR' : '1 QB, 5 OL, 2 RB, 1 TE, 2 WR',  
                                                   '1 RB, 3 TE, 1 WR' : '1 QB, 5 OL, 1 RB, 3 TE, 1 WR',  
                                                   '2 RB, 2 TE, 1 WR' : '1 QB, 5 OL, 2 RB, 2 TE, 1 WR', 
                                                   '2 RB, 0 TE, 3 WR' : '1 QB, 5 OL, 2 RB, 0 TE, 3 WR',  
                                                   '1 RB, 0 TE, 4 WR' : '1 QB, 5 OL, 1 RB, 0 TE, 4 WR', 
                                                   '2 RB, 3 TE, 0 WR' : '1 QB, 5 OL, 2 RB, 3 TE, 0 WR', 
                                                   '3 RB, 1 TE, 1 WR' : '1 QB, 5 OL, 3 RB, 1 TE, 1 WR',  
                                                   '3 RB, 0 TE, 2 WR' : '1 QB, 5 OL, 3 RB, 0 TE, 2 WR',  
                                                   '0 RB, 2 TE, 3 WR' : '1 QB, 5 OL, 0 RB, 2 TE, 3 WR',  
                                                   '1 RB, 4 TE, 0 WR' : '1 QB, 5 OL, 1 RB, 4 TE, 0 WR',  
                                                   '3 RB, 2 TE, 0 WR' : '1 QB, 5 OL, 3 RB, 2 TE, 0 WR',  
                                                   '0 RB, 3 TE, 2 WR' : '1 QB, 5 OL, 0 RB, 3 TE, 2 WR',  
                                                   '0 RB, 0 TE, 5 WR' : '1 QB, 5 OL, 0 RB, 0 TE, 5 WR',
                                                   '0 RB, 1 TE, 4 WR' : '1 QB, 5 OL, 0 RB, 1 TE, 4 WR'
                                                  }, inplace = True)
                                       

### these we can make assumption of 1 QB and 6 OL, add QB:

# 6 OL, 3 RB, 0 TE, 1 WR              22 -- 1 QB, 6 OL, 3 RB, 0 TE, 1 WR 
# 6 OL, 0 RB, 2 TE, 2 WR              22 -- 1 QB, 6 OL, 0 RB, 2 TE, 2 WR 
# 6 OL, 2 RB, 0 TE, 2 WR            1452 -- 1 QB, 6 OL, 2 RB, 0 TE, 2 WR
# 6 OL, 1 RB, 3 TE, 0 WR            1606 -- 1 QB, 6 OL, 1 RB, 3 TE, 0 WR
# 6 OL, 1 RB, 0 TE, 3 WR            2948 -- 1 QB, 6 OL, 1 RB, 0 TE, 3 WR
# 6 OL, 2 RB, 2 TE, 0 WR            4444 -- 1 QB, 6 OL, 2 RB, 2 TE, 0 WR

# 6 OL, 2 RB, 1 TE, 1 WR            7392 -- 1 QB, 6 OL, 2 RB, 1 TE, 1 WR
# 6 OL, 1 RB, 1 TE, 2 WR           12936 -- 1 QB, 6 OL, 1 RB, 1 TE, 2 WR
# 6 OL, 1 RB, 2 TE, 1 WR           14190 -- 1 QB, 6 OL, 1 RB, 2 TE, 1 WR

working_df['OffensePersonnel'].replace(to_replace={'6 OL, 3 RB, 0 TE, 1 WR' : '1 QB, 6 OL, 3 RB, 0 TE, 1 WR', 
                                                   '6 OL, 0 RB, 2 TE, 2 WR' : '1 QB, 6 OL, 0 RB, 2 TE, 2 WR', 
                                                   '6 OL, 2 RB, 0 TE, 2 WR' : '1 QB, 6 OL, 2 RB, 0 TE, 2 WR', 
                                                   '6 OL, 1 RB, 3 TE, 0 WR' : '1 QB, 6 OL, 1 RB, 3 TE, 0 WR', 
                                                   '6 OL, 1 RB, 0 TE, 3 WR' : '1 QB, 6 OL, 1 RB, 0 TE, 3 WR',
                                                   '6 OL, 2 RB, 2 TE, 0 WR' : '1 QB, 6 OL, 1 RB, 0 TE, 3 WR', 
                                                   '6 OL, 2 RB, 1 TE, 1 WR' : '1 QB, 6 OL, 2 RB, 1 TE, 1 WR', 
                                                   '6 OL, 1 RB, 1 TE, 2 WR' : '1 QB, 6 OL, 1 RB, 1 TE, 2 WR', 
                                                   '6 OL, 1 RB, 2 TE, 1 WR' : '1 QB, 6 OL, 1 RB, 2 TE, 1 WR'
                                                  }, inplace= True)


### due to high count of plays, it is unlikely 7 ol were used. RB was probs seen as OL. 
### subtract one OL, add on RB, add QB

# 7 OL, 1 RB, 0 TE, 2 WR             682 -- 1 QB, 6 OL, 2 RB, 0 TE, 2 WR
# 7 OL, 1 RB, 2 TE, 0 WR             308 -- 1 QB, 6 OL, 2 RB, 2 TE, 0 WR
# 7 OL, 1 RB, 1 TE, 1 WR              44 -- 1 QB, 6 OL, 1 RB, 2 TE, 1 WR
# 7 OL, 2 RB, 1 TE, 0 WR              88 -- 1 QB, 6 OL, 3 RB, 1 TE, 0 WR
# 7 OL, 2 RB, 0 TE, 1 WR             220 -- 1 QB, 6 OL, 3 RB, 0 TE, 1 WR
# 7 OL, 1 RB, 2 TE, 0 WR             308 -- 1 QB, 6 OL, 2 RB, 2 TE, 0 WR

working_df['OffensePersonnel'].replace(to_replace={'7 OL, 1 RB, 0 TE, 2 WR' : '1 QB, 6 OL, 2 RB, 0 TE, 2 WR', 
                                                   '7 OL, 1 RB, 2 TE, 0 WR' : '1 QB, 6 OL, 2 RB, 2 TE, 0 WR', 
                                                   '7 OL, 1 RB, 1 TE, 1 WR' : '1 QB, 6 OL, 1 RB, 2 TE, 1 WR', 
                                                   '7 OL, 2 RB, 1 TE, 0 WR' : '1 QB, 6 OL, 3 RB, 1 TE, 0 WR',
                                                   '7 OL, 2 RB, 0 TE, 1 WR' : '1 QB, 6 OL, 3 RB, 0 TE, 1 WR',
                                                   '7 OL, 1 RB, 2 TE, 0 WR' : '1 QB, 6 OL, 2 RB, 2 TE, 0 WR'
                                                  }, inplace= True)

### the following were corrected by verifying actual game film for each play (NFL GamePass)
### play numbers are included followed by correct grouping and ind. replacement code:

# 1 RB, 1 TE, 2 WR,1 LB               66 -- 1 QB, 5 OL, 2 RB, 1 TE, 2 WR -- 20171113000112 / 20171113001385 / 20191117101942
# 1 RB, 0 TE, 3 WR,1 DB               66 -- 1 QB, 5 OL, 1 RB, 0 TE, 4 WR -- 20171105060104 / 20171112061099 / 20171112063586
# 2 QB, 2 RB, 0 TE, 2 WR              66 -- 1 QB, 5 OL, 3 RB, 0 TE, 2 WR -- 20181021012259 / 20181209042583 / 20191117063033
# 1 RB, 2 TE, 1 WR,1 DB               66 -- 1 QB, 5 OL, 1 RB, 2 TE, 2 WR -- 20181111110273 / 20181118002325 / 20181129000675
# 2 QB, 1 RB, 3 TE, 0 WR              44 -- 1 QB, 5 OL, 2 RB, 3 TE, 0 WR -- 20181028013051 / 20181028112146
# 6 OL, 1 RB, 0 TE, 2 WR,1 LB         44 -- 1 QB, 6 OL, 2 RB, 0 TE, 2 WR -- 20191117100236 / 20191117100497
# 2 RB, 2 TE, 0 WR,1 DL               44 -- 1 QB, 6 OL, 2 RB, 2 TE, 0 WR -- 20180930100710 / 20181101000953
# 7 OL, 1 RB, 1 TE, 0 WR,1 LB         44 -- 1 QB, 6 OL, 3 RB, 1 TE, 0 WR -- 20190915090924 / 20190915090949
# 2 QB, 2 RB, 2 TE, 0 WR              22 -- 1 QB, 5 OL, 3 RB, 2 TE, 0 WR -- 20181008003713
# 2 RB, 3 TE, 1 WR                    22 -- 1 QB, 5 OL, 1 RB, 3 TE, 1 WR -- 20170924102908
# 1 RB, 3 TE, 0 WR,1 DB               22 -- 1 QB, 5 OL, 1 RB, 3 TE, 1 WR -- 20171116000514
# 6 OL, 1 RB, 1 TE, 1 WR,1 LB         22 -- 1 QB, 6 OL, 2 RB, 1 TE, 1 WR -- 20191124090534
# 1 RB, 2 TE, 3 WR                    22 -- 1 QB, 5 OL, 1 RB, 2 TE, 2 WR -- 20171112080136
# 2 RB, 1 TE, 1 WR,1 DB               22 -- 1 QB, 5 OL, 2 RB, 1 TE, 2 WR -- 20181104013118
# 6 OL, 1 RB, 1 TE, 0 WR,2 DL         22 -- 1 QB, 6 OL, 3 RB, 1 TE, 0 WR -- 20181202071749
# 2 QB, 3 RB, 1 TE, 0 WR              22 -- 1 QB, 6 OL, 3 RB, 1 TE, 0 WR -- 20181021012713

working_df['OffensePersonnel'].replace(to_replace={'1 RB, 1 TE, 2 WR,1 LB' : '1 QB, 5 OL, 2 RB, 1 TE, 2 WR', 
                                                   '1 RB, 0 TE, 3 WR,1 DB' : '1 QB, 5 OL, 1 RB, 0 TE, 4 WR', 
                                                   '2 QB, 2 RB, 0 TE, 2 WR' : '1 QB, 5 OL, 3 RB, 0 TE, 2 WR', 
                                                   '1 RB, 2 TE, 1 WR,1 DB' : '1 QB, 5 OL, 1 RB, 2 TE, 2 WR', 
                                                   '2 QB, 1 RB, 3 TE, 0 WR' : '1 QB, 5 OL, 2 RB, 3 TE, 0 WR', 
                                                   '6 OL, 1 RB, 0 TE, 2 WR,1 LB' : '1 QB, 6 OL, 2 RB, 0 TE, 2 WR', 
                                                   '2 RB, 2 TE, 0 WR,1 DL' : '1 QB, 6 OL, 2 RB, 2 TE, 0 WR', 
                                                   '7 OL, 1 RB, 1 TE, 0 WR,1 LB' : '1 QB, 6 OL, 3 RB, 1 TE, 0 WR', 
                                                   '2 QB, 2 RB, 2 TE, 0 WR' : '1 QB, 5 OL, 3 RB, 2 TE, 0 WR', 
                                                   '2 RB, 3 TE, 1 WR' : '1 QB, 5 OL, 1 RB, 3 TE, 1 WR', 
                                                   '1 RB, 3 TE, 0 WR,1 DB' : '1 QB, 5 OL, 1 RB, 3 TE, 1 WR', 
                                                   '6 OL, 1 RB, 1 TE, 1 WR,1 LB' : '1 QB, 6 OL, 2 RB, 1 TE, 1 WR', 
                                                   '1 RB, 2 TE, 3 WR' : '1 QB, 5 OL, 1 RB, 2 TE, 2 WR', 
                                                   '2 RB, 1 TE, 1 WR,1 DB' : '1 QB, 5 OL, 2 RB, 1 TE, 2 WR', 
                                                   '6 OL, 1 RB, 1 TE, 0 WR,2 DL' : '1 QB, 6 OL, 3 RB, 1 TE, 0 WR', 
                                                   '2 QB, 3 RB, 1 TE, 0 WR' : '1 QB, 6 OL, 3 RB, 1 TE, 0 WR', 
                                                  }, inplace=True)


### finally, the following were adjusted using football logic
### when in doubt, an out of place player is counted as RB since this df is rushing plays
### whether Machine Vision or manual jersey number, DL are probs OL

# 1 RB, 2 TE, 1 WR,1 DL             3102 -- 1 QB, 6 OL, 1 RB, 2 TE, 1 WR
# 2 QB, 1 RB, 1 TE, 2 WR            1716 -- 1 QB, 5 OL, 2 RB, 1 TE, 2 WR
# 1 RB, 1 TE, 2 WR,1 DL              748 -- 1 QB, 6 OL, 1 RB, 1 TE, 2 WR
# 2 QB, 2 RB, 1 TE, 1 WR             550 -- 1 QB, 5 OL, 3 RB, 1 TE, 1 WR
# 1 RB, 3 TE, 0 WR,1 DL              506 -- 1 QB, 6 OL, 1 RB, 3 TE, 0 WR
# 2 QB, 1 RB, 2 TE, 1 WR             462 -- 1 QB, 5 OL, 2 RB, 2 TE, 1 WR
# 6 OL, 1 RB, 2 TE, 0 WR,1 LB        440 -- 1 QB, 6 OL, 2 RB, 2 TE, 0 WR
# 6 OL, 1 RB, 2 TE, 0 WR,1 DL        374 -- 1 QB, 6 OL, 2 RB, 2 TE, 0 WR
# 1 RB, 2 TE, 1 WR,1 LB              264 -- 1 QB, 5 OL, 2 RB, 2 TE, 1 WR
# 1 RB, 1 TE, 2 WR,1 DB              242 -- 1 QB, 5 OL, 1 RB, 1 TE, 3 WR
# 6 OL, 2 RB, 1 TE, 0 WR,1 DL        198 -- 1 QB, 6 OL, 3 RB, 1 TE, 0 WR
# 2 QB, 1 RB, 0 TE, 3 WR             198 -- 1 QB, 5 OL, 2 RB, 0 TE, 3 WR
# 2 QB, 6 OL, 1 RB, 1 TE, 1 WR       176 -- 1 QB, 6 OL, 2 RB, 1 TE, 1 WR
# 6 OL, 1 RB, 1 TE, 1 WR,1 DL        154 -- 1 QB, 6 OL, 2 RB, 1 TE, 1 WR
# 1 RB, 3 TE, 0 WR,1 LB              154 -- 1 QB, 5 OL, 2 RB, 3 TE, 0 WR

working_df['OffensePersonnel'].replace(to_replace={'1 RB, 2 TE, 1 WR,1 DL' : '1 QB, 6 OL, 1 RB, 2 TE, 1 WR', 
                                                   '2 QB, 1 RB, 1 TE, 2 WR' : '1 QB, 5 OL, 2 RB, 1 TE, 2 WR',
                                                   '1 RB, 1 TE, 2 WR,1 DL' : '1 QB, 6 OL, 1 RB, 1 TE, 2 WR', 
                                                   '2 QB, 2 RB, 1 TE, 1 WR' : '1 QB, 5 OL, 3 RB, 1 TE, 1 WR', 
                                                   '1 RB, 3 TE, 0 WR,1 DL' : '1 QB, 6 OL, 1 RB, 3 TE, 0 WR',
                                                   '2 QB, 1 RB, 2 TE, 1 WR' : '1 QB, 5 OL, 2 RB, 2 TE, 1 WR', 
                                                   '6 OL, 1 RB, 2 TE, 0 WR,1 LB' : '1 QB, 6 OL, 2 RB, 2 TE, 0 WR', 
                                                   '6 OL, 1 RB, 2 TE, 0 WR,1 DL' : '1 QB, 6 OL, 2 RB, 2 TE, 0 WR', 
                                                   '1 RB, 2 TE, 1 WR,1 LB' : '1 QB, 5 OL, 2 RB, 2 TE, 1 WR', 
                                                   '1 RB, 1 TE, 2 WR,1 DB' : '1 QB, 5 OL, 1 RB, 1 TE, 3 WR', 
                                                   '6 OL, 2 RB, 1 TE, 0 WR,1 DL' : '1 QB, 6 OL, 3 RB, 1 TE, 0 WR', 
                                                   '2 QB, 1 RB, 0 TE, 3 WR' : '1 QB, 5 OL, 2 RB, 0 TE, 3 WR', 
                                                   '2 QB, 6 OL, 1 RB, 1 TE, 1 WR' : '1 QB, 6 OL, 2 RB, 1 TE, 1 WR', 
                                                   '6 OL, 1 RB, 1 TE, 1 WR,1 DL' : '1 QB, 6 OL, 2 RB, 1 TE, 1 WR',
                                                   '1 RB, 3 TE, 0 WR,1 LB' : '1 QB, 5 OL, 2 RB, 3 TE, 0 WR'
                                                  }, inplace= True)

In [36]:
# function needs to go to lh, removed from here 

def labelRB(OffensePersonnel):
    groups = OffensePersonnel.split(',')
    for group in groups:
        if "RB" in group:
            backs = [int(x) for x in group if x.isdigit()]
            backs = backs[0]
            return backs

working_df['RB'] = working_df['OffensePersonnel'].apply(labelRB)

In [37]:
# function needs to go to lh, removed from here 

def labelTE(OffensePersonnel):
    groups = OffensePersonnel.split(',')
    for group in groups:
        if "TE" in group:
            ends = [int(x) for x in group if x.isdigit()]
            ends = ends[0]
            return ends
        
working_df['TE'] = working_df['OffensePersonnel'].apply(labelTE)

In [38]:
# function needs to go to lh, removed from here 

def labelWR(OffensePersonnel):
    groups = OffensePersonnel.split(',')
    for group in groups:
        if "WR" in group:
            receivers = [int(x) for x in group if x.isdigit()]
            receivers = receivers[0]
            return receivers

working_df['WR'] = working_df['OffensePersonnel'].apply(labelWR)

In [39]:
# function needs to go to lh, removed from here 

def labelOL(OffensePersonnel):
    groups = OffensePersonnel.split(',')
    for group in groups:
        if "OL" in group:
            linemen = [int(x) for x in group if x.isdigit()]
            linemen = linemen[0]
            return linemen

working_df['OL'] = working_df['OffensePersonnel'].apply(labelOL)

In [40]:
working_df.drop(columns=['OffensePersonnel'], axis=1, inplace=True)

### DefendersInTheBox

In [41]:
# one null value, filled with 7: mean of column is (6.9), median is 7
defendersMedian = working_df['DefendersInTheBox'].median()

working_df['DefendersInTheBox'].fillna(value=defendersMedian, inplace=True)

In [42]:
working_df['DefendersInTheBox'] = working_df['DefendersInTheBox'].astype(int)

### DefensePersonnel

In [43]:
# working_df['DefensePersonnel'].value_counts()

In [44]:
# all values that contained proper personnel total were left as is, 
# the few outliers were corrected below with the assistance of film

working_df['DefensePersonnel'].replace(to_replace={'2 DL, 4 LB, 4 DB, 1 RB' : '3 DL, 4 LB, 4 DB',
                                                   '1 DL, 4 LB, 5 DB, 1 RB' : '2 DL, 4 LB, 5 DB',
                                                   '5 DL, 3 LB, 2 DB, 1 OL' : '6 DL, 3 DB, 2 DB',
                                                   '0 DL, 5 LB, 6 DB' : '5 DL, 0 LB, 6 DB',      
                                                   '0 DL, 4 LB, 7 DB' : '3 DL, 1 LB, 7 DB',      
                                                   '1 DL, 3 LB, 6 DB, 1 RB' : '3 DL, 2 LB, 6 DB',     
                                                   '5 DL, 4 LB, 1 DB, 1 OL' : '6 DL, 4 LB, 1 DB',
                                                   '2 DL, 4 LB, 4 DB, 1 OL' : '3 DL, 4 LB, 4 DB',     
                                                   '2 DL, 3 LB, 5 DB, 1 RB' : '3 DL, 3 LB, 5 DB',
                                                   '3 DL, 4 LB, 3 DB, 1 RB' : '4 DL, 4 LB, 3 DB',
                                                   '0 DL, 6 LB, 5 DB' : '6 DL, 0 LB, 5 DB',      
                                                   '4 DL, 5 LB, 1 DB, 1 OL' : '5 DL, 5 LB, 1 DB',     
                                                   '3 DL, 4 LB, 3 DB, 1 OL' : '4 DL, 4 LB, 3 DB',     
                                                   '0 DL, 4 LB, 6 DB, 1 RB' : '5 DL, 0 LB, 6 DB'
                                                  }, inplace=True)

In [45]:
def labelDL(DefensePersonnel):
    groups = DefensePersonnel.split(',')
    for group in groups:
        if "DL" in group:
            linemen = [int(x) for x in group if x.isdigit()]
            linemen = linemen[0]
            return linemen

working_df['DL'] = working_df['DefensePersonnel'].apply(labelDL)
working_df['DL'].fillna(value=0, inplace=True)

In [46]:
def labelLB(DefensePersonnel):
    groups = DefensePersonnel.split(',')
    for group in groups:
        if "LB" in group:
            backers = [int(x) for x in group if x.isdigit()]
            backers = backers[0]
            return backers

working_df['LB'] = working_df['DefensePersonnel'].apply(labelLB)
working_df['LB'].fillna(value=0, inplace=True)

In [47]:
def labelDB(DefensePersonnel):
    groups = DefensePersonnel.split(',')
    for group in groups:
        if "DB" in group:
            backs = [int(x) for x in group if x.isdigit()]
            backs = backs[0]
            return backs

working_df['DB'] = working_df['DefensePersonnel'].apply(labelDB)
working_df['LB'].fillna(value=0, inplace=True)

In [48]:
working_df.drop(columns='DefensePersonnel', axis=1, inplace=True)

### Yards

In [49]:
working_df['Yards'].isnull().sum()

0

### HomeTeamAbbr + VisitorTeamAbbr

In [50]:
# see PossessionTeam section above

### Week

In [51]:
working_df['Week'].isnull().sum()
# leave as is

0

### StadiumType

In [52]:
# working_df['StadiumType'].value_counts()

In [53]:
working_df['EnclosedStadium'] = 0
working_df['OpenStadium'] = 0

indoor = ['Indoors', 'Dome', 'Indoor', 'Retr. Roof-Closed', 'Domed, closed', 'Retr. Roof - Closed',
         'Retractable Roof - Closed', 'Closed Dome', 'Dome, closed', 'Domed', 'indoor',
         'Indoor, Roof Closed', 'Retr. Roof Closed']
openAir = ['Outdoor', 'Outdoors', 'Retractable Roof', 'Open', 'Domed, open', 'Retr. Roof-Open',
          'Domed, Open', 'OUTDOOR', 'Outdoor', 'Outddors', 'Bowl', 'Retr. Roof - Open', 'Outdoor Retr Roof-Open',
          'Outdor', 'Ourdoor', 'Indoor, Open Roof', 'Outside', 'Cloudy', 'Indoor, roof open', 'Heinz Field']

working_df.loc[working_df['StadiumType'].isin(indoor), 'EnclosedStadium'] = 1
working_df.loc[working_df['StadiumType'].isin(openAir), 'EnclosedStadium'] = 0

working_df.loc[working_df['StadiumType'].isin(openAir), 'OpenStadium'] = 1
working_df.loc[working_df['StadiumType'].isin(indoor), 'OpenStadium'] = 0

In [54]:
working_df.drop(columns='StadiumType', axis=1, inplace=True)

### Turf

In [55]:
# simplify Turf classes
# this was done by manually verifying each playing surface brand name
turf_list = ['Field Turf', 'Artificial', 'FieldTurf', 'UBU Speed Series-S5-M',
            'A-Turf Titan', 'UBU Sports Speed S5-M', 'FieldTurf360', 'Twenty-Four/Seven Turf',
            'FieldTurf 360', 'Twenty Four/Seven Turf', 'Turf', 'Field turf', 
             'UBU-Speed Series-S5-M', 'Artifical']
grass_list = ['Grass', 'Natural Grass', 'Natural', 'Naturall Grass', 'natural grass', 'grass',
             'Natural grass']
hybrid_list = ['SISGrass', 'DD GrassMaster']

working_df.loc[working_df['Turf'].isin(turf_list), 'Turf'] = "TurfField"
working_df.loc[working_df['Turf'].isin(grass_list), 'Turf'] = "GrassField"
working_df.loc[working_df['Turf'].isin(hybrid_list), 'Turf'] = "HybridField"

In [56]:
turf_dummies = pd.get_dummies(data=working_df['Turf'])

In [57]:
#working_df = pd.merge(working_df, turf_dummies, how='left')

In [58]:
working_df.drop(columns='Turf', axis=1, inplace=True)

### GameWeather

In [59]:
# working_df['GameWeather'].value_counts()

In [60]:
working_df['RainOrSnow'] = 0

rainorsnow = ['Rain', 'Light Rain', 'Rain shower', 'Cloudy, Rain', 'Rain and Wind', 'Scattered Showers',
       'Rain likely, temps in low 40s.', 'Cloudy, 50% change of rain', 'Cloudy with showers and wind',
       'Raining', 'Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.',
       'Showers', 'Light rain', 'Rainy', 'Snow', 'Heavy lake effect snow', 'Cloudy, light snow accumulating 1-3"']

working_df.loc[working_df['GameWeather'].isin(rainorsnow), 'RainOrSnow'] = 1

In [61]:
working_df.drop(columns='GameWeather', axis=1, inplace=True)

In [62]:
working_df['RainOrSnow'].value_counts()

0    28530
1     2086
Name: RainOrSnow, dtype: int64

### Temp

In [63]:
# temp had 2862 null values
# fill NA with median of temp based on StadiumType and GameWeather

working_df['Temperature'].isnull().sum()

2862

In [64]:
tempMedian = working_df['Temperature'].median()

working_df['Temperature'].fillna(value=tempMedian, inplace=True)

In [65]:
working_df['Temperature'] = working_df['Temperature'].astype(int)

### Humidity

In [66]:
working_df['Humidity'].isnull().sum()

277

In [67]:
humidityMedian = working_df['Humidity'].median()

working_df['Humidity'].fillna(value=humidityMedian, inplace=True)

In [68]:
working_df['Humidity'] = working_df['Humidity'].astype(int)

### Success

In [69]:
# will turn this into a function at some point, this is rudimentary
# YardsToFirst, YardsToTD, YardsToFGRange

working_df['Successful'] = 0

working_df.loc[(working_df['Down'] == 1) & (working_df['Yards'] >= working_df['YardsToTD']), 'Successful'] = 1
working_df.loc[(working_df['Down'] == 1) & (working_df['Yards'] >= working_df['YardsToFirst'] * .45), 'Successful'] = 1
working_df.loc[(working_df['Down'] == 1) & (working_df['Yards'] >= working_df['YardsToFGRange']), 'Successful'] = 1
                                             
working_df.loc[(working_df['Down'] == 2) & (working_df['Yards'] >= working_df['YardsToTD']), 'Successful'] = 1
working_df.loc[(working_df['Down'] == 2) & (working_df['Yards'] >= working_df['YardsToFirst'] * .60), 'Successful'] = 1
working_df.loc[(working_df['Down'] == 2) & (working_df['Yards'] >= working_df['YardsToFGRange']), 'Successful'] = 1

working_df.loc[(working_df['Down'] == 3) & (working_df['Yards'] >= working_df['YardsToTD']), 'Successful'] = 1
working_df.loc[(working_df['Down'] == 3) & (working_df['Yards'] >= working_df['YardsToFirst']), 'Successful'] = 1
working_df.loc[(working_df['Down'] == 3) & (working_df['Yards'] >= working_df['YardsToFGRange']), 'Successful'] = 1

working_df.loc[(working_df['Down'] == 4) & (working_df['Yards'] >= working_df['YardsToTD']), 'Successful'] = 1
working_df.loc[(working_df['Down'] == 4) & (working_df['Yards'] >= working_df['YardsToFirst']), 'Successful'] = 1
working_df.loc[(working_df['Down'] == 4) & (working_df['Yards'] >= working_df['YardsToFGRange']), 'Successful'] = 1



# On first down, a play is considered a success if it gains 45 percent of needed yards; 
# on second down, a play needs to gain 60 percent of needed yards; on third or fourth 
# down, only gaining a new first down is considered success.

In [70]:
working_df['Successful'].value_counts()

1    16600
0    14016
Name: Successful, dtype: int64

### Final Adjustments

In [71]:
working_df['Team'] = le.fit_transform(working_df.Team.values)
working_df['Opponent'] = le.fit_transform(working_df.Opponent.values)

In [72]:
working_df['S'] = working_df['S'].astype(int)
working_df['A'] = working_df['A'].astype(int)
working_df['YardsToTD'] = working_df['YardsToTD'].astype(int)
working_df['YardsToFGRange'] = working_df['YardsToFGRange'].astype(int)
working_df['ScoreBeforePlay'] = working_df['ScoreBeforePlay'].astype(int)
working_df['OppScoreBeforePlay'] = working_df['OppScoreBeforePlay'].astype(int)
working_df['ScoreDifferential'] = working_df['ScoreDifferential'].astype(int)
working_df['LB'] = working_df['LB'].astype(int)

In [73]:
working_df

Unnamed: 0,PlayId,S,A,Season,YardLine,Quarter,GameClock,Team,Down,YardsToFirst,ballCarrier,OffenseFormation,DefendersInTheBox,Yards,Week,Temperature,Humidity,isHome,Opponent,YardsToTD,YardsToFGRange,ScoreBeforePlay,OppScoreBeforePlay,ScoreDifferential,RB,TE,WR,OL,DL,LB,DB,EnclosedStadium,OpenStadium,RainOrSnow,Successful
18,20170907000118,3,3,0,35,1,854,20,3,2,2543773,5,6,8,1,63,77,1,15,65,30,0,0,0,1,1,3,5,2,3,6,0,1,0,1
40,20170907000139,3,2,0,43,1,832,20,1,10,2543773,5,6,3,1,63,77,1,15,57,22,0,0,0,1,1,3,5,2,3,6,0,1,0,0
62,20170907000189,5,2,0,35,1,782,20,1,10,2543773,6,7,5,1,63,77,1,15,35,0,0,0,0,1,1,3,5,2,3,6,0,1,0,1
84,20170907000345,4,3,0,2,1,732,20,2,2,2539663,3,9,2,1,63,77,1,15,2,0,0,0,0,1,0,3,6,4,4,3,0,1,0,1
98,20170907000395,3,2,0,25,1,728,15,1,10,2557917,5,7,7,1,63,77,0,20,75,40,0,7,7,1,3,1,5,3,2,6,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
682052,20191125003419,4,2,2,46,4,587,2,1,10,2562407,4,8,1,12,62,64,0,16,54,19,42,6,36,2,2,1,5,4,3,4,0,1,0,0
682074,20191125003440,4,3,2,47,4,543,2,2,9,2562407,4,7,1,12,62,64,0,16,53,18,42,6,36,1,1,3,5,3,3,5,0,1,0,0
682096,20191125003496,4,1,2,13,4,484,2,1,10,2561324,6,8,1,12,62,64,0,16,13,0,42,6,36,1,1,3,5,3,3,5,0,1,0,1
682118,20191125003768,4,3,2,37,4,295,2,1,10,2562407,4,7,1,12,62,64,0,16,63,28,45,6,39,2,1,2,5,4,3,4,0,1,0,0


In [74]:
working_df.dropna(axis=1, inplace=True)
working_df

Unnamed: 0,PlayId,S,A,Season,YardLine,Quarter,GameClock,Team,Down,YardsToFirst,ballCarrier,OffenseFormation,DefendersInTheBox,Yards,Week,Temperature,Humidity,isHome,Opponent,YardsToTD,YardsToFGRange,ScoreBeforePlay,OppScoreBeforePlay,ScoreDifferential,RB,TE,WR,OL,DL,LB,DB,EnclosedStadium,OpenStadium,RainOrSnow,Successful
18,20170907000118,3,3,0,35,1,854,20,3,2,2543773,5,6,8,1,63,77,1,15,65,30,0,0,0,1,1,3,5,2,3,6,0,1,0,1
40,20170907000139,3,2,0,43,1,832,20,1,10,2543773,5,6,3,1,63,77,1,15,57,22,0,0,0,1,1,3,5,2,3,6,0,1,0,0
62,20170907000189,5,2,0,35,1,782,20,1,10,2543773,6,7,5,1,63,77,1,15,35,0,0,0,0,1,1,3,5,2,3,6,0,1,0,1
84,20170907000345,4,3,0,2,1,732,20,2,2,2539663,3,9,2,1,63,77,1,15,2,0,0,0,0,1,0,3,6,4,4,3,0,1,0,1
98,20170907000395,3,2,0,25,1,728,15,1,10,2557917,5,7,7,1,63,77,0,20,75,40,0,7,7,1,3,1,5,3,2,6,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
682052,20191125003419,4,2,2,46,4,587,2,1,10,2562407,4,8,1,12,62,64,0,16,54,19,42,6,36,2,2,1,5,4,3,4,0,1,0,0
682074,20191125003440,4,3,2,47,4,543,2,2,9,2562407,4,7,1,12,62,64,0,16,53,18,42,6,36,1,1,3,5,3,3,5,0,1,0,0
682096,20191125003496,4,1,2,13,4,484,2,1,10,2561324,6,8,1,12,62,64,0,16,13,0,42,6,36,1,1,3,5,3,3,5,0,1,0,1
682118,20191125003768,4,3,2,37,4,295,2,1,10,2562407,4,7,1,12,62,64,0,16,63,28,45,6,39,2,1,2,5,4,3,4,0,1,0,0


In [75]:
working_df.reset_index(drop=True, inplace=True)

In [76]:
working_df

Unnamed: 0,PlayId,S,A,Season,YardLine,Quarter,GameClock,Team,Down,YardsToFirst,ballCarrier,OffenseFormation,DefendersInTheBox,Yards,Week,Temperature,Humidity,isHome,Opponent,YardsToTD,YardsToFGRange,ScoreBeforePlay,OppScoreBeforePlay,ScoreDifferential,RB,TE,WR,OL,DL,LB,DB,EnclosedStadium,OpenStadium,RainOrSnow,Successful
0,20170907000118,3,3,0,35,1,854,20,3,2,2543773,5,6,8,1,63,77,1,15,65,30,0,0,0,1,1,3,5,2,3,6,0,1,0,1
1,20170907000139,3,2,0,43,1,832,20,1,10,2543773,5,6,3,1,63,77,1,15,57,22,0,0,0,1,1,3,5,2,3,6,0,1,0,0
2,20170907000189,5,2,0,35,1,782,20,1,10,2543773,6,7,5,1,63,77,1,15,35,0,0,0,0,1,1,3,5,2,3,6,0,1,0,1
3,20170907000345,4,3,0,2,1,732,20,2,2,2539663,3,9,2,1,63,77,1,15,2,0,0,0,0,1,0,3,6,4,4,3,0,1,0,1
4,20170907000395,3,2,0,25,1,728,15,1,10,2557917,5,7,7,1,63,77,0,20,75,40,0,7,7,1,3,1,5,3,2,6,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30611,20191125003419,4,2,2,46,4,587,2,1,10,2562407,4,8,1,12,62,64,0,16,54,19,42,6,36,2,2,1,5,4,3,4,0,1,0,0
30612,20191125003440,4,3,2,47,4,543,2,2,9,2562407,4,7,1,12,62,64,0,16,53,18,42,6,36,1,1,3,5,3,3,5,0,1,0,0
30613,20191125003496,4,1,2,13,4,484,2,1,10,2561324,6,8,1,12,62,64,0,16,13,0,42,6,36,1,1,3,5,3,3,5,0,1,0,1
30614,20191125003768,4,3,2,37,4,295,2,1,10,2562407,4,7,1,12,62,64,0,16,63,28,45,6,39,2,1,2,5,4,3,4,0,1,0,0


# Train Test Split

In [77]:
working_df.columns

Index(['PlayId', 'S', 'A', 'Season', 'YardLine', 'Quarter', 'GameClock',
       'Team', 'Down', 'YardsToFirst', 'ballCarrier', 'OffenseFormation',
       'DefendersInTheBox', 'Yards', 'Week', 'Temperature', 'Humidity',
       'isHome', 'Opponent', 'YardsToTD', 'YardsToFGRange', 'ScoreBeforePlay',
       'OppScoreBeforePlay', 'ScoreDifferential', 'RB', 'TE', 'WR', 'OL', 'DL',
       'LB', 'DB', 'EnclosedStadium', 'OpenStadium', 'RainOrSnow',
       'Successful'],
      dtype='object')

In [78]:
x = working_df[['PlayId', 'S', 'A', 'Season', 'YardLine', 'Quarter', 'GameClock',
       'Team', 'Down', 'YardsToFirst', 'ballCarrier', 'OffenseFormation',
       'DefendersInTheBox', 'Yards', 'Week', 'Temperature', 'Humidity',
       'isHome', 'Opponent', 'YardsToTD', 'YardsToFGRange', 'ScoreBeforePlay',
       'OppScoreBeforePlay', 'ScoreDifferential', 'RB', 'TE', 'WR', 'OL', 'DL',
       'LB', 'DB', 'EnclosedStadium', 'OpenStadium', 'RainOrSnow']]
y = working_df['Successful']
               

In [79]:
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [80]:
#scaler = MinMaxScaler()
#X = scaler.fit_transform(xTrain)

In [86]:
X = xTrain

In [87]:
Y = yTrain

kfd = KFold(10)

def kfold(model, score_type) :
    kfold_scores = []
    
    for train_index, test_index in kfd.split(X, Y):
        
        clf = model()
        clf.fit(xTrain, yTrain)
        yPred = clf.predict(xTest)
        score = score_type(yTest, yPred)
        kfold_scores.append(score)
        
    
    return(sum(kfold_scores)/len(kfold_scores))

In [88]:
# KNN

knn_rcScore = kfold(KNeighborsClassifier, recall_score)
knn_pcScore = kfold(KNeighborsClassifier, precision_score)
knn_acScore = kfold(KNeighborsClassifier, accuracy_score)
knn_f1Score = kfold(KNeighborsClassifier, f1_score)

In [89]:
# Decision Tree

dtc_rcScore = kfold(DecisionTreeClassifier, recall_score)
dtc_pcScore = kfold(DecisionTreeClassifier, precision_score)
dtc_acScore = kfold(DecisionTreeClassifier, accuracy_score)
dtc_f1Score = kfold(DecisionTreeClassifier, f1_score)

In [90]:
# Random Forest

rfc_rcScore = kfold(RandomForestClassifier, recall_score)
rfc_pcScore = kfold(RandomForestClassifier, precision_score)
rfc_acScore = kfold(RandomForestClassifier, accuracy_score)
rfc_f1Score = kfold(RandomForestClassifier, f1_score)

In [91]:
knn_pcScore

0.5919523944460188

In [92]:
model_eval = pd.DataFrame({'Model': ['KNN', 'Decision Tree', 'Random Forest'], 
                       'Recall_Score' : [{knn_rcScore}, {dtc_rcScore}, {rfc_rcScore}],
                       'Precision_Score' : [{knn_pcScore}, {dtc_pcScore}, {rfc_pcScore}],
                       'Accuracy_Score' : [{knn_acScore}, {dtc_acScore}, {rfc_acScore}],
                       'F1_Score' : [{knn_f1Score}, {dtc_f1Score}, {rfc_f1Score}]})

model_eval

Unnamed: 0,Model,Recall_Score,Precision_Score,Accuracy_Score,F1_Score
0,KNN,{0.618413262285376},{0.5919523944460188},{0.5543762246897453},{0.6048935862168815}
1,Decision Tree,{0.9979869745411488},{0.9977800577379263},{0.9975506205094711},{0.9976171586606585}
2,Random Forest,{0.9818827708703374},{0.9852304645403851},{0.981009144350098},{0.9836056082239317}


In [None]:
yTest

In [None]:
#lr = LinearRegression()
#lr.fit(X_train, y_train)
#y_hat_test = lr.predict(X_test)

#print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_hat_test))  
#print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_hat_test))  
#print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_hat_test)))

In [None]:

#print('Pred mean:', y_pred.mean())
#print('Actual mean:', y_test.mean())

In [None]:
#correlation_matrix = _df.corr().round(2)
#plt.figure(figsize=(14, 12))  
#sns.heatmap(data=correlation_matrix, annot=True)

In [None]:
#correlation_matrix = origin_df.corr().round(2)
#plt.figure(figsize=(14, 12))  
#sns.heatmap(data=correlation_matrix, annot=True)

In [None]:
#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler() 
#data_scaled = scaler.fit_transform(working_df)

In [None]:
#X = data_scaled[['Season', 'YardLine', 'Quarter', 'GameClock', 'Team', 'Down',
#       'YardsToFirst', 'OffenseFormation', 'DefendersInTheBox',
#       'Week', 'Temperature', 'Humidity', 'isHome', 'Opponent', 'YardsToTD',
#       'YardsToFGRange', 'ScoreBeforePlay', 'OppScoreBeforePlay',
#       'ScoreDifferential', 'RB', 'TE', 'WR', 'QB', 'OL', 'DL', 'LB', 'DB',
#       'EnclosedStadium', 'OpenStadium', 'GrassField', 'HybridField',
#       'TurfField', 'RainOrSnow']]
#y = data_scaled[['Yards']]

In [None]:
#xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
#Y = yTrain
#x = xTrain

#kfd = KFold(10)

#def kfold(model, score_type) :
 #   kfold_scores = []
    
  #  for train_index, test_index in kfd.split(x, Y):
   #     X_train = X.iloc[train_index]
    #    X_test = X.iloc[test_index]
     #   Y_train = Y.iloc[train_index]
      #  Y_test = Y.iloc[test_index]
        
       # clf = model()
        #clf.fit(X_train, Y_train)
        #Y_pred = clf.predict(X_test)
        #score = score_type(Y_test, Y_pred)
        #kfold_scores.append(score)
        
    #return(sum(kfold_scores)/len(kfold_scores))

# Regression Modeling

In [None]:
#lr = RandomForestRegressor()

In [None]:
#lr.fit(x, Y)

In [None]:
#metrics.r2_score(yTest, lr.predict(xTest))

In [None]:
#y_pred = lr.predict(xTest)

In [None]:
#y_pred

In [None]:
#df = pd.DataFrame({'Actual': yTest, 'Predicted': y_pred})
#df

In [None]:
#kfold(LinearRegression, explained_variance_score)

In [None]:
#kfold(LinearRegression, r2_score)

In [None]:
#lr_maScore = kfold(LinearRegression, mean_absolute_error)
#lr_msScore = kfold(LinearRegression, mean_squared_error)
#lr_vaScore = kfold(LinearRegression, explained_variance_score)
#lr_r2Score = kfold(LinearRegression, r2_score)

In [None]:
#sv_maScore = kfold(LinearSVR, mean_absolute_error)
#sv_msScore = kfold(LinearSVR, mean_squared_error)
#sv_vaScore = kfold(LinearSVR, explained_variance_score)
#sv_r2Score = kfold(LinearSVR, r2_score)

In [None]:
#rf_maScore = kfold(RandomForestRegressor, mean_absolute_error)
#rf_msScore = kfold(RandomForestRegressor, mean_squared_error)
#rf_vaScore = kfold(RandomForestRegressor, explained_variance_score)
#rf_r2Score = kfold(RandomForestRegressor, r2_score)

In [None]:
#dt_maScore = kfold(DecisionTreeRegressor, mean_absolute_error)
#dt_msScore = kfold(DecisionTreeRegressor, mean_squared_error)
#dt_vaScore = kfold(DecisionTreeRegressor, explained_variance_score)
#dt_r2Score = kfold(DecisionTreeRegressor, r2_score)

In [None]:
#modelEvaluation = pd.DataFrame({'Model': ['Linear Regression', 'LinearSVR','Decision Tree', 'Random Forest'], 
#                       'Mean Absolute Error' : [lr_maScore, sv_maScore, rf_maScore, dt_maScore],
#                       'Mean Squared Error' : [lr_msScore, sv_msScore, rf_msScore, dt_msScore],
#                       'Explained Variance' : [lr_vaScore, sv_vaScore, rf_vaScore, dt_vaScore],
#                       'R2 Score' : [lr_r2Score, sv_r2Score, rf_r2Score, dt_r2Score]})

#modelEvaluation