In [24]:
%matplotlib inline
import warnings
warnings.filterwarnings(action='ignore')

# data
import pandas as pd
import numpy as np
import random as rnd
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from collections import Counter

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [3]:
origin_df = pd.read_csv('bdb.csv', low_memory = False)

In [4]:
messy = origin_df.copy()

In [5]:
messy.drop(columns=['DisplayName', 'JerseyNumber', 'PlayerBirthDate', 'PlayerCollegeName',
                        'Stadium', 'Location', 'WindSpeed', 'WindDirection', 'GameId', 'Season', 'GameWeather'
                        ], axis= 1, inplace= True)

In [6]:
messy.columns

Index(['PlayId', 'Team', 'X', 'Y', 'S', 'A', 'Dis', 'Orientation', 'Dir',
       'NflId', 'YardLine', 'Quarter', 'GameClock', 'PossessionTeam', 'Down',
       'Distance', 'FieldPosition', 'HomeScoreBeforePlay',
       'VisitorScoreBeforePlay', 'NflIdRusher', 'OffenseFormation',
       'OffensePersonnel', 'DefendersInTheBox', 'DefensePersonnel',
       'PlayDirection', 'TimeHandoff', 'TimeSnap', 'Yards', 'PlayerHeight',
       'PlayerWeight', 'Position', 'HomeTeamAbbr', 'VisitorTeamAbbr', 'Week',
       'StadiumType', 'Turf', 'Temperature', 'Humidity'],
      dtype='object')

In [7]:
messy.drop(columns=['PlayId', 'Team', 'X', 'Y', 'S', 'A', 'Dis', 'Orientation', 'Dir',
       'NflId', 'PossessionTeam', 'FieldPosition', 'NflIdRusher', 'OffenseFormation',
       'OffensePersonnel', 'DefensePersonnel', 'PlayDirection', 'PlayerHeight',
       'PlayerWeight', 'Position', 'HomeTeamAbbr', 'VisitorTeamAbbr',
       'StadiumType', 'Turf'], axis= 1, inplace= True)

In [8]:
messy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 682154 entries, 0 to 682153
Data columns (total 14 columns):
YardLine                  682154 non-null int64
Quarter                   682154 non-null int64
GameClock                 682154 non-null object
Down                      682154 non-null int64
Distance                  682154 non-null int64
HomeScoreBeforePlay       682154 non-null int64
VisitorScoreBeforePlay    682154 non-null int64
DefendersInTheBox         682132 non-null float64
TimeHandoff               682154 non-null object
TimeSnap                  682154 non-null object
Yards                     682154 non-null int64
Week                      682154 non-null int64
Temperature               618508 non-null float64
Humidity                  675994 non-null float64
dtypes: float64(3), int64(8), object(3)
memory usage: 72.9+ MB


In [9]:
messy.drop(columns=['GameClock', 'TimeHandoff', 'TimeSnap'], axis=1, inplace=True)

In [10]:
messy.dropna()

Unnamed: 0,YardLine,Quarter,Down,Distance,HomeScoreBeforePlay,VisitorScoreBeforePlay,DefendersInTheBox,Yards,Week,Temperature,Humidity
0,35,1,3,2,0,0,6.0,8,1,63.0,77.0
1,35,1,3,2,0,0,6.0,8,1,63.0,77.0
2,35,1,3,2,0,0,6.0,8,1,63.0,77.0
3,35,1,3,2,0,0,6.0,8,1,63.0,77.0
4,35,1,3,2,0,0,6.0,8,1,63.0,77.0
...,...,...,...,...,...,...,...,...,...,...,...
682149,38,4,2,9,6,45,6.0,4,12,62.0,64.0
682150,38,4,2,9,6,45,6.0,4,12,62.0,64.0
682151,38,4,2,9,6,45,6.0,4,12,62.0,64.0
682152,38,4,2,9,6,45,6.0,4,12,62.0,64.0


In [11]:
messy.columns

Index(['YardLine', 'Quarter', 'Down', 'Distance', 'HomeScoreBeforePlay',
       'VisitorScoreBeforePlay', 'DefendersInTheBox', 'Yards', 'Week',
       'Temperature', 'Humidity'],
      dtype='object')

In [12]:
x = messy[['YardLine', 'Quarter', 'Down', 'Distance', 'HomeScoreBeforePlay',
       'VisitorScoreBeforePlay', 'DefendersInTheBox', 'Week', 'Temperature', 'Humidity']]
y = messy['Yards']

In [13]:
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [14]:
scaler = MinMaxScaler()
X = scaler.fit_transform(xTrain)

In [15]:
Y = yTrain

kfd = KFold(10)

def kfold(model, score_type) :
    kfold_scores = []
    
    for train_index, test_index in kfd.split(X, Y):
        # print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        
        clf = model()
        clf.fit(X_train, Y_train)
        Y_pred = clf.predict(X_test)
        score = score_type(Y_test, Y_pred)
        kfold_scores.append(score)
        
    
    return(sum(kfold_scores)/len(kfold_scores))

In [34]:
np.nan_to_num(x)

array([[35.,  1.,  3., ...,  1., 63., 77.],
       [35.,  1.,  3., ...,  1., 63., 77.],
       [35.,  1.,  3., ...,  1., 63., 77.],
       ...,
       [38.,  4.,  2., ..., 12., 62., 64.],
       [38.,  4.,  2., ..., 12., 62., 64.],
       [38.,  4.,  2., ..., 12., 62., 64.]])

In [35]:
np.nan_to_num(y)

array([8, 8, 8, ..., 4, 4, 4])

In [36]:
lr_maScore = kfold(LogisticRegression, mean_absolute_error)
lr_msScore = kfold(LogisticRegression, mean_squared_error)
lr_vaScore = kfold(LogisticRegression, explained_variance_score)
lr_r2Score = kfold(LogisticRegression, r2_score)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [25]:
sv_maScore = kfold(LinearSVR, mean_absolute_error)
sv_msScore = kfold(LinearSVR, mean_squared_error)
sv_vaScore = kfold(LinearSVR, explained_variance_score)
sv_r2Score = kfold(LinearSVR, r2_score)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [26]:
rf_maScore = kfold(RandomForestRegressor, mean_absolute_error)
rf_msScore = kfold(RandomForestRegressor, mean_squared_error)
rf_vaScore = kfold(RandomForestRegressor, explained_variance_score)
rf_r2Score = kfold(RandomForestRegressor, r2_score)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [27]:
dt_maScore = kfold(DecisionTreeRegressor, mean_absolute_error)
dt_msScore = kfold(DecisionTreeRegressor, mean_squared_error)
dt_vaScore = kfold(DecisionTreeRegressor, explained_variance_score)
dt_r2Score = kfold(DecisionTreeRegressor, r2_score)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').