Resources
- https://towardsdatascience.com/regression-prediction-intervals-with-xgboost-428e0a018b

In [27]:
import pandas as pd
import numpy as np
import xgboost
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix

In [2]:
df = pd.read_csv('data/train.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df.columns

Index(['GameId', 'PlayId', 'Team', 'X', 'Y', 'S', 'A', 'Dis', 'Orientation',
       'Dir', 'NflId', 'DisplayName', 'JerseyNumber', 'Season', 'YardLine',
       'Quarter', 'GameClock', 'PossessionTeam', 'Down', 'Distance',
       'FieldPosition', 'HomeScoreBeforePlay', 'VisitorScoreBeforePlay',
       'NflIdRusher', 'OffenseFormation', 'OffensePersonnel',
       'DefendersInTheBox', 'DefensePersonnel', 'PlayDirection', 'TimeHandoff',
       'TimeSnap', 'Yards', 'PlayerHeight', 'PlayerWeight', 'PlayerBirthDate',
       'PlayerCollegeName', 'Position', 'HomeTeamAbbr', 'VisitorTeamAbbr',
       'Week', 'Stadium', 'Location', 'StadiumType', 'Turf', 'GameWeather',
       'Temperature', 'Humidity', 'WindSpeed', 'WindDirection'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0,GameId,PlayId,Team,X,Y,S,A,Dis,Orientation,Dir,...,Week,Stadium,Location,StadiumType,Turf,GameWeather,Temperature,Humidity,WindSpeed,WindDirection
0,2017090700,20170907000118,away,73.91,34.84,1.69,1.13,0.4,81.99,177.18,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
1,2017090700,20170907000118,away,74.67,32.64,0.42,1.35,0.01,27.61,198.7,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
2,2017090700,20170907000118,away,74.0,33.2,1.22,0.59,0.31,3.01,202.73,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
3,2017090700,20170907000118,away,71.46,27.7,0.42,0.54,0.02,359.77,105.64,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
4,2017090700,20170907000118,away,69.32,35.42,1.82,2.43,0.16,12.63,164.31,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW


In [5]:
df.shape

(509762, 49)

In [6]:
df.isnull().sum()

GameId                        0
PlayId                        0
Team                          0
X                             0
Y                             0
S                             0
A                             0
Dis                           0
Orientation                  18
Dir                          14
NflId                         0
DisplayName                   0
JerseyNumber                  0
Season                        0
YardLine                      0
Quarter                       0
GameClock                     0
PossessionTeam                0
Down                          0
Distance                      0
FieldPosition              6424
HomeScoreBeforePlay           0
VisitorScoreBeforePlay        0
NflIdRusher                   0
OffenseFormation            110
OffensePersonnel              0
DefendersInTheBox            66
DefensePersonnel              0
PlayDirection                 0
TimeHandoff                   0
TimeSnap                      0
Yards   

## new vars

In [7]:
#home team
df['home_team'] = pd.get_dummies(df['Team'])['home']

In [8]:
#stadium
df = pd.concat(
    [df, pd.get_dummies(
        df['Stadium']
    )],
    axis = 1)

In [9]:
df['WindSpeed'] = df.WindSpeed.str.extract('(\d+)', expand=False)

In [None]:
df['score_diff'] = (df.HomeScoreBeforePlay * df.home_team + df.VisitorScoreBeforePlay * !df.home_team) - \
(df.HomeScoreBeforePlay * !df.home_team + df.VisitorScoreBeforePlay * df.home_team)

## Change data types

In [78]:
X = df[[
    'WindSpeed',
    'home_team',
    'Temperature',
    'X','Y','Week', 'S', 'A'
] + df.Stadium.unique().tolist()]

In [79]:
X.head()

Unnamed: 0,WindSpeed,home_team,Temperature,X,Y,Week,S,A,Gillette Stadium,New Era Field,...,State Farm Stadium,Broncos Stadium At Mile High,Los Angeles Memorial Coliesum,Broncos Stadium at Mile High,TIAA Bank Field,CenturyField,FirstEnergyStadium,Paul Brown Stdium,Lambeau field,Metlife Stadium
0,,0,63.0,73.91,34.84,1,1.69,1.13,1,0,...,0,0,0,0,0,0,0,0,0,0
1,,0,63.0,74.67,32.64,1,0.42,1.35,1,0,...,0,0,0,0,0,0,0,0,0,0
2,,0,63.0,74.0,33.2,1,1.22,0.59,1,0,...,0,0,0,0,0,0,0,0,0,0
3,,0,63.0,71.46,27.7,1,0.42,0.54,1,0,...,0,0,0,0,0,0,0,0,0,0
4,,0,63.0,69.32,35.42,1,1.82,2.43,1,0,...,0,0,0,0,0,0,0,0,0,0


0          0
1          0
2          0
3          0
4          0
          ..
509757    24
509758    24
509759    24
509760    24
509761    24
Name: HomeScoreBeforePlay, Length: 509762, dtype: int64

## TTS and X and Y

In [80]:
df_sample = df.sample(frac=0.01)

In [81]:
X = df_sample[[
    'WindSpeed',
    'Temperature',
    'home_team',
    'X',
    'Y',
    'Week',
    'Distance',
    'University of Phoenix Stadium',
    'Down',
]].astype(np.float16)

y = df_sample.Yards

In [82]:
X.head()

Unnamed: 0,WindSpeed,Temperature,home_team,X,Y,Week,Distance,University of Phoenix Stadium,Down
193041,,58.0,1.0,54.3125,32.46875,13.0,10.0,0.0,2.0
402012,1.0,75.0,0.0,96.625,23.9375,10.0,3.0,0.0,2.0
84646,,84.0,1.0,95.8125,20.5625,6.0,15.0,0.0,2.0
440758,8.0,57.0,0.0,45.0,35.96875,13.0,10.0,0.0,1.0
53386,,87.0,1.0,48.9375,11.960938,4.0,3.0,0.0,3.0


In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [84]:
X_csr_train = csr_matrix(X_train)
X_csr_test = csr_matrix(X_test)

In [90]:
xgbr = xgboost.XGBRegressor(
    objective='reg:squarederror',
    max_depth=4,
    verbosity=1
)

In [91]:
xgbr.fit(X_train,y_train)

  if getattr(data, 'base', None) is not None and \


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [89]:
help(xgboost.XGBRegressor)

Help on class XGBRegressor in module xgboost.sklearn:

class XGBRegressor(XGBModel, sklearn.base.RegressorMixin)
 |  XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=100, verbosity=1, silent=None, objective='reg:linear', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, importance_type='gain', **kwargs)
 |  
 |  Implementation of the scikit-learn API for XGBoost regression.
 |  
 |  Parameters
 |  ----------
 |  max_depth : int
 |      Maximum tree depth for base learners.
 |  learning_rate : float
 |      Boosting learning rate (xgb's "eta")
 |  n_estimators : int
 |      Number of trees to fit.
 |  verbosity : int
 |      The degree of verbosity. Valid values are 0 (silent) - 3 (debug).
 |  silent : boolean
 |      Whether to print messages while running boostin

In [87]:
predictions = xgbr.predict(X_test)

In [88]:
print('Error:',mean_squared_error(predictions,y_test))

Error: 36.69872629632328
