<div style="text-align: right"> Tommy Evans-Barton </div>
<div style="text-align: right"> WR Year 2 Jumps </div>

# Analysis and Modeling Notebook

The purpose of this notebook is to develop the model used to predict second year production for receivers based on their statistics in their first year. This notebook will also serve as a preliminary 'final notebook' before the final presentation of this project's findings.

In [20]:
import os
import sys
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

In [21]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
TOP_PATH = os.environ['PWD']

## Reading in Data

In [23]:
df = pd.read_csv(TOP_PATH + '/data/final/FINAL_DATA.csv')
df

Unnamed: 0,Rnd,Pick,Team,Player,First Year,Age Draft,G,GS,Tgt,Rec,...,YAR,DVOA,VOA,EYds,DPI Pens,DPI Yds,Rec Pts First Season,Rec Pts/G First Season,Rec Pts Second Season,Rec Pts/G Second Season
0,1,3,CLE,B.Edwards,2005,22,10.0,7.0,59.0,32.0,...,80.0,3.3,4.9,474.0,0.0,0.0,69.2,6.920000,124.4,7.775000
1,1,7,MIN,T.Williamson,2005,22,14.0,3.0,52.0,24.0,...,14.0,-5.4,-9.3,370.0,0.0,0.0,49.2,3.514286,45.5,3.250000
2,1,10,DET,M.Williams,2005,21,14.0,4.0,57.0,29.0,...,-19.0,-16.3,-16.9,337.0,1.0,23.0,41.0,2.928571,15.9,1.987500
3,1,21,JAX,M.Jones,2005,22,16.0,1.0,69.0,36.0,...,42.0,-6.4,-4.7,483.0,0.0,0.0,73.2,4.575000,88.3,6.307143
4,1,22,BAL,M.Clayton,2005,23,14.0,10.0,87.0,44.0,...,-63.0,-23.0,-22.1,446.0,1.0,21.0,59.1,4.221429,123.9,7.743750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,2,62,ARI,A.Isabella,2019,22,15.0,1.0,13.0,9.0,...,41.0,33.4,29.9,144.0,0.0,0.0,24.9,1.660000,,
143,2,64,SEA,D.Metcalf,2019,21,16.0,15.0,100.0,58.0,...,85.0,0.6,-2.0,801.0,1.0,4.0,132.0,8.250000,,
144,3,66,PIT,D.Johnson,2019,23,16.0,12.0,92.0,59.0,...,7.0,-8.9,-11.6,601.0,2.0,43.0,98.0,6.125000,,
145,3,76,WAS,T.McLaurin,2019,23,14.0,14.0,93.0,58.0,...,244.0,18.9,19.8,961.0,3.0,49.0,133.9,9.564286,,


In [24]:
rec_model = df[df['First Year'] < 2019].reset_index(drop = True)
rec_prediction = df[df['First Year'] == 2019].reset_index(drop = True)

In [25]:
rec_model

Unnamed: 0,Rnd,Pick,Team,Player,First Year,Age Draft,G,GS,Tgt,Rec,...,YAR,DVOA,VOA,EYds,DPI Pens,DPI Yds,Rec Pts First Season,Rec Pts/G First Season,Rec Pts Second Season,Rec Pts/G Second Season
0,1,3,CLE,B.Edwards,2005,22,10.0,7.0,59.0,32.0,...,80.0,3.3,4.9,474.0,0.0,0.0,69.2,6.920000,124.4,7.775000
1,1,7,MIN,T.Williamson,2005,22,14.0,3.0,52.0,24.0,...,14.0,-5.4,-9.3,370.0,0.0,0.0,49.2,3.514286,45.5,3.250000
2,1,10,DET,M.Williams,2005,21,14.0,4.0,57.0,29.0,...,-19.0,-16.3,-16.9,337.0,1.0,23.0,41.0,2.928571,15.9,1.987500
3,1,21,JAX,M.Jones,2005,22,16.0,1.0,69.0,36.0,...,42.0,-6.4,-4.7,483.0,0.0,0.0,73.2,4.575000,88.3,6.307143
4,1,22,BAL,M.Clayton,2005,23,14.0,10.0,87.0,44.0,...,-63.0,-23.0,-22.1,446.0,1.0,21.0,59.1,4.221429,123.9,7.743750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130,2,51,CHI,A.Miller,2018,23,15.0,4.0,54.0,33.0,...,87.0,3.9,7.5,455.0,1.0,43.0,84.3,5.620000,77.6,4.850000
131,2,60,PIT,J.Washington,2018,22,14.0,6.0,38.0,16.0,...,-51.0,-25.1,-29.5,190.0,0.0,0.0,27.7,1.978571,91.5,6.100000
132,2,61,JAX,D.Chark,2018,21,11.0,0.0,32.0,14.0,...,-95.0,-47.4,-49.0,78.0,0.0,0.0,17.4,1.581818,148.8,9.920000
133,3,81,DAL,M.Gallup,2018,22,16.0,8.0,68.0,33.0,...,15.0,-14.3,-9.7,415.0,2.0,33.0,62.7,3.918750,146.7,10.478571


In [26]:
rec_prediction

Unnamed: 0,Rnd,Pick,Team,Player,First Year,Age Draft,G,GS,Tgt,Rec,...,YAR,DVOA,VOA,EYds,DPI Pens,DPI Yds,Rec Pts First Season,Rec Pts/G First Season,Rec Pts Second Season,Rec Pts/G Second Season
0,1,25,BAL,M.Brown,2019,22,14.0,11.0,71.0,46.0,...,98.0,4.2,4.0,621.0,2.0,73.0,100.4,7.171429,,
1,1,32,NWE,N.Harry,2019,21,7.0,5.0,24.0,12.0,...,-18.0,-26.1,-22.1,116.0,1.0,10.0,22.5,3.214286,,
2,2,36,SFO,D.Samuel,2019,23,15.0,11.0,81.0,57.0,...,114.0,7.3,6.1,673.0,0.0,0.0,98.2,6.546667,,
3,2,51,TEN,A.Brown,2019,22,16.0,11.0,84.0,52.0,...,253.0,26.2,26.4,897.0,0.0,0.0,153.1,9.56875,,
4,2,56,KAN,M.Hardman,2019,21,16.0,5.0,41.0,26.0,...,177.0,44.1,42.6,528.0,2.0,28.0,89.8,5.6125,,
5,2,57,PHI,J.Arcega-Whiteside,2019,22,16.0,5.0,22.0,10.0,...,5.0,-12.3,-9.3,130.0,0.0,0.0,22.9,1.43125,,
6,2,59,IND,P.Campbell,2019,22,7.0,3.0,24.0,18.0,...,-88.0,-73.4,-64.4,-14.0,0.0,0.0,18.7,2.671429,,
7,2,62,ARI,A.Isabella,2019,22,15.0,1.0,13.0,9.0,...,41.0,33.4,29.9,144.0,0.0,0.0,24.9,1.66,,
8,2,64,SEA,D.Metcalf,2019,21,16.0,15.0,100.0,58.0,...,85.0,0.6,-2.0,801.0,1.0,4.0,132.0,8.25,,
9,3,66,PIT,D.Johnson,2019,23,16.0,12.0,92.0,59.0,...,7.0,-8.9,-11.6,601.0,2.0,43.0,98.0,6.125,,


In [8]:
rec_model.columns

Index(['Rnd', 'Pick', 'Team', 'Player', 'First Year', 'Age Draft', 'G', 'GS',
       'Tgt', 'Rec', 'Ctch%', 'Yds', 'Y/R', 'TD', '1D', 'Lng', 'Y/Tgt', 'R/G',
       'Y/G', 'DYAR', 'YAR', 'DVOA', 'VOA', 'EYds', 'DPI Pens', 'DPI Yds',
       'Rec Pts First Season', 'Rec Pts Second Season', 'Rec Pts Jump'],
      dtype='object')

In [76]:
X_temp, X_test, y_temp, y_test = train_test_split(rec_model.drop(['Rec Pts Second Season', 'Rec Pts/G Second Season'], axis = 1), rec_model['Rec Pts/G Second Season'], test_size = 0.2, random_state = 1)

In [77]:
X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size = 0.2, random_state = 1)

In [78]:
X_train = X_train.reset_index(drop = True)
X_valid = X_valid.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)
y_valid = y_valid.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

In [79]:
cat_feat = ['Rnd']

cat_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(categories = 'auto'))     # categorical columns become input to OneHot
])

reg_num_feat = ['Pick', 'First Year', 'Age Draft', 'G', 'GS',
       'Tgt', 'Rec', 'Ctch%', 'Yds', 'Y/R', 'TD', '1D', 'Lng', 'Y/Tgt', 'R/G',
       'Y/G', 'Rec Pts First Season', 'Rec Pts/G First Season']
adv_num_feat = ['DYAR', 'YAR', 'DVOA', 'VOA', 'EYds', 'DPI Pens', 'DPI Yds']
num_feat = reg_num_feat + adv_num_feat

num_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())   # z-scale
])

small_selection = ['Tgt', 'Rec', 'TD', '1D', 'Y/G', 'EYds', 'Rec Pts First Season']

preproc = ColumnTransformer(transformers=[('num', num_transformer, small_selection), ('cat', cat_transformer, cat_feat)])

pl = Pipeline(steps=[('preprocessor', preproc), ('regressor', LinearRegression())])

In [80]:
pl.fit(X_train[small_selection + cat_feat], y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('scaler',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True))],
                                                           verbose=False),
                                                  ['Tgt', 'Rec', 'TD', '1D',
                                                   'Y/G', 'EYds',
                                              

In [81]:
pd.DataFrame(zip(small_selection + ['Rnd1', 'Rnd2', 'Rnd3'], pl.named_steps['regressor'].coef_)) 

Unnamed: 0,0,1
0,Tgt,0.160605
1,Rec,-1.995555
2,TD,-0.82585
3,1D,0.001799
4,Y/G,1.200101
5,EYds,0.422311
6,Rec Pts First Season,3.113839
7,Rnd1,0.371373
8,Rnd2,0.061633
9,Rnd3,-0.433006


In [82]:
valid_pred = pl.predict(X_valid[small_selection + ['Rnd']])

In [83]:
valid_pred

array([ 8.57464086,  6.05969207,  7.10704278,  2.12696628,  3.61489259,
        8.19348376,  4.95700196,  3.36855409,  6.62750965,  5.50879326,
       11.79225317,  4.50823371,  4.3319437 ,  2.40064563,  2.93272957,
        3.31719971,  2.18832352,  6.56082131,  3.57117278,  8.46258389,
        8.07313058,  7.71542518])

In [84]:
temp = X_valid.copy()
temp['Actual'] = y_valid
temp['Prediction'] = valid_pred

In [93]:
temp['Error'] = np.abs(valid_pred - y_valid)

In [98]:
temp = temp.sort_values('Error').sort_values('Error', ascending = False).reset_index(drop = True)

In [99]:
temp

Unnamed: 0,Rnd,Pick,Team,Player,First Year,Age Draft,G,GS,Tgt,Rec,...,DVOA,VOA,EYds,DPI Pens,DPI Yds,Rec Pts First Season,Rec Pts/G First Season,Actual,Prediction,Error
0,2,52,GNB,G.Jennings,2006,22,14.0,11.0,104.0,45.0,...,-24.2,-22.1,543.0,2.0,27.0,81.2,5.8,12.615385,7.107043,5.508342
1,3,81,DAL,M.Gallup,2018,22,16.0,8.0,68.0,33.0,...,-14.3,-9.7,415.0,2.0,33.0,62.7,3.91875,10.478571,5.508793,4.969778
2,2,37,NYJ,D.Smith,2015,23,10.0,3.0,28.0,9.0,...,-46.1,-41.8,68.0,0.0,0.0,17.5,1.75,0.5,3.3172,2.8172
3,1,30,TEN,K.Britt,2009,20,16.0,6.0,75.0,42.0,...,11.2,11.2,700.0,1.0,20.0,88.1,5.50625,10.958333,8.193484,2.76485
4,3,79,PIT,M.Wheaton,2013,22,12.0,1.0,13.0,6.0,...,-30.5,-26.8,56.0,0.0,0.0,6.4,0.533333,4.775,2.126966,2.648034
5,1,7,TAM,M.Evans,2014,21,15.0,15.0,122.0,68.0,...,11.4,12.7,1078.0,2.0,48.0,177.1,11.806667,9.24,11.792253,2.552253
6,3,96,DET,K.Golladay,2017,23,11.0,5.0,48.0,28.0,...,21.9,20.7,498.0,1.0,34.0,65.7,5.972727,9.086667,6.62751,2.459157
7,1,13,ARI,M.Floyd,2012,22,16.0,3.0,86.0,45.0,...,-10.3,-15.7,546.0,0.0,0.0,68.2,4.2625,8.38125,6.059692,2.321558
8,3,69,SEA,T.Lockett,2015,22,16.0,8.0,69.0,51.0,...,35.1,33.9,790.0,0.0,0.0,102.4,6.4,4.38,6.560821,2.180821
9,1,30,LAC,C.Davis,2007,21,13.0,1.0,34.0,20.0,...,-1.5,-1.9,273.0,2.0,45.0,24.8,1.907692,1.475,3.614893,2.139893


Thoughts:

- One hot encode round
- use pick straight up
- use year straight up (?)
- use age draft straight up
- use each stat straight up
- MIGHT NEED MORE DATA

Possible more features:

- Target share
- Yard share
- Number of receivers ahead of them for their team
- Number of available targets: Targets that left team - Targets that came into team ?
- Receivers drafted in first 2 days by team