<div style="text-align: right"> Tommy Evans-Barton </div>
<div style="text-align: right"> WR Year 2 Jumps </div>

# Data Cleaning Notebook

In [129]:
import os
import sys
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt

In [130]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [131]:
TOP_PATH = os.environ['PWD']

In [132]:
sys.path.append(TOP_PATH + '/src')
sys.path.append(TOP_PATH + '/src/viz')

In [133]:
import processing

In [134]:
receivers = pd.read_csv(TOP_PATH + '/data/raw/RECEIVERS.csv')
rec_stats = pd.read_csv(TOP_PATH + '/data/raw/REC_STATS.csv')
adv_stats = pd.read_csv(TOP_PATH + '/data/raw/ADV_REC_STATS.csv')

## Cleaning Receiver Data
- **Removed Position Column**: This column was not necessary as all players were wide receivers
- **Altered Player Column**: This column needed to be edited to match the format of the advanced stats name column, as its format is less granular

In [135]:
receivers

Unnamed: 0,Rnd,Pick,Tm,Player,Pos,Age,YEAR
0,1,22,DEN,Demaryius Thomas,WR,22,2010
1,1,24,DAL,Dez Bryant,WR,21,2010
2,2,36,KAN,Dexter McCluster,WR,21,2010
3,2,39,TAM,Arrelious Benn,WR,21,2010
4,2,60,SEA,Golden Tate,WR,22,2010
...,...,...,...,...,...,...,...
118,2,64,SEA,D.K. Metcalf,WR,21,2019
119,3,66,PIT,Diontae Johnson,WR,23,2019
120,3,67,SFO,Jalen Hurd,WR,23,2019
121,3,76,WAS,Terry McLaurin,WR,23,2019


In [136]:
processing.clean_receivers()

Unnamed: 0,Rnd,Pick,Tm,Player,Age,First Year,Second Year
0,1,22,DEN,D.Thomas,22,2010,2011
1,1,24,DAL,D.Bryant,21,2010,2011
2,2,36,KAN,D.McCluster,21,2010,2011
3,2,39,TAM,A.Benn,21,2010,2011
4,2,60,SEA,G.Tate,22,2010,2011
...,...,...,...,...,...,...,...
118,2,64,SEA,D.Metcalf,21,2019,2020
119,3,66,PIT,D.Johnson,23,2019,2020
120,3,67,SFO,J.Hurd,23,2019,2020
121,3,76,WAS,T.McLaurin,23,2019,2020


## Cleaning Basic Statistics Data

- **Removed Position Column**: This column was not necessary as all players were wide receivers
- **Removed Fumbles Column**: As found in the EDA, these fumbles were very present for players playing special teams, so it was removed in order to isolate receiving talent.
- **Altered Player Column**: This column needed to be edited to match the format of the advanced stats name column, as its format is less granular
- **Altered Catch Rate Column**: Turned this column into a numeric column for analysis

***Note***: While it was considered whether or not certain 'per target' stats should have minimum target requirements on them, it was felt that this would greatly affect the data, as these entries would be disproportionately the first and second year players that are being investigated.

In [137]:
rec_stats

Unnamed: 0,Player,Tm,Age,Pos,G,GS,Tgt,Rec,Ctch%,Yds,Y/R,TD,1D,Lng,Y/Tgt,R/G,Y/G,Fmb,YEAR
0,Roddy White*+,ATL,29,WR,16,16,179.0,115,64.2%,1389,12.1,10,73,46,7.8,7.2,86.8,1,2010
1,Reggie Wayne*+,IND,32,WR,16,16,175.0,111,63.4%,1355,12.2,6,72,50,7.7,6.9,84.7,1,2010
2,Santana Moss,WAS,31,WR,16,16,145.0,93,64.1%,1115,12.0,6,61,56,7.7,5.8,69.7,3,2010
3,Larry Fitzgerald*,ARI,27,WR,16,15,173.0,90,52.0%,1137,12.6,6,58,41,6.6,5.6,71.1,0,2010
4,Andre Johnson *,HOU,29,WR,13,13,138.0,86,62.3%,1216,14.1,8,59,60,8.8,6.6,93.5,1,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3369,Jordan Thomas,HOU,23,,5,2,3.0,1,33.3%,8,8.0,0,0,8,2.7,0.2,1.6,0,2019
3370,Eric Tomlinson,3TM,27,,8,3,1.0,1,100.0%,1,1.0,0,0,1,1.0,0.1,0.1,0,2019
3371,John Ursua,SEA,25,,3,0,1.0,1,100.0%,11,11.0,0,1,11,11.0,0.3,3.7,0,2019
3372,Dwayne Washington,NOR,25,,16,0,1.0,1,100.0%,6,6.0,0,0,6,6.0,0.1,0.4,0,2019


In [138]:
processing.clean_stats()

Unnamed: 0,Player,Tm,Age,G,GS,Tgt,Rec,Ctch%,Yds,Y/R,TD,1D,Lng,Y/Tgt,R/G,Y/G,YEAR,Rec Pts
0,R.White,ATL,29,16,16,179.0,115,64.2,1389,12.1,10,73,46,7.8,7.2,86.8,2010,198.9
1,R.Wayne,IND,32,16,16,175.0,111,63.4,1355,12.2,6,72,50,7.7,6.9,84.7,2010,171.5
2,S.Moss,WAS,31,16,16,145.0,93,64.1,1115,12.0,6,61,56,7.7,5.8,69.7,2010,147.5
3,L.Fitzgerald,ARI,27,16,15,173.0,90,52.0,1137,12.6,6,58,41,6.6,5.6,71.1,2010,149.7
4,A.Johnson,HOU,29,13,13,138.0,86,62.3,1216,14.1,8,59,60,8.8,6.6,93.5,2010,169.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3369,J.Thomas,HOU,23,5,2,3.0,1,33.3,8,8.0,0,0,8,2.7,0.2,1.6,2019,0.8
3370,E.Tomlinson,3TM,27,8,3,1.0,1,100.0,1,1.0,0,0,1,1.0,0.1,0.1,2019,0.1
3371,J.Ursua,SEA,25,3,0,1.0,1,100.0,11,11.0,0,1,11,11.0,0.3,3.7,2019,1.1
3372,D.Washington,NOR,25,16,0,1.0,1,100.0,6,6.0,0,0,6,6.0,0.1,0.4,2019,0.6


## Cleaning Advanced Statistics Data
- **Altered Player Column**: Made the format consistent with the first two datasets
- **Altered Team Column**: Mapped some alternative team encodings to the more traditional versions, and replaced 2TM designations with none entries for simpler analysis
- **Altered DVOA and VOA Columns**: Reformatted these entries into numeric values
- **Split DPI Column**: Split the DPI column into DPI Penalties and DPI Yards in order to have this information in numeric form

In [139]:
adv_stats

Unnamed: 0,Player,Team,DYAR,YAR,DVOA,VOA,EYds,DPI,YEAR
0,M.Wallace,PIT,458,445,49.0%,47.2%,1280,1/38,2010
1,B.Lloyd,DEN,415,402,20.1%,19.1%,1642,8/139,2010
2,G.Jennings,GB,328,327,19.4%,19.4%,1315,2/16,2010
3,R.White,ATL,303,299,8.9%,8.7%,1590,2/27,2010
4,A.Johnson,HOU,281,242,12.9%,9.3%,1309,3/32,2010
...,...,...,...,...,...,...,...,...,...
1564,T.Benjamin,LAC,-84,-87,-78.9%,-81.5%,-21,0/0,2019
1565,Z.Jones,2TM,-91,-92,-38.6%,-39.0%,151,1/9,2019
1566,P.Campbell,IND,-104,-88,-73.4%,-64.4%,-14,0/0,2019
1567,K.Johnson,ARI,-105,-103,-45.6%,-44.8%,105,0/0,2019


In [140]:
processing.clean_adv_stats()

Unnamed: 0,Player,Team,DYAR,YAR,DVOA,VOA,EYds,YEAR,DPI Pens,DPI Yds
0,M.Wallace,PIT,458,445,49.0,47.2,1280,2010,1,38
1,B.Lloyd,DEN,415,402,20.1,19.1,1642,2010,8,139
2,G.Jennings,GNB,328,327,19.4,19.4,1315,2010,2,16
3,R.White,ATL,303,299,8.9,8.7,1590,2010,2,27
4,A.Johnson,HOU,281,242,12.9,9.3,1309,2010,3,32
...,...,...,...,...,...,...,...,...,...,...
1564,T.Benjamin,LAC,-84,-87,-78.9,-81.5,-21,2019,0,0
1565,Z.Jones,,-91,-92,-38.6,-39.0,151,2019,1,9
1566,P.Campbell,IND,-104,-88,-73.4,-64.4,-14,2019,0,0
1567,K.Johnson,ARI,-105,-103,-45.6,-44.8,105,2019,0,0


## Combining the Data

In [187]:
df = processing.merge_data()
df.head()

Unnamed: 0,Rnd,Pick,Tm,Player,Age Draft,First Year,Second Year,Age First Season,G,GS,...,1D,Lng,Y/Tgt,R/G,Y/G,YEAR First Season,Rec Pts First Season,YEAR Second Season,Rec Pts Second Season,Rec Pts Jump
0,1,22,DEN,D.Thomas,22,2010,2011,23.0,10.0,2.0,...,15.0,31.0,7.3,2.2,28.3,2010.0,40.3,2011.0,79.1,38.8
1,1,24,DAL,D.Bryant,21,2010,2011,22.0,12.0,2.0,...,23.0,46.0,7.7,3.8,46.8,2010.0,92.1,2011.0,146.8,54.7
2,2,36,KAN,D.McCluster,21,2010,2011,21.0,11.0,7.0,...,11.0,31.0,5.4,1.9,19.0,2010.0,26.9,2011.0,38.8,11.9
3,2,39,TAM,A.Benn,21,2010,2011,22.0,15.0,9.0,...,17.0,64.0,10.4,1.7,26.3,2010.0,51.5,2011.0,62.1,10.6
4,2,60,SEA,G.Tate,22,2010,2011,22.0,11.0,0.0,...,8.0,52.0,5.8,1.9,20.6,2010.0,22.7,2011.0,56.2,33.5


In [188]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99,100,101,102,103,104,105,106,107,108
Rnd,1,1,2,2,2,3,3,3,3,3,...,2,2,2,2,2,2,2,3,3,3
Pick,22,24,36,39,60,77,78,82,84,87,...,36,51,56,57,59,62,64,66,76,93
Tm,DEN,DAL,KAN,TAM,SEA,TEN,CAR,PIT,CIN,DEN,...,SFO,TEN,KAN,PHI,IND,ARI,SEA,PIT,WAS,BAL
Player,D.Thomas,D.Bryant,D.McCluster,A.Benn,G.Tate,D.Williams,B.LaFell,E.Sanders,J.Shipley,E.Decker,...,D.Samuel,A.Brown,M.Hardman,J.Arcega-Whiteside,P.Campbell,A.Isabella,D.Metcalf,D.Johnson,T.McLaurin,M.Boykin
Age Draft,22,21,21,21,22,22,23,23,24,23,...,23,22,21,22,22,22,21,23,23,22
First Year,2010,2010,2010,2010,2010,2010,2010,2010,2010,2010,...,2019,2019,2019,2019,2019,2019,2019,2019,2019,2019
Second Year,2011,2011,2011,2011,2011,2011,2011,2011,2011,2011,...,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020
Age First Season,23,22,21,22,22,22,24,23,25,23,...,23,22,21,23,22,23,22,23,24,23
G,10,12,11,15,11,16,14,13,15,14,...,15,16,16,16,7,15,16,16,14,16
GS,2,2,7,9,0,1,2,1,4,0,...,11,11,5,5,3,1,15,12,14,11
