# Scraping and cleaning NBA Rookies data from 2019

## Initial scraping and cleaning

In [1]:
# important imports for data processing & visualization
import numpy as np
import pandas as pd

# imports for Web Scraping
from bs4 import BeautifulSoup  # HTML data structure
from urllib.request import urlopen  # Web client

In [2]:
page_url = 'https://www.basketball-reference.com/draft/NBA_2019.html'

# opens the connection and downloads html page from url
uClient = urlopen(page_url)


# parses html into a soup data structure to traverse html as if it were a json data type.
page_soup = BeautifulSoup(uClient.read())
uClient.close()

# use findALL() to get the column headers
headerRowHTML = page_soup.findAll('tr', limit =3)

Based on the output below, the 'tr' tag contains the 'th' tag where the name of each column is stored. We're looking at the [0] index because the 'tr' tag applies to every row, not just the header row

In [3]:
# use getText()to extract the text we need into a list
headers = [th.getText() for th in page_soup.findAll('tr', limit=2)[1].findAll('th')]

# exclude the first column as we will not need the ranking order from Basketball Reference for the analysis
headers = headers[1:]

Now we're going to run the previous code for every row on the data table (excluding the first which we have as the header already)

In [4]:
#list comprehension creates list of lists with each inner row containing the stats for an individual rookie
#each player stored in tr, player element stored in td (html syntax)
playerRowHTML = page_soup.findAll('tr')[2:]
playerRows = [ [val.getText() for val in playerRowHTML[i].findAll('td')] for i in range(len(playerRowHTML)) ] 

In [5]:
pd.set_option('display.max_columns', None)
rookiesRaw = pd.DataFrame(playerRows, columns=headers) #create dataframe
rookiesRaw.head()

Unnamed: 0,Pk,Tm,Player,College,Yrs,G,MP,PTS,TRB,AST,FG%,3P%,FT%,MP.1,PTS.1,TRB.1,AST.1,WS,WS/48,BPM,VORP
0,1,NOP,Zion Williamson,Duke,1,19,565,448,129,41,0.589,0.462,0.645,29.7,23.6,6.8,2.2,1.8,0.149,2.4,0.6
1,2,MEM,Ja Morant,Murray State,1,59,1771,1041,208,409,0.491,0.367,0.77,30.0,17.6,3.5,6.9,3.4,0.093,0.4,1.1
2,3,NYK,RJ Barrett,Duke,1,56,1704,803,279,143,0.402,0.32,0.614,30.4,14.3,5.0,2.6,-0.5,-0.015,-4.3,-1.0
3,4,LAL,De'Andre Hunter,Virginia,1,63,2018,778,286,112,0.41,0.355,0.764,32.0,12.3,4.5,1.8,0.1,0.001,-4.7,-1.4
4,5,CLE,Darius Garland,Vanderbilt,1,59,1824,728,111,229,0.401,0.355,0.875,30.9,12.3,1.9,3.9,-1.3,-0.035,-5.6,-1.7


In [6]:
namesBefore = [x for x in rookiesRaw['Player']] #create a list of player names (will be useful for duplicate removal later)
rookies = rookiesRaw 
rookies.describe() #describe dataframe

Unnamed: 0,Pk,Tm,Player,College,Yrs,G,MP,PTS,TRB,AST,FG%,3P%,FT%,MP.1,PTS.1,TRB.1,AST.1,WS,WS/48,BPM,VORP
count,60,60,60,60.0,60,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0
unique,60,28,60,38.0,2,33.0,50.0,47.0,47.0,39.0,44.0,42.0,42.0,50.0,42.0,34.0,28.0,26.0,53.0,44.0,19.0
top,39,PHI,Justin Wright-Foreman,,1,,,,,,,,,,,,0.5,0.0,,,0.0
freq,1,5,1,8.0,53,7.0,7.0,7.0,7.0,7.0,9.0,11.0,17.0,7.0,7.0,7.0,7.0,8.0,7.0,7.0,13.0


## Clean Dataset (Missing Values, Column Names, etc.)

Update some redundant column names

In [7]:
#changed aggregated statistics by adding a 'T' to their header name
headers = ['Pk', 'Tm', 'Player', 'College', 'Yrs', 'GP', 'TMP', 'TPTS', 'TRB', 'TAST', 'FG%', '3P%', 'FT%', 'MP', 'PTS', 'RB', 'AST', 'WS', 'WS/48', 'BPM', 'VORP']
rookies.columns = headers

In [8]:
rookies.head()

Unnamed: 0,Pk,Tm,Player,College,Yrs,GP,TMP,TPTS,TRB,TAST,FG%,3P%,FT%,MP,PTS,RB,AST,WS,WS/48,BPM,VORP
0,1,NOP,Zion Williamson,Duke,1,19,565,448,129,41,0.589,0.462,0.645,29.7,23.6,6.8,2.2,1.8,0.149,2.4,0.6
1,2,MEM,Ja Morant,Murray State,1,59,1771,1041,208,409,0.491,0.367,0.77,30.0,17.6,3.5,6.9,3.4,0.093,0.4,1.1
2,3,NYK,RJ Barrett,Duke,1,56,1704,803,279,143,0.402,0.32,0.614,30.4,14.3,5.0,2.6,-0.5,-0.015,-4.3,-1.0
3,4,LAL,De'Andre Hunter,Virginia,1,63,2018,778,286,112,0.41,0.355,0.764,32.0,12.3,4.5,1.8,0.1,0.001,-4.7,-1.4
4,5,CLE,Darius Garland,Vanderbilt,1,59,1824,728,111,229,0.401,0.355,0.875,30.9,12.3,1.9,3.9,-1.3,-0.035,-5.6,-1.7


Set Draft Pick as Index and Drop 'Yrs' Column


In [9]:
rookies.set_index('Pk', inplace = True)

In [10]:
rookies.drop('Yrs', axis =1, inplace=True)

Cast to Numbers

In [11]:
categoricalVals = ['Tm', 'Player', 'College']
allVals = rookies.columns
numericalVals = list(set(allVals) - set(categoricalVals))
print(numericalVals)

for val in numericalVals:
    rookies[val] = pd.to_numeric(rookies[val], downcast='float')

['VORP', 'FG%', 'AST', 'TMP', 'BPM', 'TPTS', 'FT%', 'RB', 'PTS', 'WS/48', 'TRB', 'MP', 'WS', 'GP', '3P%', 'TAST']


Check for Missing Values

In [12]:
# given the null values present, we must get rid of them (remember earlier function from module 1)

def assess_NA(data):
    # pandas series denoting features and the sum of their null values
    null_sum = data.isnull().sum()# instantiate columns for missing data
    total = null_sum.sort_values(ascending=False)
    percent = ( ((null_sum / len(data.index))*100).round(2) ).sort_values(ascending=False)

    #concatenate along the columns to create the complete dataframe
    df_NA = pd.concat([total, percent], axis=1, keys=['Number of NA', 'Percent NA'])
    
    return df_NA
print('Before Dropping any Rows:')
assess = assess_NA(rookies)
assess

Before Dropping any Rows:


Unnamed: 0,Number of NA,Percent NA
FT%,19,30.65
3P%,13,20.97
FG%,11,17.74
BPM,9,14.52
GP,9,14.52
TMP,9,14.52
TPTS,9,14.52
TRB,9,14.52
TAST,9,14.52
VORP,9,14.52


In [13]:
rookies.dropna(axis=0, how = 'any', thresh=None, inplace =True)
rookies.describe()

Unnamed: 0,GP,TMP,TPTS,TRB,TAST,FG%,3P%,FT%,MP,PTS,RB,AST,WS,WS/48,BPM,VORP
count,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0
mean,39.452381,801.214294,329.976196,125.071426,66.690475,0.431619,0.281738,0.73231,17.388094,7.192857,2.816667,1.407143,0.669048,0.046333,-3.178571,-0.164286
std,19.5201,616.678406,292.992432,96.251038,75.844666,0.090713,0.107272,0.132521,8.776637,5.112844,1.591428,1.200689,1.178359,0.064483,3.913425,0.606397
min,4.0,11.0,4.0,5.0,0.0,0.2,0.0,0.462,2.8,1.0,0.6,0.0,-1.3,-0.109,-21.299999,-1.7
25%,21.0,276.5,72.5,33.25,16.0,0.3855,0.23475,0.64525,10.125,3.5,1.5,0.525,0.025,0.0015,-4.525,-0.3
50%,47.5,733.5,238.0,114.5,46.0,0.4155,0.2985,0.7385,17.1,4.85,2.8,1.15,0.25,0.0505,-3.15,-0.1
75%,56.75,1259.75,572.0,205.5,94.0,0.47675,0.35475,0.8015,25.324999,11.625,3.7,1.875,0.975,0.08025,-1.075,0.0
max,65.0,2018.0,1041.0,316.0,409.0,0.66,0.462,1.0,32.0,23.6,6.8,6.9,4.4,0.192,3.5,1.5


In [14]:
assess = assess_NA(rookies)
assess

Unnamed: 0,Number of NA,Percent NA
VORP,0,0.0
FG%,0,0.0
Player,0,0.0
College,0,0.0
GP,0,0.0
TMP,0,0.0
TPTS,0,0.0
TRB,0,0.0
TAST,0,0.0
3P%,0,0.0


Join Position Data to the Rookies Dataset

In [15]:
import pandas as pd 
allplayers=pd.read_csv('./data/NBAplayerTotals2020.csv')
allplayers.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,TS%,PER,VA,EWA
0,Steven Adams,C,26.0,OKC,58.0,58.0,1564.0,262.0,443.0,0.591,1.0,3.0,0.333,261.0,440.0,0.593,0.593,108.0,183.0,0.59,196.0,347.0,543.0,141.0,50.0,65.0,86.0,111.0,633.0,0.6,19.4,205.42,6.8
1,Bam Adebayo,PF,22.0,MIA,65.0,65.0,2235.0,408.0,719.0,0.567,1.0,13.0,0.077,407.0,706.0,0.576,0.568,236.0,342.0,0.69,165.0,518.0,683.0,333.0,78.0,85.0,185.0,164.0,1053.0,0.61,18.42,230.84,7.7
2,LaMarcus Aldridge,C,34.0,SAS,53.0,53.0,1754.0,391.0,793.0,0.493,61.0,157.0,0.389,330.0,636.0,0.519,0.532,158.0,191.0,0.827,103.0,289.0,392.0,129.0,36.0,87.0,74.0,128.0,1001.0,0.57,18.39,203.94,6.8
3,Nickeil Alexander-Walker,SG,21.0,NOP,41.0,0.0,501.0,77.0,227.0,0.339,40.0,117.0,0.342,37.0,110.0,0.336,0.427,17.0,28.0,0.607,8.0,72.0,80.0,74.0,11.0,7.0,40.0,46.0,211.0,0.44,8.87,-12.19,-0.4
4,Grayson Allen,SG,24.0,MEM,30.0,0.0,498.0,79.0,176.0,0.449,33.0,91.0,0.363,46.0,85.0,0.541,0.543,30.0,35.0,0.857,5.0,61.0,66.0,43.0,6.0,1.0,23.0,36.0,221.0,0.58,11.41,6.76,0.2


In [16]:
positions = allplayers[['Player', 'Pos', 'eFG%']]
positions.head()

Unnamed: 0,Player,Pos,eFG%
0,Steven Adams,C,0.593
1,Bam Adebayo,PF,0.568
2,LaMarcus Aldridge,C,0.532
3,Nickeil Alexander-Walker,SG,0.427
4,Grayson Allen,SG,0.543


In [17]:
rookies = pd.merge(rookies, positions, on = 'Player')
#run the above line only once

We were able to perform some initial EDA with the metrics above, but wanted to add more variables to our analysis of rookies. 

## Adding More Features (Scraping)

In [18]:
page_url = 'https://www.basketball-reference.com/leagues/NBA_2020_rookies-season-stats.html'

# opens the connection and downloads html page from url
uClient = urlopen(page_url)


# parses html into a soup data structure to traverse html as if it were a json data type.
page_soup = BeautifulSoup(uClient.read())
uClient.close()

# use findALL() to get the column headers
headerRowHTML = page_soup.findAll('tr', limit =3)


In [19]:
# use getText()to extract the text we need into a list
headers = [th.getText() for th in page_soup.findAll('tr', limit=2)[1].findAll('th')]

# exclude the first column as we will not need the ranking order from Basketball Reference for the analysis
headers = headers[1:]

In [20]:
#list comprehension creates list of lists with each inner row containing the stats for an individual rookie
#each player stored in tr, player element stored in td (html syntax)
playerRowHTML = page_soup.findAll('tr')[2:]
playerRows = [ [val.getText() for val in playerRowHTML[i].findAll('td')] for i in range(len(playerRowHTML)) ] 

In [21]:
pd.set_option('display.max_columns', None)
updateRaw = pd.DataFrame(playerRows, columns=headers) #create dataframe
updateRaw.head()

Unnamed: 0,Player,Debut,Age,Yrs,G,MP,FG,FGA,3P,3PA,FT,FTA,ORB,TRB,AST,STL,BLK,TOV,PF,PTS,FG%,3P%,FT%,MP.1,PTS.1,TRB.1,AST.1
0,Nickeil Alexander-Walker,"Oct 22, '19, NOP @ TOR",21,1,41,501,77,227,40,117,17,28,8,80,74,11,7,40,46,211,0.339,0.342,0.607,12.2,5.1,2.0,1.8
1,RJ Barrett,"Oct 23, '19, NYK @ SAS",19,1,56,1704,292,727,63,197,156,254,50,279,143,55,17,124,122,803,0.402,0.32,0.614,30.4,14.3,5.0,2.6
2,Darius Bazley,"Oct 23, '19, OKC @ UTA",19,1,53,909,88,230,30,100,32,47,21,196,29,20,35,40,45,238,0.383,0.3,0.681,17.2,4.5,3.7,0.5
3,Goga Bitadze,"Oct 26, '19, IND @ CLE",20,1,49,410,61,129,6,36,24,33,22,96,21,6,32,25,57,152,0.473,0.167,0.727,8.4,3.1,2.0,0.4
4,Marques Bolden,"Feb 1, '20, CLE vs. GSW",21,1,1,3,0,0,0,0,0,0,0,2,0,1,0,0,1,0,,,,3.0,0.0,2.0,0.0


In [22]:
#drop the data we do not need
list(rookies.columns)


['Tm',
 'Player',
 'College',
 'GP',
 'TMP',
 'TPTS',
 'TRB',
 'TAST',
 'FG%',
 '3P%',
 'FT%',
 'MP',
 'PTS',
 'RB',
 'AST',
 'WS',
 'WS/48',
 'BPM',
 'VORP',
 'Pos',
 'eFG%']

In [23]:
updateRaw.drop(['G','MP','FG%','3P%','PTS','TRB','AST','FT%','Yrs'], axis = 1, inplace=True)
updateRaw.head()

Unnamed: 0,Player,Debut,Age,FG,FGA,3P,3PA,FT,FTA,ORB,STL,BLK,TOV,PF
0,Nickeil Alexander-Walker,"Oct 22, '19, NOP @ TOR",21,77,227,40,117,17,28,8,11,7,40,46
1,RJ Barrett,"Oct 23, '19, NYK @ SAS",19,292,727,63,197,156,254,50,55,17,124,122
2,Darius Bazley,"Oct 23, '19, OKC @ UTA",19,88,230,30,100,32,47,21,20,35,40,45
3,Goga Bitadze,"Oct 26, '19, IND @ CLE",20,61,129,6,36,24,33,22,6,32,25,57
4,Marques Bolden,"Feb 1, '20, CLE vs. GSW",21,0,0,0,0,0,0,0,1,0,0,1


In [24]:
namesBefore = [x for x in rookiesRaw['Player']] #create a list of player names (will be useful for duplicate removal later)
update = updateRaw 
update.describe() #describe dataframe

Unnamed: 0,Player,Debut,Age,FG,FGA,3P,3PA,FT,FTA,ORB,STL,BLK,TOV,PF
count,115,115,115,115,115,115,115,115,115,115,115,115,115,115
unique,115,90,10,66,81,43,59,47,54,42,37,32,47,63
top,Nickeil Alexander-Walker,"Oct 23, '19, WAS @ DAL",22,0,5,0,1,0,0,0,0,0,0,0
freq,1,4,29,14,8,39,12,27,24,28,26,33,19,12


In [25]:
categoricalVals = ['Player', 'Debut']
allVals = update.columns
numericalVals = list(set(allVals) - set(categoricalVals))

for val in numericalVals:
    update[val] = pd.to_numeric(update[val], downcast='float')

In [26]:
print('Before Dropping any Rows:')
assess = assess_NA(update)
assess

Before Dropping any Rows:


Unnamed: 0,Number of NA,Percent NA
PF,10,8.0
TOV,10,8.0
BLK,10,8.0
STL,10,8.0
ORB,10,8.0
FTA,10,8.0
FT,10,8.0
3PA,10,8.0
3P,10,8.0
FGA,10,8.0


In [27]:
update.dropna(axis=0, how = 'any', thresh=None, inplace =True)
update.describe()

Unnamed: 0,Age,FG,FGA,3P,3PA,FT,FTA,ORB,STL,BLK,TOV,PF
count,115.0,115.0,115.0,115.0,115.0,115.0,115.0,115.0,115.0,115.0,115.0,115.0
mean,22.060869,62.878262,142.834778,18.121738,54.330433,24.0,33.347828,15.66087,12.513043,7.652174,23.173914,38.626087
std,2.005637,92.156822,208.787979,30.150543,84.853821,37.605289,52.183891,20.979893,17.271498,11.816613,35.216438,45.897572
min,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20.5,2.5,7.5,0.0,2.0,1.0,2.0,1.0,1.0,0.0,1.0,2.5
50%,22.0,19.0,36.0,3.0,11.0,7.0,10.0,6.0,4.0,2.0,7.0,16.0
75%,23.0,87.5,195.5,21.5,75.5,31.5,45.0,21.5,19.5,9.0,33.0,72.0
max,29.0,393.0,850.0,133.0,376.0,204.0,265.0,82.0,80.0,56.0,191.0,182.0


In [28]:
assess = assess_NA(update)
assess

Unnamed: 0,Number of NA,Percent NA
PF,0,0.0
TOV,0,0.0
BLK,0,0.0
STL,0,0.0
ORB,0,0.0
FTA,0,0.0
FT,0,0.0
3PA,0,0.0
3P,0,0.0
FGA,0,0.0


## Join the New Dataset to the Original

In [29]:
rookies = pd.merge(rookies, update, on = 'Player')
#run the above line only once
rookies.head()

Unnamed: 0,Tm,Player,College,GP,TMP,TPTS,TRB,TAST,FG%,3P%,FT%,MP,PTS,RB,AST,WS,WS/48,BPM,VORP,Pos,eFG%,Debut,Age,FG,FGA,3P,3PA,FT,FTA,ORB,STL,BLK,TOV,PF
0,NOP,Zion Williamson,Duke,19.0,565.0,448.0,129.0,41.0,0.589,0.462,0.645,29.700001,23.6,6.8,2.2,1.8,0.149,2.4,0.6,PF,0.599,"Jan 22, '20, NOP vs. SAS",19.0,172.0,292.0,6.0,13.0,98.0,152.0,55.0,16.0,9.0,51.0,32.0
1,MEM,Ja Morant,Murray State,59.0,1771.0,1041.0,208.0,409.0,0.491,0.367,0.77,30.0,17.6,3.5,6.9,3.4,0.093,0.4,1.1,PG,0.523,"Oct 23, '19, MEM @ MIA",20.0,393.0,800.0,51.0,139.0,204.0,265.0,44.0,54.0,15.0,191.0,90.0
2,NYK,RJ Barrett,Duke,56.0,1704.0,803.0,279.0,143.0,0.402,0.32,0.614,30.4,14.3,5.0,2.6,-0.5,-0.015,-4.3,-1.0,SG,0.445,"Oct 23, '19, NYK @ SAS",19.0,292.0,727.0,63.0,197.0,156.0,254.0,50.0,55.0,17.0,124.0,122.0
3,LAL,De'Andre Hunter,Virginia,63.0,2018.0,778.0,286.0,112.0,0.41,0.355,0.764,32.0,12.3,4.5,1.8,0.1,0.001,-4.7,-1.4,SF,0.489,"Oct 24, '19, ATL @ DET",22.0,280.0,683.0,108.0,304.0,110.0,144.0,44.0,44.0,18.0,103.0,182.0
4,CLE,Darius Garland,Vanderbilt,59.0,1824.0,728.0,111.0,229.0,0.401,0.355,0.875,30.9,12.3,1.9,3.9,-1.3,-0.035,-5.6,-1.7,PG,0.476,"Oct 23, '19, CLE @ ORL",20.0,280.0,699.0,105.0,296.0,63.0,72.0,27.0,42.0,4.0,151.0,92.0


In [30]:
rookies = rookies[['Player','College','Pos','Tm', 'GP', 'TMP', 'TPTS','TRB','ORB','TAST', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA','FT%','STL','BLK', 'TOV', 'PF', 'MP','PTS', 'RB','AST', 'WS', 'WS/48', 'BPM', 'VORP']]
rookies.head()

Unnamed: 0,Player,College,Pos,Tm,GP,TMP,TPTS,TRB,ORB,TAST,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,STL,BLK,TOV,PF,MP,PTS,RB,AST,WS,WS/48,BPM,VORP
0,Zion Williamson,Duke,PF,NOP,19.0,565.0,448.0,129.0,55.0,41.0,172.0,292.0,0.589,6.0,13.0,0.462,98.0,152.0,0.645,16.0,9.0,51.0,32.0,29.700001,23.6,6.8,2.2,1.8,0.149,2.4,0.6
1,Ja Morant,Murray State,PG,MEM,59.0,1771.0,1041.0,208.0,44.0,409.0,393.0,800.0,0.491,51.0,139.0,0.367,204.0,265.0,0.77,54.0,15.0,191.0,90.0,30.0,17.6,3.5,6.9,3.4,0.093,0.4,1.1
2,RJ Barrett,Duke,SG,NYK,56.0,1704.0,803.0,279.0,50.0,143.0,292.0,727.0,0.402,63.0,197.0,0.32,156.0,254.0,0.614,55.0,17.0,124.0,122.0,30.4,14.3,5.0,2.6,-0.5,-0.015,-4.3,-1.0
3,De'Andre Hunter,Virginia,SF,LAL,63.0,2018.0,778.0,286.0,44.0,112.0,280.0,683.0,0.41,108.0,304.0,0.355,110.0,144.0,0.764,44.0,18.0,103.0,182.0,32.0,12.3,4.5,1.8,0.1,0.001,-4.7,-1.4
4,Darius Garland,Vanderbilt,PG,CLE,59.0,1824.0,728.0,111.0,27.0,229.0,280.0,699.0,0.401,105.0,296.0,0.355,63.0,72.0,0.875,42.0,4.0,151.0,92.0,30.9,12.3,1.9,3.9,-1.3,-0.035,-5.6,-1.7


### Adding True Shooting %, Player Efficiency Rating, Value Added, and Estimated Wins Added

In [31]:
playersTS = rookies['PTS'] / (2 * ( rookies['FGA'] + 0.44*rookies['FTA'] ))
rookies['TS%'] =  playersTS.round(decimals=2)

In [32]:
PER_coeff = {
    "FG": 85.910,
    "STL": 53.897,
    "3P": 51.757,
    "FT": 46.845,
    "BLK": 39.190,
    "ORB": 39.190,
    "AST": 34.677,
    "DRB": 14.707,
    "PF": -17.174,
    "FTmiss": -20.091,
    "FGmiss": -39.190,
    "TOV": -53.897
}

rookies["DRB"] = rookies["TRB"] - rookies["ORB"]
# adding defensive rebounds column

playerER = (
    rookies["FG"]*PER_coeff["FG"] + 
    rookies["STL"]*PER_coeff["STL"] + 
    rookies["3P"]*PER_coeff["3P"] + 
    rookies["BLK"]*PER_coeff["BLK"] + 
    rookies["ORB"]*PER_coeff["ORB"] + 
    rookies["AST"]*PER_coeff["AST"] + 
    rookies["DRB"]*PER_coeff["DRB"] + 
    rookies["PF"]*PER_coeff["PF"] + 
    (rookies["FGA"]-rookies["FG"])*PER_coeff["FGmiss"] + 
    (rookies["FTA"]-rookies["FT"])*PER_coeff["FTmiss"] + 
    rookies["TOV"]*PER_coeff["TOV"]
)

playerER *= (1 / rookies["MP"])

playerER = playerER.round(decimals=2)

rookies['PER'] = playerER

In [33]:
VA_coeff = {
    'PF':11.5,
    'PG':11.0,
    'C':10.6,
    'SF':10.5,
    'SG':10.5
}

PRL = [ VA_coeff[pos] for pos in rookies['Pos']]

playerVA = (rookies['MP'] * (rookies['PER'] - PRL)) / 67
playerVA = playerVA.round(decimals=2)

playerEWA = playerVA/30
playerEWA = playerEWA.round(decimals=1)

rookies['VA'] = playerVA
rookies['EWA'] = playerEWA

Now that the data set is complete, we can export it as a CSV to be used in our model file

In [112]:
out_filename = "NBArookies2019.csv"
rookies.to_csv(out_filename, index=False, header = True)