# Web Scraping Additional Data

#### Getting number of players that were drafted to the NBA per team

In [1]:
# Importing required libraries
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

In [658]:
# NBA season we will be analyzing
year = 2018
# URL page we will scraping (see image above)
url = "https://www.basketball-reference.com/draft/NBA_{}.html".format(year)
# this is the HTML from the given URL
html = urlopen(url)
soup = BeautifulSoup(html)

In [659]:
# use findALL() to get the column headers
soup.findAll('tr', limit=2)
# use getText()to extract the text we need into a list
headers = [th.getText() for th in soup.findAll('tr', limit=2)[1].findAll('th')]
# exclude the first column as we will not need the ranking order from Basketball Reference for the analysis
headers = headers[1:]
headers

['Pk',
 'Tm',
 'Player',
 'College',
 'Yrs',
 'G',
 'MP',
 'PTS',
 'TRB',
 'AST',
 'FG%',
 '3P%',
 'FT%',
 'MP',
 'PTS',
 'TRB',
 'AST',
 'WS',
 'WS/48',
 'BPM',
 'VORP']

In [660]:
# avoid the first header row
rows = soup.findAll('tr')[1:]
draft_info = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]

In [661]:
draft = pd.DataFrame(draft_info, columns = headers)
draft = draft[draft['Yrs'].apply(lambda x: str(x).isdigit())]
draft.head()

Unnamed: 0,Pk,Tm,Player,College,Yrs,G,MP,PTS,TRB,AST,...,3P%,FT%,MP.1,PTS.1,TRB.1,AST.1,WS,WS/48,BPM,VORP
1,1,PHO,Deandre Ayton,Arizona,1,71,2183,1159,729,125,...,0.0,0.746,30.7,16.3,10.3,1.8,5.8,0.128,0.2,1.2
2,2,SAC,Marvin Bagley,Duke,1,62,1567,923,471,62,...,0.313,0.691,25.3,14.9,7.6,1.0,3.6,0.11,-1.8,0.1
3,3,ATL,Luka Dončić,,1,72,2318,1526,563,429,...,0.327,0.713,32.2,21.2,7.8,6.0,4.9,0.101,4.1,3.6
4,4,MEM,Jaren Jackson,Michigan State,1,58,1515,798,272,64,...,0.359,0.766,26.1,13.8,4.7,1.1,3.3,0.105,0.1,0.8
5,5,DAL,Trae Young,Oklahoma,1,81,2503,1549,301,653,...,0.324,0.829,30.9,19.1,3.7,8.1,3.3,0.062,-1.1,0.6


In [662]:
draft.drop(['Tm', 'Player','Yrs','G','MP','PTS','TRB','AST','FG%','3P%','FT%','MP','PTS','TRB',
 'AST','WS','WS/48','BPM','VORP'], inplace = True, axis = 1)
draft.head()

Unnamed: 0,Pk,College
1,1,Arizona
2,2,Duke
3,3,
4,4,Michigan State
5,5,Oklahoma


In [663]:
draft['Season'] = 2018
draft.head()

Unnamed: 0,Pk,College,Season
1,1,Arizona,2018
2,2,Duke,2018
3,3,,2018
4,4,Michigan State,2018
5,5,Oklahoma,2018


In [664]:
# Get number of players that were drafted from each team
draft['num_drafted'] = draft.groupby('College')['College'].transform('count')

In [665]:
draft.head()

Unnamed: 0,Pk,College,Season,num_drafted
1,1,Arizona,2018,1
2,2,Duke,2018,4
3,3,,2018,7
4,4,Michigan State,2018,2
5,5,Oklahoma,2018,1


In [666]:
draft['best_college_player'] = 0
draft.head()

Unnamed: 0,Pk,College,Season,num_drafted,best_college_player
1,1,Arizona,2018,1,0
2,2,Duke,2018,4,0
3,3,,2018,7,0
4,4,Michigan State,2018,2,0
5,5,Oklahoma,2018,1,0


In [667]:
# Remove any rows where players did not play college basketball (internation/highschool players)
import numpy as np
draft['College'].replace('', np.nan, inplace=True)
draft.dropna(subset=['College'], inplace=True)
draft.head()

Unnamed: 0,Pk,College,Season,num_drafted,best_college_player
1,1,Arizona,2018,1,0
2,2,Duke,2018,4,0
4,4,Michigan State,2018,2,0
5,5,Oklahoma,2018,1,0
6,6,Texas,2018,1,0


In [668]:
draft.loc[1,'best_college_player']=1
draft.head()

Unnamed: 0,Pk,College,Season,num_drafted,best_college_player
1,1,Arizona,2018,1,1
2,2,Duke,2018,4,0
4,4,Michigan State,2018,2,0
5,5,Oklahoma,2018,1,0
6,6,Texas,2018,1,0


In [669]:
draft.to_pickle('draft2018.pkl')