# Scraping and cleaning NBA Team data for 2020

In [12]:
# important imports for data processing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px # library for interactive plots

# imports for Web Scraping
from bs4 import BeautifulSoup  # HTML data structure
from urllib.request import urlopen  # Web client

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

pd.set_option('display.max_columns', None)

In [13]:
wd = webdriver.Chrome(ChromeDriverManager().install())

[WDM] - Current google-chrome version is 83.0.4103
[WDM] - Get LATEST driver version for 83.0.4103
[WDM] - Driver [/Users/vishaldubey/.wdm/drivers/chromedriver/mac64/83.0.4103.39/chromedriver] found in cache


 


In [14]:
wd.get("https://stats.nba.com/teams/traditional/?sort=W_PCT&dir=-1")
wd.implicitly_wait(100)

page_soup = BeautifulSoup(wd.page_source, 'html.parser').find('table')

In [15]:
# use findALL() to get the column headers
headerRowHTML = page_soup.findAll('tr', limit=2)
# use getText()to extract the text we need into a list
headers = [th.getText() for th in page_soup.findAll('tr', limit=2)[0].findAll('th')]

# exclude the first column as we will not need the ranking order from Basketball Reference for the analysis
headers = headers[1:28]

In [16]:
teamRowHTML = page_soup.findAll('tr')[1:]

teamRows = [ [val.getText() for val in teamRowHTML[i].findAll('td')] for i in range(len(teamRowHTML)) ]

for i in range(len(teamRows)):
    teamRows[i] = [x.replace('\n', '') for x in teamRows[i][1:]]
    teamRows[i] = [teamRows[i][0]] + [teamRows[i][j].replace(' ', '') for j in range(len(teamRows[i])) if j!=0]

teamData = pd.DataFrame(teamRows, columns=headers)
teamData.head()

Unnamed: 0,TEAM,GP,W,L,WIN%,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,TOV,STL,BLK,BLKA,PF,PFD,+/-
0,Milwaukee Bucks,65,53,12,0.815,48.2,118.6,43.5,91.2,47.7,13.7,38.6,35.6,17.8,24.0,74.2,9.5,42.2,51.7,25.9,14.9,7.4,6.0,4.6,19.2,21.3,11.3
1,Los Angeles Lakers,63,49,14,0.778,48.2,114.3,42.9,88.6,48.5,11.2,31.4,35.5,17.3,23.7,73.0,10.6,35.5,46.1,25.9,15.1,8.6,6.8,3.7,20.6,21.4,7.4
2,Toronto Raptors,64,46,18,0.719,48.3,113.0,40.6,88.5,45.8,13.8,37.0,37.1,18.1,22.6,80.0,9.7,35.5,45.2,25.4,14.4,8.8,4.9,5.3,21.5,20.0,6.5
3,LA Clippers,64,44,20,0.688,48.2,116.2,41.6,89.7,46.4,12.2,33.2,36.6,20.8,26.2,79.2,11.0,37.0,48.0,23.8,14.8,7.1,5.0,4.9,22.0,22.8,6.5
4,Boston Celtics,64,43,21,0.672,48.4,113.0,41.2,89.6,45.9,12.4,34.2,36.3,18.3,22.8,80.1,10.7,35.3,46.0,22.8,13.6,8.3,5.6,5.6,21.4,20.6,6.2


In [17]:
categoricalVals = ['TEAM']
allVals = headers
numericalVals = list(set(allVals) - set(categoricalVals))

print(numericalVals)

for val in numericalVals:
    teamData[val] = np.round(pd.to_numeric(teamData[val], downcast='float'), 2)

['DREB', 'FTA', 'PFD', 'PTS', 'STL', 'FGM', 'FG%', 'TOV', 'AST', '3P%', 'REB', 'OREB', 'WIN%', '+/-', 'PF', '3PA', 'FGA', 'W', 'FT%', 'L', 'BLK', '3PM', 'GP', 'FTM', 'BLKA', 'MIN']


In [18]:
def assess_NA(data):
    # pandas series denoting features and the sum of their null values
    null_sum = data.isnull().sum()# instantiate columns for missing data
    total = null_sum.sort_values(ascending=False)
    percent = ( ((null_sum / len(data.index))*100).round(2) ).sort_values(ascending=False)

    #concatenate along the columns to create the complete dataframe
    df_NA = pd.concat([total, percent], axis=1, keys=['Number of NA', 'Percent NA'])
    
    return df_NA

print(assess_NA(teamData))

      Number of NA  Percent NA
+/-              0         0.0
3P%              0         0.0
GP               0         0.0
W                0         0.0
L                0         0.0
WIN%             0         0.0
MIN              0         0.0
PTS              0         0.0
FGM              0         0.0
FGA              0         0.0
FG%              0         0.0
3PM              0         0.0
3PA              0         0.0
FTM              0         0.0
PFD              0         0.0
FTA              0         0.0
FT%              0         0.0
OREB             0         0.0
DREB             0         0.0
REB              0         0.0
AST              0         0.0
TOV              0         0.0
STL              0         0.0
BLK              0         0.0
BLKA             0         0.0
PF               0         0.0
TEAM             0         0.0


In [19]:
out_filename = "NBAteamTotals2020.csv"

teamData.to_csv(out_filename, index=False, header=True)