In [113]:
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [114]:
#Scrapping data for the NBA 2019-2020 season
#Player data - will be brought in from NBA Reference.com
year = 2020

In [115]:
#Target URL to be scrapped...calling on the year variable to create a more specific search
url = "https://www.basketball-reference.com/leagues/NBA_2020_totals.html".format(year)

In [116]:
#Enabling the url to establish connection
html = urlopen(url)

In [117]:
#Passing the Beuatiful Suop object through the url linked website
soup = BeautifulSoup(html)

In [118]:
#Accessing the first 2 rows as indicated by the html element 'tr'
soup.findAll('tr', limit=2)

[<tr>
 <th aria-label="Rank" class="ranker poptip sort_default_asc show_partial_when_sorting center" data-stat="ranker" data-tip="Rank" scope="col">Rk</th>
 <th aria-label="Player" class="poptip sort_default_asc center" data-stat="player" scope="col">Player</th>
 <th aria-label="Position" class="poptip sort_default_asc center" data-stat="pos" data-tip="Position" scope="col">Pos</th>
 <th aria-label="Player's age on February 1 of the season" class="poptip sort_default_asc center" data-stat="age" data-tip="Player's age on February 1 of the season" scope="col">Age</th>
 <th aria-label="Team" class="poptip sort_default_asc center" data-stat="team_id" data-tip="Team" scope="col">Tm</th>
 <th aria-label="Games" class="poptip center" data-stat="g" data-tip="Games" scope="col">G</th>
 <th aria-label="Games Started" class="poptip center" data-stat="gs" data-tip="Games Started" scope="col">GS</th>
 <th aria-label="Minutes Played" class="poptip center" data-stat="mp" data-tip="Minutes Played" sco

In [119]:
#Accessing the Table headers to establish column names and finding all of the table rows hat belong to the columns
#Extracting the text we need to list
headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]

In [120]:
#This is extracting all of the columns execpt for the first one because we do not need basketball references ranking column
headers = headers[1:]
headers

['Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'GS',
 'MP',
 'FG',
 'FGA',
 'FG%',
 '3P',
 '3PA',
 '3P%',
 '2P',
 '2PA',
 '2P%',
 'eFG%',
 'FT',
 'FTA',
 'FT%',
 'ORB',
 'DRB',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS']

In [121]:
#Brining in all of the rows except for the first is this is basic website row not needed for the dataframe
rows = soup.findAll('tr')[1:]
player_stats = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]

In [122]:
stats = pd.DataFrame(player_stats, columns = headers)
stats.head(40)

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Steven Adams,C,26.0,OKC,26.0,26.0,716.0,131.0,210.0,0.624,...,0.486,89.0,168.0,257.0,72.0,13.0,32.0,42.0,50.0,298.0
1,Bam Adebayo,C,22.0,MIA,30.0,30.0,1023.0,173.0,306.0,0.565,...,0.685,76.0,242.0,318.0,139.0,45.0,39.0,87.0,81.0,471.0
2,LaMarcus Aldridge,C,34.0,SAS,27.0,27.0,893.0,213.0,415.0,0.513,...,0.825,57.0,142.0,199.0,68.0,17.0,51.0,40.0,59.0,523.0
3,Nickeil Alexander-Walker,SG,21.0,NOP,26.0,0.0,337.0,53.0,160.0,0.331,...,0.688,4.0,50.0,54.0,46.0,9.0,4.0,26.0,30.0,145.0
4,Grayson Allen,SG,24.0,MEM,16.0,0.0,295.0,49.0,111.0,0.441,...,0.842,2.0,39.0,41.0,24.0,5.0,1.0,16.0,25.0,136.0
5,Jarrett Allen,C,21.0,BRK,29.0,26.0,770.0,141.0,213.0,0.662,...,0.615,105.0,201.0,306.0,39.0,21.0,40.0,34.0,73.0,357.0
6,Kadeem Allen,SG,27.0,NYK,1.0,0.0,16.0,3.0,6.0,0.5,...,,0.0,0.0,0.0,2.0,1.0,0.0,1.0,1.0,9.0
7,Al-Farouq Aminu,PF,29.0,ORL,18.0,2.0,380.0,25.0,86.0,0.291,...,0.655,24.0,63.0,87.0,21.0,18.0,8.0,17.0,27.0,78.0
8,Kyle Anderson,PF,26.0,MEM,25.0,1.0,464.0,57.0,119.0,0.479,...,0.629,24.0,79.0,103.0,56.0,17.0,13.0,24.0,40.0,141.0
9,Ryan Anderson,PF,31.0,HOU,2.0,0.0,14.0,2.0,7.0,0.286,...,,0.0,7.0,7.0,2.0,1.0,0.0,1.0,1.0,5.0


In [123]:
stats.to_csv("nba_stats_2019.csv", index=False)

In [124]:
stats.Tm.unique()

array(['OKC', 'MIA', 'SAS', 'NOP', 'MEM', 'BRK', 'NYK', 'ORL', 'HOU',
       'MIL', 'LAL', 'POR', 'TOR', 'CHI', 'SAC', 'PHO', 'CHO', None,
       'DAL', 'DEN', 'MIN', 'WAS', 'ATL', 'LAC', 'IND', 'UTA', 'PHI',
       'DET', 'GSW', 'BOS', 'CLE'], dtype=object)

In [125]:
#Next workbook to look at is the NBA_PRODUCTION_2019 workbook