In [16]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [19]:
year = np.arange(1980, 2021, 1)
year

array([1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990,
       1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
       2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
       2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])

In [21]:
# NBA season we will be analyzing
def scrape_nba_basketball_reference(year):
    # URL page we will scraping (see image above)
    url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html".format(year)
    # this is the HTML from the given URL
    html = urlopen(url)
    soup = BeautifulSoup(html)
    # use findALL() to get the column headers
    soup.findAll('tr', limit=2)
    # use getText()to extract the text we need into a list
    headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
    # exclude the first column as we will not need the ranking order from Basketball Reference for the analysis
    headers = headers[1:]
    # avoid the first header row
    rows = soup.findAll('tr')[1:]
    player_stats = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]
    stats = pd.DataFrame(player_stats, columns = headers)
    stats['year'] = year
    return(stats)  

In [71]:
nba_stats = pd.DataFrame()
for i in year:
    tmp = scrape_nba_basketball_reference(i)
    nba_stats = nba_stats.append(tmp,ignore_index=True)

In [72]:
player_stats = nba_stats
player_stats.shape

(21786, 30)

In [73]:
nba_stats.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,year
0,Kareem Abdul-Jabbar*,C,32,LAL,82,,38.3,10.2,16.9,0.604,...,2.3,8.5,10.8,4.5,1.0,3.4,3.6,2.6,24.8,1980
1,Tom Abernethy,PF,25,GSW,67,,18.2,2.3,4.7,0.481,...,0.9,1.9,2.9,1.3,0.5,0.2,0.6,1.8,5.4,1980
2,Alvan Adams,C,25,PHO,75,,28.9,6.2,11.7,0.531,...,2.1,6.0,8.1,4.3,1.4,0.7,2.9,3.2,14.9,1980
3,Tiny Archibald*,PG,31,BOS,80,80.0,35.8,4.8,9.9,0.482,...,0.7,1.7,2.5,8.4,1.3,0.1,3.0,2.7,14.1,1980
4,Dennis Awtrey,C,31,CHI,26,,21.5,1.0,2.3,0.45,...,1.1,3.3,4.4,1.5,0.5,0.6,1.0,2.5,3.3,1980


In [74]:
player_stats.tail()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,year
21781,Trae Young,PG,21,ATL,60,60,35.3,9.1,20.8,0.437,...,0.5,3.7,4.3,9.3,1.1,0.1,4.8,1.7,29.6,2020
21782,Cody Zeller,C,27,CHO,58,39,23.1,4.3,8.3,0.524,...,2.8,4.3,7.1,1.5,0.7,0.4,1.3,2.4,11.1,2020
21783,Tyler Zeller,C,30,SAS,2,0,2.0,0.5,2.0,0.25,...,1.5,0.5,2.0,0.0,0.0,0.0,0.0,0.0,1.0,2020
21784,Ante Žižić,C,23,CLE,22,0,10.0,1.9,3.3,0.569,...,0.8,2.2,3.0,0.3,0.3,0.2,0.5,1.2,4.4,2020
21785,Ivica Zubac,C,22,LAC,72,70,18.4,3.3,5.3,0.613,...,2.7,4.8,7.5,1.1,0.2,0.9,0.8,2.3,8.3,2020


In [75]:
player_stats['year'].unique()

array([1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990,
       1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
       2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
       2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])

In [76]:
# some players are duplicated during a single given year
print('Total number of players since 1980')
print(len(player_stats['Player']))
print('Unique players since 1980')
print(len(player_stats['Player'].unique()))
print('Number of rows in year 2015')
print(len(player_stats[player_stats['year']==2015]))
print('Unique players in 2015')
print(len(player_stats[player_stats['year']==2015]['Player'].unique()))

Total number of players since 1980
21786
Unique players since 1980
3184
Number of rows in year 2015
675
Unique players in 2015
493


In [77]:
player_stats[player_stats['year']==2015].head(150)

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,year
17790,Quincy Acy,PF,24,NYK,68,22,18.9,2.2,4.9,.459,...,1.2,3.3,4.4,1.0,0.4,0.3,0.9,2.2,5.9,2015
17791,Jordan Adams,SG,20,MEM,30,0,8.3,1.2,2.9,.407,...,0.3,0.6,0.9,0.5,0.5,0.2,0.5,0.8,3.1,2015
17792,Steven Adams,C,21,OKC,70,67,25.3,3.1,5.7,.544,...,2.8,4.6,7.5,0.9,0.5,1.2,1.4,3.2,7.7,2015
17793,Jeff Adrien,PF,28,MIN,17,0,12.6,1.1,2.6,.432,...,1.4,3.2,4.5,0.9,0.2,0.5,0.5,1.8,3.5,2015
17794,Arron Afflalo,SG,29,TOT,78,72,32.1,4.8,11.3,.424,...,0.3,2.8,3.2,1.7,0.5,0.1,1.5,2.1,13.3,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17935,Troy Daniels,SG,23,CHO,11,0,12.3,2.5,5.4,.458,...,0.2,0.5,0.7,0.5,0.3,0.1,0.6,1.1,7.0,2015
17936,Gigi Datome,SF,27,TOT,21,1,10.0,2.0,4.2,.483,...,0.1,1.2,1.4,0.4,0.1,0.3,0.4,0.7,5.0,2015
17937,Gigi Datome,SF,27,DET,3,0,5.7,1.7,4.0,.417,...,0.3,1.0,1.3,0.7,0.3,0.0,0.7,0.3,3.7,2015
17938,Gigi Datome,SF,27,BOS,18,1,10.7,2.1,4.3,.494,...,0.1,1.3,1.4,0.4,0.1,0.4,0.4,0.8,5.2,2015


In [78]:
year_2015 = player_stats[player_stats['year']==2015]
year_2015[year_2015['Player']=='Thaddeus Young']

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,year
18460,Thaddeus Young,PF,26,TOT,76,68,32.0,5.9,12.7,0.466,...,1.7,3.7,5.4,2.3,1.6,0.3,1.5,2.3,14.1,2015
18461,Thaddeus Young,PF,26,MIN,48,48,33.4,6.0,13.4,0.451,...,1.6,3.5,5.1,2.8,1.8,0.4,1.6,2.4,14.3,2015
18462,Thaddeus Young,PF,26,BRK,28,20,29.6,5.8,11.7,0.495,...,1.9,4.1,5.9,1.4,1.4,0.3,1.5,2.0,13.8,2015


In [79]:
year_2015[year_2015['Player']=='Jorge Gutiérrez']

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,year
18038,Jorge Gutiérrez,PG,26,TOT,20,1,8.8,1.1,2.1,0.537,...,0.3,1.0,1.3,1.1,0.3,0.0,0.6,1.3,2.7,2015
18039,Jorge Gutiérrez,PG,26,BRK,10,0,4.4,0.7,1.4,0.5,...,0.0,0.7,0.7,0.7,0.1,0.0,0.3,0.9,1.6,2015
18040,Jorge Gutiérrez,PG,26,MIL,10,1,13.1,1.5,2.7,0.556,...,0.5,1.3,1.8,1.5,0.5,0.0,0.9,1.6,3.7,2015


In [82]:
player_stats[player_stats['year']==2015].head(100)

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,year
17790,Quincy Acy,PF,24,NYK,68,22,18.9,2.2,4.9,.459,...,1.2,3.3,4.4,1.0,0.4,0.3,0.9,2.2,5.9,2015
17791,Jordan Adams,SG,20,MEM,30,0,8.3,1.2,2.9,.407,...,0.3,0.6,0.9,0.5,0.5,0.2,0.5,0.8,3.1,2015
17792,Steven Adams,C,21,OKC,70,67,25.3,3.1,5.7,.544,...,2.8,4.6,7.5,0.9,0.5,1.2,1.4,3.2,7.7,2015
17793,Jeff Adrien,PF,28,MIN,17,0,12.6,1.1,2.6,.432,...,1.4,3.2,4.5,0.9,0.2,0.5,0.5,1.8,3.5,2015
17794,Arron Afflalo,SG,29,TOT,78,72,32.1,4.8,11.3,.424,...,0.3,2.8,3.2,1.7,0.5,0.1,1.5,2.1,13.3,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17885,Nick Calathes,SG,25,MEM,58,0,14.4,1.8,4.3,.421,...,0.3,1.5,1.8,2.5,1.1,0.1,1.1,1.7,4.2,2015
17886,José Calderón,PG,33,NYK,42,42,30.2,3.5,8.4,.415,...,0.5,2.5,3.0,4.7,0.7,0.0,1.8,1.8,9.1,2015
17887,,,,,,,,,,,...,,,,,,,,,,2015
17888,Kentavious Caldwell-Pope,SG,21,DET,82,82,31.5,4.8,11.9,.401,...,0.6,2.5,3.1,1.3,1.1,0.2,1.1,2.0,12.7,2015


In [87]:
# remove Player == 'none' and remove_duplicates keep first
player_stats = player_stats[player_stats['Player']!='None']
print(player_stats.shape)
player_stats = player_stats.drop_duplicates(subset=['Player', 'year'], keep='first')
print(player_stats.shape)

(20991, 30)
(17160, 30)


In [88]:
player_stats[player_stats['Player']=='Jorge Gutiérrez']

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,year
17406,Jorge Gutiérrez,PG,25,BRK,15,2,16.3,1.7,3.6,0.463,...,0.2,1.3,1.5,2.0,0.7,0.1,0.9,2.5,4.1,2014
18038,Jorge Gutiérrez,PG,26,TOT,20,1,8.8,1.1,2.1,0.537,...,0.3,1.0,1.3,1.1,0.3,0.0,0.6,1.3,2.7,2015
18671,Jorge Gutiérrez,PG,27,CHO,12,0,5.3,0.5,0.9,0.545,...,0.0,0.6,0.6,1.4,0.3,0.0,0.5,0.3,1.8,2016


In [89]:
player_stats.to_csv('data/player_stats.csv', index=False)