In [4]:
import pandas as pd
import numpy as np
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup

import requests
import json

## Crawl data from NBA Reference

In [36]:
def craw_data(year):
    url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html".format(year)# this is the HTML from the given URL
    html = urlopen(url)
    soup = BeautifulSoup(html)
    
    #get headers column name
    col_name = [th.string for th in soup.find_all('tr', limit=1)[0].find_all('th')][1:]
    col_name.append('year')
    
    #get players data
    rows = soup.find_all('tr')[1:]
    player_data = []
    for row in rows:
        if "thead" in row['class']:  # skip elements having emptyItem class
            continue
        
        data = []
        for td in row.find_all('td'):
            if (len(td.contents) > 1):
                data.append(td.contents[0].string)
            else:
                data.append(td.string)
            
        data.append(year)
        player_data.append(data)
        
    stats = pd.DataFrame(data = player_data, columns = col_name)
    
    return stats

In [37]:
data = craw_data('1997')

data[60:70]

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,year
60,Randy Brown,PG,28,CHI,72,3,14.7,1.9,4.6,0.42,...,0.5,1.1,1.5,1.8,1.1,0.2,0.8,1.6,4.7,1997
61,Kobe Bryant,SF,18,LAL,71,6,15.5,2.5,5.9,0.417,...,0.7,1.2,1.9,1.3,0.7,0.3,1.6,1.4,7.6,1997
62,Mark Bryant,PF,31,PHO,41,18,24.8,3.7,6.7,0.553,...,1.6,3.5,5.2,1.1,0.5,0.1,1.1,3.3,9.3,1997
63,Jud Buechler,SF,28,CHI,76,0,9.3,0.8,2.1,0.367,...,0.6,1.1,1.7,0.8,0.3,0.3,0.4,0.7,1.8,1997
64,Matt Bullard,PF,29,HOU,71,12,14.4,1.6,4.0,0.401,...,0.2,1.5,1.6,0.9,0.3,0.3,0.5,1.0,4.5,1997
65,Scott Burrell,SF,26,TOT,57,2,16.5,1.7,4.8,0.362,...,0.9,1.9,2.8,1.3,0.5,0.3,0.9,2.1,5.2,1997
66,Scott Burrell,SF,26,CHH,28,2,17.2,1.6,4.7,0.344,...,0.9,2.0,2.8,1.4,0.5,0.4,0.9,2.1,5.4,1997
67,Scott Burrell,SF,26,GSW,29,0,15.8,1.8,4.8,0.379,...,0.9,1.9,2.7,1.2,0.5,0.3,1.0,2.1,4.9,1997
68,Willie Burton,SF,28,ATL,24,2,15.8,1.6,4.8,0.336,...,0.5,1.3,1.7,0.5,0.3,0.1,1.1,2.3,6.2,1997
69,Mitchell Butler,SG,26,POR,49,1,9.5,1.1,2.6,0.416,...,0.4,0.7,1.1,0.6,0.3,0.0,0.6,1.1,3.0,1997


In [38]:
years = [year for year in range(1997, 2020)]

for year in years:
    data = craw_data(year)
    data.to_csv('nba_stats_{}.csv'.format(year), index=True)

In [26]:
years

[1997,
 1998,
 1999,
 2000,
 2001,
 2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019]

## Take data from NBA official website

In [10]:
from nba_api.stats.endpoints import leaguedashplayerstats

def craw_data_nba(year):
    print (type(str(year)))
    print(year)
    my_header = {
        'Host': 'stats.nba.com',
        'Connection': 'keep-alive',
        'Cache-Control': 'max-age=0',
        'DNT': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0',
        'Accept': 'application/json, text/plain, */*',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'en-US,en;q=0.9',
        'x-nba-stats-origin': 'stats',
        'x-nba-stats-token': 'true',
    }
    
    player_stats = leaguedashplayerstats.LeagueDashPlayerStats(headers = my_header, season = year)
    
    return player_stats.league_dash_player_stats.get_data_frame()

In [11]:
x = [str(xs) for xs in range(1996, 2020)]

y = [str(ys) for ys in range(97, 100)]
for i in range(0,21):
    if i < 10:
        y.append(str(i).zfill(2))
    else:
        y.append(str(i))
        
adv_years = []
for i in range(len(x)):
    string = x[i] + '-' + y[i]
    adv_years.append(string)
    
adv_years

['1996-97',
 '1997-98',
 '1998-99',
 '1999-00',
 '2000-01',
 '2001-02',
 '2002-03',
 '2003-04',
 '2004-05',
 '2005-06',
 '2006-07',
 '2007-08',
 '2008-09',
 '2009-10',
 '2010-11',
 '2011-12',
 '2012-13',
 '2013-14',
 '2014-15',
 '2015-16',
 '2016-17',
 '2017-18',
 '2018-19',
 '2019-20']

In [13]:
from time import sleep

for adv_year in adv_years:
    data_nba = craw_data_nba(adv_year)
    data_nba.to_csv('nba_adv_stats_{}.csv'.format(adv_year[:4]), index=True)
    # be careful with frequency. Sleep to avoid being tagged as DoS attack
    sleep(3)
    break

<class 'str'>
1996-97
