In [1]:
import pandas as pd
import numpy as np
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup

import requests
import json

## Crawl data from NBA Reference

In [24]:
def craw_data(year):
    url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html".format(year)# this is the HTML from the given URL
    html = urlopen(url)
    soup = BeautifulSoup(html)
    
    #get headers column name
    col_name = [th.string for th in soup.find_all('tr', limit=1)[0].find_all('th')][1:]
    col_name.append('year')
    
    #get players data
    rows = soup.find_all('tr')[1:]
    player_data = []
    for row in rows:
        data = []
        for td in row.find_all('td'):
            data.append(td.string)
            
        data.append(year)
        player_data.append(data)
        
    stats = pd.DataFrame(data = player_data, columns = col_name)
    
    return stats

In [25]:
years = [year for year in range(1997, 2020)]

for year in years:
    data = craw_data(year)
    data.to_csv('nba_stats_{}.csv'.format(year), index=True)

In [26]:
years

[1997,
 1998,
 1999,
 2000,
 2001,
 2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019]

## Take data from NBA official website

In [10]:
from nba_api.stats.endpoints import leaguedashplayerstats

def craw_data_nba(year):
    print (type(str(year)))
    print(year)
    my_header = {
        'Host': 'stats.nba.com',
        'Connection': 'keep-alive',
        'Cache-Control': 'max-age=0',
        'DNT': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0',
        'Accept': 'application/json, text/plain, */*',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'en-US,en;q=0.9',
        'x-nba-stats-origin': 'stats',
        'x-nba-stats-token': 'true',
    }
    
    player_stats = leaguedashplayerstats.LeagueDashPlayerStats(headers = my_header, season = year)
    
    return player_stats.league_dash_player_stats.get_data_frame()

In [11]:
x = [str(xs) for xs in range(1996, 2020)]

y = [str(ys) for ys in range(97, 100)]
for i in range(0,21):
    if i < 10:
        y.append(str(i).zfill(2))
    else:
        y.append(str(i))
        
adv_years = []
for i in range(len(x)):
    string = x[i] + '-' + y[i]
    adv_years.append(string)
    
adv_years

['1996-97',
 '1997-98',
 '1998-99',
 '1999-00',
 '2000-01',
 '2001-02',
 '2002-03',
 '2003-04',
 '2004-05',
 '2005-06',
 '2006-07',
 '2007-08',
 '2008-09',
 '2009-10',
 '2010-11',
 '2011-12',
 '2012-13',
 '2013-14',
 '2014-15',
 '2015-16',
 '2016-17',
 '2017-18',
 '2018-19',
 '2019-20']

In [13]:
from time import sleep

for adv_year in adv_years:
    data_nba = craw_data_nba(adv_year)
    data_nba.to_csv('nba_adv_stats_{}.csv'.format(adv_year[:4]), index=True)
    # be careful with frequency. Sleep to avoid being tagged as DoS attack
    sleep(3)
    break

<class 'str'>
1996-97
