In [1]:
import pandas as pd
import numpy as np
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup

import requests
import json

## Crawl data from NBA Reference

In [24]:
def craw_data(year):
    url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html".format(year)# this is the HTML from the given URL
    html = urlopen(url)
    soup = BeautifulSoup(html)
    
    #get headers column name
    col_name = [th.string for th in soup.find_all('tr', limit=1)[0].find_all('th')][1:]
    col_name.append('year')
    
    #get players data
    rows = soup.find_all('tr')[1:]
    player_data = []
    for row in rows:
        data = []
        for td in row.find_all('td'):
            data.append(td.string)
            
        data.append(year)
        player_data.append(data)
        
    stats = pd.DataFrame(data = player_data, columns = col_name)
    
    return stats

In [25]:
years = [year for year in range(1997, 2020)]

for year in years:
    data = craw_data(year)
    data.to_csv('nba_stats_{}.csv'.format(year), index=True)

In [26]:
years

[1997,
 1998,
 1999,
 2000,
 2001,
 2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019]

## Take data from NBA official website

In [10]:
from nba_api.stats.endpoints import leaguedashplayerstats

def craw_data_nba(year):
    print (type(str(year)))
    print(year)
    my_header = {
        'Host': 'stats.nba.com',
        'Connection': 'keep-alive',
        'Cache-Control': 'max-age=0',
        'DNT': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0',
        'Accept': 'application/json, text/plain, */*',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'en-US,en;q=0.9',
        'x-nba-stats-origin': 'stats',
        'x-nba-stats-token': 'true',
    }
    
    player_stats = leaguedashplayerstats.LeagueDashPlayerStats(headers = my_header)
    
#     url = "https://stats.nba.com/stats/leaguedashplayerstats?College=&Conference="+\
#     "&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height="+\
#     "&LastNGames=0&LeagueID=00&Location=&MeasureType=Advanced&Month=0&OpponentTeamID=0&Outcome="+\
#     "&PORound=0&PaceAdjust=N&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N"+\
#     "&Season={season}&SeasonSegment=&SeasonType=Regular Season&ShotClockRange=&StarterBench=&TeamID=0&TwoWay=0&VsConference="+\
#     "&VsDivision=&Weight=".format(season=str(year))
    # this is the HTML from the given URL
    #response = requests.get(url)
    #data = json.loads(response.text)
    
#     games = data["resultSets"][0]["rowSet"]
#     for i in range(0, len(games)):
#         game_id = games[i][2]
#         game_ids.append(game_id)
#     html = urlopen(url)
#     soup = BeautifulSoup(html)
    
    #get headers column name
    #col_name = soup.find_all('nba-stat-table', limit=1)#[0].find_all('th')]#[1:]
    #col_name.append('year')
    
#     #get players data
#     rows = soup.find_all('tr')[1:]
#     player_data = []
#     for row in rows:
#         data = []
#         for td in row.find_all('td'):
#             data.append(td.string)
            
#         data.append(year)
#         player_data.append(data)
        
#     stats = pd.DataFrame(data = player_data, columns = col_name)
    
    return player_stats

In [18]:
data_nba = craw_data_nba('2019-20')

data_nba.league_dash_player_stats.get_data_frame()

<class 'str'>
2019-20


Unnamed: 0,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,MIN,...,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,NBA_FANTASY_PTS_RANK,DD2_RANK,TD3_RANK,CFID,CFPARAMS
0,203932,Aaron Gordon,1610612753,ORL,24.0,62,30,32,0.484,2017.143333,...,456,387,50,81,398,49,29,17,5,2039321610612753
1,1628988,Aaron Holiday,1610612754,IND,23.0,66,42,24,0.636,1617.331667,...,377,374,133,151,87,161,115,29,5,16289881610612754
2,1627846,Abdel Nader,1610612760,OKC,26.0,55,37,18,0.673,866.951667,...,214,250,293,261,420,292,239,29,5,16278461610612760
3,1629690,Adam Mokoka,1610612741,CHI,21.0,11,3,8,0.273,111.666667,...,121,101,460,451,128,456,239,29,5,16296901610612741
4,1629678,Admiral Schofield,1610612764,WAS,23.0,33,9,24,0.273,368.381667,...,121,194,390,388,387,405,239,29,5,16296781610612764
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
524,203897,Zach LaVine,1610612741,CHI,25.0,60,20,40,0.333,2085.410000,...,525,404,34,15,490,30,142,29,5,2038971610612741
525,1629668,Zach Norvell Jr.,1610612744,GSW,22.0,5,2,3,0.400,40.801667,...,1,37,495,496,321,496,239,29,5,16296681610612744
526,1629015,Zhaire Smith,1610612755,PHI,21.0,7,3,4,0.429,32.331667,...,52,37,486,501,272,500,239,29,5,16290151610612755
527,1629627,Zion Williamson,1610612740,NOP,19.0,24,11,13,0.458,668.141667,...,441,174,107,181,114,250,142,29,5,16296271610612740
