In [9]:
import requests
from bs4 import BeautifulSoup
from bs4 import Comment
import pandas as pd
import re
import warnings; warnings.simplefilter('ignore')

In [10]:
url ='https://www.basketball-reference.com/leagues/NBA_2019.html'

In [20]:
def NBA_reference_team_stats(url):
    
    num=re.findall('\d+', url )
    year=num[0]
    #function to parse each row
    def parse_row(row):
        return [x.text.strip() for x in row.findAll('td')]
    
    #Getting url and using Beautiful Soup to Parse it
    r = requests.get(url)

    soup = BeautifulSoup(r.content)

    all_comments = soup.find_all(string=lambda text: isinstance(text, Comment))

    for item in all_comments:
        if "team-stats-per_game" in item:
            adv = BeautifulSoup(item)

            team_stats = adv.find("table", id="team-stats-per_game")

            if not team_stats:
                #print('skip')
                continue # skip comment without table - go back to `for`

            rows = team_stats.find("tbody").findAll("tr")
            header_row= team_stats.findAll('th')
            list_parsed_rows =[parse_row(row) for row in rows[:]] 
            header = [item.text for item in header_row]
    
    
    #Creating dataframe and modifying
    df = pd.DataFrame(list_parsed_rows[:]) #make Dataframe
    df.columns = header[1:25]
    
    #Creating Playoff Identifer Column based off the Asterik
    df.loc[df['Team'].str.contains(pat='\*'), 'Playoff'] = True
    df.loc[~df['Team'].str.contains(pat='\*'), 'Playoff'] = False
    
    #Splitting up Team into location and Team Name
    list_location = []
    list_team = []  
    for i, team in enumerate(range(0,len(df['Team']))):
        if df['Team'][i] == 'Portland Trail Blazers*':
            name =df['Team'][i].split(' ',1)
            list_location.append(name[0])
            list_team.append(name[1].strip('\*'))
        else:
            name =df['Team'][i].rsplit(' ',1)
            list_location.append(name[0])
            list_team.append(name[1].strip('\*'))
    df['Location'] = list_location
    df['Team'] = list_team
    df['Year'] = year
 
    #Setting datatypes
    float_cols = ['MP','FG','FGA','FG%','3P','3PA','3P%','2P','2PA','2P%','FT','FTA',
                  'FT%','ORB','DRB','TRB','AST','STL','BLK','TOV','PF','PTS']
    int_cols = ['G','Year']
    
    df[float_cols]=df[float_cols].astype(float)
    df[int_cols]=df[int_cols].astype(int)
    
    # Creating Conference Labels
    Eastern =[
    'Atlanta','Boston','Chicago','Cleveland','Detroit','Indiana','Miami','Milwaukee',
    'Brooklyn','New York','Orlando','Philadelphia','Washington' ,'Toronto','Charlotte']
    Western = [
    'Houston','Los Angeles','Minnesota' ,'Phoenix','Portland','Sacramento','San Antonio',
    'Oklahoma City','Utah' ,'Memphis' ,'New Orleans','Dallas' ,'Denver' ,'Golden State']
    
    East = df[df['Location'].isin(Eastern)]
    East['Conference'] = 'Eastern'
    West = df[df['Location'].isin(Western)]
    West['Conference'] = 'Western'
    df=pd.concat([West,East])
    df.sort_index(inplace=True)

    return df  

In [21]:
test = NBA_reference_team_stats( 'https://www.basketball-reference.com/leagues/NBA_2019.html')
test.head()

Unnamed: 0,Team,G,MP,FG,FGA,FG%,3P,3PA,3P%,2P,...,AST,STL,BLK,TOV,PF,PTS,Playoff,Location,Year,Conference
0,Bucks,82,241.2,43.4,91.1,0.476,13.5,38.2,0.353,29.9,...,26.0,7.5,5.9,13.9,19.6,118.1,True,Milwaukee,2019,Eastern
1,Warriors,82,241.5,44.0,89.8,0.491,13.3,34.4,0.385,30.8,...,29.4,7.6,6.4,14.3,21.4,117.7,True,Golden State,2019,Western
2,Pelicans,82,240.9,43.7,92.2,0.473,10.3,29.9,0.344,33.4,...,27.0,7.4,5.4,14.8,21.1,115.4,False,New Orleans,2019,Western
3,76ers,82,241.5,41.5,88.2,0.471,10.8,30.2,0.359,30.7,...,26.9,7.4,5.3,14.9,21.3,115.2,True,Philadelphia,2019,Eastern
4,Clippers,82,241.8,41.3,87.5,0.471,10.0,25.8,0.388,31.3,...,24.0,6.8,4.7,14.5,23.3,115.1,True,Los Angeles,2019,Western


### Looking at Syntax of the Hoops Hype  

https://www.basketball-reference.com/leagues/NBA_2019.html
https://www.basketball-reference.com/leagues/NBA_2020.html
https://www.basketball-reference.com/leagues/NBA_2018.html




In [22]:
def NBA_reference_team_stats_year(start_year, end_year):
    for i in range(0,end_year+1-start_year):
        if i == 0:
            a = start_year+i
            url=f'https://www.basketball-reference.com/leagues/NBA_{a}.html'
            df = NBA_reference_team_stats(url)
            print(start_year)
        elif i > 0:
            a = start_year+i
            url=f'https://www.basketball-reference.com/leagues/NBA_{a}.html'
            df2 = NBA_reference_team_stats(url)
            df = df.append(df2)
            print(start_year+i)
    return df
  

In [24]:
df2 = NBA_reference_team_stats_year(2018,2019)

2018
2019


In [44]:
df2.to_csv('NBA_team_stats_2015to2019.csv')

In [7]:
# df = NBA_reference_team_stats_year(1980,2019)
df = pd.read_csv('NBA_team_stats_1980to2019.csv')

In [9]:
len(df)
# df.head()

1015