In [1]:
import pandas as pd
import requests
import re
import time
from bs4 import BeautifulSoup
from unidecode import unidecode

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Functions

In [3]:
def fetch(url):
    """
    Use BeautifulSoup to get the text of a web page.
    """
    r = requests.get(url)
    comm = re.compile("<!--|-->")
    soup = BeautifulSoup(comm.sub("", r.text), "html.parser")
    return soup

## Scrape

Steps:
- Get team ID's
- Go to each team's page to scrape

In [5]:
bbr_url = 'https://www.basketball-reference.com'
bbr_teams_soup = fetch(f'{bbr_url}/teams/')
active_teams_table = bbr_teams_soup.find_all('table', {'id': 'teams_active'})[0]
bbr_teams_dict = {row.text: row.a['href']
                  for row in active_teams_table.find_all('th', 
                                                         {'data-stat': 'franch_name'})
                  if row.text != 'Franchise'}

In [6]:
replace_dict = {'Brooklyn Nets': ['NJN', 'BRK'],
                'Charlotte Hornets': ['CHA', 'CHO'],
                'New Orleans Pelicans': ['NOH', 'NOP']}
for team in replace_dict.keys():
    old, new = replace_dict[team]
    bbr_teams_dict[team] = bbr_teams_dict[team].replace(old, new)
bbr_teams_dict

{'Atlanta Hawks': '/teams/ATL/',
 'Boston Celtics': '/teams/BOS/',
 'Brooklyn Nets': '/teams/BRK/',
 'Charlotte Hornets': '/teams/CHO/',
 'Chicago Bulls': '/teams/CHI/',
 'Cleveland Cavaliers': '/teams/CLE/',
 'Dallas Mavericks': '/teams/DAL/',
 'Denver Nuggets': '/teams/DEN/',
 'Detroit Pistons': '/teams/DET/',
 'Golden State Warriors': '/teams/GSW/',
 'Houston Rockets': '/teams/HOU/',
 'Indiana Pacers': '/teams/IND/',
 'Los Angeles Clippers': '/teams/LAC/',
 'Los Angeles Lakers': '/teams/LAL/',
 'Memphis Grizzlies': '/teams/MEM/',
 'Miami Heat': '/teams/MIA/',
 'Milwaukee Bucks': '/teams/MIL/',
 'Minnesota Timberwolves': '/teams/MIN/',
 'New Orleans Pelicans': '/teams/NOP/',
 'New York Knicks': '/teams/NYK/',
 'Oklahoma City Thunder': '/teams/OKC/',
 'Orlando Magic': '/teams/ORL/',
 'Philadelphia 76ers': '/teams/PHI/',
 'Phoenix Suns': '/teams/PHO/',
 'Portland Trail Blazers': '/teams/POR/',
 'Sacramento Kings': '/teams/SAC/',
 'San Antonio Spurs': '/teams/SAS/',
 'Toronto Raptors': 

In [7]:
def scrape_team_season_table(team, season, table_id,
                             bbr_teams_dict=bbr_teams_dict, bbr_url=bbr_url):
    team_season_url = f'{bbr_url}{bbr_teams_dict[team]}{season+1}.html'
#     print(team_season_url)
    time.sleep(10)
    team_season_soup = fetch(team_season_url)
    table = team_season_soup.find_all('table', {'id': table_id})[0]
    
    cols = [th['data-stat'] for th in table.find_all('thead')[0].find_all('th')
            if th['data-stat'] != 'DUMMY']
    data = [[row.find_all('th', {'data-stat': cols[0]})[0].text] +
            [row.find_all('td', {'data-stat': col})[0].text
             for col in cols[1:]]
            for row in table.find_all('tbody')[0].find_all('tr')]
    df = pd.DataFrame(data, columns=cols)
    df['season'] = season
    df['team'] = team
    return df

### Advanced

In [8]:
print('Start Scraping...')
team_dfs = {}
for team in bbr_teams_dict.keys():
    print(team)
    team_dfs[team] = pd.concat([scrape_team_season_table(team, season, 'advanced')
                                for season in range(2017, 2023)])
    time.sleep(30)
print('Done.')

Start Scraping...
Atlanta Hawks
Boston Celtics
Brooklyn Nets
Charlotte Hornets
Chicago Bulls
Cleveland Cavaliers
Dallas Mavericks
Denver Nuggets
Detroit Pistons
Golden State Warriors
Houston Rockets
Indiana Pacers
Los Angeles Clippers
Los Angeles Lakers
Memphis Grizzlies
Miami Heat
Milwaukee Bucks
Minnesota Timberwolves
New Orleans Pelicans
New York Knicks
Oklahoma City Thunder
Orlando Magic
Philadelphia 76ers
Phoenix Suns
Portland Trail Blazers
Sacramento Kings
San Antonio Spurs
Toronto Raptors
Utah Jazz
Washington Wizards
Done.


In [9]:
df = pd.concat([team_dfs[team] for team in team_dfs.keys()])\
       .reset_index().drop('index', axis=1)
df['player_alt'] = df['player'].apply(unidecode)

In [10]:
df.to_csv('bbr_nba_advanced_stats.csv', index=False)

In [11]:
df[df['player'] != df['player_alt']][['player', 'player_alt']].drop_duplicates()

Unnamed: 0,player,player_alt
1,Dennis Schröder,Dennis Schroder
6,Ersan İlyasova,Ersan Ilyasova
20,Nicolás Brussino,Nicolas Brussino
70,Bogdan Bogdanović,Bogdan Bogdanovic
94,Timothé Luwawu-Cabarrot,Timothe Luwawu-Cabarrot
218,Juancho Hernangómez,Juancho Hernangomez
285,Džanan Musa,Dzanan Musa
356,Goran Dragić,Goran Dragic
402,Willy Hernangómez,Willy Hernangomez
485,Théo Maledon,Theo Maledon


### Roster

In [12]:
print('Start Scraping...')
team_dfs = {}
for team in bbr_teams_dict.keys():
    print(team)
    team_dfs[team] = pd.concat([scrape_team_season_table(team, season, 'roster')
                                for season in range(2017, 2023)])
    time.sleep(30)
print('Done.')

Start Scraping...
Atlanta Hawks
Boston Celtics
Brooklyn Nets
Charlotte Hornets
Chicago Bulls
Cleveland Cavaliers
Dallas Mavericks
Denver Nuggets
Detroit Pistons
Golden State Warriors
Houston Rockets
Indiana Pacers
Los Angeles Clippers
Los Angeles Lakers
Memphis Grizzlies
Miami Heat
Milwaukee Bucks
Minnesota Timberwolves
New Orleans Pelicans
New York Knicks
Oklahoma City Thunder
Orlando Magic
Philadelphia 76ers
Phoenix Suns
Portland Trail Blazers
Sacramento Kings
San Antonio Spurs
Toronto Raptors
Utah Jazz
Washington Wizards
Done.


In [13]:
df = pd.concat([team_dfs[team] for team in team_dfs.keys()])\
       .reset_index().drop('index', axis=1)
df['player_alt'] = df['player'].apply(unidecode)

In [14]:
df.to_csv('bbr_nba_roster_stats.csv', index=False)

In [15]:
df[df['player'] != df['player_alt']][['player', 'player_alt']].drop_duplicates()

Unnamed: 0,player,player_alt
4,Nicolás Brussino,Nicolas Brussino
12,Ersan İlyasova,Ersan Ilyasova
19,Dennis Schröder,Dennis Schroder
65,Bogdan Bogdanović,Bogdan Bogdanovic
98,Timothé Luwawu-Cabarrot,Timothe Luwawu-Cabarrot
...,...,...
3538,Johnny Juzang (TW),Johnny Juzang (TW)
3540,Micah Potter (TW),Micah Potter (TW)
3596,Anžejs Pasečņiks,Anzejs Pasecniks
3670,Quenton Jackson (TW),Quenton Jackson (TW)
