In [1]:
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
import re
import numpy as np
import pandas as pd
import json

## Dataset Generation

In [70]:
nba_champion_url = 'https://www.basketball-reference.com/playoffs/'
nba_team_stats_url = 'https://www.basketball-reference.com/play-index/tsl_finder.cgi?request=1&match=single&type=advanced&year_min=1980&year_max=&lg_id=NBA&franch_id=&c1stat=&c1comp=&c1val=&c2stat=&c2comp=&c2val=&c3stat=&c3comp=&c3val=&c4stat=&c4comp=&c4val=&order_by=wins&order_by_asc=&offset=0'
base_url = requests.get(nba_team_stats_url).url
def get_soup_from_url(url):
    return BeautifulSoup(requests.get(url).text, 'html.parser')
def create_champion_dict(soup): 
    champions = {}
    for row in soup.find_all('tr'):
        if row.find('th') and row.find_all('th')[0].get('data-stat') == 'year_id' and row.find('td') and row.find('a'): 
            year = int(row.find('a').text)
            if year < 1980: 
                break
            year_str = f'{year - 1}-{year % 100}'
            champ = [stat.text for stat in row.find_all('td') if stat.get('data-stat') == 'champion'][0]
            champions[year_str] = champ
    return champions
def create_team_dataset(soup, champions):
    searching = True
    rows_list = []
    while searching: 
        for row in soup.find_all('tr'): 
            if row.find_all('th')[0].get('data-stat') == 'ranker' and row.find('td') and row.find('a'): 
                current_row = {}
                current_row['Team'] = row.find('a').get('title')
                for stat in row.find_all('td'): 
                    if stat.get('data-stat') == 'season':
                        season = stat.text
                        current_row['Champion'] = current_row['Team'] == champions.get(season)
                        current_row['Team'] += ' ' + season
                    elif stat.get('data-stat') == 'win_loss_pct':
                        current_row['win_loss_pct'] = float(stat.text)
                    elif stat.get('data-stat') == 'efg_pct':
                        current_row['efg_pct'] = float(stat.text)
                    elif stat.get('data-stat') == 'off_rtg':
                        current_row['off_rtg'] = float(stat.text)  
                    elif stat.get('data-stat') == 'def_rtg':
                        current_row['def_rtg'] = float(stat.text)
                rows_list.append(current_row)
        searching = False
        for link in soup.find_all('a'): 
            if link.text == 'Next page': 
                soup = get_soup_from_url(urljoin(base_url, link.get('href')))
                searching = True
    dataset = pd.DataFrame(rows_list)
    dataset.set_index('Team')
    return dataset

In [71]:
champions = create_champion_dict(get_soup_from_url(nba_champion_url))
dataset = create_team_dataset(get_soup_from_url(nba_team_stats_url), champions)

In [72]:
dataset.to_csv('nba_team_data.csv')