In [40]:
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
import re
import numpy as np
import pandas as pd
import json

## Dataset Generation Functions
Stats scraped from [Basketball Reference](https://www.basketball-reference.com/) and [StatMuse](https://www.statmuse.com/nba)

In [41]:
team_abbrevs = {'ATL': {'name': 'atlanta-hawks', 'conference': 'East'},'BRK': {'name': 'brooklyn-nets', 'conference': 'East'},'BOS': {'name': 'boston-celtics', 'conference': 'East'},'CHA': {'name': 'charlotte-bobcats', 'conference': 'East'},'CHO': {'name': 'charlotte-hornets', 'conference': 'East'},'CHI': {'name': 'chicago-bulls', 'conference': 'East'},'CHH': {'name': 'charlotte-hornets', 'conference': 'East'},'CLE': {'name': 'cleveland-cavaliers', 'conference': 'East'},'DAL': {'name': 'dallas-mavericks', 'conference': 'West'},'DEN': {'name': 'denver-nuggets', 'conference': 'West'},'DET': {'name': 'detroit-pistons', 'conference': 'East'},'GSW': {'name': 'golden-state-warriors', 'conference': 'West'},'HOU': {'name': 'houston-rockets', 'conference': 'West'},'IND': {'name': 'indiana-pacers', 'conference': 'East'},'KCK': {'name': 'kansas-city-kings', 'conference': 'West'},'LAC': {'name': 'los-angeles-clippers', 'conference': 'West'},'LAL': {'name': 'los-angeles-lakers', 'conference': 'West'},'MEM': {'name': 'memphis-grizzlies', 'conference': 'West'},'MIA': {'name': 'miami-heat', 'conference': 'East'},'MIL': {'name': 'milwaukee-bucks', 'conference': 'East'},'MIN': {'name': 'minnesota-timberwolves', 'conference': 'West'},'NJN': {'name': 'new-jersey-nets', 'conference': 'East'},'NOH': {'name': 'new-orleans-hornets', 'conference': 'West'},'NOP': {'name': 'new-orleans-pelicans', 'conference': 'West'},'NOK':{'name': 'new-orleans-hornets', 'conference': 'West'},'NYK': {'name': 'new-york-knicks', 'conference': 'East'},'OKC': {'name': 'oklahoma-city-thunder', 'conference': 'West'},'ORL': {'name': 'orlando-magic', 'conference': 'East'},'PHI': {'name': 'philadelphia-76ers', 'conference': 'East'},'PHO': {'name': 'phoenix-suns', 'conference': 'West'},'POR': {'name': 'portland-trail-blazers', 'conference': 'West'},'SEA': {'name': 'seattle-supersonics', 'conference': 'West'},'SAC': {'name': 'sacramento-kings', 'conference': 'West'},'SAS': {'name': 'san-antonio-spurs', 'conference': 'West'},'SDC': {'name': 'san-diego-clippers', 'conference': 'West'},'TOR': {'name': 'toronto-raptors', 'conference': 'East'},'UTA': {'name': 'utah-jazz', 'conference': 'West'},'VAN': {'name': 'vancouver-grizzlies', 'conference': 'West'},'WAS': {'name': 'washington-wizards', 'conference': 'East'},'WSB': {'name': 'washington-bullets', 'conference': 'East'}}
def get_soup_from_url(url):
    return BeautifulSoup(requests.get(url).text, 'lxml')
def statmuse_get_stats_for_player(name, season, team, player_dict): 
    season_start = season.split('-')[0]
    if season_start != '1999': 
        season_end = season_start[:2] + season.split('-')[1]
    else: 
        season_end = '2000'
    first_name = name.split(' ')[0]
    last_name = name.split(' ')[1]
    search_query = f'https://www.statmuse.com/nba/ask/{first_name}-{last_name}-nba-stats-from-october-{season_start}-to-january-{season_end}'
    team_name = team_abbrevs.get(team).get('name')
    team_search_query = f'https://www.statmuse.com/nba/ask/{team_name}-winning-percentage-from-october-{season_start}-to-january-{season_end}'
    soup = get_soup_from_url(search_query)
    team_soup = get_soup_from_url(team_search_query)
    data_list = [div.find('visual-answer').get('answer') for div in soup.find_all('div') if div.find('visual-answer')]
    team_data_list = [div.find('visual-answer').get('answer') for div in team_soup.find_all('div') if div.find('visual-answer')]
    if len(data_list) == 0 or len(team_data_list) == 0: 
        return False
    data = json.loads(data_list[0])
    team_data = json.loads(team_data_list[0])
    item = data.get('visual').get('detail')[0].get('grids')[0].get('rows')[0]
    team_item = team_data.get('visual').get('detail')[0].get('grids')[0].get('rows')[0]
    if 'PPG' in item and item.get('PPG').get('value'): 
        player_dict['PPG'] = float(item.get('PPG').get('value'))
    else: 
        player_dict['PPG'] = 0
    if 'PPG' in item and item.get('RPG').get('value'): 
        player_dict['RPG'] = float(item.get('RPG').get('value'))
    else: 
        player_dict['RPG'] = 0
    if 'APG' in item and item.get('APG').get('value'): 
        player_dict['APG'] = float(item.get('APG').get('value'))
    else: 
        player_dict['APG'] = 0
    if 'BPG' in item and item.get('BPG').get('value'): 
        player_dict['BPG'] = float(item.get('BPG').get('value'))
    else: 
        player_dict['BPG'] = 0
    if 'SPG' in item and item.get('SPG').get('value'): 
        player_dict['SPG'] = float(item.get('SPG').get('value'))
    else: 
        player_dict['SPG'] = 0
    if 'GP' in item and item.get('GP').get('value'): 
        player_dict['GP'] = float(item.get('GP').get('value'))
    else: 
        player_dict['GP'] = 0
    if 'W%' in team_item and team_item.get('W%').get('value'): 
        player_dict['W%'] = float(team_item.get('W%').get('value'))
    else: 
        player_dict['W%'] = 0
    soup.decompose()
    team_soup.decompose()
    return True
def add_players(soup, rows_list, is_all_star, position):
    searching = True
    count = 0
    while searching: 
        for row in soup.find_all('tr'): 
            if row.find('th') and row.find_all('th')[0].get('data-stat') == 'ranker' and row.find('td'): 
                current_row = {}
                current_row['All-Star'] = is_all_star
                current_row['Position'] = position
                for stat in row.find_all('td'): 
                    if stat.get('data-stat') == 'player':
                        name = stat.text
                        current_row['Player'] = name + ' '
                    elif stat.get('data-stat') == 'season':
                        season = stat.text
                        current_row['Player'] += season
                        #print(current_row['Player'])
                    elif stat.get('data-stat') == 'age':
                        current_row['Age'] = stat.text
                    elif stat.get('data-stat') == 'team_id': 
                        if stat.find('a'):
                            team = stat.find('a').text
                            current_row['Conference'] = team_abbrevs.get(team).get('conference')
                        else: 
                            team = 'TOT'
                if team == 'TOT': 
                    res = None
                else:
                    res = statmuse_get_stats_for_player(name,season,team,current_row)
                if res:
                    print(count)
                    count+=1
                    rows_list.append(current_row)
        searching = False
        for link in soup.find_all('a'): 
            if link.text == 'Next page':
                soup.decompose()
                soup = get_soup_from_url(urljoin(base_url, link.get('href')))
                searching = True

In [42]:
all_star_guards_url = "https://www.basketball-reference.com/play-index/psl_finder.cgi?request=1&match=single&type=totals&per_minute_base=36&per_poss_base=100&season_start=1&season_end=-1&lg_id=NBA&age_min=0&age_max=99&is_playoffs=N&height_min=0&height_max=99&year_min=1980&birth_country_is=Y&is_as=Y&as_comp=gt&as_val=0&pos_is_g=Y&pos_is_gf=Y&order_by=season"
all_star_bigs_url = "https://www.basketball-reference.com/play-index/psl_finder.cgi?request=1&match=single&type=totals&per_minute_base=36&per_poss_base=100&season_start=1&season_end=-1&lg_id=NBA&age_min=0&age_max=99&is_playoffs=N&height_min=0&height_max=99&year_min=1980&birth_country_is=Y&is_as=Y&as_comp=gt&as_val=0&pos_is_f=Y&pos_is_fg=Y&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&order_by=season"
non_all_star_guards_url = "https://www.basketball-reference.com/play-index/psl_finder.cgi?request=1&match=single&type=totals&per_minute_base=36&per_poss_base=100&season_start=1&season_end=-1&lg_id=NBA&age_min=0&age_max=99&is_playoffs=N&height_min=0&height_max=99&year_min=1980&year_max=2019&birth_country_is=Y&is_as=N&as_comp=gt&as_val=0&pos_is_f=Y&pos_is_fg=Y&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&order_by=season"
non_all_star_bigs_url = "https://www.basketball-reference.com/play-index/psl_finder.cgi?request=1&match=single&type=totals&per_minute_base=36&per_poss_base=100&season_start=1&season_end=-1&lg_id=NBA&age_min=0&age_max=99&is_playoffs=N&height_min=0&height_max=99&year_min=1980&birth_country_is=Y&is_as=N&as_comp=gt&as_val=0&pos_is_f=Y&pos_is_fg=Y&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&order_by=season"
base_url = requests.get(all_star_guards_url).url
all_star_guards_soup = get_soup_from_url(all_star_guards_url)
non_all_star_guards_soup = get_soup_from_url(non_all_star_guards_url)
all_star_bigs_soup = get_soup_from_url(all_star_bigs_url)
non_all_star_bigs_soup = get_soup_from_url(non_all_star_bigs_url)

In [None]:
rows_list = []
add_players(all_star_guards_soup, rows_list, True, 'G')
add_players(all_star_bigs_soup, rows_list, True, 'F/C')
add_players(non_all_star_guards_soup, rows_list, False, 'G')
add_players(non_all_star_bigs_soup, rows_list, False, 'F/C')

In [45]:
dataset = pd.DataFrame(rows_list)
dataset.set_index('Player')
dataset.to_csv('nba_player_data_through_jan.csv')