In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [27]:
def format_player_url(url_ln, name):
    name_split = name.lower().replace(" jr.", '').replace("'", '').replace(" III", '').replace('.','').split(' ')

    if(len(name_split) >= 2):

        if len(name_split[1]) < 5:
            if len(name_split[0]) < 2:
                url_player = url_ln + "/" + name_split[-1] + name_split[0] + "01.html"
            else:
                url_player = url_ln + "/" + name_split[-1] + name_split[0][:2] + "01.html"
        else: 
            if len(name_split[0]) < 2:
                url_player = url_ln + "/" + name_split[-1][:5] + name_split[0] + "01.html"
            else: 
                url_player = url_ln + "/" + name_split[-1][:5] + name_split[0][:2] + "01.html"
    
        return url_player
    else:
        return None


In [28]:
def scrape_basic_stats():
    
    url = "https://www.basketball-reference.com/players/"
    alphabet = "abcdefghijklmnopqrstuvwxyz"
    data = pd.DataFrame()
    player_name = []
    career_length = []
    career_start = []
    career_end = []
    gp = []
    winshare = []
    ppg = []
    apg = []
    rpg = []
    fg_percentage = []
    three_percentage = []
    ft_percentage = []
    per = []
    rings = []
    mvps = []
    all_star_appearances = []
    all_nba = []
    hof = []
    all_def = []

    for i in range(len(alphabet)):

        url_ln = url + alphabet[i]
        r = requests.get(url_ln)
        soup = BeautifulSoup(r.text, 'html.parser')

        # num_players = int(soup.find(id='players').findChildren()[0].text.split(' ')[0])
        players = soup.find(id='players').findAll('tr')
        
        for j in range(1, len(players)):
            
            name = players[j].findAll('a')[0].text
            url_player = format_player_url(url_ln, name)

            start_yr = int(players[j].findAll('td', {'data-stat': 'year_min'})[0].text)
            end_yr = int(players[j].findAll('td', {'data-stat': 'year_max'})[0].text)

            if end_yr < 1947:
                continue

            if url_player is not None:
            
                player_r = requests.get(url_player)
                player_soup = BeautifulSoup(player_r.text, 'html.parser')

                num_ws = np.nan
                num_gp = np.nan
                num_ppg = np.nan
                num_rpg = np.nan
                num_apg = np.nan
                num_fg_percentage = np.nan
                num_three_percentage = np.nan
                num_ft_percentage = np.nan
                num_per = np.nan

                if len(player_soup.findAll('span', {'data-tip':'<b>Win Shares</b><br>An estimate of the number of wins contributed by a player.'})) > 0:
                    num_as_str = player_soup.findAll('span', {'data-tip':'<b>Win Shares</b><br>An estimate of the number of wins contributed by a player.'})[0].findNextSiblings()[1].text
                    if num_as_str != '-':
                        num_ws = float(num_as_str)

                if len(player_soup.findAll('span', {'data-tip':'Games'})) > 0:
                    num_as_str = player_soup.findAll('span', {'data-tip':'Games'})[0].findNextSiblings()[1].text
                    if num_as_str != '-':
                       num_gp = float(num_as_str)         

                if len(player_soup.findAll('span', {'data-tip':'Points'})) > 0:
                    num_as_str = player_soup.findAll('span', {'data-tip':'Points'})[0].findNextSiblings()[1].text
                    if num_as_str != '-':
                       num_ppg = float(num_as_str)     
        
                if len(player_soup.findAll('span', {'data-tip':'Total Rebounds'})) > 0:
                    num_as_str = player_soup.findAll('span', {'data-tip':'Total Rebounds'})[0].findNextSiblings()[1].text
                    if num_as_str != '-':
                        num_rpg = float(num_as_str)
            
                if len(player_soup.findAll('span', {'data-tip':'Assists'})) > 0:
                    num_as_str = player_soup.findAll('span', {'data-tip':'Assists'})[0].findNextSiblings()[1].text
                    if num_as_str != '-':
                        num_apg = float(num_as_str)

                if len(player_soup.findAll('span', {'data-tip':'Field Goal Percentage'})) > 0:
                    num_as_str = player_soup.findAll('span', {'data-tip':'Field Goal Percentage'})[0].findNextSiblings()[1].text
                    if num_as_str != '-':
                        num_fg_percentage = float(num_as_str)

                if len(player_soup.findAll('span', {'data-tip':'3-Point Field Goal Percentage'})) > 0:
                    num_as_str = player_soup.findAll('span', {'data-tip':'3-Point Field Goal Percentage'})[0].findNextSiblings()[1].text
                    if num_as_str != '-':
                        num_three_percentage = float(num_as_str)

                if len(player_soup.findAll('span', {'data-tip':'Free Throw Percentage'})) > 0:
                    num_as_str = player_soup.findAll('span', {'data-tip':'Free Throw Percentage'})[0].findNextSiblings()[1].text
                    if num_as_str != '-':
                        num_ft_percentage = float(num_as_str)

                if len(player_soup.findAll('span', {'data-tip':'<b>Player Efficiency Rating</b><br>A measure of per-minute production standardized such that the league average is 15.'})) > 0:
                    num_as_str = player_soup.findAll('span', {'data-tip':'<b>Player Efficiency Rating</b><br>A measure of per-minute production standardized such that the league average is 15.'})[0].findNextSiblings()[1].text
                    if num_as_str != '-':
                        num_per = float(num_as_str)

                print(name)
                player_name.append(name)
                career_start.append(start_yr)
                career_end.append(end_yr)
                career_length.append(end_yr - start_yr + 1)
                winshare.append(num_ws)
                gp.append(num_gp)
                ppg.append(num_ppg)
                rpg.append(num_rpg)
                apg.append(num_apg)
                fg_percentage.append(num_fg_percentage)
                three_percentage.append(num_three_percentage)
                ft_percentage.append(num_ft_percentage)
                per.append(num_per)


    data['Player'] = player_name
    data['Career Length'] = career_length
    data['Start Year'] = career_start
    data['End Year'] = career_end
    data['GP'] = gp
    data['PPG'] = ppg
    data['APG'] = apg
    data['RPG'] = rpg
    data['FG%'] = fg_percentage
    data['3P%'] = three_percentage
    data['FT%'] = ft_percentage
    data['PER'] = per
    data['Winshares'] = winshare

    return data

In [29]:
basic_data = scrape_basic_stats()

Alaa Abdelnaby
Zaid Abdul-Aziz
Kareem Abdul-Jabbar
Mahmoud Abdul-Rauf
Tariq Abdul-Wahad
Shareef Abdur-Rahim
Tom Abernethy
Forest Able
John Abramovic
Álex Abrines
Precious Achiuwa
Alex Acker
Don Ackerman
Mark Acres
Bud Acton
Quincy Acy
Alvan Adams
Don Adams
George Adams
Hassan Adams
Jaylen Adams
Jordan Adams
Michael Adams
Steven Adams
Rafael Addison
Bam Adebayo
Deng Adel
Rick Adelman
Jeff Adrien
Arron Afflalo
Maurice Ager
Mark Aguirre
Blake Ahearn
Danny Ainge
Matthew Aitch
Alexis Ajinça
Henry Akin
Josh Akognon
DeVaughn Akoon-Purcell
Solomon Alabi
Mark Alarie
Gary Alcorn
Santi Aldama
Furkan Aldemir
Cole Aldrich
LaMarcus Aldridge
Chuck Aleksinas
Cliff Alexander
Cory Alexander
Courtney Alexander
Gary Alexander
Joe Alexander
Kyle Alexander
Ty-Shon Alexander
Victor Alexander
Nickeil Alexander-Walker
Steve Alford
Rawle Alkins
Bill Allen
Bob Allen
Grayson Allen
Jarrett Allen
Jerome Allen
Kadeem Allen
Lavoy Allen
Lucius Allen
Malik Allen
Randy Allen
Ray Allen
Tony Allen
Willie Allen
Odis Alliso

In [26]:
basic_data.to_csv("data/basic-data")

In [None]:
blings = player_soup.find("ul", {"id": "bling"})

                if(blings):
                    li = blings.findChildren()
                    hof_found = False
                    rings_found = False
                    all_star_found = False
                    all_nba_found = False
                    all_def_found = False
                    mvp_found = False
        
                    for i in range(1, len(li), 2):
                        l = li[i].text
                        if "Hall of Fame" in l:
                            hof.append(1)
                            hof_found = True
                        if "NBA Champ" in l:
                            rings.append(l.split("x")[0])
                            rings_found = True
                        elif "All Star" in l:
                            all_star_appearances.append(l.split("x")[0])
                            all_star_found = True
                        elif "MVP" in l and "Finals" not in l and "AS" not in l:
                            mvps.append(l.split("x")[0])
                            mvp_found = True
                        elif "All-NBA" in l:
                            all_nba.append(l.split("x")[0])
                            all_nba_found = True
                        elif "All-Defensive" in l:
                            all_def.append(l.split("x")[0])
                            all_def_found = True
                    
                    if hof_found == False:
                        hof.append(0)
                    if mvp_found == False:
                        mvps.append(0)
                    if all_star_found == False:
                        all_star_appearances.append(0)
                    if all_nba_found == False:
                        all_nba.append(0)
                    if rings_found == False:
                        rings.append(0)
                    if all_def_found == False:
                        all_def.append(0)
                
                else:
                    hof.append(0)
                    rings.append(0)
                    all_star_appearances.append(0)
                    all_nba.append(0)
                    mvps.append(0)
                    all_def.append(0)