In [2]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup, SoupStrainer
import sys
import string
import requests
import datetime
import time

Baksetball Reference is accessed through this url: http://www.basketball-reference.com


### Basic player information

By adding /players/'letter', we can view basic info for all active/retired NBA & ABA players with last names starting with that letter. For example, https://www.basketball-reference.com/players/a/ would give the following information for each player with last name starting with 'A':
- From: int variable, career start year
- To: int variable, career end year
- Pos: string variable, basketball position abbreviation
- Ht: string variable, height in feet-inches
- Wt: string variable, weight in pounds
- Brith Date: string variable
- Colleges: string variable, blank if international or did not play in college

In [3]:
def player_info():
    
    players = []
    player_base_url = 'http://www.basketball-reference.com/players/'

    for letter in string.ascii_lowercase[0]: # get player tables from alphabetical list pages
        page_request = requests.get(player_base_url + letter)
        soup = BeautifulSoup(page_request.text,"lxml")
        table = soup.find('table') # find table in soup

        if table:
            table_body = table.find('tbody')
            for row in table_body.findAll('tr'):  # loop over list of players in the table
                player_url = row.find('a') 
                player_pages = player_url['href']  # player page url
                player_names = player_url.text  #p player name

                # get additional player info from table
                cells = row.findAll('td')
                active_from = int(cells[0].text) # 'From' column
                active_to = int(cells[1].text) # 'To' column
                position = cells[2].text # 'Pos' column
                height = cells[3].text # 'Ht' column (feet-inches)
                weight = cells[4].text # 'Wt' column (lbs)
                birth_date = cells[5].text # 'Birth Date' column
                college = cells[6].text # 'Colleges' column (blank is either no college or intl)

                # create entry
                player_entry = {'url': player_pages,
                                'name': player_names,
                                'active_from': active_from,
                                'active_to': active_to,
                                'position': position,
                                'college': college,
                                'height': height,
                                'weight': weight,
                                'birth_date': birth_date}

                # append player dictionary
                players.append(player_entry)
                
    return pd.DataFrame(players)

In [22]:
players_general_df = player_info() # call function that scrapes general info

# convert height to inches
height_inches = players_general_df['height'].str.split('-',expand=True)
players_general_df['height_inches'] = 12.0*pd.to_numeric(height_inches[0], errors='coerce')+pd.to_numeric(height_inches[1], errors='coerce')


players_general_df.head(10) # preview

Unnamed: 0,active_from,active_to,birth_date,college,height,name,position,url,weight,height_inches
0,1991,1995,"June 24, 1968",Duke University,6-10,Alaa Abdelnaby,F-C,/players/a/abdelal01.html,240,82.0
1,1969,1978,"April 7, 1946",Iowa State University,6-9,Zaid Abdul-Aziz,C-F,/players/a/abdulza01.html,235,81.0
2,1970,1989,"April 16, 1947","University of California, Los Angeles",7-2,Kareem Abdul-Jabbar,C,/players/a/abdulka01.html,225,86.0
3,1991,2001,"March 9, 1969",Louisiana State University,6-1,Mahmoud Abdul-Rauf,G,/players/a/abdulma02.html,162,73.0
4,1998,2003,"November 3, 1974","University of Michigan, San Jose State University",6-6,Tariq Abdul-Wahad,F,/players/a/abdulta01.html,223,78.0
5,1997,2008,"December 11, 1976",University of California,6-9,Shareef Abdur-Rahim,F,/players/a/abdursh01.html,225,81.0
6,1977,1981,"May 6, 1954",Indiana University,6-7,Tom Abernethy,F,/players/a/abernto01.html,220,79.0
7,1957,1957,"July 27, 1932",Western Kentucky University,6-3,Forest Able,G,/players/a/ablefo01.html,180,75.0
8,1947,1948,"February 9, 1919",Salem International University,6-3,John Abramovic,F,/players/a/abramjo01.html,195,75.0
9,2017,2018,"August 1, 1993",,6-6,Alex Abrines,G-F,/players/a/abrinal01.html,190,78.0


In [25]:
# save in csv format
players_general_df.to_csv('Tables/players_general_df.csv', index=False)

### Detailed player information

The url of each player follows the below format:

* /players/(first letter of the last name)/(first 5 letters of last name)(first 2 letters of first name)(01 unless there's another player that fits the prior name setup, else it's 02, 03, etc).html *

Example, the end of the url for John Wall is https://www.basketball-reference.com/players/w/walljo01.html

Each player's url includes much more information (season, awards, salary, etc.), but we will focus on a couple of statistics, such as career averages.

In [9]:
def player_detail_info(url):
    '''
    scrape player's personal page. Input is players url (without  www.basketball-reference.com)
    '''
    # we do not need to parse the whole page since the information we are interested in is only a small part
    personal = SoupStrainer('p')
    page_request = requests.get('http://www.basketball-reference.com' + url)
    soup = BeautifulSoup(page_request.text,"lxml",parse_only=personal) # parse only part we are interested in
    p = soup.findAll('p') 

    # initialize some values in case they are unavailable
    shooting_hand = None
    high_school = None
    draft = None
    ppg = None
    trb = None
    ast = None

    # loop over personal info to get certain information
    for prow in p:
        # look for shoots field
        if 'Shoots:' in prow.text:
            s = prow.text.replace('\n','').split(u'\u25aa') # clean text
            if len(s)>1:
                shoots = s[1].split(':')[1].lstrip().rstrip()
        elif 'High School:' in prow.text:
            s = prow.text.replace('\n','').split(':') 
            if len(s)>1:
                high_school = s[1].lstrip()
        elif 'Draft:' in prow.text:
            s = prow.text.replace('\n','').split(':')
            if len(s)>1:
                draft = s[1].lstrip()
        elif 'Career' in prow.text:
            s = prow.find_next_siblings('p')[3]
            ppg = float(str(s).strip('"<p>""</p>"'))
            s = prow.find_next_siblings('p')[5]
            trb = float(str(s).strip('"<p>""</p>"'))            
            s = prow.find_next_siblings('p')[7]
            ast = float(str(s).strip('"<p>""</p>"'))  

    # create dictionary with all of the info            
    player_entry = {'url': url,
                    'shooting_hand': shoots,
                    'high_school': high_school,
                    'draft': draft,
                    'ppg' : ppg,
                    'trb' : trb,
                    'ast' : ast}

    return player_entry

In [11]:
players_details_info_list = []
for i,url in enumerate(players_general_info.url):
    try:
        players_details_info_list.append(player_detail_info(url))
    except:
        print('cannot load: %s; location %d' %(url,i)) 

cannot load: /players/a/abramjo01.html; location 8
cannot load: /players/a/aubucch01.html; location 144


In [19]:
players_detail_df = pd.DataFrame(players_details_info_list) # convert to dateframe 
players_detail_df.head() # preview

Unnamed: 0,ast,draft,high_school,ppg,shooting_hand,trb,url
0,0.3,"Portland Trail Blazers, 1st round (25th pick, ...","Bloomfield in Bloomfield, New Jersey",5.7,Right,3.3,/players/a/abdelal01.html
1,1.2,"Cincinnati Royals, 1st round (5th pick, 5th ov...","John Jay in Brooklyn, New York",9.0,Right,8.0,/players/a/abdulza01.html
2,3.6,"Milwaukee Bucks, 1st round (1st pick, 1st over...","Power Memorial in New York, New York",24.6,Right,11.2,/players/a/abdulka01.html
3,3.5,"Denver Nuggets, 1st round (3rd pick, 3rd overa...","Gulfport in Gulfport, Mississippi",14.6,Right,1.9,/players/a/abdulma02.html
4,1.1,"Sacramento Kings, 1st round (11th pick, 11th o...","Lycee Aristide Briand in Evreux, France",7.8,Right,3.3,/players/a/abdulta01.html


In [15]:
# save in csv format
players_detail_df.to_csv('Tables/players_detail_df.csv', index=False)

### Merge two dataframes

In [26]:
# merge two dataframes: players_general_info and players_detail_df
players_merged_df = players_general_df.merge(players_detail_df,how='outer',on='url')

# reorganize columns
players_merged_df = players_merged_df[['name', 'active_from', 'active_to', 'birth_date',
                                        'position', 'ppg', 'trb', 'ast', 'height_inches', 'weight', 
                                        'shooting_hand', 'draft', 'college', 'high_school', 'url']]

#previes
players_merged_df.head()

Unnamed: 0,name,active_from,active_to,birth_date,position,ppg,trb,ast,height_inches,weight,shooting_hand,draft,college,high_school,url
0,Alaa Abdelnaby,1991,1995,"June 24, 1968",F-C,5.7,3.3,0.3,82.0,240,Right,"Portland Trail Blazers, 1st round (25th pick, ...",Duke University,"Bloomfield in Bloomfield, New Jersey",/players/a/abdelal01.html
1,Zaid Abdul-Aziz,1969,1978,"April 7, 1946",C-F,9.0,8.0,1.2,81.0,235,Right,"Cincinnati Royals, 1st round (5th pick, 5th ov...",Iowa State University,"John Jay in Brooklyn, New York",/players/a/abdulza01.html
2,Kareem Abdul-Jabbar,1970,1989,"April 16, 1947",C,24.6,11.2,3.6,86.0,225,Right,"Milwaukee Bucks, 1st round (1st pick, 1st over...","University of California, Los Angeles","Power Memorial in New York, New York",/players/a/abdulka01.html
3,Mahmoud Abdul-Rauf,1991,2001,"March 9, 1969",G,14.6,1.9,3.5,73.0,162,Right,"Denver Nuggets, 1st round (3rd pick, 3rd overa...",Louisiana State University,"Gulfport in Gulfport, Mississippi",/players/a/abdulma02.html
4,Tariq Abdul-Wahad,1998,2003,"November 3, 1974",F,7.8,3.3,1.1,78.0,223,Right,"Sacramento Kings, 1st round (11th pick, 11th o...","University of Michigan, San Jose State University","Lycee Aristide Briand in Evreux, France",/players/a/abdulta01.html


In [27]:
# save in csv format
players_merged_df.to_csv('Tables/players_merged_df.csv', index=False)