In [1]:
import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt

from pymongo import MongoClient

from bs4 import BeautifulSoup
import requests

## Request the webpage's raw HTML and store into MongoDB

In order to get each player's career "per game" stats, we need to iterate through the alphabet and individually click on each player's name. This may be messy to do given that at the end of each alphabet, we must go back to the player directory landing page and click the next letter. Instead - lets iterate through the alphabet (links are consistent with the /'letter' changing i.e. https://www.basketball-reference.com/players/a/) and grab the list of hyperlinks per alphabet and then iterate through those individual player links to get the html. 

In [2]:
player_directory_url = []
for letter in string.ascii_lowercase:
    player_directory_url.append('https://www.basketball-reference.com/players/'+letter)

In [3]:
def get_player_url(directory):
    
    individual_player_url = []
    
    for url in directory:
        page = requests.get(url)
        soup = BeautifulSoup(page.text,'html.parser')
        
        for row in soup.find_all('tr'):
            #skip first row of chart 
            if row.a != None:
                href = row.a.get('href')
                individual_player_url.append('https://www.basketball-reference.com'+href)
    return individual_player_url

In [4]:
a_z_all_player_urls = get_player_url(player_directory_url)

Iterating through all individual basketball player links, getting the html from the page and throwing it into mongodb

In [5]:
def html_to_mongodb(urls):
    client = MongoClient()
    db = client.capstone3_bball
    players = db.players
    
    for url in urls:
        page = requests.get(url)
        players.insert_one({'link':url, 'html':page.text})
        
    return 'Done!'
    

In [6]:
# commenting out - do not want to rerun this action each time I open the notebook
# html_to_mongodb(a_z_all_player_urls)

If they played in both ABA and NBA - then their NBA score is the second row, else first row of footer. Cannot append all stats since not all player stat tables have the same # of stats - some columns are missing for some players so we will need to fill in with NaNs as placeholders to maintain data structure when putting it into a dataframe. Currently using a long thing of if statements - could probably come up with a more streamlined process

## Parse HTML and Create DataFrame

In [200]:
def parse_to_df(urls):
    
    all_players_info = []
    
    client = MongoClient()
    db = client.capstone3_bball
    players = db.players
    
    for url in urls:
        
        player_info = [np.nan]*27
        
        one_player = players.find_one({'link':url})['html']
        soup = BeautifulSoup(one_player,'html.parser')
        
        player_name = soup.find_all('h1')[0].text[1:-1]
        player_info[0]= player_name
        
        for text in soup.find_all('p'):
            if "Position" in text.text:
                position = text.text.split('\n  ')[3][:-2]
        player_info[1] = position
        
        
        table = soup.find_all(id = 'all_per_game')[0]
        all_scores = table.find_all('tfoot')[0]

        if ('TOT' in all_scores.find_all('tr')[0].text or 'ABA'in all_scores.find_all('tr')[0].text)and str(all_scores.find_all('tr')).count('tr>')!=2:
            career_scores = all_scores.find_all('tr')[1].find_all('td')[4:]
        elif 'NBA' in all_scores.find_all('tr')[0].text:
            career_scores = all_scores.find_all('tr')[0].find_all('td')[4:]
        else:
            career_scores = None
        
        if career_scores == None:
            continue
        else:
            for stat in career_scores:

                if 'data-stat="g"' in str(stat):
                    player_info[2] = stat.text
                elif 'data-stat="gs"' in str(stat):
                    player_info[3] = stat.text
                elif 'data-stat="mp_per_g"' in str(stat):
                    player_info[4] = stat.text
                elif 'data-stat="fg_per_g"' in str(stat):
                    player_info[5] = stat.text
                elif 'data-stat="fga_per_g"' in str(stat):
                    player_info[6] = stat.text
                elif 'data-stat="fg_pct"' in str(stat):
                    player_info[7] = stat.text
                elif 'data-stat="fg3_per_g"' in str(stat):
                    player_info[8] = stat.text
                elif 'data-stat="fg3a_per_g"' in str(stat):
                    player_info[9] = stat.text
                elif 'data-stat="fg3_pct"' in str(stat):
                    player_info[10] = stat.text
                elif 'data-stat="fg2_per_g"' in str(stat):
                    player_info[11] = stat.text
                elif 'data-stat="fg2a_per_g"' in str(stat):
                    player_info[12] = stat.text
                elif 'data-stat="fg2_pct"' in str(stat):
                    player_info[13] = stat.text
                elif 'data-stat="efg_pct"' in str(stat):
                    player_info[14] = stat.text
                elif 'data-stat="ft_per_g"' in str(stat):
                    player_info[15] = stat.text
                elif 'data-stat="fta_per_g"' in str(stat):
                    player_info[16] = stat.text
                elif 'data-stat="ft_pct"' in str(stat):
                    player_info[17] = stat.text
                elif 'data-stat="orb_per_g"' in str(stat):
                    player_info[18] = stat.text
                elif 'data-stat="drb_per_g"' in str(stat):
                    player_info[19] = stat.text
                elif 'data-stat="trb_per_g"' in str(stat):
                    player_info[20] = stat.text
                elif 'data-stat="ast_per_g"' in str(stat):
                    player_info[21] = stat.text
                elif 'data-stat="stl_per_g"' in str(stat):
                    player_info[22] = stat.text
                elif 'data-stat="blk_per_g"' in str(stat):
                    player_info[23] = stat.text
                elif 'data-stat="tov_per_g"' in str(stat):
                    player_info[24] = stat.text
                elif 'data-stat="pf_per_g"' in str(stat):
                    player_info[25] = stat.text
                else:
                    player_info[26] = stat.text

        all_players_info.append(player_info)
        
    df_columns = columns = ['Player','Pos','G','GS','MP','FG','FGA','FG%','3P','3PA','3P%','2P','2PA','2P%','eFG%','FT',
          'FTA','FT%','ORB','DRB','TRB','AST','STL','BLK','TOV','PF','PTS']
        
    df = pd.DataFrame(columns = df_columns, data = all_players_info)
        
    return df
        
        

Some players are officially categorized as more than one position - for the purpose of classification, we will take the position that it was categorized the most and take that to be our target i.e. Connie Hawkins(SF)

In [208]:
df = parse_to_df(a_z_all_player_urls)

In [209]:
df

Unnamed: 0,Player,Pos,G,GS,MP,FG,FGA,FG%,3P,3PA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Alaa Abdelnaby,Power Forward,256,53,12.5,2.4,4.8,.502,0.0,0.0,...,.701,1.1,2.2,3.3,0.3,0.3,0.3,1.0,1.9,5.7
1,Zaid Abdul-Aziz,Center and Power Forward,505,,21.8,3.5,8.2,.428,,,...,.728,2.4,5.4,8.0,1.2,0.6,1.0,0.9,2.2,9.0
2,Kareem Abdul-Jabbar,Center,1560,625,36.8,10.2,18.1,.559,0.0,0.0,...,.721,2.4,7.6,11.2,3.6,0.9,2.6,2.7,3.0,24.6
3,Mahmoud Abdul-Rauf,Point Guard,586,336,26.7,6.0,13.6,.442,0.8,2.3,...,.905,0.4,1.5,1.9,3.5,0.8,0.1,1.6,1.9,14.6
4,Tariq Abdul-Wahad,Shooting Guard,236,145,20.4,3.1,7.3,.417,0.1,0.3,...,.703,1.2,2.1,3.3,1.1,0.8,0.4,1.3,2.1,7.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4499,Paul Zipser,Small Forward,98,30,17.0,1.7,4.6,.371,0.7,2.1,...,.769,0.3,2.3,2.6,0.8,0.4,0.3,0.8,1.7,4.7
4500,Ante Žižić,Center,113,27,13.4,2.4,4.2,.581,0.0,0.0,...,.711,1.3,2.6,3.9,0.6,0.2,0.4,0.7,1.5,6.0
4501,Jim Zoet,Center,7,0,4.3,0.1,0.7,.200,0.0,0.0,...,,0.4,0.7,1.1,0.1,0.1,0.4,0.6,1.3,0.3
4502,Bill Zopf,Point Guard,53,,7.5,0.9,2.5,.363,,,...,.556,,,0.9,1.4,,,,0.6,2.2


In [210]:
# df.to_csv('player_stats.csv')