In [1]:
import pandas as pd
import numpy as np
import pprint
import string
import re
import matplotlib.pyplot as plt

from pymongo import MongoClient

from bs4 import BeautifulSoup
import requests

import json

## Request the webpage's raw HTML and store into MongoDB

In order to get each player's career "per game" stats, we need to iterate through the alphabet and individually click on each player's name. This may be messy to do given that at the end of each alphabet, we must go back to the player directory landing page and click the next letter. Instead - lets iterate through the alphabet (links are consistent with the /'letter' changing i.e. https://www.basketball-reference.com/players/a/) and grab the list of hyperlinks per alphabet and then iterate through those individual player links to get the html. 

In [2]:
player_directory_url = []
for letter in string.ascii_lowercase:
    player_directory_url.append('https://www.basketball-reference.com/players/'+letter)

In [3]:
def get_player_url(directory):
    
    individual_player_url = []
    
    for url in directory:
        page = requests.get(url)
        soup = BeautifulSoup(page.text,'html.parser')
        
        for row in soup.find_all('tr'):
            #skip first row of chart 
            if row.a != None:
                href = row.a.get('href')
                individual_player_url.append('https://www.basketball-reference.com'+href)
    return individual_player_url

In [4]:
a_z_all_player_urls = get_player_url(player_directory_url)

Iterating through all individual basketball player links, getting the html from the page and throwing it into mongodb

In [276]:
def html_to_mongodb(urls):
    client = MongoClient()
    db = client.capstone3_bball
    players = db.players
    
    for url in urls:
        page = requests.get(url)
        players.insert_one({'link':url, 'html':page.text})
        
    return 'Done!'
    

In [278]:
html_to_mongodb(a_z_all_player_urls)

'Done!'

In [5]:
a_z_all_player_urls[0]

'https://www.basketball-reference.com/players/a/abdelal01.html'

If they played in both ABA and NBA - then their NBA score is the second row

In [233]:
c = requests.get(a_z_all_player_urls[0])
soup = BeautifulSoup(c.text,'html.parser')

In [234]:
test = soup.find_all(id = 'all_per_game')[0]

In [235]:
x = test.find_all('tfoot')[0]

In [270]:
#initialize column lists
G, GS, MP, FG, FGA, FGPercentage, Three_P, Three_PA, Three_P_Percentage, Two_P, Two_PA, Two_P_Percentage, \
EFG_Percentage, FT, FTA, FT_Percentage, ORB, DRB, TRB, AST, STL, BLK, TOV, PF, PTS = ([], ) * 25

#list of columns
columns = [G, GS, MP, FG, FGA, FGPercentage, Three_P, Three_PA, Three_P_Percentage, Two_P, Two_PA, Two_P_Percentage, \
EFG_Percentage, FT, FTA, FT_Percentage, ORB, DRB, TRB, AST, STL, BLK, TOV, PF, PTS]

In [245]:
if 'ABA' in x.find_all('tr')[0].text:
    for val,col in zip(x.find_all('tr')[1].find_all('td')[4:], columns):
        print(val)
else:
    for i,stat in enumerate(x.find_all('tr')[0].find_all('td')[4:]):
        print(columns)

In [272]:
all_stats = []
if 'ABA' in x.find_all('tr')[0].text:
    for val,col in zip(x.find_all('tr')[1].find_all('td')[4:], columns):
        print(val)
else:
    player_stats = []
    for stat in x.find_all('tr')[0].find_all('td')[4:]:
        player_stats.append(stat.text)
    all_stats.append(player_stats)
        

In [273]:
all_stats

[['256',
  '53',
  '12.5',
  '2.4',
  '4.8',
  '.502',
  '0.0',
  '0.0',
  '.000',
  '2.4',
  '4.8',
  '.503',
  '.502',
  '0.9',
  '1.3',
  '.701',
  '1.1',
  '2.2',
  '3.3',
  '0.3',
  '0.3',
  '0.3',
  '1.0',
  '1.9',
  '5.7']]