In [307]:
import pandas as pd
import numpy as np
import pprint
import string
import re
import matplotlib.pyplot as plt
import time

from pymongo import MongoClient

from bs4 import BeautifulSoup
import requests

import json

## Request the webpage's raw HTML and store into MongoDB

In order to get each player's career "per game" stats, we need to iterate through the alphabet and individually click on each player's name. This may be messy to do given that at the end of each alphabet, we must go back to the player directory landing page and click the next letter. Instead - lets iterate through the alphabet (links are consistent with the /'letter' changing i.e. https://www.basketball-reference.com/players/a/) and grab the list of hyperlinks per alphabet and then iterate through those individual player links to get the html. 

In [2]:
player_directory_url = []
for letter in string.ascii_lowercase:
    player_directory_url.append('https://www.basketball-reference.com/players/'+letter)

In [308]:
def get_player_url(directory):
    
    individual_player_url = []
    
    for url in directory:
        page = requests.get(url)
        soup = BeautifulSoup(page.text,'html.parser')
        
        for row in soup.find_all('tr'):
            #skip first row of chart 
            if row.a != None:
                href = row.a.get('href')
                individual_player_url.append('https://www.basketball-reference.com'+href)
    return individual_player_url

In [4]:
a_z_all_player_urls = get_player_url(player_directory_url)

In [318]:
a_z_all_player_urls

['https://www.basketball-reference.com/players/a/abdelal01.html',
 'https://www.basketball-reference.com/players/a/abdulza01.html',
 'https://www.basketball-reference.com/players/a/abdulka01.html',
 'https://www.basketball-reference.com/players/a/abdulma02.html',
 'https://www.basketball-reference.com/players/a/abdulta01.html',
 'https://www.basketball-reference.com/players/a/abdursh01.html',
 'https://www.basketball-reference.com/players/a/abernto01.html',
 'https://www.basketball-reference.com/players/a/ablefo01.html',
 'https://www.basketball-reference.com/players/a/abramjo01.html',
 'https://www.basketball-reference.com/players/a/abrinal01.html',
 'https://www.basketball-reference.com/players/a/achiupr01.html',
 'https://www.basketball-reference.com/players/a/ackeral01.html',
 'https://www.basketball-reference.com/players/a/ackerdo01.html',
 'https://www.basketball-reference.com/players/a/acresma01.html',
 'https://www.basketball-reference.com/players/a/actonbu01.html',
 'https://w

Iterating through all individual basketball player links, getting the html from the page and throwing it into mongodb

In [313]:
def html_to_mongodb(urls):
    client = MongoClient()
    db = client.capstone3_bball
    players = db.players
    
    for url in urls:
        page = requests.get(url)
        players.insert_one({'link':url, 'html':page.text})
        
    return 'Done!'
    

In [314]:
# commenting out - do not want to rerun this action each time I open the notebook
html_to_mongodb(a_z_all_player_urls)

'Done!'

If they played in both ABA and NBA - then their NBA score is the second row, else first row of footer

In [449]:
def parse_to_df(urls):
    
    all_players_info = []
    
    client = MongoClient()
    db = client.capstone3_bball
    players = db.players
    
    for url in urls:
        
        player_info = []
        
        one_player = players.find_one({'link':url})['html']
        soup = BeautifulSoup(one_player,'html.parser')
        
        player_name = soup.find_all('h1')[0].text[1:-1]
        player_info.append(player_name)
        position = soup.find_all('p')[2].text.split('\n  ')[3][:-2]
        player_info.append(position)
        
        
        table = soup.find_all(id = 'all_per_game')[0]
        all_scores = table.find_all('tfoot')[0]
        
        if 'TOT' in all_scores.find_all('tr')[0].text or 'ABA'in all_scores.find_all('tr')[0].text:
            career_scores = all_scores.find_all('tr')[1].find_all('td')[4:]
        else:
            career_scores = all_scores.find_all('tr')[0].find_all('td')[4:]
        
        for stat in career_scores:
            player_info.append(stat.text)
        
        all_players_info.append(player_info)
        
    return all_players_info
        
        

Some players are officially categorized as more than one position - for the purpose of classification, we will take the position that it was categorized the most and take that to be our target i.e. Connie Hawkins(SF)

In [453]:
parse_to_df(['https://www.basketball-reference.com/players/h/hawkico01.html'])

[['Connie Hawkins',
  'Small Forward, Power Forward, and Center',
  '499',
  '',
  '34.5',
  '6.0',
  '12.9',
  '.467',
  '',
  '',
  '',
  '6.0',
  '12.9',
  '.467',
  '.467',
  '4.4',
  '5.6',
  '.785',
  '1.7',
  '4.5',
  '8.0',
  '4.1',
  '1.2',
  '0.8',
  '',
  '2.9',
  '16.5']]