In [4]:
from HTMLParser import HTMLParser

class HTMLtoJSONParser(HTMLParser):
    def __init__(self, raise_exception = True) :
        HTMLParser.__init__(self)
        self.doc  = { }
        self.path = []
        self.cur  = self.doc
        self.line = 0
        self.raise_exception = raise_exception
         
    @property
    def json(self):
        return self.doc
         
    @staticmethod
    def to_json(content, raise_exception = True):
        parser = HTMLtoJSONParser(raise_exception = raise_exception)
        parser.feed(content)
        return parser.json
         
    def handle_starttag(self, tag, attrs):
        self.path.append(tag)
        attrs = { k:v for k,v in attrs }
        if tag in self.cur :
            if isinstance(self.cur[tag],list) :
                self.cur[tag].append(  { "__parent__": self.cur } )
                self.cur = self.cur[tag][-1]
            else :
                self.cur[tag] = [ self.cur[tag] ]
                self.cur[tag].append(  { "__parent__": self.cur } )
                self.cur = self.cur[tag][-1]
        else :
            self.cur[tag] = { "__parent__": self.cur }
            self.cur = self.cur[tag]
             
        for a,v in attrs.items():
            self.cur["#" + a] = v
        self.cur[""] = ""
                 
    def handle_endtag(self, tag):
        if tag != self.path[-1] and self.raise_exception :
            raise Exception("html is malformed around line: {0} (it might be because of a tag <br>, <hr>, <img .. > not closed)".format(self.line))
        del self.path[-1]
        memo = self.cur
        self.cur = self.cur["__parent__"]
        self.clean(memo)
                 
    def handle_data(self, data):
        self.line += data.count("\n")
        if "" in self.cur :
            self.cur[""] += data
             
    def clean(self, values):
        keys = list(values.keys())
        for k in keys:
            v = values[k]
            if isinstance(v, str) :
                #print ("clean", k,[v])
                c = v.strip(" \n\r\t")
                if c != v : 
                    if len(c) > 0 : 
                        values[k] = c
                    else : 
                        del values[k]
        del values["__parent__"]

In [5]:
import urllib2

In [6]:
nba_dom = ""

In [7]:
nba_href = "/leagues/NBA_2017.html"

In [8]:
nba_url = nba_dom + nba_href

In [43]:
from bs4 import BeautifulSoup, Comment
import requests, re

In [12]:
page = requests.get(nba_url)

In [13]:
soup = BeautifulSoup(page.text, "html5lib")

# get the comments
comments = soup.findAll(text=lambda text:isinstance(text, Comment))
# look for table with the id "team_stats"
def get_table(table_name, comments):
    rx = re.compile(r'<table.+?id="%s".+?>[\s\S]+?</table>' % table_name)
    for comment in comments:
        try:
            table = rx.search(comment.string).group(0)
            # break the loop if found
            return table
        except:
            pass
        

In [434]:
table = get_table("team-stats-base", comments)

In [56]:
def get_col_dicts(table):
    import shlex
    rx = re.compile(r'<th aria-label.+?>[\s\S]+?</th>')
    labels = rx.findall(table)
    col_dicts = []
    for label in labels:
        try:
            col_dict = dict(map(lambda x: tuple(x.split("=")), shlex.split(label.split("<th ")[1].split(">")[0])))
            if 'scope' in col_dict: del col_dict['scope']
            if 'class' in col_dict: del col_dict['class']
            if 'aria-label' in col_dict: del col_dict['aria-label']
            col_dicts.append(col_dict)
        except Exception, e:
            print label + " has errored."
            print(str(e))
    return (col_dicts, labels)

In [436]:
(col_dicts, labels) = get_col_dicts(table)

In [437]:
def get_teams(col_dicts, table):
    rx = re.compile(r'<tr.+?>[\s\S]+?</tr>')
    labels = rx.findall(table)
    
    team_dicts = []
    for label in labels[:-1]:
        team_dict = {}
        label_json = HTMLtoJSONParser.to_json(label, True)['tr']
        team_dict[col_dicts[0]["data-stat"]] = label_json['th']['']
        labels_json_td = label_json['td']
        for label_json_td in labels_json_td:
            if label_json_td["#data-stat"] == "team_name":
                team_dict["team_name"] = label_json_td['a']['']
                team_dict["href"] = label_json_td['a']['#href']
                team_dict["po"] = label_json_td['']
            else:
                team_dict[label_json_td["#data-stat"]] = label_json_td['']
        team_dicts.append(team_dict)
    return team_dicts

In [438]:
team_dicts = get_teams(col_dicts, table)

In [439]:
stats_team = ["team_and_opponent", "salaries2"]#"team_misc", 
stats_player = ["totals", "per_game", "per_poss", "per_minute", "advanced", "advanced_pbp", "shooting"]

In [71]:
def extract_stats_table(col_dicts, table, skip_top=0, skip_bot=0):
    '''
    "team_and_opponent", "salaries2", "totals", "per_game", "per_poss", "per_minute", "advanced", "advanced_pbp", "shooting"
    '''
    rx = re.compile(r'<tr.+?>[\s\S]+?</tr>')
    labels = rx.findall(table)
    
    some_dicts = []
    len_labels = len(labels)
    for label in labels[skip_top:(len_labels - skip_bot)]:
        if "colspan" not in label:
            some_dict = {}
            label_json = HTMLtoJSONParser.to_json(label, True)
            label_json = label_json['tr'] if 'tr' in label_json else label_json
            some_dict[col_dicts[0]["data-stat"]] = label_json['th']['']
            labels_json_td = label_json['td'] if 'td' in label_json else label_json
            some_dict["metric"] = label_json['th']['']
            for label_json_td in labels_json_td:
                if label_json_td["#data-stat"] in ["player", "g", "season", "team_id", 
                                                   "lg_id", "date_game", "visitor_team_name", 
                                                   "home_team_name", "box_score_text"] and "a" in label_json_td:
                    some_dict[label_json_td["#data-stat"]] = label_json_td['a']['']
                    some_dict[label_json_td["#data-stat"] + "_href"] = label_json_td['a']['#href']
                elif some_dict["metric"] == "Year/Year" and label_json_td["#data-stat"] != 'g':
                    some_dict[label_json_td["#data-stat"]] = label_json_td['span']['']
                else:
                    some_dict[label_json_td["#data-stat"]] = label_json_td['']

            some_dicts.append(some_dict)
    return some_dicts

In [441]:
import json

In [451]:
all_team_stats = {}
for team_dict in team_dicts:
    print("Processing " + team_dict["team_name"])
    team_url = nba_dom + team_dict['href']
    page = urllib2.urlopen(team_url)
    soup = BeautifulSoup(page, "html5lib")
    comments = soup.findAll(text=lambda text:isinstance(text, Comment))

    all_stats = {}
    
    for stat_team in stats_team:
        table = get_table(stat_team, comments)
        (col_dicts, labels) = get_col_dicts(table)
        if stat_team in ["salaries2"]:
            stats = extract_stats_table(col_dicts, table, 1, 0)
        else:
            stats = extract_stats_table(col_dicts, table)
            
        all_stats[stat_team] = stats
    
    for stat_player in stats_player:
        table = get_table(stat_player, comments)
        (col_dicts, labels) = get_col_dicts(table)
        
        stats = None
        
        if stat_player == "totals":
            stats = extract_stats_table(col_dicts, table, 0, 1)
        else:
            if stat_player in ["advanced_pbp", "shooting"]:
                col_dicts = filter(lambda col_dict: "colspan" not in col_dict, col_dicts)
            stats = extract_stats_table(col_dicts, table)
        
        all_stats[stat_player] = stats    
    
    if team_dict['po'] == "*":
        for stat_player in stats_player:
            table = get_table("playoffs_" + stat_player, comments)
            (col_dicts, labels) = get_col_dicts(table)

            stats = None

            if stat_player == "totals":
                stats = extract_stats_table(col_dicts, table, 0, 1)
            else:
                if stat_player in ["advanced_pbp", "shooting"]:
                    col_dicts = filter(lambda col_dict: "colspan" not in col_dict, col_dicts)
                stats = extract_stats_table(col_dicts, table)

            all_stats["playoffs_" + stat_player] = stats

    all_team_stats[team_dict["team_name"]] = all_stats
with open('all_team_stats.json', 'w') as outfile:
    json.dump(all_team_stats, outfile)

print("Done")

Processing Golden State Warriors
Processing Houston Rockets
Processing Denver Nuggets
Processing Cleveland Cavaliers
Processing Washington Wizards
Processing Los Angeles Clippers
Processing Boston Celtics
Processing Portland Trail Blazers
Processing Phoenix Suns
Processing Toronto Raptors
Processing Oklahoma City Thunder
Processing Brooklyn Nets
Processing Minnesota Timberwolves
Processing San Antonio Spurs
Processing Indiana Pacers
Processing Charlotte Hornets
Processing Los Angeles Lakers
Processing New Orleans Pelicans
Processing New York Knicks
Processing Milwaukee Bucks
Processing Miami Heat
Processing Atlanta Hawks
Processing Chicago Bulls
Processing Sacramento Kings
Processing Philadelphia 76ers
Processing Detroit Pistons
Processing Orlando Magic
Processing Utah Jazz
Processing Memphis Grizzlies
Processing Dallas Mavericks
Done


In [462]:
import time

In [463]:
import random

In [479]:
stats_player = ["totals", "per_poss", "per_minute", "advanced", "advanced_pbp", "shooting"]
all_team_players = {}

for team_dict in team_dicts:
    print("Processing " + team_dict["team_name"])

    team_players = {}
    
    for player in all_team_stats[team_dict["team_name"]]["totals"]:
        print("...Processing " + player["player"])
        player_href = player["player_href"]
        player_url = nba_dom + player_href
        page = urllib2.urlopen(player_url)
        soup = BeautifulSoup(page, "html5lib")
        comments = soup.findAll(text=lambda text:isinstance(text, Comment))
        
        all_stats = {}
        
        

        for stat_player in stats_player:
            table = get_table(stat_player, comments)
            (col_dicts, labels) = get_col_dicts(table)

            stats = None

            if stat_player == "totals":
                stats = extract_stats_table(col_dicts, table, 0, 1)
            else:
                if stat_player in ["advanced_pbp", "shooting"]:
                    col_dicts = filter(lambda col_dict: "colspan" not in col_dict, col_dicts)
                stats = extract_stats_table(col_dicts, table)

            all_stats[stat_player] = stats    

        if team_dict['po'] == "*":
            for stat_player in stats_player:
                table = get_table("playoffs_" + stat_player, comments)
                if table is not None:
                    (col_dicts, labels) = get_col_dicts(table)

                    stats = None

                    if stat_player == "totals":
                        stats = extract_stats_table(col_dicts, table, 0, 1)
                    else:
                        if stat_player in ["advanced_pbp", "shooting"]:
                            col_dicts = filter(lambda col_dict: "colspan" not in col_dict, col_dicts)
                        stats = extract_stats_table(col_dicts, table)

                    all_stats["playoffs_" + stat_player] = stats
                
        team_players[player["player"]] = all_stats
        
        time.sleep(random.uniform(1,3))
    
#     all_team_players[team_dict["team_name"]] = team_players
    
    with open(team_dict["team_name"].replace(" ", "") + '_players_stats.json', 'w') as outfile:
        json.dump(team_players, outfile)
    
# with open('all_players_stats.json', 'w') as outfile:
#     json.dump(all_team_players, outfile)

print("Done")

Processing Golden State Warriors
...Processing Klay Thompson
...Processing Stephen Curry
...Processing Draymond Green
...Processing Kevin Durant
...Processing Andre Iguodala
...Processing Shaun Livingston
...Processing Zaza Pachulia
...Processing Ian Clark
...Processing Patrick McCaw
...Processing David West
...Processing JaVale McGee
...Processing James Michael McAdoo
...Processing Kevon Looney
...Processing Matt Barnes
...Processing Anderson Varejao
...Processing Damian Jones
...Processing Briante Weber
Processing Houston Rockets
...Processing James Harden
...Processing Trevor Ariza
...Processing Eric Gordon
...Processing Ryan Anderson
...Processing Patrick Beverley
...Processing Clint Capela
...Processing Sam Dekker
...Processing Nene Hilario
...Processing Montrezl Harrell
...Processing Corey Brewer
...Processing Lou Williams
...Processing K.J. McDaniels
...Processing Tyler Ennis
...Processing Troy Williams
...Processing Bobby Brown
...Processing Isaiah Taylor
...Processing Chinanu 

...Processing Johnny O'Bryant
...Processing Mike Tobey
...Processing Aaron Harrison
Processing Los Angeles Lakers
...Processing Jordan Clarkson
...Processing Brandon Ingram
...Processing Julius Randle
...Processing D'Angelo Russell
...Processing Nick Young
...Processing Luol Deng
...Processing Larry Nance
...Processing Lou Williams
...Processing Timofey Mozgov
...Processing Tarik Black
...Processing Ivica Zubac
...Processing Thomas Robinson
...Processing David Nwaba
...Processing Tyler Ennis
...Processing Corey Brewer
...Processing Jose Calderon
...Processing Marcelo Huertas
...Processing Metta World Peace
Processing New Orleans Pelicans
...Processing Anthony Davis
...Processing Solomon Hill
...Processing Jrue Holiday
...Processing E'Twaun Moore
...Processing Dante Cunningham
...Processing Tim Frazier
...Processing Terrence Jones
...Processing Buddy Hield
...Processing Langston Galloway
...Processing Alexis Ajinca
...Processing DeMarcus Cousins
...Processing Omer Asik
...Processing Don

In [74]:
from pandas.io.html import read_html
from selenium import webdriver

nba_href = "/leagues/NBA_%s_games-%s.html"
nba_url = nba_dom + nba_href
game_years = ['2014', '2015', '2016', '2017']
game_months = ["october", "november", "december", "january", "february", "march", "april", "may", "june"]

driver = webdriver.Chrome("/home/vladislav/Projects/Data_Engineering/Web_Scraper/chromedriver")

In [75]:
game_year_stats = {}
for game_year in game_years:
    game_month_stats = {}
    print("Processing year " + game_year)
    for game_month in game_months:
        print("...Processing month " + game_month)
        +
        driver.get(nba_url % (game_year, game_month))
        table = driver.find_element_by_xpath('//*[@id="schedule"]')
        table_html = table.find_element_by_xpath('//*[@id="schedule"]/thead/tr')
        cols_table = table_html.get_attribute('innerHTML')
        (col_dicts, labels) = get_col_dicts(cols_table)
        
        table_html = table.find_element_by_xpath('//*[@id="schedule"]/tbody')

        data_table = table_html.get_attribute('innerHTML')
        stats = extract_stats_table(col_dicts, data_table)
        
        game_month_stats[game_month] = stats
        
    game_year_stats[game_year] = game_month_stats
    
with open('game_year_stats.json', 'w') as outfile:
    json.dump(game_year_stats, outfile)

NameError: name 'json' is not defined