In [2]:
import sys
sys.path.insert(0,"../../")

from library import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import *
from bs4.element import PageElement


# Get Page Content

In [3]:
def new_session(headers=None):
    session = requests.Session()
    if headers:
        session.headers.update(headers)
    return session

def get_page(url):
    content = new_session().get(url).text
    return content

def strip_html_comment(string): 
    import re
    return re.sub("<!--|-->","",string)

def get_box_scores_html(match_code):
    page_content = strip_html_comment(get_page(f'https://www.basketball-reference.com/boxscores/{match_code}.html'))
    soup = BeautifulSoup(page_content)
    return str(soup)

In [4]:
page_content = strip_html_comment(get_page('https://www.basketball-reference.com/boxscores/202110190LAL.html'))
soup = BeautifulSoup(page_content)
str(soup)[:2000]

'\n<!DOCTYPE html>\n\n<html class="no-js" data-root="/home/bbr/build" data-version="klecko-" itemscope="" itemtype="https://schema.org/WebSite" lang="en">\n<head>\n<meta charset="utf-8"/>\n<meta content="ie=edge" http-equiv="x-ua-compatible"/>\n<meta content="width=device-width, initial-scale=1.0, maximum-scale=2.0" name="viewport">\n<link href="https://d2p3bygnnzw9w3.cloudfront.net/req/202204281" rel="dns-prefetch"/>\n     Quantcast Choice. Consent Manager Tag v2.0 (for TCF 2.0) \n<script async="true" type="text/javascript">\n    (function() {\n\tvar host = window.location.hostname;\n\tvar element = document.createElement(\'script\');\n\tvar firstScript = document.getElementsByTagName(\'script\')[0];\n\tvar url = \'https://quantcast.mgr.consensu.org\'\n\t    .concat(\'/choice/\', \'XwNYEpNeFfhfr\', \'/\', host, \'/choice.js\')\n\tvar uspTries = 0;\n\tvar uspTriesLimit = 3;\n\telement.async = true;\n\telement.type = \'text/javascript\';\n\telement.src = url;\n\t\n\tfirstScript.parentNo

# Box Scores

In [5]:
def parse_table(source, trow_selector):
    soup = source if isinstance(source,(BeautifulSoup,PageElement)) else BeautifulSoup(source)
    row_tags = soup.select(trow_selector)
    parsed_rows = [parse_trow(row) for row in row_tags]
    return pd.DataFrame(parsed_rows)

def parse_trow(source):
    soup = source if isinstance(source,(BeautifulSoup,PageElement)) else BeautifulSoup(source)
    return {ele["data-stat"]:ele.text for ele in soup.select('th,td')}     # values = soup.get_text(sep).split(sep)

def parse_box_scores(source,teams):
    away,home = teams
    box_scores_meta = {
        'four_factors'  : f'#four_factors > tbody > tr',
        'line_score'    : f'#line_score > tbody > tr',
        'away-basic'    : f'#box-{away}-game-basic > tbody > tr:not(.thead)',
        'away-advanced' : f'#box-{away}-game-advanced > tbody > tr:not(.thead)',
        'home-basic'    : f'#box-{home}-game-basic > tbody > tr:not(.thead)',
        'home-advanced' : f'#box-{home}-game-advanced > tbody > tr:not(.thead)',
        # 'away-basic'    : f'#box-{away}-game-basic > tfoot > tr:not(.thead)',
        **{
            f'away-{period}-basic': f'#box-{away}-{period}-basic > tbody > tr:not(.thead)'
                for period in ['q1','q2','q3','q4','h1','h2']
        },
        **{
            f'home-{period}-basic': f'#box-{home}-{period}-basic > tbody > tr:not(.thead)'
                for period in ['q1','q2','q3','q4','h1','h2']
        },          
    }
    tables = {  
        name : parse_table(source,selector) for name,selector in box_scores_meta.items() 
    }
    return tables

tables = parse_box_scores(soup,('GSW','LAL'))
iter_k = iter(tables.items())
tables['home-basic']

Unnamed: 0,player,mp,fg,fga,fg_pct,fg3,fg3a,fg3_pct,ft,fta,...,drb,trb,ast,stl,blk,tov,pf,pts,plus_minus,reason
0,Anthony Davis,38:55,15.0,26.0,0.577,1.0,5.0,0.2,2.0,7.0,...,9.0,11.0,2.0,1.0,2.0,0.0,0.0,33.0,-2.0,
1,LeBron James,36:44,13.0,23.0,0.565,5.0,11.0,0.455,3.0,6.0,...,10.0,11.0,5.0,1.0,1.0,4.0,5.0,34.0,-2.0,
2,Russell Westbrook,35:08,4.0,13.0,0.308,0.0,4.0,0.0,0.0,0.0,...,4.0,5.0,4.0,0.0,0.0,4.0,4.0,8.0,-23.0,
3,Kent Bazemore,30:37,3.0,9.0,0.333,2.0,8.0,0.25,0.0,0.0,...,2.0,2.0,0.0,0.0,1.0,1.0,4.0,8.0,10.0,
4,DeAndre Jordan,12:55,1.0,1.0,1.0,0.0,0.0,,0.0,0.0,...,2.0,2.0,1.0,1.0,0.0,0.0,3.0,2.0,2.0,
5,Carmelo Anthony,26:04,3.0,9.0,0.333,2.0,4.0,0.5,1.0,2.0,...,4.0,4.0,2.0,0.0,0.0,1.0,2.0,9.0,-5.0,
6,Rajon Rondo,19:49,1.0,4.0,0.25,1.0,3.0,0.333,0.0,0.0,...,0.0,0.0,5.0,2.0,0.0,3.0,2.0,3.0,1.0,
7,Malik Monk,18:44,2.0,5.0,0.4,2.0,4.0,0.5,0.0,0.0,...,3.0,3.0,1.0,2.0,0.0,1.0,1.0,6.0,-10.0,
8,Dwight Howard,12:49,1.0,2.0,0.5,0.0,0.0,,3.0,4.0,...,6.0,6.0,0.0,0.0,0.0,2.0,2.0,5.0,-7.0,
9,Avery Bradley,8:14,2.0,3.0,0.667,2.0,3.0,0.667,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,2.0,6.0,1.0,


In [6]:
k,v = next(iter_k)
print(k)
tables[k]

four_factors


Unnamed: 0,team_id,pace,efg_pct,tov_pct,orb_pct,ft_rate,off_rtg
0,GSW,112.8,0.516,13.8,18.4,0.269,107.3
1,LAL,112.8,0.553,14.1,10.9,0.095,101.1


# Fetch Box Scores

In [7]:
season = read_dataframe("../../data/21-22/results.csv")
season

Unnamed: 0,Date,Start,Visitor,PTS-V,Home,PTS-H,Matchup,OT,Attend.,Winner,Outcome_HV,Outcome_MU,Score_MU,Code
0,2021-10-19,19.5,BKN,104,MIL,127,BKN-MIL,0,17341.0,MIL,Home,1,"(104, 127)",202110190MIL
1,2021-10-19,22.0,GSW,121,LAC,114,GSW-LAC,0,18997.0,GSW,Visitor,0,"(121, 114)",202110190LAC
2,2021-10-20,19.0,IND,122,CHA,123,CHA-IND,0,15521.0,CHA,Home,0,"(123, 122)",202110200CHA
3,2021-10-20,19.0,CHI,94,DET,88,CHI-DET,0,20088.0,CHI,Visitor,0,"(94, 88)",202110200DET
4,2021-10-20,19.5,BOS,134,NYK,138,BOS-NYK,2,19812.0,NYK,Home,1,"(134, 138)",202110200NYK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1225,2022-04-10,21.5,LAC,146,DEN,141,DEN-LAC,1,19520.0,LAC,Visitor,1,"(141, 146)",202204100DEN
1226,2022-04-10,21.5,OKC,88,LAL,138,LAL-OKC,0,18210.0,LAL,Home,0,"(138, 88)",202204100LAL
1227,2022-04-10,21.5,GSW,128,NOP,107,GSW-NOP,0,16595.0,GSW,Visitor,0,"(128, 107)",202204100NOP
1228,2022-04-10,21.5,SAC,116,PHX,109,PHX-SAC,0,17071.0,SAC,Visitor,1,"(109, 116)",202204100PHX


In [19]:
i,d = list(season.iterrows())[2]
# box_score_url = f"https://www.basketball-reference.com/boxscores/{d['Code']}.html"
# page_content = strip_html_comment(get_page(box_score_url))
# soup = BeautifulSoup(page_content)
match_code = d["Code"]
raw_html = get_box_scores_html(d["Code"])
tables = parse_box_scores(raw_html,(d['Visitor'],d['Home']))
# for name,table in tables.items():
#     write_dataframe(table,f"../../data/21-22/box_scores1/{match_code}/{name}.csv")

# tables = parse_box_scores(soup,(d['Visitor'],d['Home']))
iter_k = iter(tables.items())



In [27]:
raw_html



In [26]:
k,v = next(iter_k)
print(k)
tables[k]

away-q1-basic


In [11]:

for i,d in season.iterrows():
    match_code = d['Code']
    raw_html = get_box_scores_html(match_code)
    tables = parse_box_scores(soup,(d['Visitor'],d['Home']))
    for name,table in tables.items():
        write_dataframe(table,f"../../data/21-22/box_scores/{match_code}/{name}.csv")
    # write_file(raw_html,f"../../data/21-22/box_scores/{match_code}/raw_html.html")
    print(match_code)
    time.sleep(0.5)



202110190MIL
202110190LAC
202110200CHA
202110200DET
202110200NYK
202110200TOR
202110200MEM
202110200MIN
202110200NOP
202110200SAS
202110200UTA
202110200POR
202110200PHX
202110210ATL
202110210MIA
202110210GSW
202110220ORL
202110220WAS
202110220CLE
202110220BOS
202110220PHI
202110220HOU
202110220CHI
202110220DEN
202110220LAC
202110220SAC
202110230CLE
202110230IND
202110230TOR
202110230CHI
202110230MIN
202110230SAS
202110230POR
202110230LAL
202110240BKN
202110240HOU
202110240NYK
202110240OKC
202110240SAC
202110240LAC
202110250CHA
202110250IND
202110250TOR
202110250MIA
202110250BKN
202110250ATL
202110250MIN
202110250DEN
202110250LAL
202110260NYK
202110260OKC
202110260SAS
202110260DAL
202110260UTA
202110270ORL
202110270BOS
202110270BKN
202110270TOR
202110270NOP
202110270MIL
202110270OKC
202110270PHX
202110270POR
202110270LAL
202110280WAS
202110280PHI
202110280HOU
202110280CHI
202110280DAL
202110280GSW
202110290TOR
202110290MIA
202110290BKN
202110290NOP
202110290DEN
202110290POR
202110290LAC

KeyboardInterrupt: 