In [1]:
from file_tools import *
from request_tools import *
import time
from tqdm import tqdm
import argparse
from parse_tools import *
from IPython.display import clear_output,HTML
from bs4 import BeautifulSoup, Comment

In [3]:
sleep = 4
TGT_DIR = './01-data-html'

def request_html(url,content_only=False):
    html_text, html_soup = None, None
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        html_text = r.text
        html_soup = BeautifulSoup(html_text, 'html.parser')
        if content_only:
            html_soup = html_soup.find('div',{'id':'content'})
            html_text = html_soup.prettify()
    except:
        if int(r.status_code) == 429:
            print(f"Too many requests, sleeping for {r.headers['Retry-After']} seconds, starting at {time.ctime()}")
            time.sleep(int(r.headers['Retry-After']))
    return html_text, html_soup

def load_html(url,content_only=False):
    html_text = load_file(url)
    html_soup = BeautifulSoup(html_text, 'html.parser')
    if content_only:
        html_soup = html_soup.find('div',{'id':'content'})
        html_text = html_soup.prettify()
    return html_text, html_soup

def fetch_all_players_hrefs() -> list:
    players_hrefs = []
    # For letters in the alphabet
    TQDM_LETTERS = tqdm('abcdefghijklmnopqrstuvwxyz',ncols=150)
    for letter in TQDM_LETTERS:
        TQDM_LETTERS.set_description(letter)
        # Get the html soup for the letter page
        html_text, html_soup = request_html('/'.join([PLAYERS_PAGE,letter]))
        # For each player in the letter page
        if html_soup:
            for th in html_soup.find_all('th', {'data-stat': 'player'}):
                for a in th.find_all('a'):
                    players_hrefs.append(a['href'])
            save_file('./00-data-facts/players_hrefs.txt','\n'.join(players_hrefs))
        time.sleep(sleep)
    # Save the list of player hrefs to a file
    save_file('./00-data-facts/players_hrefs.txt','\n'.join(players_hrefs))
    return players_hrefs


def scrape_all_player_htmls(TGT_DIR) -> None:
    _FAILS_ = []
    # Load the list of player hrefs from a file
    players_hrefs = load_file('./00-data-facts/players_hrefs.txt').split('\n')
    # For each player href
    TQDM_PLAYER_HREFS = tqdm(players_hrefs,ncols=150)
    for player_href in TQDM_PLAYER_HREFS:
        # Get the html soup for the player page
        TQDM_PLAYER_HREFS.set_description(player_href)
        try:
            html_text, html_soup = request_html('/'.join([HOME_PAGE,player_href]),content_only=True)
            save_file('/'.join([TGT_DIR,player_href]), html_text) 
            time.sleep(sleep)
        except Exception as e:
            print(f'Error getting {player_href}: {e}')
            _FAILS_.append(player_href)
    if _FAILS_:
        fails = '\n'.join(_FAILS_)
        print(f"Failed to fetch {len(_FAILS_)} boxscores: {fails}")
    else:
        print('All boxscores fetched')


def extract_all_players_gamelog_hrefs(SRC_DIR):
    def extract_player_gamelog_hrefs(html_soup):
            gamelog_hrefs = []
            # For each player in the letter page
            for a in html_soup.select('div[id="bottom_nav"] a[href*="gamelog"]'):
                if '/gamelog/' in a['href']:
                    gamelog_hrefs.append(a['href'])
                    gamelog_hrefs.append(re.sub('gamelog','gamelog-advanced',a['href']))
            return gamelog_hrefs
    
    # Load the list of player hrefs from a file
    ALL_PLAYERS_HREFS_LIST = load_file('./00-data-facts/players_hrefs.txt').split('\n')
    # For each player href
    _FAILS_ = []
    all_players_gamelog_hrefs_dict = {}
    TQDM_PLAYER_HREFS = tqdm(ALL_PLAYERS_HREFS_LIST,ncols=150)
    for player_href in TQDM_PLAYER_HREFS:
        # Get the html soup for the player page
        TQDM_PLAYER_HREFS.set_description(player_href)
        try:
            html_text,html_soup = load_html('/'.join([SRC_DIR,player_href]))
            gamelog_hrefs       = extract_player_gamelog_hrefs(html_soup)
            all_players_gamelog_hrefs_dict[player_href] = gamelog_hrefs
            save_json('./00-data-facts/players_gamelog_hrefs.json', all_players_gamelog_hrefs_dict)
        except Exception as e:
            # print(f'Error getting {player_href}: {e}\r')
            _FAILS_.append(player_href)
            continue
        # time.sleep(sleep)
    if _FAILS_:
        fails = '\n'.join(_FAILS_)
        print(f"Failed to fetch {len(_FAILS_)} cases: {fails}")
    else:
        print('All cases extracted')


def scrape_all_players_gamelog_html(TGT_DIR):
    # Load the list of player gamelog hrefs from a file
    SRC_DIR = './00-data-facts/players_gamelog_hrefs.json'
    players_gamelog_hrefs_dict = load_json(SRC_DIR)
    # For each player href
    _FAILS_ = []
    TQDM_PLAYER_HREFS = tqdm(players_gamelog_hrefs_dict.items(),ncols=150)
    for player_href,player_gamelog_href_list in TQDM_PLAYER_HREFS:
        # Get the html soup for the player page
        TQDM_PLAYER_HREFS.set_description(f'{player_href} ({len(player_gamelog_href_list)})')
        for player_gamelog_href in player_gamelog_href_list:
            try:
                html_text, html_soup = request_html('/'.join([HOME_PAGE,player_gamelog_href]),content_only=True)
                save_file('/'.join([TGT_DIR,player_gamelog_href+'.html']), html_text) 
            except Exception as e:
                print(f'Error getting {player_gamelog_href}: {e}')
                _FAILS_.append(player_gamelog_href)
            time.sleep(sleep)
    if _FAILS_:
        fails = '\n'.join(_FAILS_)
        print(f"Failed to fetch {len(_FAILS_)} cases: {fails}")
    else:
        print('All cases fetched')




    

In [None]:
TGT_DIR = './01-data-html'

# fetch_all_players_hrefs()
# scrape_all_player_htmls(TGT_DIR)
# extract_all_players_gamelog_hrefs('./01-data-html')
scrape_all_players_gamelog_html(TGT_DIR)

After Scraped

In [6]:
FAILS = """\
/players/b/battlke01/gamelog-advanced/1992
/players/b/baumjo01/gamelog-advanced/1972/aba/
/players/b/baumjo01/gamelog/1973/aba/
/players/b/bayloel01/gamelog/1969
/players/f/frahmri01/gamelog/2006
/players/f/frahmri01/gamelog-advanced/2006
/players/f/francst01/gamelog/2002
/players/f/frankte01/gamelog/1990
/players/f/frankja01/gamelog-advanced/2014
/players/g/gazean01/gamelog/1994
/players/l/lewisra02/gamelog/2001
/players/l/liberma01/gamelog/1992
/players/l/lichtto01/gamelog/1991
/players/l/liggide01/gamelog/2014
/players/l/ligongo01/gamelog-advanced/1968/aba/
/players/l/ligongo01/gamelog-advanced/1971/aba/
/players/n/nelsodo01/gamelog-advanced/1972
/players/r/raycl01/gamelog-advanced/1978
/players/t/tollian01/gamelog/2009
/players/t/tomjaru01/gamelog/1981
/players/t/toneyan01/gamelog/1988
/players/t/toomaja01/gamelog-advanced/1950
/players/w/williho01/gamelog-advanced/1996
/players/y/youngni01/gamelog/2019
/players/y/youngsa01/gamelog/2010\
""".split('\n')

FAILS

['/players/b/battlke01/gamelog-advanced/1992',
 '/players/b/baumjo01/gamelog-advanced/1972/aba/',
 '/players/b/baumjo01/gamelog/1973/aba/',
 '/players/b/bayloel01/gamelog/1969',
 '/players/f/frahmri01/gamelog/2006',
 '/players/f/frahmri01/gamelog-advanced/2006',
 '/players/f/francst01/gamelog/2002',
 '/players/f/frankte01/gamelog/1990',
 '/players/f/frankja01/gamelog-advanced/2014',
 '/players/g/gazean01/gamelog/1994',
 '/players/l/lewisra02/gamelog/2001',
 '/players/l/liberma01/gamelog/1992',
 '/players/l/lichtto01/gamelog/1991',
 '/players/l/liggide01/gamelog/2014',
 '/players/l/ligongo01/gamelog-advanced/1968/aba/',
 '/players/l/ligongo01/gamelog-advanced/1971/aba/',
 '/players/n/nelsodo01/gamelog-advanced/1972',
 '/players/r/raycl01/gamelog-advanced/1978',
 '/players/t/tollian01/gamelog/2009',
 '/players/t/tomjaru01/gamelog/1981',
 '/players/t/toneyan01/gamelog/1988',
 '/players/t/toomaja01/gamelog-advanced/1950',
 '/players/w/williho01/gamelog-advanced/1996',
 '/players/y/youngni0

In [10]:
player_gamelog_href = FAILS[2]

html_text, html_soup = request_html('/'.join([HOME_PAGE,player_gamelog_href]),content_only=True)
# HTML(html_text)

In [11]:
# SRC_DIR = './00-data-facts/players_gamelog_hrefs.json'
# players_gamelog_hrefs_dict = load_json(SRC_DIR)
# For each player href
TGT_DIR = './01-data-html'
_FAILS_ = []
TQDM_PLAYER_HREFS = tqdm(FAILS,ncols=150)
    # Get the html soup for the player page
for player_gamelog_href in TQDM_PLAYER_HREFS:
    TQDM_PLAYER_HREFS.set_description(f'{player_gamelog_href}')
    try:
        html_text, html_soup = request_html('/'.join([HOME_PAGE,player_gamelog_href]),content_only=True)
        save_file('/'.join([TGT_DIR,player_gamelog_href+'.html']), html_text) 
    except Exception as e:
        print(f'Error getting {player_gamelog_href}: {e}')
        _FAILS_.append(player_gamelog_href)
    time.sleep(sleep)
if _FAILS_:
    fails = '\n'.join(_FAILS_)
    print(f"Failed to fetch {len(_FAILS_)} cases: {fails}")
else:
    print('All cases fetched')

/players/y/youngsa01/gamelog/2010: 100%|██████████████████████████████████████████████████████████████████████████████| 25/25 [02:28<00:00,  5.92s/it]

All cases fetched



