In [1]:
import sys
sys.path.insert(0,"../../")

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import *
from bs4.element import PageElement
import time
from base import *

Fetch hrefs

In [20]:
def fetch_hrefs(page,allowed_domains=None,fetch_subpages=False,sleep=3.5,verbose=True):
    if not isinstance(allowed_domains,(tuple,list)):
        allowed_domains = (allowed_domains,)
    page_html = fetch_html(page)
    soup = BeautifulSoup(page_html)
    hrefs = [a['href'] for a in soup.find_all("a",href=True) if a['href'].startswith(tuple(allowed_domains))]
    hrefs = list(set(hrefs))
    if verbose:
        print(f" . Fetched {len(hrefs)} hrefs @ {page}")
    time.sleep(sleep)

    if fetch_subpages:
        next_hrefs = hrefs
        prev_hrefs = set()
        for href_i in next_hrefs:
            href_i_hrefs = fetch_hrefs(f"{page}{href_i}",allowed_domains,False,sleep,verbose)
            next_hrefs += list(set(href_i_hrefs) - set(prev_hrefs))
            prev_hrefs.add(href_i)
        hrefs += list(prev_hrefs)
    return hrefs

fetch_hrefs(page="https://basketball-reference.com",allowed_domains="/teams/GSW/2023",fetch_subpages=True)

 . Fetched 1 hrefs @ https://basketball-reference.com
 . Fetched 10 hrefs @ https://basketball-reference.com/teams/GSW/2023.html
 . Fetched 10 hrefs @ https://basketball-reference.com/teams/GSW/2023/on-off/
 . Fetched 10 hrefs @ https://basketball-reference.com/teams/GSW/2023_start.html
 . Fetched 11 hrefs @ https://basketball-reference.com/teams/GSW/2023/gamelog/
 . Fetched 10 hrefs @ https://basketball-reference.com/teams/GSW/2023_depth.html
 . Fetched 10 hrefs @ https://basketball-reference.com/teams/GSW/2023.html
 . Fetched 10 hrefs @ https://basketball-reference.com/teams/GSW/2023_games.html
 . Fetched 10 hrefs @ https://basketball-reference.com/teams/GSW/2023_referees.html
 . Fetched 10 hrefs @ https://basketball-reference.com/teams/GSW/2023/splits/
 . Fetched 10 hrefs @ https://basketball-reference.com/teams/GSW/2023/lineups/
 . Fetched 10 hrefs @ https://basketball-reference.com/teams/GSW/2023_transactions.html
 . Fetched 10 hrefs @ https://basketball-reference.com/teams/GSW/20

KeyboardInterrupt: 

In [18]:
set([1,2,3,4]) - (set([1,3,5]))

{2, 4}

In [4]:
fetch_html("https://basketball-reference.com")

'<!DOCTYPE html>\n\n[if lt IE 7]> <html class="no-js ie6 oldie" lang="en-US"> <![endif]\n\n\n[if IE 7]>    <html class="no-js ie7 oldie" lang="en-US"> <![endif]\n\n\n[if IE 8]>    <html class="no-js ie8 oldie" lang="en-US"> <![endif]\n\n\n[if gt IE 8]>\n> <html class="no-js" lang="en-US"> \n<![endif]\n\n<head>\n<title>Access denied | www.basketball-reference.com used Cloudflare to restrict access</title>\n<meta charset="UTF-8" />\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n<meta http-equiv="X-UA-Compatible" content="IE=Edge" />\n<meta name="robots" content="noindex, nofollow" />\n<meta name="viewport" content="width=device-width,initial-scale=1" />\n<link rel="stylesheet" id="cf_styles-css" href="/cdn-cgi/styles/main.css" />\n\n\n<script>\n(function(){if(document.addEventListener&&window.XMLHttpRequest&&JSON&&JSON.stringify){var e=function(a){var c=document.getElementById("error-feedback-survey"),d=document.getElementById("error-feedback-success"),b=new XMLH

In [29]:
def filedir_name(filepath):
    loc = filepath[::-1].find("/")
    if loc == -1:
        return "./",filepath
    else:
        return filepath[:-loc],filepath[-loc:]

def fetch_hrefs_list(source,allowed_hrefs=('/players','/teams','/leagues','/boxscores')):
    soup = BeautifulSoup(fetch_html(source))
    hrefs = [a['href'] for a in soup.find_all("a",href=True) if a['href'].startswith(allowed_hrefs)]
    return hrefs

def update_hrefs_index(hrefs_index=None,hrefs=[],host="https://basketball-reference.com",output=None):
    if isinstance(hrefs,str):
        hrefs = [hrefs]
    for href in hrefs:
        if href not in pd.unique(hrefs_index['host_href']):
            local_href = f"./{host.strip('https://')}{href}" if href.endswith('.html') else f"./{host.strip('https://')}{href}/index.html"
            hrefs_index.loc[len(hrefs_index)] = {'host_href':href,'local_href':local_href,'last_updated':np.nan}
    if output is not None:
        hrefs_index.to_csv(output)
    return hrefs_index

def fetch_update_hrefs_index(hrefs_index,host = "https://basketball-reference.com",page="/",fetch_subpages=False,output=None,sleep=3.5):
    hrefs = fetch_hrefs_list(f"{host}{page}")
    hrefs_index = update_hrefs_index(hrefs_index,hrefs,host=host,output=output)
    if fetch_subpages:
        next_hrefs = hrefs
        prev_hrefs = [page]
        for href_i in next_hrefs:
            print(f"{len(prev_hrefs)}/{len(next_hrefs)}  {href_i}")
            if href_i not in prev_hrefs:
                all_hrefs_ij = fetch_hrefs_list(f"{host}{href_i}")
                hrefs_index = update_hrefs_index(hrefs_index,all_hrefs_ij,host=host,output=output)
                next_hrefs = (next_hrefs + all_hrefs_ij)[1:]
                prev_hrefs.append(href_i)
                time.sleep(sleep)
    return hrefs_index

def fetch_update_hrefs_html(hrefs_index,host="https://basketball-reference.com",page="/",last_updated=1,sleep=3.5,output=None):
    update_rows = (hrefs_index['host_href'].str.startswith(page)
                    & ((pd.to_datetime(hrefs_index['last_updated']) <= pd.Timestamp.today() - 1 * pd.Timedelta(1,'D')) | hrefs_index['last_updated'].isna()))
    for i,href_row in hrefs_index[update_rows].iterrows():
        hrefs_index.loc[i,'last_updated'] = pd.Timestamp.today()
        html_url = f"{host}{href_row['host_href']}"
        local_url = f"{href_row['local_href']}"
        write(fetch_html(html_url),local_url)
        hrefs_index.to_csv(output)
        time.sleep(sleep)
    return hrefs_index    

hrefs_index_filepath = './basketball-reference.com/hrefs_index.csv'
hrefs_index = pd.read_csv(hrefs_index_filepath).drop(columns='Unnamed: 0')
fetch_update_hrefs_index(hrefs_index,page="/",fetch_subpages=True,output=hrefs_index_filepath)

0/298 (total=1) /players/
1/570 (total=2) /teams/
2/696 (total=3) /leagues/
3/1643 (total=4) /boxscores/
4/1797 (total=5) /players/
5/1797 (total=5) /players/e/eliema01.html
6/3143 (total=6) /players/b/brookaa01.html
7/4467 (total=7) /players/b/brownja02.html
8/5326 (total=8) /players/f/fosteje01.html
9/6611 (total=9) /players/r/reevebr01.html
10/7091 (total=10) /players/k/keefead01.html
11/8079 (total=11) /players/h/hilljo01.html
12/9051 (total=12) /leagues/NBA_2023_rookies.html
13/9322 (total=13) /players/w/willial06.html
14/9497 (total=14) /players/m/minotjo01.html
15/9678 (total=15) /players/b/brownke03.html
16/9864 (total=16) /players/c/champju02.html
17/10041 (total=17) /players/f/fostemi02.html
18/10190 (total=18) /players/r/robinor01.html
19/10366 (total=19) /players/
20/10366 (total=19) /players/o/orrlo01.html
21/11014 (total=20) /players/s/silaspa01.html
22/12230 (total=21) /players/m/mooreje01.html
23/12312 (total=22) /players/w/wrighji01.html


KeyboardInterrupt: 

In [27]:
hrefs_index = hrefs_index.sort_values('host_href')
hrefs_index

Unnamed: 0,host_href,local_href,last_updated
0,/boxscores/,basketball-reference.com/boxscores/index.html,2022-12-26 13:16:49.702286
10927,/boxscores/194911030SYR.html,./basketball-reference.com/boxscores/194911030...,
10963,/boxscores/194911170SYR.html,./basketball-reference.com/boxscores/194911170...,
10962,/boxscores/194912070TRI.html,./basketball-reference.com/boxscores/194912070...,
11014,/boxscores/195003260SYR.html,./basketball-reference.com/boxscores/195003260...,
...,...,...,...
4887,/teams/WSB/1994.html,./basketball-reference.com/teams/WSB/1994.html,
4885,/teams/WSB/1995.html,./basketball-reference.com/teams/WSB/1995.html,
4883,/teams/WSB/1996.html,./basketball-reference.com/teams/WSB/1996.html,
4882,/teams/WSB/1997.html,./basketball-reference.com/teams/WSB/1997.html,


In [28]:
fetch_update_hrefs_html(hrefs_index,page="/",last_updated=0,output=hrefs_index_filepath)

KeyboardInterrupt: 

We're only interested in Players, Teams, Seasons and Scores

Link Tree

Players

https://www.basketball-reference.com/players/

#div_alphabet > ul > li > a href

Teams

https://www.basketball-reference.com/teams/

Leagues https://www.basketball-reference.com/leagues/
- Schedule

BoxScores

https://www.basketball-reference.com/boxscores/