In [1]:
from bs4 import *
from bs4.element import PageElement
import time
import requests
import re
import numpy as np
import pandas as pd
from IPython.display import clear_output, HTML
from tqdm import tqdm, trange
import os
from file_tools import *
from request_tools import *
from parse_tools import *

HOME_PAGE = 'https://www.basketball-reference.com'
SEASONS_PAGE = 'https://www.basketball-reference.com/leagues'
TEAMS_PAGE = 'https://www.basketball-reference.com/teams'
BOXSCORES_PAGE = 'https://www.basketball-reference.com/boxscores'

In [6]:
SAVE_TO = './data'

html_text = request_html_text(TEAMS_PAGE)
html_text = re.sub(r'<!--|-->', '', html_text)
html_soup = parse_html_soup(html_text)

teams_active = html_soup.find('table', {'id': 'teams_active'})

teams_active_df = parse_html_table(teams_active.prettify(),extract_links='body')
teams_active_hrefs_list = [href for _,href in teams_active_df['Franchise'] if href is not None]

teams_all_df = parse_html_table(html_soup.prettify(),extract_links='body')
teams_all_hrefs_list = [href for _,href in teams_all_df['Franchise'] if href is not None]

save_file(f'{SAVE_TO}/teams.txt', '\n'.join(teams_active_hrefs_list))
save_file(f'{SAVE_TO}/teams_all.txt', '\n'.join(teams_all_hrefs_list))

In [7]:
# Load teams list
teams_list = load_file(f'{SAVE_TO}/teams_all.txt').split('\n')
tqdm_team_list = tqdm(teams_list)
all_team_seasons_hrefs_list = []
fails = []
for team in tqdm_team_list:
    tqdm_team_list.set_description(team)
    html_text = request_html_text(HOME_PAGE + team)
    html_text = clean_html_text(html_text)
    try:
        team_seasons_df = parse_html_table(html_text,extract_links='body')
        team_seasons_hrefs_list = [href for _,href in team_seasons_df['Season']]
        all_team_seasons_hrefs_list += team_seasons_hrefs_list
    except Exception as e:
        print(f'Error parsing {team}: {e}')
        fails.append(team)
    time.sleep(3)

save_file(f'{SAVE_TO}/team_seasons.txt', '\n'.join(all_team_seasons_hrefs_list))

/teams/WAT/: 100%|██████████| 53/53 [03:01<00:00,  3.42s/it]


In [16]:
# Scrape all game logs ever
# Load team seasons list
team_seasons_list = load_file(f'{SAVE_TO}/team_seasons.txt').split('\n')
tqdm_team_seasons_list = tqdm(team_seasons_list)
all_game_logs_hrefs_list = []
fails = []
# For each team season, scrape the basic and advanced game logs
for team_season in tqdm_team_seasons_list:
    for gl in ['/gamelog','/gamelog-advanced']:
        try:
            html_soup = request_html_soup(HOME_PAGE + team_season.strip('.html') + gl)
            html_soup = content_div_only(html_soup)
            html_text = html_soup.prettify()
            html_text = clean_html_text(html_text)
            save_file(f'{SAVE_TO}{team_season.strip(".html")}{gl}.html', html_text)
        except Exception as e:
            print(f'Error parsing {team_season}: {e}')
            fails.append(team_season)
        time.sleep(3)
if fails:
    print(f'Failed to parse {len(fails)} team seasons: {fails}')

# html_basic_regular = parse_html_soup(html_text).find('table', {'id': 'tgl_basic'})
# html_basic_playoffs = parse_html_soup(html_text).find('table', {'id': 'tgl_basic_playoffs'})
# html_advanced_regular = parse_html_soup(html_text).find('table', {'id': 'tgl_advanced'})
# html_advanced_playoffs = parse_html_soup(html_text).find('table', {'id': 'tgl_advanced_playoffs'})

# gamelog_basic_regular = parse_html_table(html_basic_regular.prettify())
# gamelog_basic_regular
# ['gamelog','gamelog-advanced']


  0%|          | 0/1758 [02:51<?, ?it/s]
  1%|▏         | 25/1758 [03:35<4:04:15,  8.46s/it]

Error parsing /teams/ATL/1999.html: 'NoneType' object has no attribute 'prettify'


  2%|▏         | 27/1758 [03:51<3:59:50,  8.31s/it]

In [None]:
gamelog_basic_regular[gamelog_basic_regular[]]

In [14]:
# Flatten/merge the columns from multi-index to single index
gamelog_basic_regular.columns = gamelog_basic_regular.columns.map('_'.join)
gamelog_basic_regular

Unnamed: 0,Unnamed: 0_level_0_Rk,Unnamed: 1_level_0_G,Unnamed: 2_level_0_Date,Unnamed: 3_level_0_Unnamed: 3_level_1,Unnamed: 4_level_0_Opp,Unnamed: 5_level_0_W/L,Unnamed: 6_level_0_Tm,Unnamed: 7_level_0_Opp,Team_FG,Team_FGA,...,Opponent_FT,Opponent_FTA,Opponent_FT%,Opponent_ORB,Opponent_TRB,Opponent_AST,Opponent_STL,Opponent_BLK,Opponent_TOV,Opponent_PF
0,1,1,2022-10-19,,HOU,W,117,107,45,90,...,14,15,.933,15,54,25,4,3,15,20
1,2,2,2022-10-21,,ORL,W,108,98,40,89,...,24,30,.800,10,44,16,9,7,19,19
2,3,3,2022-10-23,,CHO,L,109,126,39,95,...,21,30,.700,14,52,28,7,6,13,24
3,4,4,2022-10-26,@,DET,W,118,113,45,91,...,15,22,.682,10,40,23,4,6,13,25
4,5,5,2022-10-28,@,DET,W,136,112,55,97,...,31,38,.816,14,42,18,2,4,10,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,80,80,2023-04-05,,WAS,W,134,116,43,84,...,20,26,.769,12,39,23,10,9,14,26
86,,,,,,,,,Team,Team,...,Opponent,Opponent,Opponent,Opponent,Opponent,Opponent,Opponent,Opponent,Opponent,Opponent
87,Rk,G,Date,,Opp,W/L,Tm,Opp,FG,FGA,...,FT,FTA,FT%,ORB,TRB,AST,STL,BLK,TOV,PF
88,81,81,2023-04-07,,PHI,L,131,136,51,92,...,23,29,.793,13,52,33,10,2,20,22


Drafts

Scrape and save data from basketball-reference.com

In [None]:
# function to get the html of a page
def get_html(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "Error"

# function to get the bs html of a page    
def get_html_soup(url):
    html_text = get_html(url)
    return BeautifulSoup(html_text, 'html.parser')

def content_only(soup):
    return soup.find('div',{'id':'content'})

# function to save a file
def save_file(file_name, content):
    # Create a new directory if necessary
    directory = os.path.dirname(file_name)
    if not os.path.exists(directory):
        os.makedirs(directory)
    # Save the file
    with open(file_name, 'w') as f:
        f.write(content)

# function to load a file
def load_file(file_name):
    with open(file_name, 'r') as f:
        return f.read()
    

html = get_html_soup(SEASONS_PAGE)
html

In [None]:
# function to get all seasons
def req_all_seasons_hrefs() -> list:
    seasons = []
    html_soup = get_html_soup(SEASONS_PAGE)
    try:
        for th in html_soup.find_all('th', {'data-stat': 'season'}):
            for a in th.find_all('a'):
                seasons.append(a['href'])
    except Exception as e:
        print(f'Error getting seasons: {e}')
    return seasons

def req_season_games_hrefs(season_href,sleep=3) -> list:
    games = []
    html_soup = get_html_soup(HOME_PAGE + season_href.strip('.html')+'_games.html')
    filter_div = html_soup.find('div',{'class':'filter'}) 
    schedule_table = html_soup.find('table', {'id': 'schedule'})
    try:
        if filter_div is None:
            for td in html_soup.find_all('td', {'data-stat': 'box_score_text'}):
                for a in td.find_all('a'):
                    games.append(a['href'])
        else:
            month_hrefs = [a['href'] for a in filter_div.select('a')]
            for month_href in month_hrefs:
                html_soup = get_html_soup(HOME_PAGE + month_href)
                schedule_table = html_soup.find('table', {'id': 'schedule'})
                for td in schedule_table.find_all('td', {'data-stat': 'box_score_text'}):
                    for a in td.find_all('a'):
                        games.append(a['href'])
                time.sleep(sleep)
    except Exception as e:
        print(f'Error getting boxscores for season {season_href}: {e}')
    return games

def req_game_boxscores_hrefs(game_href) -> dict:
    boxscores = []
    try:
        html_soup = get_html_soup(HOME_PAGE + game_href)
        filter_div = html_soup.find('div',{'class':'filter'})
        if filter_div is not None:
            filter_hrefs = [a['href'] for a in filter_div.select('a')]
            for filter_href in filter_hrefs:
                boxscores.append(filter_href)
        else:
            boxscores.append(game_href)
    except Exception as e:
        print(f'Error getting boxscores hrefs for game {game_href}: {e}')

    return boxscores


# get_all_seasons_hrefs()
# get_season_games_hrefs('/leagues/NBA_2020_games.html')
req_game_boxscores_hrefs('/boxscores/201910220TOR.html')


In [None]:
SAVE_TO = './data/'

# Get all files within a directory
def get_all_files(directory, followlinks=False):
    files = []
    for root, dirs, files in os.walk(directory, followlinks=followlinks):
        return files
    
# Get all folders within a directory
def get_all_folders(directory):
    folders = []
    for root, dirs, files in os.walk(directory):
        return dirs
    
# Check if a file exists
def file_exists(file_name):
    return os.path.isfile(file_name)

In [None]:
sleep = 3

# Create our game hrefs list
SAVE_TO = './data/'
game_hrefs = []
seasons_hrefs = req_all_seasons_hrefs()[:25]
for season_href in tqdm(seasons_hrefs,position=0, leave=True):
    # fetch season boxscores list
    season_games_hrefs = req_season_games_hrefs(season_href,sleep)
    game_hrefs += sorted(season_games_hrefs,reverse=True)
    save_file(SAVE_TO + 'boxscores.txt', '\n'.join(game_hrefs))
    time.sleep(sleep)



In [None]:
# Load our game hrefs list
sleep = 3
SAVE_TO = './data/'
game_hrefs = load_file(SAVE_TO + 'boxscores.txt').split('\n')
game_hrefs_tqdm = tqdm(game_hrefs,position=0, leave=True)
for game_href in game_hrefs_tqdm:
    game_hrefs_tqdm.set_description(f'{game_href}')
    # Check if we already have the boxscores
    if file_exists(SAVE_TO + game_href):
        continue
    # Else fetch the game html
    try:
        html_soup = get_html_soup(HOME_PAGE + game_href)
        html_text = content_only(html_soup).prettify()
        save_file(SAVE_TO + game_href, html_text)
        time.sleep(sleep)
    except Exception as e:
        print(f'Error getting boxscores for game {game_href}: {e}')


In [None]:
sleep = 3

# Get all boxscores hrefs, save them according to season
SAVE_TO = './data/'
seasons_hrefs = req_all_seasons_hrefs()[:25]
for season_href in tqdm(seasons_hrefs,position=0, leave=True):
    # fetch season boxscores list
    season_games_hrefs = req_season_games_hrefs(season_href,sleep)
    for game_href in tqdm(season_games_hrefs,position=1, leave=True):
        # fetch game boxscores 
        html_text = content_only(get_html_soup(HOME_PAGE + game_href)).prettify()
        save_file(SAVE_TO + game_href, html_text)
        time.sleep(sleep)
 
    # save season boxscores list
    # save_file(save_to + season_href.strip('.html')+'_boxscores_hrefs.txt', '\n'.join(season_boxscores_hrefs))
