In [78]:
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO

def scrape_espn_bracket(url):
    """
    Scrape an ESPN tournament bracket page and extract matchup information.
    
    Args:
        url (str): The URL of the ESPN bracket page
        
    Returns:
        dict: Dictionary containing matchups organized by region
    """
    print(f"Scraping bracket from: {url}")
    
    # Send request with headers to mimic a browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        print(f"Failed to retrieve page: {response.status_code}")
        return None
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all items with the class 'BracketCell'
    bracket_cells = soup.find_all(class_='BracketCell')
    champ_cell = soup.find(class_='BracketCellChamp')
    print(f"Found {len(bracket_cells)} bracket cells")
    print(f"Found champion cell: {champ_cell is not None}")

    # Some hacky checking to remove first four games
    if len(bracket_cells) > 62:
        print("Removing first four games")
        bracket_cells = bracket_cells[:-4]
        print(f"Remaining bracket cells: {len(bracket_cells)}")

    bracket_cells.append(champ_cell)  # Add the champion cell to the list of bracket cells if it exists

    # Grab teams from the bracket cells
    # for cell in bracket_cells:
    #     team_cells = cell.find_all(class_='BracketCell__Name')
    #     team_names = [ind_team_cell.get_text(strip=True) for ind_team_cell in team_cells]

    #     seed_cells = cell.find_all(class_='BracketCell__Rank')
    #     seed_names = [ind_seed_cell.get_text(strip=True) for ind_seed_cell in seed_cells]

    # get a list of game urls (hrefs attributes of the bracket cells)
    game_urls = [matchup.get('href') for matchup in bracket_cells if matchup.get('href')]
    print(f"Found {len(game_urls)} game URLs")
    return game_urls

def translate_espn_game_url_to_boxscore_url(game_url):
    """
    Translate an ESPN game URL to its corresponding box score URL.
    
    Args:
        game_url (str): The URL of the ESPN game page
        
    Returns:
        str: The corresponding box score URL
    """
    # Example game URL: https://www.espn.com/mens-college-basketball/game/_/gameId/401360000
    match = re.search(r'/game/_/gameId/(\d+)', game_url)
    if match:
        game_id = match.group(1)
        boxscore_url = f'https://www.espn.com/mens-college-basketball/boxscore/_/gameId/{game_id}'
        return boxscore_url
    return None

def clean_espn_box_score_df(df_espn):
    """
    Clean the DataFrame containing box score data by renaming columns and dropping unnecessary rows.
    
    Args:
        df (pd.DataFrame): The DataFrame to clean
        
    Returns:
        pd.DataFrame: The cleaned DataFrame
    """
    # set the column names to be the first row
    df_espn.columns = df_espn.iloc[0]
    
    # if only bench players are present, use that as the column to select
    if ('bench' in df_espn.columns) and ('starters' not in df_espn.columns):
        df_espn = df_espn[['bench', 'PTS']]
        # rename the columns to be 'starters' and 'PTS'
        df_espn.columns = ['starters', 'PTS']
    else:
        df_espn = df_espn[['starters', 'PTS']]

    # drop the rows that are nan or repeat the column names
    df_espn = df_espn.dropna()
    df_espn = df_espn[df_espn['starters'] != 'starters']
    df_espn = df_espn[df_espn['starters'] != 'bench']
    df_espn = df_espn[df_espn['starters'] != 'team']

    # reset the index
    df_espn = df_espn.reset_index(drop=True)
    return df_espn

def insert_espn_box_score_df_to_player_bookkeeping(df, name, seed, bk_dict):
    """
    Insert the cleaned DataFrame into a player bookkeeping dictionary.
    
    Args:
        df (pd.DataFrame): The DataFrame containing player stats
        bk_dict (dict): The player bookkeeping dictionary
        
    Returns:
        dict: Updated player bookkeeping dictionary
    """
    for index, row in df.iterrows():
        player_name = row['starters']
        player_pts_scored = int(row['PTS'])
        if player_name not in bk_dict:
            bk_dict[player_name] = {'team': name, 'seed': seed, 'pts': player_pts_scored}
        else:
            bk_dict[player_name]['pts'] += player_pts_scored  # accumulate points if player already exists
    
    return bk_dict


def scrape_espn_box_score(url, bookkeeping_dict=None):
    """
    Scrape an ESPN box score page and extract game information.
    
    Args:
        url (str): The URL of the ESPN box score page
    """
    print(f"Scraping box score from: {url}")
    
    # Send request with headers to mimic a browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        print(f"Failed to retrieve page: {response.status_code}")
        return None
    
    soup = BeautifulSoup(response.text, 'html.parser')

    # First get teams and seeds from the page
    # look for the class 'Gamestrip__InfoWrapper'
    team_info = soup.find_all(class_='Gamestrip__InfoWrapper')
    if not team_info:
        print(f"No team info found on page")
        return None
    if len(team_info) != 2:
        print(f"Expected 2 teams, found {len(team_info)}")
        return None
    
    team1 = team_info[0].get_text(strip=True)
    team1_seed = re.search(r'\d{1,2}', team1).group(0)
    team1_name = re.sub(r'\d+', '', team1).strip()
    
    team2 = team_info[1].get_text(strip=True)
    team2_seed = re.search(r'\d{1,2}', team2).group(0)
    team2_name = re.sub(r'\d+', '', team2).strip()

    class_to_find = 'Boxscore Boxscore__ResponsiveWrapper'
    boxscore = soup.find(class_=class_to_find)
    if not boxscore:
        print(f"No box score found on page")
        return None

    # Extract tables
    tables = boxscore.find_all('table')
    if not tables:
        print(f"No tables found in box score")
        return None
    if len(tables) != 4:
        print(f"Expected 4 tables, found {len(tables)}")
        return None
    
    team1_players_df = pd.read_html(StringIO(str(tables[0])))[0]
    team1_stats_df = pd.read_html(StringIO(str(tables[1])))[0]
    
    team2_players_df = pd.read_html(StringIO(str(tables[2])))[0]
    team2_stats_df = pd.read_html(StringIO(str(tables[3])))[0]
    

    # Combine player stats with team stats
    team1_stats_df = pd.concat([team1_players_df, team1_stats_df], axis=1)
    team2_stats_df = pd.concat([team2_players_df, team2_stats_df], axis=1)

    # Clean up dataframes
    team1_stats_df = clean_espn_box_score_df(team1_stats_df)
    team2_stats_df = clean_espn_box_score_df(team2_stats_df)

    # Insert data into bookkeeping dictionaries
    insert_espn_box_score_df_to_player_bookkeeping(team1_stats_df, team1_name, team1_seed, bookkeeping_dict)
    insert_espn_box_score_df_to_player_bookkeeping(team2_stats_df, team2_name, team2_seed, bookkeeping_dict)

    print(f'Done scraping box score for {url}')

In [65]:
def get_multiplier(seed):
    if seed < 6:
        return 1
    elif seed < 13:
        return 2
    else:
        return 3

In [66]:
scrape_espn_box_score('https://www.espn.com/mens-college-basketball/boxscore/_/gameId/401638599', {})

Scraping box score from: https://www.espn.com/mens-college-basketball/boxscore/_/gameId/401638599
Done scraping box score for https://www.espn.com/mens-college-basketball/boxscore/_/gameId/401638599


In [79]:
def full_espn_pipeline(year):
    """
    Full pipeline to scrape an ESPN tournament bracket and box scores for a given year.
    
    Args:
        year (int): The year of the tournament to scrape
    """
    espn_bracket_base = 'https://www.espn.com/mens-college-basketball/bracket/_/season/{}'
    espn_bracket_url = espn_bracket_base.format(year)
    bracket_data = scrape_espn_bracket(espn_bracket_url)
    
    if not bracket_data:
        print(f"Failed to scrape bracket data for {year}")
        return
    
    # Initialize bookkeeping dictionary
    bookkeeping_dict = {}
    
    # Iterate through matchups and scrape box scores
    for game_url in bracket_data:
        boxscore_url = translate_espn_game_url_to_boxscore_url(game_url)
        if not boxscore_url:
            print(f"Failed to translate game URL to box score URL: {game_url}")
            continue
        
        # print(f"Scraping box score for matchup: {boxscore_url}")
        scrape_espn_box_score(boxscore_url, bookkeeping_dict)

    # Multiply the points by the multiplier
    for player, data in bookkeeping_dict.items():
        seed = int(data['seed'])
        multiplier = get_multiplier(seed)
        data['pts_mult'] = data['pts'] * multiplier
        data['year'] = year  # Add the year to the bookkeeping dictionary
    
    return bookkeeping_dict

In [69]:
bk_dicts = {}

for year in range(2008, 2025):
    print(f"Scraping bracket for year: {year}")
    bookkeeping_dict = full_espn_pipeline(year)
    if bookkeeping_dict:
        bk_dicts[year] = bookkeeping_dict
    else:
        print(f"No data found for year: {year}")

Scraping bracket for year: 2008
Scraping bracket from: https://www.espn.com/mens-college-basketball/bracket/_/season/2008
Found 62 bracket cells
Found champion cell: True
Found 63 game URLs
Scraping box score from: https://www.espn.com/mens-college-basketball/boxscore/_/gameId/284000047
Done scraping box score for https://www.espn.com/mens-college-basketball/boxscore/_/gameId/284000047
Scraping box score from: https://www.espn.com/mens-college-basketball/boxscore/_/gameId/284000048
Done scraping box score for https://www.espn.com/mens-college-basketball/boxscore/_/gameId/284000048
Scraping box score from: https://www.espn.com/mens-college-basketball/boxscore/_/gameId/284000050
Done scraping box score for https://www.espn.com/mens-college-basketball/boxscore/_/gameId/284000050
Scraping box score from: https://www.espn.com/mens-college-basketball/boxscore/_/gameId/284000051
Done scraping box score for https://www.espn.com/mens-college-basketball/boxscore/_/gameId/284000051
Scraping box s

In [71]:
# add 2024 tournament data to the bookkeeping dict
bk_dicts[2024] = full_espn_pipeline(2024)

Scraping bracket from: https://www.espn.com/mens-college-basketball/bracket/_/season/2024
Found 66 bracket cells
Found champion cell: True
Removing first four games
Remaining bracket cells: 62
Found 63 game URLs
Scraping box score from: https://www.espn.com/mens-college-basketball/boxscore/_/gameId/401638599
Done scraping box score for https://www.espn.com/mens-college-basketball/boxscore/_/gameId/401638599
Scraping box score from: https://www.espn.com/mens-college-basketball/boxscore/_/gameId/401638614
Done scraping box score for https://www.espn.com/mens-college-basketball/boxscore/_/gameId/401638614
Scraping box score from: https://www.espn.com/mens-college-basketball/boxscore/_/gameId/401638613
Done scraping box score for https://www.espn.com/mens-college-basketball/boxscore/_/gameId/401638613
Scraping box score from: https://www.espn.com/mens-college-basketball/boxscore/_/gameId/401638612
Done scraping box score for https://www.espn.com/mens-college-basketball/boxscore/_/gameId/40

In [80]:
# add 2005-2007 tournament data to the bookkeeping dict
for year in range(2005, 2008):
    print(f"Scraping bracket for year: {year}")
    bookkeeping_dict = full_espn_pipeline(year)
    if bookkeeping_dict:
        bk_dicts[year] = bookkeeping_dict
    else:
        print(f"No data found for year: {year}")

Scraping bracket for year: 2005
Scraping bracket from: https://www.espn.com/mens-college-basketball/bracket/_/season/2005
Found 62 bracket cells
Found champion cell: True
Found 63 game URLs
Scraping box score from: https://www.espn.com/mens-college-basketball/boxscore/_/gameId/254000047
Done scraping box score for https://www.espn.com/mens-college-basketball/boxscore/_/gameId/254000047
Scraping box score from: https://www.espn.com/mens-college-basketball/boxscore/_/gameId/254000048
Done scraping box score for https://www.espn.com/mens-college-basketball/boxscore/_/gameId/254000048
Scraping box score from: https://www.espn.com/mens-college-basketball/boxscore/_/gameId/254000050
Done scraping box score for https://www.espn.com/mens-college-basketball/boxscore/_/gameId/254000050
Scraping box score from: https://www.espn.com/mens-college-basketball/boxscore/_/gameId/254000051
Done scraping box score for https://www.espn.com/mens-college-basketball/boxscore/_/gameId/254000051
Scraping box s

KeyError: "None of [Index(['starters', 'PTS'], dtype='object', name=0)] are in the [columns]"

In [72]:
# combine all the dictionaries into one dataframe
bk_df = pd.DataFrame.from_dict({(i, j): bk_dicts[i][j] 
                                  for i in bk_dicts.keys() 
                                  for j in bk_dicts[i].keys()},
                                 orient='index').reset_index()
bk_df.head()

Unnamed: 0,level_0,level_1,team,seed,pts,pts_mult,year
0,2008,Sam AtupemS. Atupem,Mount St. Mary's Mountaineers,16,4,12,2008
1,2008,Markus MitchellM. Mitchell,Mount St. Mary's Mountaineers,16,7,21,2008
2,2008,Will HollandW. Holland,Mount St. Mary's Mountaineers,16,3,9,2008
3,2008,Jeremy GoodeJ. Goode,Mount St. Mary's Mountaineers,16,15,45,2008
4,2008,Chris VannC. Vann,Mount St. Mary's Mountaineers,16,16,48,2008


In [27]:
# Example usage
bracket_data = scrape_espn_bracket(espn_bracket_2024)
if bracket_data:
    print(f"Found {len(bracket_data)} regions")
    for region, matchups in bracket_data.items():
        print(f"{region}: {len(matchups)} matchups")

res = scrape_espn_box_score(translate_espn_game_url_to_boxscore_url('https://www.espn.com/mens-college-basketball/game/_/gameId/401638599/stetson-uconn'))

Scraping bracket from: https://www.espn.com/mens-college-basketball/bracket/_/season/2024
Found 66 bracket cells
Found champion cell: True
Found 66 game URLs
Found 66 regions


AttributeError: 'list' object has no attribute 'items'

In [6]:
df_espn = res['team1']
# set the column names to be the first row
df_espn.columns = df_espn.iloc[0]
df_espn = df_espn[['starters', 'PTS']]

# drop the rows that are nan or repeat the column names
df_espn = df_espn.dropna()
df_espn = df_espn[df_espn['starters'] != 'starters']
df_espn = df_espn[df_espn['starters'] != 'bench']
df_espn = df_espn[df_espn['starters'] != 'team']
df_espn

NameError: name 'res' is not defined