In [1]:
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO

def scrape_espn_bracket(url):
    """
    Scrape an ESPN tournament bracket page and extract matchup information.
    
    Args:
        url (str): The URL of the ESPN bracket page
        
    Returns:
        dict: Dictionary containing matchups organized by region
    """
    print(f"Scraping bracket from: {url}")
    
    # Send request with headers to mimic a browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        print(f"Failed to retrieve page: {response.status_code}")
        return None
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all items with the class 'BracketCell'
    bracket_cells = soup.find_all(class_='BracketCell')
    champ_cell = soup.find(class_='BracketCellChamp')
    print(f"Found {len(bracket_cells)} bracket cells")
    print(f"Found champion cell: {champ_cell is not None}")

    # Some hacky checking to remove first four games
    if len(bracket_cells) > 62:
        print("Removing first four games")
        # bracket_cells = bracket_cells[:-4]
        print(f"Remaining bracket cells: {len(bracket_cells)}")

    bracket_cells.append(champ_cell)  # Add the champion cell to the list of bracket cells if it exists

    # Grab teams from the bracket cells
    # for cell in bracket_cells:
    #     team_cells = cell.find_all(class_='BracketCell__Name')
    #     team_names = [ind_team_cell.get_text(strip=True) for ind_team_cell in team_cells]

    #     seed_cells = cell.find_all(class_='BracketCell__Rank')
    #     seed_names = [ind_seed_cell.get_text(strip=True) for ind_seed_cell in seed_cells]

    # get a list of game urls (hrefs attributes of the bracket cells)
    game_urls = [matchup.get('href') for matchup in bracket_cells if matchup.get('href')]
    print(f"Found {len(game_urls)} game URLs")
    return game_urls

def scrape_teams_from_espn_game_page(url):
    """
    Scrape team names and seeds from an ESPN game page.
    
    Args:
        url (str): The URL of the ESPN game page
    """
    print(f"Scraping teams from: {url}")
    
    # Send request with headers to mimic a browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        print(f"Failed to retrieve page: {response.status_code}")
        return None
    
    soup = BeautifulSoup(response.text, 'html.parser')

    # First get teams and seeds from the page
    # look for the class 'Gamestrip__InfoWrapper'
    team_info = soup.find_all(class_='Gamestrip__InfoWrapper')
    if not team_info:
        print(f"No team info found on page")
        return None
    if len(team_info) != 2:
        print(f"Expected 2 teams, found {len(team_info)}")
        return None
    
    team1 = team_info[0].get_text(strip=True)
    team1_name = re.sub(r'\d+', '', team1).strip()
    if team1_name == 'TBD':
        print(f"Team 1 name is TBD, skipping")
        return None
    team1_seed = re.search(r'\d{1,2}', team1).group(0)
    
    team2 = team_info[1].get_text(strip=True)
    team2_name = re.sub(r'\d+', '', team2).strip()
    if team2_name == 'TBD':
        print(f"Team 2 name is TBD, skipping")
        return None
    team2_seed = re.search(r'\d{1,2}', team2).group(0)

    return {'team1': team1_name, 'team1_seed': team1_seed, 'team2': team2_name, 'team2_seed': team2_seed}


In [2]:
scrape_teams_from_espn_game_page("https://www.espn.com/mens-college-basketball/game/_/gameId/401745909")

Scraping teams from: https://www.espn.com/mens-college-basketball/game/_/gameId/401745909


{'team1': 'Xavier Musketeers',
 'team1_seed': '11',
 'team2': 'Texas Longhorns',
 'team2_seed': '11'}

In [3]:
from constants import TEAMS_ALIVE_MASK, ESPN_TO_PP_MAP

def matchups_for_round(round_num):
    """
    Get matchups for a specific round number.
    
    Args:
        round_num (int): The round number to get matchups for
    """
    espn_bracket_url = 'https://www.espn.com/mens-college-basketball/bracket'
    bracket_data = scrape_espn_bracket(espn_bracket_url)

    if not bracket_data:
        print(f"Failed to scrape bracket data")

    # Iterate through matchups and scrape box scores
    matchups_list = []
    for game_url in bracket_data:
        full_url = 'https://www.espn.com' + game_url  # Ensure the URL is complete
        teams = scrape_teams_from_espn_game_page(full_url)
        if not teams:
            print(f"Failed to scrape teams from game page: {game_url}")
            continue
        team1 = teams['team1']
        team2 = teams['team2']

        # map team names to the names in TEAMS_ALIVE_MASK
        team1 = ESPN_TO_PP_MAP[team1]
        team2 = ESPN_TO_PP_MAP[team2]

        # Check if teams are alive using the TEAMS_ALIVE_MASK
        # print(f"Checking if teams are alive: {team1}, {team2}")
        # if team1 not in TEAMS_ALIVE_MASK:
        #     print(f"Team {team1} is not alive")
        # if team2 not in TEAMS_ALIVE_MASK:
        #     print(f"Team {team2} is not alive")

        # If teams are alive, create matchup
        if TEAMS_ALIVE_MASK[team1] and TEAMS_ALIVE_MASK[team2]:
            print(f"Creating matchup for {team1} vs {team2}")
            matchup = {
                'team_1': {'seed': teams['team1_seed'], 'name': team1, 'link': None},
                'team_2': {'seed': teams['team2_seed'], 'name': team2, 'link': None},
                'location': None
            }
            matchups_list.append(matchup)
        else:
            # print(f"Skipping matchup for {team1} vs {team2} as one or both teams are not alive")
            pass

    num_matchups_for_round = 64 / (2 ** round_num)
    num_matchups_per_region = num_matchups_for_round / 4
    regions = ['south', 'east', 'west', 'midwest']
    
    matchups_dict = {}
    for region in regions:
        matchups_dict[region] = []
        for i in range(int(num_matchups_per_region)):
            matchup_index = int(i + (num_matchups_per_region * regions.index(region)))
            if matchup_index < len(matchups_list):
                matchups_dict[region].append(matchups_list[matchup_index])

    return matchups_dict

In [4]:
mu = matchups_for_round(5)
for region, matchups in mu.items():
    for matchup in matchups:
        print(f"Region: {region}, Matchup: {matchup['team_1']['name']} vs {matchup['team_2']['name']}")

Scraping bracket from: https://www.espn.com/mens-college-basketball/bracket
Found 66 bracket cells
Found champion cell: True
Removing first four games
Remaining bracket cells: 66
Found 66 game URLs
Scraping teams from: https://www.espn.com/mens-college-basketball/game/_/gameId/401745957/alabama-st-auburn
Scraping teams from: https://www.espn.com/mens-college-basketball/game/_/gameId/401745959/creighton-louisville
Scraping teams from: https://www.espn.com/mens-college-basketball/game/_/gameId/401745963/uc-san-diego-michigan
Scraping teams from: https://www.espn.com/mens-college-basketball/game/_/gameId/401745966/yale-texas-am
Scraping teams from: https://www.espn.com/mens-college-basketball/game/_/gameId/401745995/north-carolina-ole-miss
Scraping teams from: https://www.espn.com/mens-college-basketball/game/_/gameId/401745997/lipscomb-iowa-state
Scraping teams from: https://www.espn.com/mens-college-basketball/game/_/gameId/401745999/new-mexico-marquette
Scraping teams from: https://www

In [5]:
from load_team_data import parse_silver_ratings
from simulate_tournament import simulate_n_tournaments

year = 2025
matchups_dict = matchups_for_round(5)
silver_path = '../data/silver.csv'
silver_df = parse_silver_ratings(silver_path)
print(silver_df)

probs, sims = simulate_n_tournaments(matchups_dict, {}, silver_df, method='silver', N=10000, player_bk_used=False)

# Convert the probabilities to a DataFrame for better readability
df = pd.DataFrame(probs.items(), columns=['Team', 'Probability'])
df = df.sort_values(by='Probability', ascending=False)
df.reset_index(drop=True, inplace=True)

Scraping bracket from: https://www.espn.com/mens-college-basketball/bracket
Found 66 bracket cells
Found champion cell: True
Removing first four games
Remaining bracket cells: 66
Found 66 game URLs
Scraping teams from: https://www.espn.com/mens-college-basketball/game/_/gameId/401745957/alabama-st-auburn
Scraping teams from: https://www.espn.com/mens-college-basketball/game/_/gameId/401745959/creighton-louisville
Scraping teams from: https://www.espn.com/mens-college-basketball/game/_/gameId/401745963/uc-san-diego-michigan
Scraping teams from: https://www.espn.com/mens-college-basketball/game/_/gameId/401745966/yale-texas-am
Scraping teams from: https://www.espn.com/mens-college-basketball/game/_/gameId/401745995/north-carolina-ole-miss
Scraping teams from: https://www.espn.com/mens-college-basketball/game/_/gameId/401745997/lipscomb-iowa-state
Scraping teams from: https://www.espn.com/mens-college-basketball/game/_/gameId/401745999/new-mexico-marquette
Scraping teams from: https://www

  0%|          | 0/10000 [00:00<?, ?it/s]


Exception: Error: Region simulation did not end with a single championship game.

In [6]:
len(sims)

10000

In [7]:
df

Unnamed: 0,Team,Probability
0,Duke (1),0.2107
1,Florida (1),0.1989
2,Houston (1),0.1922
3,Auburn (1),0.1565
4,Alabama (2),0.0707
5,Tennessee (2),0.0641
6,Mich. St. (2),0.0632
7,Texas Tech (3),0.0437


In [8]:
# check how many games each team played from sim[0]
team_to_games_sums = {}
for sim in sims:
    team_to_games = {}
    for region_ref in [sim.south, sim.east, sim.west, sim.midwest]:
        regional_final = region_ref.championship
        t1 = regional_final.matchup.team1
        t2 = regional_final.matchup.team2
        team_to_games[t1.team_name] = t1.games_played
        team_to_games[t2.team_name] = t2.games_played
        # work backwards left and right nodes of the tree
        backtrack_stack = [regional_final]
        while len(backtrack_stack) > 0:
            current_node = backtrack_stack.pop()
            t1 = current_node.matchup.team1
            t2 = current_node.matchup.team2
            team_to_games[t1.team_name] = t1.games_played
            team_to_games[t2.team_name] = t2.games_played

            if current_node.left:
                backtrack_stack.append(current_node.left)
            if current_node.right:
                backtrack_stack.append(current_node.right)

    for team_name, games_played in team_to_games.items():
        if team_name in team_to_games_sums:
            team_to_games_sums[team_name] += games_played
        else:
            team_to_games_sums[team_name] = games_played
    
# convert to a DataFrame for better readability
team_games_df = pd.DataFrame(list(team_to_games_sums.items()), columns=['Team', 'Games Played'])
team_games_df['Games Played'] = team_games_df['Games Played'].astype(int)/10000

In [9]:
team_games_df.sort_values(by='Games Played', ascending=False, inplace=True)
team_games_df.reset_index(drop=True, inplace=True)
team_games_df

Unnamed: 0,Team,Games Played
0,Florida,2.0563
1,Duke,1.9809
2,Houston,1.9731
3,Auburn,1.9375
4,Mich. St.,1.5534
5,Alabama,1.537
6,Tennessee,1.509
7,Texas Tech,1.4528


In [10]:
team_games_df.to_csv('../data/team_games_played_r8_projection.csv', index=False)

In [10]:
sims[0].east.print_region()

Duke (1)
├──[92mDuke (1)[0m
│   ├──[92mDuke (1)[0m
│   │   ├──Baylor (9)
│   │   └──[92mDuke (1)[0m
│   └──Oregon (5)
│       ├──[92mOregon (5)[0m
│       └──Arizona (4)
└──BYU (6)
    ├──[92mBYU (6)[0m
    │   ├──[92mBYU (6)[0m
    │   └──Wisconsin (3)
    └──Alabama (2)
        ├──Saint Mary's (7)
        └──[92mAlabama (2)[0m
