In [2]:
from bs4 import BeautifulSoup
import requests
import os
import csv
from math import floor

if not os.path.isdir("data/records"):
    os.system(f"mkdir data/records")

for year in range(2003, 2025):
    # skip the two years most heavily impacted by COVID-19 for stat reliability. For some reason 2005 is now missing data on the website, the github repo has the csv.
    if year in [2020, 2021, 2005]:
        continue

    # HTML and CSV file names and paths
    html_name = f"{year}_bracket.html"
    html_file_path = f"data/brackets/{html_name}"
    csv_name = f"{year}_records.csv"
    csv_file_path = f"data/records/{csv_name}"
        
    with open(html_file_path) as f:
        soup = BeautifulSoup(f, "html.parser")

    # find all of the relevant information from the HTML source
    all_records = soup.find_all(name="td")
    all_records = [x.get("rel") for x in all_records] # if x.get("rel")]
    all_records = [x for x in all_records if x != None]
    records = []
    teams = []
    while all_records:
        # year and team name
        team = [year, all_records.pop(2)]
        for i in range(9):
            if i in [1, 4, 7, 8]:
                # columns to ignore
                all_records.pop(0)
            else:
                # seed, season total wins and losses, conference win and loss totals
                seed = float(all_records.pop(0))
                seed = floor(seed)
                team.append(int(seed))
        teams.append(team[:])
    
    
    tournament_record = {}

    # tally the tournament wins and losses in order to remove them to prevent data leakage
    games_path = f"data/brackets/{year}_bracket.csv"
    with open(games_path) as f:
        reader = csv.reader(f)
        games = [x for x in reader]
        # the championship game is not recorded in the team's records for some reason
        games.pop()

    # count the wins and losses of each team in the tournament
    for game in games:
        winner = game[1]
        loser = game[2]
        if winner not in tournament_record:
            tournament_record[winner] = [1, 0]
        else:
            tournament_record[winner][0] += 1
        if loser not in tournament_record:
            tournament_record[loser] = [0, 1]
        else:
            tournament_record[loser][1] += 1

    # remove the tournament wins and losses from the season totals to have accurate pre-tournament values
    for team in teams:
        name = team[1]
        tournament_wins = tournament_record[name][0]
        tournament_losses = tournament_record[name][1]
        team[3] -= tournament_wins
        team[4] -= tournament_losses

    # save all of the information to a csv file
    with open(csv_file_path, "w") as f:
        write = csv.writer(f)
        # here are the column headers
        categories = ["year", "team", "seed", "wins", "losses", "cwins", "closses"]
        write.writerow(categories)
        while teams:
            team = teams.pop(0)
            write.writerow(team)