Data for both season standings and NCAA Tournament exist from 1939-onwards. However, 2019-20 season should be ignored because the 2020 tournament was cancelled that season.

I intend to use 1985 onwards because that's when the modern format began

In [13]:
import requests
import os
from bs4 import BeautifulSoup as bs

In [20]:
import pandas as pd

### Get HTML Data of Tournament Results 

In [134]:
years = list(range(1985, 2024))
years.remove(2020)

In [132]:
url_tournament = "https://www.sports-reference.com/cbb/postseason/men/{}-ncaa.html"

for year in years:
    url = url_tournament.format(year)
    
    data = requests.get(url)
    
    with open(f"tournament/{year}.html", "w+") as f:
        f.write(data.text)

### Parse HTML Data of Tounament Results

In [119]:
def parse_tournament_html(year):
    with open(f"tournament/{year}.html") as f:
        page = f.read()
    soup = bs(page, 'html.parser')
    brackets = soup.find("div", {"id": "brackets"})
    if brackets:
        children = []
        for child in brackets.findChildren():
            child_id = child.get("id")
            if not child_id == None and not child_id == "bracket":
                children.append(child_id)
        return year, children, brackets

In [208]:
def parse_brackets(year):
    data = []
    
    brackets = parse_tournament_html(year)
    
    for b in brackets[1]:
        rounds = brackets[2].find("div", {"id": b}).find_all("div", {"class": "round"})

        if b != "national":
            round_names = ["First Round", "Second Round", "Sweet Sixteen", "Elite Eight"]
        else:
            round_names = ["Final Four", "National Championship"]

        for r in rounds[:-1]:
            games = r.find_all("div", recursive=False)
            for g in games:
                g_items = g.find_all(recursive=False)

                team_one_wins = False

                t1 = g_items[0]
                i1 = t1.find_all(recursive=False)
                
                if len(i1) == 2:
                    # Oregon vs VCU in 2021 was cancelled
                    continue

                t2 = g_items[1]
                i2 = t2.find_all(recursive=False)


                if t1.get("class") == ["winner"]:
                    team_one_wins = True


                data.append({
                    'year': year,
                    'bracket': b.title(),
                    'round': round_names[0],
                    'location': g_items[-1].getText().replace('at', '').strip(),
                    'team_one_rank': i1[0].getText(),
                    'team_one': i1[1].getText(),
                    'team_one_score': i1[2].getText(),
                    'team_two_rank': i2[0].getText(),
                    'team_two': i2[1].getText(),
                    'team_two_score': i2[2].getText(),
                    'team_one_wins': team_one_wins
                })
            round_names.pop(0)
    return data

In [210]:
data = []
for year in years:
    try:
        data.extend(parse_brackets(year))
    except:
        print(year)

In [211]:
df = pd.DataFrame(data)
df

Unnamed: 0,year,bracket,round,location,team_one_rank,team_one,team_one_score,team_two_rank,team_two,team_two_score,team_one_wins
0,1985,East,First Round,"Hartford, CT",1,Georgetown,68,16,Lehigh,43,True
1,1985,East,First Round,"Hartford, CT",8,Temple,60,9,Virginia Tech,57,True
2,1985,East,First Round,"Hartford, CT",5,SMU,85,12,Old Dominion,68,True
3,1985,East,First Round,"Hartford, CT",4,Loyola (IL),59,13,Iona,58,True
4,1985,East,First Round,"Atlanta, GA",6,Georgia,67,11,Wichita State,59,True
...,...,...,...,...,...,...,...,...,...,...,...
2388,2023,West,Sweet Sixteen,"Las Vegas, NV",3,Gonzaga,79,2,UCLA,76,True
2389,2023,West,Elite Eight,"Las Vegas, NV",4,UConn,82,3,Gonzaga,54,True
2390,2023,National,Final Four,"Houston, TX",9,Florida Atlantic,71,5,San Diego State,72,False
2391,2023,National,Final Four,"Houston, TX",5,Miami (FL),59,4,UConn,72,False


In [171]:
df.to_csv('data/tournaments.csv', sep=',', index=False, encoding='utf-8')