Data for both season standings and NCAA Tournament exist from 1939-onwards. However, 2019-20 season should be ignored because the 2020 tournament was cancelled that season.

I intend to use 1985 onwards because that's when the modern format began

In [277]:
import requests
import os
from bs4 import BeautifulSoup as bs
from bs4 import Comment

In [20]:
import pandas as pd

In [293]:
import re

### Get HTML Data of Tournament Results 

In [134]:
years = list(range(1985, 2024))
years.remove(2020)

In [132]:
url_tournament = "https://www.sports-reference.com/cbb/postseason/men/{}-ncaa.html"

for year in years:
    url = url_tournament.format(year)
    
    data = requests.get(url)
    
    with open(f"tournament/{year}.html", "w+") as f:
        f.write(data.text)

### Parse HTML Data of Tounament Results

In [453]:
def parse_tournament_html(year):
    with open(f"tournament/{year}.html") as f:
        page = f.read()
    soup = bs(page, 'html.parser')
    brackets = soup.find("div", {"id": "brackets"})
    if brackets:
        children = []
        for child in brackets.findChildren():
            child_id = child.get("id")
            if not child_id == None and not child_id == "bracket":
                children.append(child_id)
        return year, children, brackets

In [454]:
def parse_brackets(year):
    data = []
    
    brackets = parse_tournament_html(year)
    
    for b in brackets[1]:
        rounds = brackets[2].find("div", {"id": b}).find_all("div", {"class": "round"})

        if b != "national":
            round_names = ["First Round", "Second Round", "Sweet Sixteen", "Elite Eight"]
        else:
            round_names = ["Final Four", "National Championship"]

        for r in rounds[:-1]:
            games = r.find_all("div", recursive=False)
            for g in games:
                g_items = g.find_all(recursive=False)

                team_one_wins = False

                t1 = g_items[0]
                i1 = t1.find_all(recursive=False)
                
                if len(i1) == 2:
                    # Oregon vs VCU in 2021 was cancelled
                    continue

                t2 = g_items[1]
                i2 = t2.find_all(recursive=False)


                if t1.get("class") == ["winner"]:
                    team_one_wins = True


                data.append({
                    'year': year,
                    'bracket': b.title(),
                    'round': round_names[0],
                    'location': g_items[-1].getText().replace('at', '').strip(),
                    'team_one_rank': i1[0].getText(),
                    'team_one': i1[1].getText(),
                    'team_one_score': i1[2].getText(),
                    'team_two_rank': i2[0].getText(),
                    'team_two': i2[1].getText(),
                    'team_two_score': i2[2].getText(),
                    'team_one_wins': team_one_wins
                })
            round_names.pop(0)
    return data

In [None]:
bracket_data = []
for year in years:
    try:
        bracket_data.extend(parse_brackets(year))
    except:
        print(year)

In [None]:
df = pd.DataFrame(data)
df

In [212]:
df.to_csv('data/tournaments.csv', sep=',', index=False, encoding='utf-8')

### Get Data of Season Standings

In [230]:
print(years)

[1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2021, 2022, 2023]


In [235]:
url_season = "https://www.sports-reference.com/cbb/seasons/men/{}-standings.html"

for year in range(2015, 2024):
    url = url_season.format(year)
    
    data = requests.get(url)
    
    with open(f"seasons/{year}.html", "w+") as f:
        f.write(data.text)

### Parse HTML Data of Season Standings

In [424]:
def parse_standings(year):
    data = []
    with open(f"seasons/{year}.html") as f:
        page = f.read()
    soup = bs(page, 'html.parser')
    conferences = soup.find("div", {"id": "content"})

    for c in conferences.find_all("div", recursive=False):
        if c.get('id') and 'all_standings' in c.get('id'):
            conf = ' '.join(c.get('id')[len('all_standings') + 1:].split('-'))
            
            for child in c.descendants:
                if isinstance(child, Comment):
                    parsed_child = BeautifulSoup(child.string, "html")
                    for t in parsed_child.find_all("tr")[2:]:
                        info = [h.getText() for h in t.find_all(recursive=False)]
                        
                        conference_tournament_champion = False
                        if info[21]:
                            for i in info[21].split(";"):
                                if 'Conf. Tournament Champion' in i:
                                    conference_tournament_champion = True
                        
                        
                        data.append({
                            'year': year,
                            'conference': info[2],
                            'rank': int(info[0]) if info[0] else None,
                            'team': info[1],
                            'overall_wins': int(info[3]) if info[3] else None,
                            'overall_losses': int(info[4]) if info[4] else None,
                            'overall_win_loss_percentage': float(info[5]) if info[5] else None,
                            'conference_wins': int(info[7]) if info[7] else None,
                            'conference_losses': int(info[8]) if info[8] else None,
                            'conference_win_loss_percentage': float(info[9]) if info[9] else None,
                            'own_points_per_game': float(info[11]) if info[11] else None,
                            'opp_points_per_game': float(info[12]) if info[12] else None,
                            'strength_of_schedule': float(info[14]) if info[14] else None,
                            'simple_rating_system': float(info[15]) if info[15] else None,
                            'ap_pre': int(info[17]) if info[17] else None,
                            'ap_high': int(info[18]) if info[18] else None,
                            'ap_final': int(info[19]) if info[19] else None,
                            'conference_tournament_champion': conference_tournament_champion
                        })
    return data

In [425]:
standings_data = []

for year in years:
    standings_data.extend(parse_standings(year))

df2 = pd.DataFrame(standings_data)
df2

Unnamed: 0,year,conference,rank,team,overall_wins,overall_losses,overall_win_loss_percentage,conference_wins,conference_losses,conference_win_loss_percentage,own_points_per_game,opp_points_per_game,strength_of_schedule,simple_rating_system,ap_pre,ap_high,ap_final,conference_tournament_champion
0,1985,ACC,1.0,Georgia Tech,27.0,8.0,0.771,9.0,5.0,0.643,69.9,60.9,16.45,7.42,20.0,6.0,6.0,True
1,1985,ACC,2.0,North Carolina,27.0,9.0,0.750,9.0,5.0,0.643,73.1,65.9,15.33,8.13,,5.0,7.0,False
2,1985,ACC,3.0,NC State,23.0,10.0,0.697,9.0,5.0,0.643,73.3,65.0,15.02,6.78,13.0,9.0,16.0,False
3,1985,ACC,4.0,Duke,23.0,8.0,0.742,8.0,6.0,0.571,78.9,67.9,18.38,7.38,6.0,2.0,10.0,False
4,1985,ACC,5.0,Maryland,25.0,12.0,0.676,8.0,6.0,0.571,70.8,65.3,13.92,8.47,,17.0,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12453,2023,WCC,6.0,Brigham Young,19.0,15.0,0.559,7.0,9.0,0.438,74.1,70.1,9.43,5.40,,,,False
12454,2023,WCC,7.0,Pacific,15.0,18.0,0.455,7.0,9.0,0.438,75.8,78.4,-0.56,2.05,,,,False
12455,2023,WCC,8.0,Portland,14.0,19.0,0.424,5.0,11.0,0.313,76.7,79.1,1.68,4.13,,,,False
12456,2023,WCC,9.0,San Diego,11.0,20.0,0.355,4.0,12.0,0.250,76.3,81.6,-2.06,3.28,,,,False


In [452]:
df2.to_csv('data/standings.csv', sep=',', index=False, encoding='utf-8')