In [89]:
import re
import os
from bs4 import BeautifulSoup as bs
from bs4 import Comment
import pandas as pd

In [3]:
years = list(range(1985, 2026))
years.remove(2020)

In [45]:
def soup_file(filename):
    with open (filename) as f:
        page = f.read()
    return bs(page, 'html.parser')

## Tournament Results

In [158]:
def parse_tournament(year):
    soup = soup_file(f"tournament/{year}.html")
    schools = []
    east, midwest, south, west, national = soup.find("div", {"id": "brackets"}).find_all("div", recursive=False)
    champ = national.find_all("div", {"class": "round"})[-1].find("a").get_text()

    def parse_division(bracket):
        fr = bracket.find_all("div", {"class": "round"})[0].find_all("div")
        sch = []
        for m in fr:
            teams = m.find_all("div")
            for t in teams:
                rank = t.find("span").get_text()
                name = t.find_all("a")[0].get_text()
                sch.append({
                    'rank': rank,
                    'school': name,
                    'year': year,
                    'champion': name == champ
                })
        return sch
    
    schools.extend(parse_division(east))
    schools.extend(parse_division(midwest))
    schools.extend(parse_division(south))
    schools.extend(parse_division(west))
    
    return schools

In [163]:
schools = []
for year in years:
    try:
        schools.extend(parse_tournament(year))
    except:
        print(year)

df = pd.DataFrame(schools)
df.head()

Unnamed: 0,rank,school,year,champion
0,1,Georgetown,1985,False
1,16,Lehigh,1985,False
2,8,Temple,1985,False
3,9,Virginia Tech,1985,False
4,5,SMU,1985,False


In [166]:
df.to_csv('data/tournaments.csv', sep=',', index=False, encoding='utf-8')

## Season Results

In [105]:
def parse_season(year):
    schools = []
    soup = soup_file(f"season/{year}.html")
    conferences = soup.find("div", {"id": "content"})
    for c in conferences.find_all("div", recursive=False):
        if c.get('id') and 'all_standings' in c.get('id'):
            conf = ' '.join(c.get('id')[len('all_standings') + 1:].split('-'))
            
            for child in c.descendants:
                if isinstance(child, Comment):
                    parsed_child = bs(child.string, "html")
                    for t in parsed_child.find_all("tr")[2:]:
                        info = [h.getText() for h in t.find_all(recursive=False)]
                        
                        conference_tournament_champion = False
                        if info[21]:
                            for i in info[21].split(";"):
                                if 'Conf. Tournament Champion' in i:
                                    conference_tournament_champion = True
                        
                        
                        schools.append({
                            'year': year,
                            'conference': info[2],
                            'rank': int(info[0]) if info[0] else None,
                            'school': info[1],
                            'overall_wins': int(info[3]) if info[3] else None,
                            'overall_losses': int(info[4]) if info[4] else None,
                            'overall_win_loss_percentage': float(info[5]) if info[5] else None,
                            'conference_wins': int(info[7]) if info[7] else None,
                            'conference_losses': int(info[8]) if info[8] else None,
                            'conference_win_loss_percentage': float(info[9]) if info[9] else None,
                            'own_points_per_game': float(info[11]) if info[11] else None,
                            'opp_points_per_game': float(info[12]) if info[12] else None,
                            'strength_of_schedule': float(info[14]) if info[14] else None,
                            'simple_rating_system': float(info[15]) if info[15] else None,
                            'ap_pre': int(info[17]) if info[17] else None,
                            'ap_high': int(info[18]) if info[18] else None,
                            'ap_final': int(info[19]) if info[19] else None,
                            'conference_tournament_champion': conference_tournament_champion
                        })
    return schools

In [107]:
seasons = []
for year in years:
    seasons.extend(parse_season(year))

df = pd.DataFrame(seasons)
df.head()

Unnamed: 0,year,conference,rank,school,overall_wins,overall_losses,overall_win_loss_percentage,conference_wins,conference_losses,conference_win_loss_percentage,own_points_per_game,opp_points_per_game,strength_of_schedule,simple_rating_system,ap_pre,ap_high,ap_final,conference_tournament_champion
0,1985,ACC,1.0,Georgia Tech,27.0,8.0,0.771,9.0,5.0,0.643,69.9,60.9,16.45,7.42,20.0,6.0,6.0,True
1,1985,ACC,2.0,North Carolina,27.0,9.0,0.75,9.0,5.0,0.643,73.1,65.9,15.33,8.13,,5.0,7.0,False
2,1985,ACC,3.0,NC State,23.0,10.0,0.697,9.0,5.0,0.643,73.3,65.0,15.02,6.78,13.0,9.0,16.0,False
3,1985,ACC,4.0,Duke,23.0,8.0,0.742,8.0,6.0,0.571,78.9,67.9,18.38,7.38,6.0,2.0,10.0,False
4,1985,ACC,5.0,Maryland,25.0,12.0,0.676,8.0,6.0,0.571,70.8,65.3,13.92,8.47,,17.0,,False


In [114]:
df.to_csv('data/seasons.csv', sep=',', index=False, encoding='utf-8')

## Season Leaders

In [168]:
# use https://www.sports-reference.com/cbb/seasons/men/2025-leaders.html to input 2025 leaders

In [220]:
def parse_leaders(lt):
    leaders = []
    soup = soup_file(f'leaders/{lt}.html')
    for row in soup.find_all('tr'):
        info = row.find_all('td')[-2:]
        if not info:
            continue
        year = info[0].find('a').get_text()
        year = year[:2] + year[-2:]
        school = info[1].find('a').get_text()
        leaders.append({
            'year': year,
            'school': school
        })

    # insert 2025 data
    leaders_2025 = {
        'pts': 'Villanova',
        'pts-per-g': 'Villanova',
        'trb': 'Northern Arizona',
        'trb-per-g': 'Northern Arizona',
        'ast': 'Gonzaga',
        'ast-per-g': 'Gonzaga',
        'stl': 'UC-San Diego',
        'stl-per-g': 'Maine',
        'blk': 'Queens (NC)',
        'blk-per-g': 'Queens (NC)'
    }

    leaders.append({
        'year': '2025',
        'school': leaders_2025[lt]
    })

    df = pd.DataFrame(leaders).sort_values(by='year').reset_index(drop=True)
    df.to_csv(f'data/{lt}.csv', sep=',', index=False, encoding='utf-8')

In [221]:
leader_types = [
    'pts', # points
    'trb', # total rebounds
    'ast', # assists
    'stl', # steals
    'blk', # blocks
    'pts-per-g', # points per game
    'trb-per-g', # total rebounds per game
    'ast-per-g', # assists per game
    'stl-per-g', # steals per game
    'blk-per-g', # blocks per game
]

In [222]:
for lt in leader_types:
    parse_leaders(lt)

## Player of the Year

In [115]:
soup = soup_file('other/poy.html')
poy = {
    'year': [],
    'player': [],
    'school': []
}
for row in soup.find('tbody').find_all('tr'):
    # get the year
    season_part = row.find('th', {'data-stat': 'season'})
    if not season_part:
        continue
    school_year_part = season_part.find('a')
    if not school_year_part:
        continue
    school_year = school_year_part.get_text()
    year = school_year[0:2] + school_year[-2:]
    poy['year'].append(year)

    # get the player
    player_part = row.find('td', {'data-stat': 'player'})
    player_name_part = player_part.find('a')
    player = player_name_part.getText()
    poy['player'].append(player)

    # get the school
    school_part = row.find('td', {'data-stat': 'school_name'})
    school_name_part = school_part.find('a')
    school = school_name_part.getText()
    poy['school'].append(school)

df = pd.DataFrame(poy)
df.head()

Unnamed: 0,year,player,school
0,2025,Cooper Flagg,Duke
1,2024,Zach Edey,Purdue
2,2023,Zach Edey,Purdue
3,2022,Oscar Tshiebwe,Kentucky
4,2021,Luka Garza,Iowa


In [116]:
df.to_csv('data/poy.csv', sep=',', index=False, encoding='utf-8')