In [None]:
from bs4 import BeautifulSoup # type: ignore
import os # type: ignore
import pandas as pd # type: ignore
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout # type: ignore
import time # type: ignore

SEASONS = list(range(2010, 2026))
DATA_DIR = 'downloads'

async def get_html(url, selector, sleep=5, retries=5):
    html = None
    for i in range(1, retries + 1):
        time.sleep(sleep * i)
        try:
            async with async_playwright() as p:
                browser = await p.webkit.launch()
                page = await browser.new_page()
                await page.goto(url)
                if 'College Basketball at Sports-Reference.com' in str(await page.title()) and '404 error' not in str(await page.title()):
                    print(await page.title())
                else:
                    print('INVALID PAGE')
                    return None
                html = await page.inner_html(selector)
        except PlaywrightTimeout:
            print(f"Timeout error on {url}")
            continue
        else:
            break
    return html

async def scrape_teams():
    teams = []
    url = 'https://www.sports-reference.com/cbb/schools/'
    html = await get_html(url, '#all_NCAAM_schools')
    soup = BeautifulSoup(html)
    links = soup.find_all('a')
    hrefs = [l.get('href') for l in links]
    for href in hrefs:
        if href is not None and 'schools' in href:
            teams.append(href.split('schools/')[-1].split('/men')[0])
    return teams

async def scrape_stats(season, team):
    url = f"https://www.sports-reference.com/cbb/schools/{team}/men/{season}.html"
    save_path = os.path.join(DATA_DIR, url.split('schools/')[-1].replace('/','_'))
    if not os.path.exists(save_path):
        html = await get_html(url, '#content')
        if html is not None:
            with open(save_path, 'w+') as f:
                f.write(html)

TEAMS = await scrape_teams()
for season in SEASONS:
    for team in TEAMS:
        await scrape_stats(season, team)

WINNERS = []
YEAR = 2024
ncaa_page = await get_html('https://www.sports-reference.com/cbb/postseason/', '#content')
soup = BeautifulSoup(ncaa_page)
table = soup.find('table', {'id':'ncaa-tournament-history_NCAAM'}).find('tbody')
rows = table.find_all('tr')
for row in rows[:-1]:
    columns = row.find_all('td')
    if len(columns) > 0:
        winner = columns[0].find('a').get('href').split('schools/')[-1].split('/men')[0]
        if YEAR == 2020: YEAR -= 1
        WINNERS.append(str(YEAR) + ' ' + winner)
        YEAR -= 1

####################################################
pd.set_option('display.max_columns', 10000)
team_pages = os.listdir(DATA_DIR)
team_pages = [os.path.join(DATA_DIR, f) for f in team_pages if f.endswith('.html')]

def parse_html(box_score):
    with open(box_score) as f:
        html = f.read()
    soup = BeautifulSoup(html)
    [s.decompose() for s in soup.select('tr.over_header')]
    [s.decompose() for s in soup.select('tr.thead')]
    return soup

final_df = pd.DataFrame()
for team_page in team_pages:
    try:
        soup = parse_html(team_page)
        STATS = ['season-total_per_game','conf_per_game','players_per_game','players_per_game_conf','players_advanced','players_advanced_conf']
        
        df = pd.DataFrame()
        for STAT in STATS:
            if STAT == 'season-total_per_game' or STAT == 'conf_per_game':
                table = pd.read_html(str(soup), attrs={'id':STAT}, index_col=0)[0]
                table = table.apply(pd.to_numeric, errors='coerce')
                df = pd.concat([df, table.iloc[0], table.iloc[1], table.iloc[2], table.iloc[3]])
            else:
                table = pd.read_html(str(soup), attrs={'id':STAT}, index_col=0)[0].iloc[:-1]
                table = table.apply(pd.to_numeric, errors='coerce')
                df = pd.concat([df, table.mean(), table.max()])
        df = df.T
        
        TEAM, YEAR = team_page.split('downloads/')[-1].split('_')[0], team_page.split('_')[-1].split('.html')[0]
        df['Team'] = TEAM
        df['Year'] = YEAR
        df['Winner/Target'] = 1 if YEAR + ' ' + TEAM in WINNERS else 0
        df.loc[df['Year'] == 2025, 'Winner/Target'] = 2
        
        final_df = pd.concat([final_df, df], ignore_index=True)
        if len(final_df) % 1 == 0:
            print(f"{len(final_df)} / {len(team_pages)}")
    except Exception as e:
        print(f"SKIPPED {team_page} because {e}")
display(final_df)
final_df.to_csv('ncaa_stats.csv')

List of all the College Basketball Schools | College Basketball at Sports-Reference.com
INVALID PAGE
2009-10 Air Force Falcons Men's Roster and Stats | College Basketball at Sports-Reference.com
2009-10 Akron Zips Men's Roster and Stats | College Basketball at Sports-Reference.com
2009-10 Alabama Crimson Tide Men's Roster and Stats | College Basketball at Sports-Reference.com
2009-10 Alabama A&M Bulldogs Men's Roster and Stats | College Basketball at Sports-Reference.com
2009-10 Alabama State Hornets Men's Roster and Stats | College Basketball at Sports-Reference.com
2009-10 Albany (NY) Great Danes Men's Roster and Stats | College Basketball at Sports-Reference.com
2009-10 Alcorn State Braves Men's Roster and Stats | College Basketball at Sports-Reference.com
INVALID PAGE
2009-10 American Eagles Men's Roster and Stats | College Basketball at Sports-Reference.com
INVALID PAGE
2009-10 Appalachian State Mountaineers Men's Roster and Stats | College Basketball at Sports-Reference.com
2009-