# Data Collection

In [1]:
import requests
import os
from bs4 import BeautifulSoup as bs
from bs4 import Comment

In [2]:
import pandas as pd
import numpy as np

In [3]:
import re

## Get HTML Data of Tournament

In [4]:
url_tournament = "https://www.sports-reference.com/cbb/postseason/men/{}-ncaa.html"

url = url_tournament.format(2024)

data = requests.get(url)

with open(f"{2024}.html", "w+") as f:
    f.write(data.text)

## Parse HTML Data of Tournament

In [112]:
def parse_tournament_html(year):
    with open(f"{year}.html") as f:
        page = f.read()
    soup = bs(page, 'html.parser')
    brackets = soup.find("div", {"id": "brackets"})
    if brackets:
        children = []
        for child in brackets.findChildren():
            child_id = child.get("id")
            if not child_id == None and not child_id == "bracket":
                children.append(child_id)
        return year, children, brackets

In [113]:
def parse_brackets(year):
    data = []
    
    brackets = parse_tournament_html(year)
    
    for b in brackets[1]:
        rounds = brackets[2].find("div", {"id": b}).find_all("div", {"class": "round"})
        games = rounds[0].find_all("div", recursive=False)
        for g in games:
            g_items = g.find_all(recursive=False)

            team_one_wins = False

            t1 = g_items[0]
            i1 = t1.find_all(recursive=False)

            t2 = g_items[1]
            i2 = t2.find_all(recursive=False)


            if t1.get("class") == ["winner"]:
                team_one_wins = True


            data.append({
                'year': year,
                'bracket': b.title(),
                'round': 'First Round',
                'location': g_items[-1].getText().replace('at', '').strip(),
                'team_one_rank': i1[0].getText(),
                'team_one': i1[1].getText(),
                'team_two_rank': i2[0].getText(),
                'team_two': i2[1].getText()
            })
    return data

In [114]:
bracket_data = parse_brackets(2024)

In [115]:
bracket_data

[{'year': 2024,
  'bracket': 'East',
  'round': 'First Round',
  'location': 'Brooklyn, NY',
  'team_one_rank': '1',
  'team_one': 'Connecticut',
  'team_two_rank': '16',
  'team_two': 'Stetson'},
 {'year': 2024,
  'bracket': 'East',
  'round': 'First Round',
  'location': 'Brooklyn, NY',
  'team_one_rank': '8',
  'team_one': 'Florida Atlantic',
  'team_two_rank': '9',
  'team_two': 'Northwestern'},
 {'year': 2024,
  'bracket': 'East',
  'round': 'First Round',
  'location': 'Spokane, WA',
  'team_one_rank': '5',
  'team_one': 'San Diego State',
  'team_two_rank': '12',
  'team_two': 'UAB'},
 {'year': 2024,
  'bracket': 'East',
  'round': 'First Round',
  'location': 'Spokane, WA',
  'team_one_rank': '4',
  'team_one': 'Auburn',
  'team_two_rank': '13',
  'team_two': 'Yale'},
 {'year': 2024,
  'bracket': 'East',
  'round': 'First Round',
  'location': 'Omaha, NE',
  'team_one_rank': '6',
  'team_one': 'Brigham Young',
  'team_two_rank': '11',
  'team_two': 'Duquesne'},
 {'year': 2024,


In [116]:
df = pd.DataFrame(bracket_data)
df

Unnamed: 0,year,bracket,round,location,team_one_rank,team_one,team_two_rank,team_two
0,2024,East,First Round,"Brooklyn, NY",1,Connecticut,16,Stetson
1,2024,East,First Round,"Brooklyn, NY",8,Florida Atlantic,9,Northwestern
2,2024,East,First Round,"Spokane, WA",5,San Diego State,12,UAB
3,2024,East,First Round,"Spokane, WA",4,Auburn,13,Yale
4,2024,East,First Round,"Omaha, NE",6,Brigham Young,11,Duquesne
5,2024,East,First Round,"Omaha, NE",3,Illinois,14,Morehead State
6,2024,East,First Round,"Omaha, NE",7,Washington State,10,Drake
7,2024,East,First Round,"Omaha, NE",2,Iowa State,15,South Dakota State
8,2024,Midwest,First Round,"Indianapolis, IN",1,Purdue,16,Grambling
9,2024,Midwest,First Round,"Indianapolis, IN",8,Utah State,9,TCU


In [117]:
df.to_csv('data/2024_tournament.csv', sep=',', index=False, encoding='utf-8')

## Get HTML Data of Season Standings

In [4]:
url_season = "https://www.sports-reference.com/cbb/seasons/men/{}-standings.html"

url = url_season.format(2024)

data = requests.get(url)

with open(f"{2024}-season.html", "w+") as f:
    f.write(data.text)

## Parse HTML Data of Season Standings

In [5]:
def parse_standings(year):
    data = []
    with open(f"{year}-season.html") as f:
        page = f.read()
    soup = bs(page, 'html.parser')
    conferences = soup.find("div", {"id": "content"})

    for c in conferences.find_all("div", recursive=False):
        if c.get('id') and 'all_standings' in c.get('id'):
            conf = ' '.join(c.get('id')[len('all_standings') + 1:].split('-'))
            
            for child in c.descendants:
                if isinstance(child, Comment):
                    parsed_child = bs(child.string, "html")
                    for t in parsed_child.find_all("tr")[2:]:
                        info = [h.getText() for h in t.find_all(recursive=False)]
                        
                        conference_tournament_champion = False
                        if info[21]:
                            for i in info[21].split(";"):
                                if 'Conf. Tournament Champion' in i:
                                    conference_tournament_champion = True
                        
                        
                        data.append({
                            'year': year,
                            'conference': info[2],
                            'rank': int(info[0]) if info[0] else None,
                            'team': info[1],
                            'overall_wins': int(info[3]) if info[3] else None,
                            'overall_losses': int(info[4]) if info[4] else None,
                            'overall_win_loss_percentage': float(info[5]) if info[5] else None,
                            'conference_wins': int(info[7]) if info[7] else None,
                            'conference_losses': int(info[8]) if info[8] else None,
                            'conference_win_loss_percentage': float(info[9]) if info[9] else None,
                            'own_points_per_game': float(info[11]) if info[11] else None,
                            'opp_points_per_game': float(info[12]) if info[12] else None,
                            'strength_of_schedule': float(info[14]) if info[14] else None,
                            'simple_rating_system': float(info[15]) if info[15] else None,
                            'ap_pre': int(info[17]) if info[17] else None,
                            'ap_high': int(info[18]) if info[18] else None,
                            'ap_final': int(info[19]) if info[19] else None,
                            'conference_tournament_champion': conference_tournament_champion
                        })
    return data

In [6]:
standings_data = []

standings_data.extend(parse_standings(2024))

df2 = pd.DataFrame(standings_data)
df2

Unnamed: 0,year,conference,rank,team,overall_wins,overall_losses,overall_win_loss_percentage,conference_wins,conference_losses,conference_win_loss_percentage,own_points_per_game,opp_points_per_game,strength_of_schedule,simple_rating_system,ap_pre,ap_high,ap_final,conference_tournament_champion
0,2024,AAC,1,South Florida,24,7,0.774,16.0,2.0,0.889,75.8,69.1,7.38,0.60,,24.0,,False
1,2024,AAC,2,Florida Atlantic,25,8,0.758,14.0,4.0,0.778,82.5,73.3,13.89,4.61,10.0,7.0,,False
2,2024,AAC,3,Charlotte,19,12,0.613,13.0,5.0,0.722,68.0,66.3,4.37,2.70,,,,False
3,2024,AAC,4,UAB,23,11,0.676,12.0,6.0,0.667,77.4,75.8,5.20,3.62,,,,True
4,2024,AAC,5,Memphis,22,10,0.688,11.0,7.0,0.611,80.5,75.8,9.23,4.48,,10.0,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,2024,WCC,5,San Diego,18,15,0.545,7.0,9.0,0.438,72.5,78.1,-5.09,0.55,,,,False
358,2024,WCC,6,Pepperdine,13,20,0.394,5.0,11.0,0.313,72.4,75.2,-1.94,0.90,,,,False
359,2024,WCC,7,Loyola Marymount,12,19,0.387,5.0,11.0,0.313,71.2,71.5,0.07,0.44,,,,False
360,2024,WCC,8,Portland,12,21,0.364,5.0,11.0,0.313,69.8,79.5,-8.02,1.72,,,,False


In [16]:
df2.isna().sum()

year                              0
conference                        0
rank                              0
team                              0
overall_wins                      0
overall_losses                    0
overall_win_loss_percentage       0
conference_wins                   0
conference_losses                 0
conference_win_loss_percentage    0
own_points_per_game               0
opp_points_per_game               0
strength_of_schedule              0
simple_rating_system              0
ap_pre                            0
ap_high                           0
ap_final                          0
conference_tournament_champion    0
dtype: int64

In [10]:
df2[['ap_pre']] = df2[['ap_pre']].fillna(value=26.0)

In [12]:
df2[['ap_high']] = df2[['ap_high']].fillna(value=26.0)

In [13]:
df2[['ap_final']] = df2[['ap_final']].fillna(value=26.0)

In [15]:
df2 = df2.dropna(subset=['conference_wins'])

In [21]:
df2

Unnamed: 0,year,conference,rank,team,overall_wins,overall_losses,overall_win_loss_percentage,conference_wins,conference_losses,conference_win_loss_percentage,own_points_per_game,opp_points_per_game,strength_of_schedule,simple_rating_system,ap_pre,ap_high,ap_final,conference_tournament_champion
0,2024,AAC,1,South Florida,24,7,0.774,16.0,2.0,0.889,75.8,69.1,7.38,0.60,26.0,24.0,26.0,False
1,2024,AAC,2,Florida Atlantic,25,8,0.758,14.0,4.0,0.778,82.5,73.3,13.89,4.61,10.0,7.0,26.0,False
2,2024,AAC,3,Charlotte,19,12,0.613,13.0,5.0,0.722,68.0,66.3,4.37,2.70,26.0,26.0,26.0,False
3,2024,AAC,4,UAB,23,11,0.676,12.0,6.0,0.667,77.4,75.8,5.20,3.62,26.0,26.0,26.0,True
4,2024,AAC,5,Memphis,22,10,0.688,11.0,7.0,0.611,80.5,75.8,9.23,4.48,26.0,10.0,26.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,2024,WCC,5,San Diego,18,15,0.545,7.0,9.0,0.438,72.5,78.1,-5.09,0.55,26.0,26.0,26.0,False
358,2024,WCC,6,Pepperdine,13,20,0.394,5.0,11.0,0.313,72.4,75.2,-1.94,0.90,26.0,26.0,26.0,False
359,2024,WCC,7,Loyola Marymount,12,19,0.387,5.0,11.0,0.313,71.2,71.5,0.07,0.44,26.0,26.0,26.0,False
360,2024,WCC,8,Portland,12,21,0.364,5.0,11.0,0.313,69.8,79.5,-8.02,1.72,26.0,26.0,26.0,False


In [22]:
df2.to_csv('data/2024_standings.csv', sep=',', index=False, encoding='utf-8')