In [None]:
# Import packages
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from datetime import datetime, timedelta
import os

In [None]:
# Set up webdriver configuration
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--no-sandbox")
options.add_argument("--headless")
options.add_argument("--disable-gpu")
browser = webdriver.Chrome(options=options)

if not os.path.exists("webscraped_ncaa_games_history.parquet"):
    date = pd.to_datetime("1986-1-2").date() # Initialize with first date of data if no data is available
    columns = ["date", "home_team", "home_team_ranking", "home_team_score", "away_team", "away_team_ranking", "away_team_score", "gender"]
    df = pd.DataFrame(columns=columns)
    df.to_parquet("webscraped_ncaa_games_history.parquet")
else:
    date = pd.to_datetime(pd.read_parquet("webscraped_ncaa_games_history.parquet").date.max()).date()

browser.get(f"https://www.sports-reference.com/cbb/boxscores/index.cgi?month={date.month}&day={date.day}&year={date.year}") # Load last fetched date

#browser.find_element(By.XPATH, "/html/body/div[1]/div/div/div/div[2]/div/button[3]").click()

In [12]:
df

Unnamed: 0,date,home_team,home_team_ranking,home_team_score,away_team,away_team_ranking,away_team_score,gender
0,1986-01-02,Holy Cross,,77,Yale,,86,m
1,1986-01-02,Central Michigan,,63,Western Michigan,,74,m
2,1986-01-02,Hartford,,54,Virginia,,73,m
3,1986-01-02,Cornell,,78,Utica,,88,m
4,1986-01-02,Oregon,,65,UCLA,,71,m
...,...,...,...,...,...,...,...,...
314686,2024-11-17,Sacred Heart,,70,Brown,,89,m
314687,2024-11-17,Cal Poly,,82,Eastern Washington,,78,m
314688,2024-11-17,California,,71,USC,,66,m
314689,2024-11-17,Weber State,,68,Hawaii,,73,m


In [None]:
finished_scraping = False # Set up while loop

df = pd.read_parquet("webscraped_ncaa_games_history.parquet") # Load already fetched database

while not finished_scraping:
    date = (date + timedelta(days = 1)) # Fetch next date

    if date.month == 5 and date.day == 1: date = date.replace(month=11, day=1) # If the date is May 1st, jump to November 1st of the same year
    
    if date > datetime.now().date(): # If the date is in the future, quit the script
        finished_scraping = True 
        browser.quit()
        continue

    browser.get(f"https://www.sports-reference.com/cbb/boxscores/index.cgi?month={date.month}&day={date.day}&year={date.year}")
    time.sleep(1)

    elements = browser.find_elements(By.CLASS_NAME, "game_summary") # Get all match result elements

    if date.day in [1, 11, 21]: # Periodically save data
        df.to_parquet("webscraped_ncaa_games_history.parquet")
        print(f"Saved data on {date}.")

    if len(elements) > 0: # If there are any match results on the page
        for element in elements:
            try:
                if "hidden" not in element.get_attribute("class").split():
                    home_team = element.find_elements(By.CSS_SELECTOR, "td")[0].text.split(" (")[0]
                    home_team_score = int(element.find_elements(By.CSS_SELECTOR, "td")[1].text)
                    away_team = element.find_elements(By.CSS_SELECTOR, "td")[3].text.split(" (")[0]
                    away_team_score = int(element.find_elements(By.CSS_SELECTOR, "td")[4].text)
                    gender = element.get_attribute("class")[-1:]
                    try: 
                        home_team_ranking = int(element.find_elements(By.CSS_SELECTOR, "td")[0].text.split(" (")[1].replace(") ", ""))
                        away_team_ranking = int(element.find_elements(By.CSS_SELECTOR, "td")[3].text.split(" (")[1].replace(") ", ""))
                    except:
                        home_team_ranking = np.nan
                        away_team_ranking = np.nan

                    df.loc[df.shape[0]] = [date, home_team, home_team_ranking, home_team_score, away_team, away_team_ranking, away_team_score, gender] # Insert new row into the bottom of the dataframe

            except Exception as e: # Simple error handling, often parsing of integers fails due to empty strings when elements are found but are empty on days with no matches. This is a crude but working fix.
                pass
        print(f"Finished collecting data for {date}!")

df.to_parquet("webscraped_ncaa_games_history.parquet") # Save data to parquet file once finished
print(f"Saved data on {date}.")

Saved data on 2023-11-11.
Finished collecting data for 2023-11-11!
Finished collecting data for 2023-11-12!
Finished collecting data for 2023-11-13!
Finished collecting data for 2023-11-14!
Finished collecting data for 2023-11-15!
Finished collecting data for 2023-11-16!
Finished collecting data for 2023-11-17!
Finished collecting data for 2023-11-18!
Finished collecting data for 2023-11-19!
Finished collecting data for 2023-11-20!
Saved data on 2023-11-21.
Finished collecting data for 2023-11-21!
Finished collecting data for 2023-11-22!
Finished collecting data for 2023-11-23!
Finished collecting data for 2023-11-24!
Finished collecting data for 2023-11-25!
Finished collecting data for 2023-11-26!
Finished collecting data for 2023-11-27!
Finished collecting data for 2023-11-28!
Finished collecting data for 2023-11-29!
Finished collecting data for 2023-11-30!
Saved data on 2023-12-01.
Finished collecting data for 2023-12-01!
Finished collecting data for 2023-12-02!
Finished collecting 