# Fetching Advanced School Stats 
➡️ We will use Selenium and a Chrome Webdriver to scrape college basketball stats from the [Sports Reference](https://www.sports-reference.com/cbb/seasons/men/2025-school-stats.html) website.


In [3]:
!source .venv/bin/activate

In [4]:
!pip install selenium requests pandas numpy scrapy



In [5]:
import requests
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore', category=UserWarning)
from scrapy.selector import Selector
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select

In [6]:
import time
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Initialize WebDriver
driver = webdriver.Chrome()

# Define years to scrape
years = range(2015, 2026)  # 2014-15 to 2024-25


for year in years:
    url = f"https://www.sports-reference.com/cbb/seasons/men/{year}-advanced-school-stats.html"
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    wait.until(EC.presence_of_element_located((By.ID, "adv_school_stats_sh")))
    
    # Scrape data
    columns = [
        "school_name", "win_loss_pct", "srs", "pace", "off_rtg", "fta_per_fga_pct", "fg3a_per_fga_pct",
        "ts_pct", "trb_pct", "ast_pct", "stl_pct", "blk_pct", "efg_pct", "tov_pct", "orb_pct", "ft_rate"
    ]
    
    data = {col: [] for col in columns}
    
    for col in columns:
        elements = driver.find_elements(By.XPATH, f'//td[@data-stat="{col}"]')
        data[col] = [e.text for e in elements]
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    
    # Save to CSV
    filename = f"ncaa_advanced_stats_uncleaned_{year}.csv"
    #Save to the Yearly_Stats folder
    df.to_csv(f"Yearly_Stats_Uncleaned/{filename}", index=False)

    
    time.sleep(2)  # Pause between requests to avoid overwhelming the server

# Close WebDriver
driver.quit()

# Fetching School Rating Data
➡️ We can find the rating data for each team on [Sports Reference](https://www.sports-reference.com/cbb/seasons/men/2025-ratings.html).

In [8]:
# Initialize WebDriver
driver = webdriver.Chrome()

# Define years to scrape
years = range(2015, 2026)  # 2014-15 to 2024-25


for year in years:
    url = f"https://www.sports-reference.com/cbb/seasons/men/{year}-ratings.html"
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    wait.until(EC.presence_of_element_located((By.XPATH, "//*[@id='info']")))
    
    # Scrape data
    columns = [
        "school_name","conf_abbr", "srs_off", "srs_def", "off_rtg", "def_rtg"
    ]
    
    data = {col: [] for col in columns}
    
    for col in columns:
        elements = driver.find_elements(By.XPATH, f'//td[@data-stat="{col}"]')
        data[col] = [e.text for e in elements]
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    
    # Save to CSV
    filename = f"ncaa_ratings_uncleaned_{year}.csv"
    #Save to the Yearly_Stats folder
    df.to_csv(f"Yearly_ratings_Uncleaned/{filename}", index=False)

    
    time.sleep(2)  # Pause between requests to avoid overwhelming the server

# Close WebDriver
driver.quit()