## Fetching College Basketball Stats 
➡️ This notebook will guide you through the process of fetching college basketball stats from the [Sports Reference](https://www.sports-reference.com/cbb/) website.\
➡️ You will need to create a venv by clicking `Select Kernel` -> `Select Another Kernel` -> `Python Environments...` -> `Create Python Environment` -> `Venv` -> `Python 3.13.2` (or the most recent version).



In [35]:
!source .venv/bin/activate

In [43]:
!pip install selenium requests pandas numpy scrapy



In [44]:
import requests
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore', category=UserWarning)
from scrapy.selector import Selector
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select

In [79]:
driver = webdriver.Chrome()
driver.get("https://www.sports-reference.com/cbb/seasons/men/2025-advanced-school-stats.html")
wait = WebDriverWait(driver, 10) 
wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="adv_school_stats_sh"]')))

###School###
school = []
school_names = driver.find_elements(By.XPATH, '//td[@data-stat="school_name"]//a')
for name in school_names:
    school.append(name.text)

###Win Loss Percentage###
win_loss_pct = []
win_loss_percentage = driver.find_elements(By.XPATH, '//td[@data-stat="win_loss_pct"]')
for percentage in win_loss_percentage:
    win_loss_pct.append(percentage.text)

###Simple Rating System###
srs = []
simple_rating_system = driver.find_elements(By.XPATH, '//td[@data-stat="srs"]')
for rating in simple_rating_system:
    srs.append(rating.text)

###SOS: Not included because March Madness is an entirely new schedule###

###Pace###
pace = []
pace_rating = driver.find_elements(By.XPATH, '//td[@data-stat="pace"]')
for rating in pace_rating:
    pace.append(rating.text)

###ORtg###
ortg = []
offensive_rating = driver.find_elements(By.XPATH, '//td[@data-stat="off_rtg"]')
for rating in offensive_rating:
    ortg.append(rating.text)

###FTr###
ftr = []
free_throw_rate = driver.find_elements(By.XPATH, '//td[@data-stat="fta_per_fga_pct"]')
for rate in free_throw_rate:
    ftr.append(rate.text)

###3PAr###
threepar = []
three_point_attempt_rate = driver.find_elements(By.XPATH, '//td[@data-stat="fg3a_per_fga_pct"]')
for rate in three_point_attempt_rate:
    threepar.append(rate.text)

###TS_pct###
ts = []
true_shooting = driver.find_elements(By.XPATH, '//td[@data-stat="ts_pct"]')
for shooting in true_shooting:
    ts.append(shooting.text)

###TRB_pct###
trb = []
total_rebound = driver.find_elements(By.XPATH, '//td[@data-stat="trb_pct"]')
for rebound in total_rebound:
    trb.append(rebound.text)

###AST_pct###
ast = []
assist = driver.find_elements(By.XPATH, '//td[@data-stat="ast_pct"]')
for assists in assist:
    ast.append(assists.text)

###STL_pct###
stl = []
steal = driver.find_elements(By.XPATH, '//td[@data-stat="stl_pct"]')
for steals in steal:
    stl.append(steals.text)

###BLK_pct###
blk = []
block = driver.find_elements(By.XPATH, '//td[@data-stat="blk_pct"]')
for blocks in block:
    blk.append(blocks.text)

###eFG_pct###
efg = []
effective_field_goal = driver.find_elements(By.XPATH, '//td[@data-stat="efg_pct"]')
for field_goal in effective_field_goal:
    efg.append(field_goal.text)

###TOV_pct###
tov = []
turnover = driver.find_elements(By.XPATH, '//td[@data-stat="tov_pct"]')
for turnovers in turnover:
    tov.append(turnovers.text)

###ORB_pct###
orb = []
offensive_rebound = driver.find_elements(By.XPATH, '//td[@data-stat="orb_pct"]')
for rebounds in offensive_rebound:
    orb.append(rebounds.text)

###FT/FGA###
ftfga = []
free_throw_field_goal = driver.find_elements(By.XPATH, '//td[@data-stat="ft_rate"]')
for rate in free_throw_field_goal:
    ftfga.append(rate.text)

#Convert all to NumPy arrays
school = np.array(school)
win_loss_pct = np.array(win_loss_pct)
srs = np.array(srs)
pace = np.array(pace)
ortg = np.array(ortg)
ftr = np.array(ftr)
threepar = np.array(threepar)
ts = np.array(ts)
trb = np.array(trb)
ast = np.array(ast)
stl = np.array(stl)
blk = np.array(blk)
efg = np.array(efg)
tov = np.array(tov)
orb = np.array(orb)
ftfga = np.array(ftfga)

In [80]:
#Put all the variables into a dataframe
df = pd.DataFrame({'School': school, 
'Win Loss Percentage': win_loss_pct, 
'Simple Rating System': srs, 
'Pace': pace, 
'ORtg': ortg, 
'FTr': ftr, 
'3PAr': threepar, 
'TS%': ts, 
'TRB%': trb, 
'AST%': ast, 
'STL%': stl, 
'BLK%': blk, 
'eFG%': efg, 
'TOV%': tov, 
'ORB%': orb, 
'FT/FGA': ftfga})


In [81]:
df

Unnamed: 0,School,Win Loss Percentage,Simple Rating System,Pace,ORtg,FTr,3PAr,TS%,TRB%,AST%,STL%,BLK%,eFG%,TOV%,ORB%,FT/FGA
0,Abilene Christian,.500,-5.46,71.1,98.6,.361,.262,.523,50.1,51.4,14.4,8.5,.483,18.0,31.8,.258
1,Air Force,.125,-7.74,64.7,95.0,.354,.485,.525,46.6,62.9,8.7,8.1,.501,18.7,22.7,.225
2,Akron,.824,3.55,72.4,116.3,.259,.461,.583,53.0,59.2,10.6,9.8,.557,14.2,33.5,.195
3,Alabama,.758,25.97,76.2,118.7,.401,.462,.594,54.0,54.1,7.9,10.1,.563,14.3,34.7,.287
4,Alabama A&M,.313,-20.34,73.0,98.4,.375,.410,.504,49.5,54.3,11.2,11.5,.470,18.1,35.1,.249
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,Wright State,.455,-4.90,68.1,112.2,.270,.377,.586,51.4,52.7,8.3,8.2,.566,15.6,29.0,.191
360,Wyoming,.375,-0.28,65.7,101.8,.300,.399,.529,51.5,47.9,6.7,8.2,.506,16.9,30.7,.196
361,Xavier,.656,15.57,69.4,111.5,.372,.374,.584,51.0,63.9,11.0,7.2,.539,14.4,25.2,.295
362,Yale,.750,6.78,69.0,118.0,.315,.320,.579,55.3,54.8,8.4,12.4,.551,12.7,33.9,.230


In [82]:
df.to_csv('march_madness_stats.csv', index=False)