### 1. Get NBA Draft Data

In [3]:
import pandas as pd
from pathlib import Path

current_dir = Path.cwd()
draft_dir = current_dir.parent /"data"/ "drafts"

files = list(draft_dir.glob("draft_20*.csv"))

draft_df = pd.DataFrame()
for file in files:
    curr_df = pd.read_csv(file)
    draft_df = pd.concat([draft_df, curr_df])

draft_df = draft_df.sort_values(by=["SEASON", "OVERALL_PICK"]) 
print(draft_df.shape)
draft_df.head()

(1542, 14)


Unnamed: 0,PERSON_ID,PLAYER_NAME,SEASON,ROUND_NUMBER,ROUND_PICK,OVERALL_PICK,DRAFT_TYPE,TEAM_ID,TEAM_CITY,TEAM_NAME,TEAM_ABBREVIATION,ORGANIZATION,ORGANIZATION_TYPE,PLAYER_PROFILE_FLAG
0,2030,Kenyon Martin,2000,1,1,1,Draft,1610612751,New Jersey,Nets,NJN,Cincinnati,College/University,1
1,2031,Stromile Swift,2000,1,2,2,Draft,1610612763,Vancouver,Grizzlies,VAN,Louisiana State,College/University,1
2,2032,Darius Miles,2000,1,3,3,Draft,1610612746,Los Angeles,Clippers,LAC,East St. Louis,High School,1
3,2033,Marcus Fizer,2000,1,4,4,Draft,1610612741,Chicago,Bulls,CHI,Iowa State,College/University,1
4,2034,Mike Miller,2000,1,5,5,Draft,1610612753,Orlando,Magic,ORL,Florida,College/University,1


### 2. Get NBA Combine Data

In [4]:
import pandas as pd
from pathlib import Path

current_dir = Path.cwd()
draft_combine_dir = current_dir.parent /"data"/ "draft_combine"

files = list(draft_combine_dir.glob("draft_combine_20*.csv"))

draft_combine_df = pd.DataFrame()
for file in files:
    curr_df = pd.read_csv(file)
    draft_combine_df = pd.concat([draft_combine_df, curr_df])

draft_combine_df = draft_combine_df.sort_values(by=["SEASON"])
print(draft_combine_df.shape)
draft_combine_df.head()

(1795, 47)


Unnamed: 0,SEASON,PLAYER_ID,FIRST_NAME,LAST_NAME,PLAYER_NAME,POSITION,HEIGHT_WO_SHOES,HEIGHT_WO_SHOES_FT_IN,HEIGHT_W_SHOES,HEIGHT_W_SHOES_FT_IN,...,SPOT_NBA_BREAK_RIGHT,SPOT_NBA_CORNER_RIGHT,OFF_DRIB_FIFTEEN_BREAK_LEFT,OFF_DRIB_FIFTEEN_TOP_KEY,OFF_DRIB_FIFTEEN_BREAK_RIGHT,OFF_DRIB_COLLEGE_BREAK_LEFT,OFF_DRIB_COLLEGE_TOP_KEY,OFF_DRIB_COLLEGE_BREAK_RIGHT,ON_MOVE_FIFTEEN,ON_MOVE_COLLEGE
10,2000,2037,Jamal,Crawford,Jamal Crawford,PG-SG,76.5,6' 4.5'',,,...,,,,,,,,,,
37,2000,2058,Mark,Madsen,Mark Madsen,PF,80.0,6' 8'',,,...,,,,,,,,,,
36,2000,2090,Justin,Love,Justin Love,PG-SG,73.5,6' 1.5'',,,...,,,,,,,,,,
35,2000,2061,Dan,Langhi,Dan Langhi,SF,81.0,6' 9'',,,...,,,,,,,,,,
34,2000,12144,Brandon,Kurtz,Brandon Kurtz,PF-C,81.5,6' 9.5'',,,...,,,,,,,,,,


### 3. Get Undrafted Players Data

In [5]:
# Make sure column types match
draft_df["PERSON_ID"] = draft_df["PERSON_ID"].astype(str)
draft_combine_df["PLAYER_ID"] = draft_combine_df["PLAYER_ID"].astype(str)

# Get the sets
drafted_ids = set(draft_df["PERSON_ID"].unique())
combine_ids = set(draft_combine_df["PLAYER_ID"].unique())

# Players who attended combine but were *not* drafted
undrafted_combine_ids = combine_ids - drafted_ids

# Filter full player rows from combine dataframe
undrafted_combine_players = draft_combine_df[
    draft_combine_df["PLAYER_ID"].isin(undrafted_combine_ids)
].copy()


print(f"Numer of Undrafted players: ", len(undrafted_combine_ids))
undrafted_combine_players.head()

Numer of Undrafted players:  678


Unnamed: 0,SEASON,PLAYER_ID,FIRST_NAME,LAST_NAME,PLAYER_NAME,POSITION,HEIGHT_WO_SHOES,HEIGHT_WO_SHOES_FT_IN,HEIGHT_W_SHOES,HEIGHT_W_SHOES_FT_IN,...,SPOT_NBA_BREAK_RIGHT,SPOT_NBA_CORNER_RIGHT,OFF_DRIB_FIFTEEN_BREAK_LEFT,OFF_DRIB_FIFTEEN_TOP_KEY,OFF_DRIB_FIFTEEN_BREAK_RIGHT,OFF_DRIB_COLLEGE_BREAK_LEFT,OFF_DRIB_COLLEGE_TOP_KEY,OFF_DRIB_COLLEGE_BREAK_RIGHT,ON_MOVE_FIFTEEN,ON_MOVE_COLLEGE
36,2000,2090,Justin,Love,Justin Love,PG-SG,73.5,6' 1.5'',,,...,,,,,,,,,,
34,2000,12144,Brandon,Kurtz,Brandon Kurtz,PF-C,81.5,6' 9.5'',,,...,,,,,,,,,,
32,2000,12143,Kenyon,Jones,Kenyon Jones,PF-C,81.0,6' 9'',,,...,,,,,,,,,,
31,2000,12142,Nate,Johnson,Nate Johnson,SF,78.5,6' 6.5'',,,...,,,,,,,,,,
29,2000,12141,Jacob,Jaacks,Jacob Jaacks,PF,81.25,6' 9.25'',,,...,,,,,,,,,,


### 4.Scrap from basketball-reference

In [6]:
import os
import re
import time
import random
import requests
import pandas as pd
from typing import Tuple
from urllib.parse import urlparse

CBB_BASE = "https://www.sports-reference.com"

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
    )
}

def safe_get(session, url, *, retries=6, backoff=2.0, timeout=30):
    """Fully 429-compliant GET with exponential backoff + retry-after."""
    last_exc = None

    for i in range(retries):
        try:
            resp = session.get(url, headers=HEADERS, timeout=timeout)

            if resp.status_code == 429:
                retry_after = resp.headers.get("Retry-After")
                try:
                    delay = int(retry_after)
                except:
                    delay = backoff * (2 ** i) + random.uniform(0.5, 1.5)

                print(f"⚠ 429 at {url}, sleeping {delay:.1f}s")
                time.sleep(delay)
                continue

            resp.raise_for_status()
            return resp

        except Exception as e:
            last_exc = e
            delay = backoff * (2 ** i) + random.uniform(0.5, 1.5)
            print(f"⚠ Error {e}, sleeping {delay:.1f}s")
            time.sleep(delay)

    raise last_exc


def build_cbb_url(first, last):
    """Sports-Reference CBB uses fn-ln-1.html pattern."""
    fn = re.sub(r"[^a-z\s-]", "", first.lower()).replace(" ", "-")
    ln = re.sub(r"[^a-z\s-]", "", last.lower()).replace(" ", "-")
    return f"{CBB_BASE}/cbb/players/{fn}-{ln}-1.html"


def scrape_undrafted_players(df, output_dir="cbb_undrafted") -> Tuple[pd.DataFrame, pd.DataFrame]:

    os.makedirs(output_dir, exist_ok=True)

    session = requests.Session()
    all_rows, summary = [], []

    for i, row in enumerate(df.itertuples(index=False), start=1):
        first = row.FIRST_NAME
        last = row.LAST_NAME
        pid = row.PLAYER_ID

        print(f"[{i}/{len(df)}] {first} {last}")

        url = build_cbb_url(first, last)

        # === Safe GET with 429 protection ===
        try:
            resp = safe_get(session, url)
        except Exception as e:
            print(f"❌ Failed for {first} {last}: {e}")
            summary.append({"PLAYER_ID": pid, "found": False, "url": None})
            continue

        tables = pd.read_html(resp.text, flavor="lxml")

        # find season table
        season_table = None
        for t in tables:
            if "Season" in t.columns:
                season_table = t
                break

        if season_table is None:
            print("⚠ No season table found")
            summary.append({"PLAYER_ID": pid, "found": False, "url": url})
            continue

        # clean
        season_table = season_table[season_table["Season"] != "Season"]
        season_table = season_table[season_table["Season"].notna()]
        season_table = season_table[season_table["Season"] != "Career"]

        if season_table.empty:
            print("⚠ No usable rows")
            summary.append({"PLAYER_ID": pid, "found": False, "url": url})
            continue

        last_row = season_table.tail(1).copy()
        last_row.insert(0, "PLAYER_ID", pid)
        last_row.insert(1, "FIRST_NAME", first)
        last_row.insert(2, "LAST_NAME", last)
        last_row.insert(3, "cbb_url", url)

        all_rows.append(last_row)
        summary.append({"PLAYER_ID": pid, "found": True, "url": url})

        # === Long polite delay ===
        sleep_time = 2.5 + random.uniform(1.0, 2.0)
        print(f"Sleeping {sleep_time:.1f}s...\n")
        time.sleep(sleep_time)

        if i % 12 == 0:
            print("Long break to avoid 429...")
            time.sleep(10)

    stats_df = pd.concat(all_rows, ignore_index=True) if all_rows else pd.DataFrame()
    summary_df = pd.DataFrame(summary)

    stats_df.to_csv(os.path.join(output_dir, "undrafted_college_stats.csv"), index=False)
    summary_df.to_csv(os.path.join(output_dir, "undrafted_summary.csv"), index=False)

    return stats_df, summary_df


In [7]:
college_stats_undrafted, summary = scrape_undrafted_players(
    undrafted_combine_players,
    output_dir="cbb_undrafted"
)


[1/705] Justin Love
⚠ 429 at https://www.sports-reference.com/cbb/players/justin-love-1.html, sleeping 2100.0s


KeyboardInterrupt: 