### 1. Get NBA Draft Data

In [13]:
import pandas as pd
from pathlib import Path

current_dir = Path.cwd()
draft_dir = current_dir.parent /"data"/ "drafts"

files = list(draft_dir.glob("draft_20*.csv"))

draft_df = pd.DataFrame()
for file in files:
    curr_df = pd.read_csv(file)
    draft_df = pd.concat([draft_df, curr_df])

draft_df = draft_df.sort_values(by=["SEASON", "OVERALL_PICK"]) 
print(draft_df.shape)
draft_df.head()

(1542, 14)


Unnamed: 0,PERSON_ID,PLAYER_NAME,SEASON,ROUND_NUMBER,ROUND_PICK,OVERALL_PICK,DRAFT_TYPE,TEAM_ID,TEAM_CITY,TEAM_NAME,TEAM_ABBREVIATION,ORGANIZATION,ORGANIZATION_TYPE,PLAYER_PROFILE_FLAG
0,2030,Kenyon Martin,2000,1,1,1,Draft,1610612751,New Jersey,Nets,NJN,Cincinnati,College/University,1
1,2031,Stromile Swift,2000,1,2,2,Draft,1610612763,Vancouver,Grizzlies,VAN,Louisiana State,College/University,1
2,2032,Darius Miles,2000,1,3,3,Draft,1610612746,Los Angeles,Clippers,LAC,East St. Louis,High School,1
3,2033,Marcus Fizer,2000,1,4,4,Draft,1610612741,Chicago,Bulls,CHI,Iowa State,College/University,1
4,2034,Mike Miller,2000,1,5,5,Draft,1610612753,Orlando,Magic,ORL,Florida,College/University,1


### 2. Get NBA Combine Data

In [14]:
import pandas as pd
from pathlib import Path

current_dir = Path.cwd()
draft_combine_dir = current_dir.parent /"data"/ "draft_combine"

files = list(draft_combine_dir.glob("draft_combine_20*.csv"))

draft_combine_df = pd.DataFrame()
for file in files:
    curr_df = pd.read_csv(file)
    draft_combine_df = pd.concat([draft_combine_df, curr_df])

draft_combine_df = draft_combine_df.sort_values(by=["SEASON"])
print(draft_combine_df.shape)
draft_combine_df.head()

(1795, 47)


Unnamed: 0,SEASON,PLAYER_ID,FIRST_NAME,LAST_NAME,PLAYER_NAME,POSITION,HEIGHT_WO_SHOES,HEIGHT_WO_SHOES_FT_IN,HEIGHT_W_SHOES,HEIGHT_W_SHOES_FT_IN,...,SPOT_NBA_BREAK_RIGHT,SPOT_NBA_CORNER_RIGHT,OFF_DRIB_FIFTEEN_BREAK_LEFT,OFF_DRIB_FIFTEEN_TOP_KEY,OFF_DRIB_FIFTEEN_BREAK_RIGHT,OFF_DRIB_COLLEGE_BREAK_LEFT,OFF_DRIB_COLLEGE_TOP_KEY,OFF_DRIB_COLLEGE_BREAK_RIGHT,ON_MOVE_FIFTEEN,ON_MOVE_COLLEGE
10,2000,2037,Jamal,Crawford,Jamal Crawford,PG-SG,76.5,6' 4.5'',,,...,,,,,,,,,,
37,2000,2058,Mark,Madsen,Mark Madsen,PF,80.0,6' 8'',,,...,,,,,,,,,,
36,2000,2090,Justin,Love,Justin Love,PG-SG,73.5,6' 1.5'',,,...,,,,,,,,,,
35,2000,2061,Dan,Langhi,Dan Langhi,SF,81.0,6' 9'',,,...,,,,,,,,,,
34,2000,12144,Brandon,Kurtz,Brandon Kurtz,PF-C,81.5,6' 9.5'',,,...,,,,,,,,,,


### 3. Get Undrafted Players Data

In [15]:
# Make sure column types match
draft_df["PERSON_ID"] = draft_df["PERSON_ID"].astype(str)
draft_combine_df["PLAYER_ID"] = draft_combine_df["PLAYER_ID"].astype(str)

# Get the sets
drafted_ids = set(draft_df["PERSON_ID"].unique())
combine_ids = set(draft_combine_df["PLAYER_ID"].unique())

# Players who attended combine but were *not* drafted
undrafted_combine_ids = combine_ids - drafted_ids

# Filter full player rows from combine dataframe
undrafted_combine_players = draft_combine_df[
    draft_combine_df["PLAYER_ID"].isin(undrafted_combine_ids)
].copy()


print(f"Numer of Undrafted players: ", len(undrafted_combine_ids))
undrafted_combine_players.head()

Numer of Undrafted players:  678


Unnamed: 0,SEASON,PLAYER_ID,FIRST_NAME,LAST_NAME,PLAYER_NAME,POSITION,HEIGHT_WO_SHOES,HEIGHT_WO_SHOES_FT_IN,HEIGHT_W_SHOES,HEIGHT_W_SHOES_FT_IN,...,SPOT_NBA_BREAK_RIGHT,SPOT_NBA_CORNER_RIGHT,OFF_DRIB_FIFTEEN_BREAK_LEFT,OFF_DRIB_FIFTEEN_TOP_KEY,OFF_DRIB_FIFTEEN_BREAK_RIGHT,OFF_DRIB_COLLEGE_BREAK_LEFT,OFF_DRIB_COLLEGE_TOP_KEY,OFF_DRIB_COLLEGE_BREAK_RIGHT,ON_MOVE_FIFTEEN,ON_MOVE_COLLEGE
36,2000,2090,Justin,Love,Justin Love,PG-SG,73.5,6' 1.5'',,,...,,,,,,,,,,
34,2000,12144,Brandon,Kurtz,Brandon Kurtz,PF-C,81.5,6' 9.5'',,,...,,,,,,,,,,
32,2000,12143,Kenyon,Jones,Kenyon Jones,PF-C,81.0,6' 9'',,,...,,,,,,,,,,
31,2000,12142,Nate,Johnson,Nate Johnson,SF,78.5,6' 6.5'',,,...,,,,,,,,,,
29,2000,12141,Jacob,Jaacks,Jacob Jaacks,PF,81.25,6' 9.25'',,,...,,,,,,,,,,


### 4.Scrap from basketball-reference

In [16]:
import os
import re
import time
import random
import requests
import pandas as pd
from typing import Tuple
from urllib.parse import urlparse

CBB_BASE = "https://www.sports-reference.com"

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
    )
}

def safe_get(session, url, *, retries=6, backoff=2.0, timeout=30):
    """Fully 429-compliant GET with exponential backoff + retry-after."""
    last_exc = None

    for i in range(retries):
        try:
            resp = session.get(url, headers=HEADERS, timeout=timeout)

            if resp.status_code == 429:
                retry_after = resp.headers.get("Retry-After")
                try:
                    delay = int(retry_after)
                except:
                    delay = backoff * (2 ** i) + random.uniform(0.5, 1.5)

                print(f"⚠ 429 at {url}, sleeping {delay:.1f}s")
                time.sleep(delay)
                continue

            resp.raise_for_status()
            return resp

        except Exception as e:
            last_exc = e
            delay = backoff * (2 ** i) + random.uniform(0.5, 1.5)
            print(f"⚠ Error {e}, sleeping {delay:.1f}s")
            time.sleep(delay)

    raise last_exc


def build_cbb_url(first, last):
    """Sports-Reference CBB uses fn-ln-1.html pattern."""
    fn = re.sub(r"[^a-z\s-]", "", first.lower()).replace(" ", "-")
    ln = re.sub(r"[^a-z\s-]", "", last.lower()).replace(" ", "-")
    return f"{CBB_BASE}/cbb/players/{fn}-{ln}-1.html"


def scrape_undrafted_players(df, output_dir="cbb_undrafted") -> Tuple[pd.DataFrame, pd.DataFrame]:

    os.makedirs(output_dir, exist_ok=True)

    session = requests.Session()
    all_rows, summary = [], []

    for i, row in enumerate(df.itertuples(index=False), start=1):
        first = row.FIRST_NAME
        last = row.LAST_NAME
        pid = row.PLAYER_ID

        print(f"[{i}/{len(df)}] {first} {last}")

        url = build_cbb_url(first, last)

        # === Safe GET with 429 protection ===
        try:
            resp = safe_get(session, url)
        except Exception as e:
            print(f"❌ Failed for {first} {last}: {e}")
            summary.append({"PLAYER_ID": pid, "found": False, "url": None})
            continue

        tables = pd.read_html(resp.text, flavor="lxml")

        # find season table
        season_table = None
        for t in tables:
            if "Season" in t.columns:
                season_table = t
                break

        if season_table is None:
            print("⚠ No season table found")
            summary.append({"PLAYER_ID": pid, "found": False, "url": url})
            continue

        # clean
        season_table = season_table[season_table["Season"] != "Season"]
        season_table = season_table[season_table["Season"].notna()]
        season_table = season_table[season_table["Season"] != "Career"]

        if season_table.empty:
            print("⚠ No usable rows")
            summary.append({"PLAYER_ID": pid, "found": False, "url": url})
            continue

        last_row = season_table.tail(1).copy()
        last_row.insert(0, "PLAYER_ID", pid)
        last_row.insert(1, "FIRST_NAME", first)
        last_row.insert(2, "LAST_NAME", last)
        last_row.insert(3, "cbb_url", url)

        all_rows.append(last_row)
        summary.append({"PLAYER_ID": pid, "found": True, "url": url})

        # === Long polite delay ===
        sleep_time = 2.5 + random.uniform(1.0, 2.0)
        print(f"Sleeping {sleep_time:.1f}s...\n")
        time.sleep(sleep_time)

        if i % 12 == 0:
            print("Long break to avoid 429...")
            time.sleep(10)

    stats_df = pd.concat(all_rows, ignore_index=True) if all_rows else pd.DataFrame()
    summary_df = pd.DataFrame(summary)

    stats_df.to_csv(os.path.join(output_dir, "undrafted_college_stats.csv"), index=False)
    summary_df.to_csv(os.path.join(output_dir, "undrafted_summary.csv"), index=False)

    return stats_df, summary_df


In [18]:
import os
import re
import time
import random
import logging
from typing import Tuple, Optional

import requests
import pandas as pd

# ==========================
# Config
# ==========================

CBB_BASE = "https://www.sports-reference.com"

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
    )
}

# Configure logging (adjust level to DEBUG if you want more detail)
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
)


# ==========================
# HTTP Helpers
# ==========================

def safe_get(
    session: requests.Session,
    url: str,
    *,
    retries: int = 6,
    backoff: float = 2.0,
    timeout: int = 30,
) -> Optional[requests.Response]:
    """
    GET with exponential backoff, 429 handling, and 404 skip.

    Returns
    -------
    Response or None
        - Response: if 2xx or other status (that didn't trigger retry) succeeds
        - None: if 404 Not Found
    Raises
    ------
    last exception after `retries` attempts (for non-404 failures).
    """
    last_exc = None

    for i in range(retries):
        try:
            resp = session.get(url, headers=HEADERS, timeout=timeout)

            # Permanent: page does not exist → skip
            if resp.status_code == 404:
                logging.warning(f"404 Not Found: {url} — skipping this player.")
                return None

            # Rate limiting: 429 Too Many Requests → obey Retry-After or backoff
            if resp.status_code == 429:
                retry_after = resp.headers.get("Retry-After")
                try:
                    delay = int(retry_after)
                except Exception:
                    delay = backoff * (2 ** i) + random.uniform(0.5, 1.5)

                logging.warning(f"429 Too Many Requests at {url}, sleeping {delay:.1f}s")
                time.sleep(delay)
                continue

            # Raise for other HTTP errors (5xx, 4xx excluding 404/429)
            resp.raise_for_status()
            return resp

        except Exception as e:
            last_exc = e
            delay = backoff * (2 ** i) + random.uniform(0.5, 1.5)
            logging.warning(f"Error {e} while fetching {url}, sleeping {delay:.1f}s")
            time.sleep(delay)

    # Exhausted retries
    if last_exc is not None:
        raise last_exc
    return None


# ==========================
# URL Builder
# ==========================

def build_cbb_url(first: str, last: str) -> str:
    """
    Sports-Reference CBB uses `fn-ln-1.html` pattern.

    Example: LeBron James → lebron-james-1.html
    """
    fn = re.sub(r"[^a-z\s-]", "", first.lower()).replace(" ", "-")
    ln = re.sub(r"[^a-z\s-]", "", last.lower()).replace(" ", "-")
    return f"{CBB_BASE}/cbb/players/{fn}-{ln}-1.html"


# ==========================
# Resume Helpers
# ==========================

def _load_existing_data(output_dir: str):
    """
    If files already exist, load them so we can resume.
    Returns (stats_df, summary_df), where either can be empty DataFrame.
    """
    stats_path = os.path.join(output_dir, "undrafted_college_stats.csv")
    summary_path = os.path.join(output_dir, "undrafted_summary.csv")

    if os.path.exists(stats_path):
        stats_df = pd.read_csv(stats_path)
        logging.info(f"Loaded existing stats from {stats_path} (rows={len(stats_df)})")
    else:
        stats_df = pd.DataFrame()

    if os.path.exists(summary_path):
        summary_df = pd.read_csv(summary_path)
        logging.info(f"Loaded existing summary from {summary_path} (rows={len(summary_df)})")
    else:
        summary_df = pd.DataFrame()

    return stats_df, summary_df


def _save_data(
    stats_df: pd.DataFrame,
    summary_df: pd.DataFrame,
    output_dir: str,
) -> None:
    """Save both stats and summary to disk."""
    os.makedirs(output_dir, exist_ok=True)
    stats_path = os.path.join(output_dir, "undrafted_college_stats.csv")
    summary_path = os.path.join(output_dir, "undrafted_summary.csv")

    stats_df.to_csv(stats_path, index=False)
    summary_df.to_csv(summary_path, index=False)

    logging.info(
        f"Saved stats (rows={len(stats_df)}) to {stats_path} "
        f"and summary (rows={len(summary_df)}) to {summary_path}"
    )


# ==========================
# Main Scraper
# ==========================

def scrape_undrafted_players(
    df: pd.DataFrame,
    output_dir: str = "cbb_undrafted",
    *,
    save_every: int = 50,
    resume: bool = True,
    long_break_every: int = 12,
    long_break_seconds: int = 10,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Scrape last-season college stats for undrafted players from Sports-Reference CBB.

    Parameters
    ----------
    df : DataFrame
        Must contain columns: PLAYER_ID, FIRST_NAME, LAST_NAME
    output_dir : str
        Directory to save CSVs.
    save_every : int
        Autosave after this many processed players (including skips).
    resume : bool
        If True, will load existing CSVs and skip players already marked found=True.
    long_break_every : int
        After this many players, take a longer sleep to avoid 429.
    long_break_seconds : int
        Duration of the long sleep.

    Returns
    -------
    (stats_df, summary_df) : Tuple[DataFrame, DataFrame]
        Final in-memory copies of stats and summary.
    """
    os.makedirs(output_dir, exist_ok=True)
    session = requests.Session()

    # Load existing data (for resume)
    existing_stats_df, existing_summary_df = _load_existing_data(output_dir)

    # Tracks new rows collected in this run
    new_stats_rows = []
    new_summary_rows = []

    # Determine which player_ids are already successfully scraped
    if resume and not existing_summary_df.empty and "PLAYER_ID" in existing_summary_df.columns:
        completed_ids = set(
            existing_summary_df.loc[
                existing_summary_df.get("found", False) == True,  # noqa: E712
                "PLAYER_ID",
            ].tolist()
        )
        logging.info(f"Resuming. Already completed players: {len(completed_ids)}")
    else:
        completed_ids = set()

    total_players = len(df)
    processed_since_save = 0

    for i, row in enumerate(df.itertuples(index=False), start=1):
        pid = row.PLAYER_ID
        first = row.FIRST_NAME
        last = row.LAST_NAME

        # Resume: skip if already scraped successfully
        if resume and pid in completed_ids:
            logging.info(f"[{i}/{total_players}] {first} {last} (PLAYER_ID={pid}) already scraped — skipping.")
            continue

        logging.info(f"[{i}/{total_players}] Scraping {first} {last} (PLAYER_ID={pid})")

        url = build_cbb_url(first, last)

        try:
            resp = safe_get(session, url)
        except Exception as e:
            logging.error(f"Failed for {first} {last} ({pid}) at {url}: {e}")
            new_summary_rows.append(
                {
                    "PLAYER_ID": pid,
                    "FIRST_NAME": first,
                    "LAST_NAME": last,
                    "found": False,
                    "url": url,
                    "error": str(e),
                }
            )
            processed_since_save += 1
            # small polite delay even on error
            time.sleep(1.0 + random.uniform(0.5, 1.0))
            # periodic save
            if processed_since_save >= save_every:
                logging.info("Autosaving due to processed_since_save >= save_every (error path).")
                existing_stats_df, existing_summary_df, processed_since_save = _merge_and_save(
                    existing_stats_df,
                    existing_summary_df,
                    new_stats_rows,
                    new_summary_rows,
                    output_dir,
                )
                new_stats_rows, new_summary_rows = [], []
            continue

        # 404 (None) → skip
        if resp is None:
            new_summary_rows.append(
                {
                    "PLAYER_ID": pid,
                    "FIRST_NAME": first,
                    "LAST_NAME": last,
                    "found": False,
                    "url": None,
                    "error": "404 Not Found",
                }
            )
            logging.info(f"No page for {first} {last} ({pid}), recorded as not found.")
            processed_since_save += 1
            time.sleep(1.0 + random.uniform(0.5, 1.0))
            if processed_since_save >= save_every:
                logging.info("Autosaving due to processed_since_save >= save_every (404 path).")
                existing_stats_df, existing_summary_df, processed_since_save = _merge_and_save(
                    existing_stats_df,
                    existing_summary_df,
                    new_stats_rows,
                    new_summary_rows,
                    output_dir,
                )
                new_stats_rows, new_summary_rows = [], []
            continue

        # Parse HTML tables
        try:
            tables = pd.read_html(resp.text, flavor="lxml")
        except ValueError as e:
            logging.warning(f"No tables found for {first} {last} ({pid}) at {url}: {e}")
            new_summary_rows.append(
                {
                    "PLAYER_ID": pid,
                    "FIRST_NAME": first,
                    "LAST_NAME": last,
                    "found": False,
                    "url": url,
                    "error": "no HTML tables",
                }
            )
            processed_since_save += 1
            time.sleep(1.0 + random.uniform(0.5, 1.0))
            if processed_since_save >= save_every:
                logging.info("Autosaving due to processed_since_save >= save_every (no-table path).")
                existing_stats_df, existing_summary_df, processed_since_save = _merge_and_save(
                    existing_stats_df,
                    existing_summary_df,
                    new_stats_rows,
                    new_summary_rows,
                    output_dir,
                )
                new_stats_rows, new_summary_rows = [], []
            continue

        # Find season table (table containing "Season" column)
        season_table = None
        for t in tables:
            if "Season" in t.columns:
                season_table = t
                break

        if season_table is None:
            logging.warning(f"No season table found for {first} {last} ({pid}) at {url}")
            new_summary_rows.append(
                {
                    "PLAYER_ID": pid,
                    "FIRST_NAME": first,
                    "LAST_NAME": last,
                    "found": False,
                    "url": url,
                    "error": "no Season table",
                }
            )
            processed_since_save += 1
            time.sleep(1.0 + random.uniform(0.5, 1.0))
            if processed_since_save >= save_every:
                logging.info("Autosaving due to processed_since_save >= save_every (no-season-table path).")
                existing_stats_df, existing_summary_df, processed_since_save = _merge_and_save(
                    existing_stats_df,
                    existing_summary_df,
                    new_stats_rows,
                    new_summary_rows,
                    output_dir,
                )
                new_stats_rows, new_summary_rows = [], []
            continue

        # Clean season table: drop header repeat rows, 'Career', NaNs
        season_table = season_table.copy()
        season_table = season_table[season_table["Season"].notna()]
        season_table = season_table[~season_table["Season"].isin(["Season", "Career"])]

        if season_table.empty:
            logging.warning(f"No usable rows in season table for {first} {last} ({pid}) at {url}")
            new_summary_rows.append(
                {
                    "PLAYER_ID": pid,
                    "FIRST_NAME": first,
                    "LAST_NAME": last,
                    "found": False,
                    "url": url,
                    "error": "empty season table",
                }
            )
            processed_since_save += 1
            time.sleep(1.0 + random.uniform(0.5, 1.0))
            if processed_since_save >= save_every:
                logging.info("Autosaving due to processed_since_save >= save_every (empty-season path).")
                existing_stats_df, existing_summary_df, processed_since_save = _merge_and_save(
                    existing_stats_df,
                    existing_summary_df,
                    new_stats_rows,
                    new_summary_rows,
                    output_dir,
                )
                new_stats_rows, new_summary_rows = [], []
            continue

        # Take last row (last season)
        last_row = season_table.tail(1).copy()
        last_row.insert(0, "PLAYER_ID", pid)
        last_row.insert(1, "FIRST_NAME", first)
        last_row.insert(2, "LAST_NAME", last)
        last_row.insert(3, "cbb_url", url)

        new_stats_rows.append(last_row)
        new_summary_rows.append(
            {
                "PLAYER_ID": pid,
                "FIRST_NAME": first,
                "LAST_NAME": last,
                "found": True,
                "url": url,
                "error": "",
            }
        )

        # Polite delay between requests
        sleep_time = 2.5 + random.uniform(1.0, 2.0)
        logging.info(f"Sleeping {sleep_time:.1f}s after {first} {last}")
        time.sleep(sleep_time)

        # Longer break every N players
        if long_break_every > 0 and i % long_break_every == 0:
            logging.info(f"Long break after {i} players to avoid 429...")
            time.sleep(long_break_seconds)

        processed_since_save += 1

        # Autosave periodically
        if processed_since_save >= save_every:
            logging.info("Autosaving due to processed_since_save >= save_every (normal path).")
            existing_stats_df, existing_summary_df, processed_since_save = _merge_and_save(
                existing_stats_df,
                existing_summary_df,
                new_stats_rows,
                new_summary_rows,
                output_dir,
            )
            new_stats_rows, new_summary_rows = [], []

    # Final save for any remaining rows
    if new_stats_rows or new_summary_rows:
        logging.info("Final save of remaining rows at end of run.")
        existing_stats_df, existing_summary_df, processed_since_save = _merge_and_save(
            existing_stats_df,
            existing_summary_df,
            new_stats_rows,
            new_summary_rows,
            output_dir,
        )

    return existing_stats_df, existing_summary_df


def _merge_and_save(
    existing_stats_df: pd.DataFrame,
    existing_summary_df: pd.DataFrame,
    new_stats_rows: list,
    new_summary_rows: list,
    output_dir: str,
):
    """
    Merge new rows into existing DataFrames and save to disk.
    Returns updated stats_df, summary_df, and resets processed_since_save.
    """
    if new_stats_rows:
        new_stats_df = pd.concat(new_stats_rows, ignore_index=True)
        if existing_stats_df.empty:
            combined_stats_df = new_stats_df
        else:
            combined_stats_df = pd.concat(
                [existing_stats_df, new_stats_df],
                ignore_index=True,
            ).drop_duplicates(subset=["PLAYER_ID", "Season"], keep="last")
    else:
        combined_stats_df = existing_stats_df

    if new_summary_rows:
        new_summary_df = pd.DataFrame(new_summary_rows)
        if existing_summary_df.empty:
            combined_summary_df = new_summary_df
        else:
            combined_summary_df = pd.concat(
                [existing_summary_df, new_summary_df],
                ignore_index=True,
            ).drop_duplicates(subset=["PLAYER_ID"], keep="last")
    else:
        combined_summary_df = existing_summary_df

    _save_data(combined_stats_df, combined_summary_df, output_dir)

    # Reset processed counter
    processed_since_save = 0
    return combined_stats_df, combined_summary_df, processed_since_save


# ==========================
# Optional CLI Example
# ==========================

if __name__ == "__main__":
    """
    Example usage:

    Suppose you already have a DataFrame `df_players` with columns:
        - PLAYER_ID
        - FIRST_NAME
        - LAST_NAME

    You might load it from CSV like this:
        df_players = pd.read_csv("undrafted_players.csv")

    Then run:
        scrape_undrafted_players(df_players, output_dir="cbb_undrafted")
    """
    print("This module is intended to be imported and used from another script.")


This module is intended to be imported and used from another script.


In [None]:
college_stats_undrafted, summary = scrape_undrafted_players(
    undrafted_combine_players,
    output_dir="ndrafted"
)


2025-12-09 15:09:25,044 [INFO] [1/705] Scraping Justin Love (PLAYER_ID=2090)
  tables = pd.read_html(resp.text, flavor="lxml")
2025-12-09 15:09:25,126 [INFO] Sleeping 4.2s after Justin Love
2025-12-09 15:09:29,292 [INFO] [2/705] Scraping Brandon Kurtz (PLAYER_ID=12144)
  tables = pd.read_html(resp.text, flavor="lxml")
2025-12-09 15:09:29,342 [INFO] Sleeping 4.0s after Brandon Kurtz
2025-12-09 15:09:33,305 [INFO] [3/705] Scraping Kenyon Jones (PLAYER_ID=12143)
  tables = pd.read_html(resp.text, flavor="lxml")
2025-12-09 15:09:33,355 [INFO] Sleeping 3.6s after Kenyon Jones
2025-12-09 15:09:36,932 [INFO] [4/705] Scraping Nate Johnson (PLAYER_ID=12142)
  tables = pd.read_html(resp.text, flavor="lxml")
2025-12-09 15:09:36,993 [INFO] Sleeping 3.7s after Nate Johnson
2025-12-09 15:09:40,686 [INFO] [5/705] Scraping Jacob Jaacks (PLAYER_ID=12141)
  tables = pd.read_html(resp.text, flavor="lxml")
2025-12-09 15:09:40,737 [INFO] Sleeping 4.0s after Jacob Jaacks
2025-12-09 15:09:44,767 [INFO] [6/70