In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm

def extract_advanced_table(soup):
    table = soup.find("table", {"id": "advanced"})
    if table:
        return pd.read_html(str(table))[0]
    return None

def scrape_season(season_end_year):
    url = f"https://www.basketball-reference.com/leagues/NBA_{season_end_year}_advanced.html"
    print(f"Fetching {url}")
    res = requests.get(url)
    if res.status_code != 200:
        print(f"Error fetching {url} — Status code {res.status_code}")
        return None, None

    soup = BeautifulSoup(res.content, "html.parser")
    df = extract_advanced_table(soup)
    if df is None:
        print(f"Advanced stats table not found for {season_end_year}")
        return None, None

    season_label = f"{season_end_year - 1}-{str(season_end_year)[-2:]}"
    df["Season"] = season_label

    if "Player" in df.columns:
        league_avg = df[df["Player"] == "League Average"].copy()
        players = df[df["Player"] != "League Average"].copy()
    else:
        league_avg = pd.DataFrame()
        players = df

    return players, league_avg

def scrape_all_seasons(start=2001, end=2025):
    all_players = []
    all_avgs = []

    for year in tqdm(range(start, end + 1)):
        players_df, avg_df = scrape_season(year)
        if players_df is not None:
            all_players.append(players_df)
        if avg_df is not None and not avg_df.empty:
            all_avgs.append(avg_df)

    return pd.concat(all_players, ignore_index=True), pd.concat(all_avgs, ignore_index=True)

player_df, league_avg_df = scrape_all_seasons(2001, 2025)
player_df.to_csv("nba_advanced_stats_2001_2025_players.csv", index=False)
league_avg_df.to_csv("nba_advanced_stats_2001_2025_league_averages.csv", index=False)
print("Saved:")
print(" - Player data → nba_advanced_stats_2001_2025_players.csv")
print(" - League averages → nba_advanced_stats_2001_2025_league_averages.csv")


  0%|                                                                                           | 0/25 [00:00<?, ?it/s]

Fetching https://www.basketball-reference.com/leagues/NBA_2001_advanced.html


  4%|███▎                                                                               | 1/25 [00:06<02:44,  6.86s/it]

Fetching https://www.basketball-reference.com/leagues/NBA_2002_advanced.html


  8%|██████▋                                                                            | 2/25 [00:12<02:25,  6.32s/it]

Fetching https://www.basketball-reference.com/leagues/NBA_2003_advanced.html


  8%|██████▋                                                                            | 2/25 [00:16<03:04,  8.03s/it]


KeyboardInterrupt: 

In [None]:
def extract_rookie_table(soup):
    table = soup.find("table", {"id": "rookies"})
    if table:
        return pd.read_html(str(table))[0]
    return None

def scrape_rookie_season(season_end_year):
    url = f"https://www.basketball-reference.com/leagues/NBA_{season_end_year}_rookies.html"
    print(f"Fetching {url}")
    res = requests.get(url)
    if res.status_code != 200:
        print(f"Error fetching {url} — Status code {res.status_code}")
        return None

    soup = BeautifulSoup(res.content, "html.parser")
    df = extract_rookie_table(soup)
    if df is not None:
        df["Season"] = f"{season_end_year - 1}-{str(season_end_year)[-2:]}"
        return df
    else:
        print(f"Rookie table not found for {season_end_year}")
        return None

def scrape_all_rookie_seasons(start=2001, end=2025):
    all_dfs = []
    for year in tqdm(range(start, end + 1)):
        df = scrape_rookie_season(year)
        if df is not None:
            all_dfs.append(df)
    return pd.concat(all_dfs, ignore_index=True)

df_all = scrape_all_rookie_seasons(2001, 2025)
df_all.to_csv("nba_rookie_stats_2001_2025.csv", index=False)
print("Saved to nba_rookie_stats_2001_2025.csv")


In [5]:
df_raw = pd.read_csv("nba_rookie_stats_2001_2025.csv", header=None)

df_raw.columns = df_raw.iloc[1]
df = df_raw.iloc[2:].copy()

df = df[(df["Rk"] != "Rk") & (df["G"] != "Totals")].copy()

last_col = df.columns[-1]
df = df.rename(columns={last_col: "Season"})

df_minimal = df[["Player", "Season"]].copy()

df_minimal.to_csv("nba_rookie_names_and_seasons.csv", index=False)

print("Cleaned file saved as nba_rookie_names_and_seasons.csv")

✅ Cleaned file saved as nba_rookie_names_and_seasons.csv
