In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

years = list(range(2021, 2026))
dfs = []

for year in years:
    url = f"https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html"
    df = pd.read_html(url)[0]
    df = df[df["Player"] != "Player"]
    lebron = df[df["Player"] == "LeBron James"].copy()
    lebron["Season"] = f"{year-1}-{str(year)[-2:]}"
    dfs.append(lebron)

# Combine all seasons
all_lebron = pd.concat(dfs)
all_lebron.reset_index(drop=True, inplace=True)

# Convert PTS column to numeric
all_lebron["PTS"] = pd.to_numeric(all_lebron["PTS"], errors="coerce")

# Create output folder
os.makedirs("output", exist_ok=True)

# Plot: Points Per Game over seasons
plt.figure(figsize=(8,5))
plt.plot(all_lebron["Season"], all_lebron["PTS"], marker="o", linewidth=2)
plt.title("LeBron James Points Per Game by Season (2020–2024)")
plt.xlabel("Season")
plt.ylabel("Points Per Game (PPG)")
plt.grid(True)
plt.tight_layout()
plt.savefig("output/lebron_ppg.png")
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

# Player Names
players = ["LeBron James","Kevin Durant","Stephen Curry"]
years = list(range(2021, 2026))

all_data = []

for year in years:
    url = f"https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html"
    df = pd.read_html(url)[0]
    df = df[df["Player"] != "Player"]  # 去除多余表头

    for player in players:
        player_rows = df[df["Player"] == player].copy()
        if not player_rows.empty:
            if "2TM" in player_rows["Team"].values:
                pdata = player_rows[player_rows["Team"] == "2TM"].copy()
            else:
                pdata = player_rows.iloc[[0]].copy()  # 如果没有多队，只取第一行
            pdata["Season"] = f"{year-1}-{str(year)[-2:]}"
            pdata["Player"] = player
            all_data.append(pdata)


# 合并所有球员数据
df_all = pd.concat(all_data)
df_all.reset_index(drop=True, inplace=True)

# 转为数字
df_all["PTS"] = pd.to_numeric(df_all["PTS"], errors="coerce")

# 创建输出目录
os.makedirs("output", exist_ok=True)

# 画图
plt.figure(figsize=(9,6))

for player in players:
    sub = df_all[df_all["Player"] == player]
    plt.plot(sub["Season"], sub["PTS"], marker="o", label=player)

plt.title("Points Per Game (2021–2025)")
plt.xlabel("Season")
plt.ylabel("Points Per Game (PPG)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("output/ppg_comparison.png")
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

# Player Names
players = ["LeBron James","Kevin Durant","Stephen Curry","Nikola Jokić"]
years = list(range(2021, 2026))

all_data = []

for year in years:
    url = f"https://www.basketball-reference.com/leagues/NBA_{year}_advanced.html"
    df = pd.read_html(url)[0]
    df = df[df["Player"] != "Player"]  # 去除多余表头

    for player in players:
        player_rows = df[df["Player"] == player].copy()
        if not player_rows.empty:
            if "2TM" in player_rows["Team"].values:
                pdata = player_rows[player_rows["Team"] == "2TM"].copy()
            else:
                pdata = player_rows.iloc[[0]].copy()  # 如果没有多队，只取第一行
            pdata["Season"] = f"{year-1}-{str(year)[-2:]}"
            pdata["Player"] = player
            all_data.append(pdata)


# 合并所有球员数据
df_all = pd.concat(all_data)
df_all.reset_index(drop=True, inplace=True)

# 转为数字
df_all["BPM"] = pd.to_numeric(df_all["BPM"], errors="coerce")

# 创建输出目录
os.makedirs("output", exist_ok=True)

# 画图
plt.figure(figsize=(9,6))

for player in players:
    sub = df_all[df_all["Player"] == player]
    plt.plot(sub["Season"], sub["BPM"], marker="o", label=player)

plt.title("Advanced Stats (2021–2025)")
plt.xlabel("Season")
plt.ylabel("BPM")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("output/BPM_comparison.png")
plt.show()

In [1]:
import pandas as pd
import os

# 球员名单与年份范围
players = ["LeBron James", "Kevin Durant", "Stephen Curry"]
years = list(range(2016, 2026))

all_data = []

for year in years:
    print(f"Processing {year}...")

    # --- 1. 抓 per_game 数据 ---
    url_pg = f"https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html"
    df_pg = pd.read_html(url_pg)[0]
    df_pg = df_pg[df_pg["Player"] != "Player"]

    # --- 2. 抓 advanced 数据 ---
    url_adv = f"https://www.basketball-reference.com/leagues/NBA_{year}_advanced.html"
    df_adv = pd.read_html(url_adv)[0]
    df_adv = df_adv[df_adv["Player"] != "Player"]

    for player in players:
        # per_game 筛选
        pg_rows = df_pg[df_pg["Player"] == player].copy()
        if pg_rows.empty:
            continue
        if "2TM" in pg_rows["Team"].values:
            pg = pg_rows[pg_rows["Team"] == "2TM"].copy()
        else:
            pg = pg_rows.iloc[[0]].copy()

        # advanced 筛选
        adv_rows = df_adv[df_adv["Player"] == player].copy()
        if adv_rows.empty:
            continue
        if "2TM" in adv_rows["Team"].values:
            adv = adv_rows[adv_rows["Team"] == "2TM"].copy()
        else:
            adv = adv_rows.iloc[[0]].copy()

        # 合并两部分
        merged = pd.DataFrame({
            "Player": [player],
            "Season": [f"{year-1}-{str(year)[-2:]}"],
            "Team": pg["Team"].values[0],
            "PTS": pd.to_numeric(pg["PTS"].values[0], errors="coerce"),
            "TRB": pd.to_numeric(pg["TRB"].values[0], errors="coerce"),
            "AST": pd.to_numeric(pg["AST"].values[0], errors="coerce"),
            "STL": pd.to_numeric(pg["STL"].values[0], errors="coerce"),
            "BLK": pd.to_numeric(pg["BLK"].values[0], errors="coerce"),
            "TOV": pd.to_numeric(pg["TOV"].values[0], errors="coerce"),
            "BPM": pd.to_numeric(adv["BPM"].values[0], errors="coerce"),
            "PER": pd.to_numeric(adv["PER"].values[0], errors="coerce"),
            "WS": pd.to_numeric(adv["WS"].values[0], errors="coerce")
        })

        all_data.append(merged)

# 合并所有数据并导出
df_all = pd.concat(all_data)
os.makedirs("output", exist_ok=True)
df_all.to_csv("output/nba_selected_stats.csv", index=False)
print("✅ Saved to output/nba_selected_stats.csv")


Processing 2010...
Processing 2011...
Processing 2012...
Processing 2013...
Processing 2014...
Processing 2015...
Processing 2016...
Processing 2017...
Processing 2018...
Processing 2019...
Processing 2020...
Processing 2021...
Processing 2022...
Processing 2023...
Processing 2024...
Processing 2025...


HTTPError: HTTP Error 429: Too Many Requests

In [3]:
import pandas as pd
import os
import time
import random
import requests
from bs4 import BeautifulSoup

# 设置球员名单和年份范围
players = [
    # Superstar
    "LeBron James", "Kevin Durant", "Stephen Curry", "Nikola Jokic",
    "Giannis Antetokounmpo", "Joel Embiid", "Kawhi Leonard", "James Harden",
    "Russell Westbrook",

    # All-Star
    "Jayson Tatum", "Luka Doncic", "Anthony Davis", "Devin Booker",
    "Damian Lillard", "Jimmy Butler", "Kyrie Irving", "Paul George", "Chris Paul",

    # Great players
    "Shai Gilgeous-Alexander", "Ja Morant", "Zion Williamson", "LaMelo Ball",
    "Anthony Edwards", "Tyrese Haliburton", "Cade Cunningham", "Jalen Green",
    "Scottie Barnes", "Victor Wembanyama", "Chet Holmgren", "Klay Thompson",
    "Bradley Beal", "DeMar DeRozan", "Mike Conley", "Julius Randle",
    
    "Jaylen Brown", "Jrue Holiday", "Bam Adebayo", "Andrew Wiggins",
    "Kristaps Porzingis", "Deandre Ayton", "CJ McCollum", "Michael Porter Jr.",
    "Desmond Bane", "RJ Barrett", "Austin Reaves", "Derrick White","Donovan Mitchell",

    # Good players
    "De'Aaron Fox", "Domantas Sabonis", "Karl-Anthony Towns", "Brandon Ingram",
    "Pascal Siakam", "Fred VanVleet", "Draymond Green", "Brook Lopez",
    "Mikal Bridges", "Aaron Gordon", "Tyrese Maxey", "Darius Garland",

    # Progress Player 
    "Jalen Brunson", "Franz Wagner", "Alperen Sengun", "Josh Giddey",
    "Bennedict Mathurin", "Keegan Murray", "Immanuel Quickley", "Cam Thomas"
]

years = list(range(2016, 2026))

# 设置输出目录
os.makedirs("output", exist_ok=True)

# 自定义请求头（伪装浏览器）
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

# 定义一个辅助函数：获取网页表格
def get_table(url):
    try:
        response = requests.get(url, headers=HEADERS)
        response.raise_for_status()
        return pd.read_html(response.text)[0]
    except Exception as e:
        print(f"⚠️ Error fetching {url}: {e}")
        return None

# 开始抓取数据
all_data = []

for year in years:
    print(f"Processing {year}...")
    
    # 每年之间加随机延时
    time.sleep(random.uniform(3, 6))

    # 1. per_game
    url_pg = f"https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html"
    df_pg = get_table(url_pg)
    if df_pg is None:
        continue
    df_pg = df_pg[df_pg["Player"] != "Player"]

    # 2. advanced
    url_adv = f"https://www.basketball-reference.com/leagues/NBA_{year}_advanced.html"
    df_adv = get_table(url_adv)
    if df_adv is None:
        continue
    df_adv = df_adv[df_adv["Player"] != "Player"]

    for player in players:
        # per_game
        pg_rows = df_pg[df_pg["Player"] == player].copy()
        if pg_rows.empty:
            continue
        pg = pg_rows[pg_rows["Team"] == "2TM"] if "2TM" in pg_rows["Team"].values else pg_rows.iloc[[0]]

        # advanced
        adv_rows = df_adv[df_adv["Player"] == player].copy()
        if adv_rows.empty:
            continue
        adv = adv_rows[adv_rows["Team"] == "2TM"] if "2TM" in adv_rows["Team"].values else adv_rows.iloc[[0]]

        # 合并为一行
        merged = pd.DataFrame({
            "Player": [player],
            "Season": [f"{year-1}-{str(year)[-2:]}"],
            "Team": pg["Team"].values[0],
    
        # 基础数据
            "PTS": pd.to_numeric(pg["PTS"].values[0], errors="coerce"),
            "TRB": pd.to_numeric(pg["TRB"].values[0], errors="coerce"),
            "AST": pd.to_numeric(pg["AST"].values[0], errors="coerce"),
            "STL": pd.to_numeric(pg["STL"].values[0], errors="coerce"),
            "BLK": pd.to_numeric(pg["BLK"].values[0], errors="coerce"),
            "TOV": pd.to_numeric(pg["TOV"].values[0], errors="coerce"),
            "FT": pd.to_numeric(pg["FT"].values[0], errors="coerce"),

        # 命中率类（来自 per_game）
            "FG%": pd.to_numeric(pg["FG%"].values[0], errors="coerce"),
            "3P%": pd.to_numeric(pg["3P%"].values[0], errors="coerce"),
            "2P%": pd.to_numeric(pg["2P%"].values[0], errors="coerce"),
            "FT%": pd.to_numeric(pg["FT%"].values[0], errors="coerce"),

        # 高阶数据（来自 advanced）
            "TS%": pd.to_numeric(adv["TS%"].values[0], errors="coerce"),
            "BPM": pd.to_numeric(adv["BPM"].values[0], errors="coerce"),
            "OBPM": pd.to_numeric(adv["OBPM"].values[0], errors="coerce"),
            "DBPM": pd.to_numeric(adv["DBPM"].values[0], errors="coerce"),
            "PER": pd.to_numeric(adv["PER"].values[0], errors="coerce"),
            "WS": pd.to_numeric(adv["WS"].values[0], errors="coerce")
        })


        all_data.append(merged)

# 合并所有数据并导出
if all_data:
    df_all = pd.concat(all_data)
    df_all.to_csv("output/nba_selected_stats.csv", index=False)
    print("✅ Saved to output/nba_selected_stats.csv")
else:
    print("❌ No data collected.")

Processing 2016...
Processing 2017...
Processing 2018...
Processing 2019...
Processing 2020...
Processing 2021...
Processing 2022...
Processing 2023...
Processing 2024...
Processing 2025...
✅ Saved to output/nba_selected_stats.csv


In [5]:
import pandas as pd
import os
import time
import random
import requests
from bs4 import BeautifulSoup
import unicodedata

def strip_accents(text):
    """统一去除重音，避免匹配失败"""
    return ''.join(c for c in unicodedata.normalize('NFD', text)
                   if unicodedata.category(c) != 'Mn')

# 设置球员名单和年份范围
players = [
    # Superstar
    "LeBron James", "Kevin Durant", "Stephen Curry", "Nikola Jokić",
    "Giannis Antetokounmpo", "Joel Embiid", "Kawhi Leonard", "James Harden",
    "Russell Westbrook",

    # All-Star
    "Jayson Tatum", "Luka Doncic", "Anthony Davis", "Devin Booker",
    "Damian Lillard", "Jimmy Butler", "Kyrie Irving", "Paul George", "Chris Paul",

    # Great players
    "Shai Gilgeous-Alexander", "Ja Morant", "Zion Williamson", "LaMelo Ball",
    "Anthony Edwards", "Tyrese Haliburton", "Cade Cunningham", "Jalen Green",
    "Scottie Barnes", "Victor Wembanyama", "Chet Holmgren", "Klay Thompson",
    "Bradley Beal", "DeMar DeRozan", "Mike Conley", "Julius Randle",
    
    "Jaylen Brown", "Jrue Holiday", "Bam Adebayo", "Andrew Wiggins",
    "Kristaps Porzingis", "Deandre Ayton", "CJ McCollum", "Michael Porter Jr.",
    "Desmond Bane", "RJ Barrett", "Austin Reaves", "Derrick White","Donovan Mitchell",

    # Good players
    "De'Aaron Fox", "Domantas Sabonis", "Karl-Anthony Towns", "Brandon Ingram",
    "Pascal Siakam", "Fred VanVleet", "Draymond Green", "Brook Lopez",
    "Mikal Bridges", "Aaron Gordon", "Tyrese Maxey", "Darius Garland",

    # Progress Player 
    "Jalen Brunson", "Franz Wagner", "Alperen Sengun", "Josh Giddey",
    "Bennedict Mathurin", "Keegan Murray", "Immanuel Quickley", "Cam Thomas"
]

years = list(range(2016, 2026))

# 设置输出目录
os.makedirs("output", exist_ok=True)

# 自定义请求头（伪装浏览器）
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

# 定义一个辅助函数：获取网页表格
def get_table(url):
    try:
        response = requests.get(url, headers=HEADERS)
        response.raise_for_status()
        return pd.read_html(response.text)[0]
    except Exception as e:
        print(f"⚠️ Error fetching {url}: {e}")
        return None

# 开始抓取数据
all_data = []

for year in years:
    print(f"Processing {year}...")
    
    # 每年之间加随机延时
    time.sleep(random.uniform(3, 6))

    # 1. per_game
    url_pg = f"https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html"
    df_pg = get_table(url_pg)
    if df_pg is None:
        continue
    df_pg = df_pg[df_pg["Player"] != "Player"]

    # 2. advanced
    url_adv = f"https://www.basketball-reference.com/leagues/NBA_{year}_advanced.html"
    df_adv = get_table(url_adv)
    if df_adv is None:
        continue
    df_adv = df_adv[df_adv["Player"] != "Player"]

    for player in players:
        # per_game
        pg_rows = df_pg[df_pg["Player"].apply(strip_accents) == strip_accents(player)].copy()
        if pg_rows.empty:
            continue
        pg = pg_rows[pg_rows["Team"] == "2TM"] if "2TM" in pg_rows["Team"].values else pg_rows.iloc[[0]]

        # advanced
        adv_rows = df_adv[df_adv["Player"].apply(strip_accents) == strip_accents(player)].copy()
        if adv_rows.empty:
            continue
        adv = adv_rows[adv_rows["Team"] == "2TM"] if "2TM" in adv_rows["Team"].values else adv_rows.iloc[[0]]

        # 合并为一行
        merged = pd.DataFrame({
            "Player": [player],
            "Season": [f"{year-1}-{str(year)[-2:]}"],
            "Team": pg["Team"].values[0],
    
        # 基础数据
            "PTS": pd.to_numeric(pg["PTS"].values[0], errors="coerce"),
            "TRB": pd.to_numeric(pg["TRB"].values[0], errors="coerce"),
            "AST": pd.to_numeric(pg["AST"].values[0], errors="coerce"),
            "STL": pd.to_numeric(pg["STL"].values[0], errors="coerce"),
            "BLK": pd.to_numeric(pg["BLK"].values[0], errors="coerce"),
            "TOV": pd.to_numeric(pg["TOV"].values[0], errors="coerce"),
            "FT": pd.to_numeric(pg["FT"].values[0], errors="coerce"),

        # 命中率类（来自 per_game）
            "FG%": pd.to_numeric(pg["FG%"].values[0], errors="coerce"),
            "3P%": pd.to_numeric(pg["3P%"].values[0], errors="coerce"),
            "2P%": pd.to_numeric(pg["2P%"].values[0], errors="coerce"),
            "FT%": pd.to_numeric(pg["FT%"].values[0], errors="coerce"),

        # 高阶数据（来自 advanced）
            "TS%": pd.to_numeric(adv["TS%"].values[0], errors="coerce"),
            "BPM": pd.to_numeric(adv["BPM"].values[0], errors="coerce"),
            "OBPM": pd.to_numeric(adv["OBPM"].values[0], errors="coerce"),
            "DBPM": pd.to_numeric(adv["DBPM"].values[0], errors="coerce"),
            "PER": pd.to_numeric(adv["PER"].values[0], errors="coerce"),
            "WS": pd.to_numeric(adv["WS"].values[0], errors="coerce")
        })


        all_data.append(merged)

# 合并所有数据并导出
if all_data:
    df_all = pd.concat(all_data)
    df_all.to_csv("output/nba_selected_stats.csv", index=False)
    print("✅ Saved to output/nba_selected_stats.csv")
else:
    print("❌ No data collected.")

Processing 2016...
Processing 2017...
Processing 2018...
Processing 2019...
Processing 2020...
Processing 2021...
Processing 2022...
Processing 2023...
Processing 2024...
Processing 2025...
✅ Saved to output/nba_selected_stats.csv
