In [1]:
import requests
import pandas as pd
import time
import os
import json
from datetime import date, timedelta

In [2]:
def fetch_tournaments(start_date, end_date, game_format='standard'):
    """
    从AlwaysBeRunning API获取指定时间范围和赛制下的锦标赛数据。
    (V2.0 - 已修正日期格式以匹配API要求)
    """
    print(f"\n--- 正在获取锦标赛列表 ---")
    print(f"时间范围: {start_date} 到 {end_date}, 赛制: {game_format}")
    
    base_url = "https://alwaysberunning.net/api/tournaments" 
    
    # --- 【关键修正】在构建params字典前，转换日期格式 ---
    start_date_api_format = start_date.replace('-', '.')
    end_date_api_format = end_date.replace('-', '.')

    print(f"注意：已将日期转换为API要求的格式 -> {start_date_api_format} 到 {end_date_api_format}")
    
    # 使用转换后的日期格式来构建参数
    params = {
        'format': game_format,
        'concluded': 1,
        'start': start_date_api_format,  # <-- 使用转换后的格式
        'end': end_date_api_format       # <-- 使用转换后的格式
    }
    
    try:
        response = requests.get(base_url, params=params)
        print(f"实际请求的URL: {response.url}") # 打印URL方便调试
        response.raise_for_status()
        tournaments = response.json()
        print(f"✅ 成功获取了 {len(tournaments)} 场比赛的数据。")
        return tournaments
    except requests.exceptions.RequestException as e:
        print(f"❌ 请求失败: {e}")
        return []

def run_data_pipeline(start_date, end_date, game_format='standard', min_claim_count=4):
    """
    一个完整的端到端数据管道函数：
    1. 获取并筛选指定日期范围内的锦标赛。
    2. 批量获取所有符合条件的牌表链接。
    3. 批量请求NetrunnerDB，获取详细卡组构成。
    4. 清洗、合并数据，并以日期戳命名保存为两个CSV文件。
    """
    print(f"\n{'='*20} 启动数据管道 {'='*20}")

    # --- 步骤1: 获取并筛选高质量的锦标赛ID ---
    initial_tournaments = fetch_tournaments(start_date, end_date, game_format)
    if not initial_tournaments:
        print("未获取到任何锦标赛数据，管道终止。")
        return

    filtered_ids = []
    for t in initial_tournaments:
        if t.get('claim_count', 0) >= min_claim_count and t.get('claim_conflict') == False:
            filtered_ids.append(t['id'])
    print(f"筛选后得到 {len(filtered_ids)} 场高质量比赛。")
    if not filtered_ids:
        print("没有符合筛选条件的比赛，管道终止。")
        return

    # --- 步骤2: 批量获取所有牌表链接 ---
    all_deck_urls = []
    print("\n--- 正在获取所有牌表链接 ---")
    for i, tournament_id in enumerate(filtered_ids):
        if (i + 1) % 10 == 0:
            print(f"处理进度: {i + 1} / {len(filtered_ids)} ...")
        
        params = {'id': tournament_id}
        url = "https://alwaysberunning.net/api/entries"
        try:
            response = requests.get(url, params=params)
            response.raise_for_status()
            entries_data = response.json()
            if entries_data:
                for entry in entries_data:
                    all_deck_urls.append({
                        'tournament_id': tournament_id,
                        'player': entry.get('user_name', 'N/A'),
                        'corp_deck_url': entry.get('corp_deck_url'),
                        'runner_deck_url': entry.get('runner_deck_url')
                    })
        except requests.exceptions.RequestException as e:
            print(f"  -> 错误: 请求锦标赛 {tournament_id} 的参赛列表失败: {e}")
        time.sleep(1)
    print(f"✅ 成功收集到 {len(all_deck_urls)} 份包含URL的牌表信息。")

    # --- 步骤3: 批量获取详细卡组构成 ---
    corp_card_data = []
    runner_card_data = []
    print("\n--- 正在从NetrunnerDB获取详细卡组构成 ---")
    for i, deck_info in enumerate(all_deck_urls):
        if (i + 1) % 10 == 0:
            print(f"处理进度: {i + 1} / {len(all_deck_urls)} ...")

        # 处理公司方
        if deck_info.get('corp_deck_url'):
            try:
                decklist_id = deck_info['corp_deck_url'].split('/')[5]
                api_url = f"https://netrunnerdb.com/api/2.0/public/decklist/{decklist_id}"
                response = requests.get(api_url)
                response.raise_for_status()
                decklist_data = response.json()
                if decklist_data.get('success') and decklist_data.get('data'):
                    cards_dict = decklist_data['data'][0].get('cards')
                    if cards_dict:
                        for card_id, quantity in cards_dict.items():
                            corp_card_data.append({'decklist_id': decklist_id, 'card_id': card_id, 'quantity': quantity})
            except Exception as e:
                print(f"处理公司方URL {deck_info['corp_deck_url']} 时出错: {e}")
        
        # 处理潜袭者方
        if deck_info.get('runner_deck_url'):
            try:
                decklist_id = deck_info['runner_deck_url'].split('/')[5]
                api_url = f"https://netrunnerdb.com/api/2.0/public/decklist/{decklist_id}"
                response = requests.get(api_url)
                response.raise_for_status()
                decklist_data = response.json()
                if decklist_data.get('success') and decklist_data.get('data'):
                    cards_dict = decklist_data['data'][0].get('cards')
                    if cards_dict:
                        for card_id, quantity in cards_dict.items():
                            runner_card_data.append({'decklist_id': decklist_id, 'card_id': card_id, 'quantity': quantity})
            except Exception as e:
                print(f"处理潜袭者方URL {deck_info['runner_deck_url']} 时出错: {e}")
        time.sleep(1)

    print(f"✅ 成功获取了 {len(corp_card_data)} 条公司卡牌记录和 {len(runner_card_data)} 条潜袭者卡牌记录。")

    # --- 步骤4: 合并卡牌信息并保存到CSV ---
    print("\n--- 正在合并卡牌信息并保存文件 ---")
    try:
        # 获取全量卡牌信息
        all_cards_url = "https://netrunnerdb.com/api/2.0/public/cards"
        response = requests.get(all_cards_url)
        response.raise_for_status()
        cards_info_df = pd.DataFrame(response.json()['data'])
        # 只保留需要的列
        cards_info_df = cards_info_df[['code', 'title', 'type_code', 'faction_code', 'side_code', 'pack_code']]

        # 创建并合并公司方DataFrame
        corp_df = pd.DataFrame(corp_card_data)
        final_corp_df = pd.merge(corp_df, cards_info_df, left_on='card_id', right_on='code', how='left')
        
        # 创建并合并潜袭者方DataFrame
        runner_df = pd.DataFrame(runner_card_data)
        final_runner_df = pd.merge(runner_df, cards_info_df, left_on='card_id', right_on='code', how='left')

        # 动态生成文件名
        file_prefix = f"{start_date.replace('-', '')}_to_{end_date.replace('-', '')}"
        if not os.path.exists('data'):
            os.makedirs('data')
        
        corp_filename = f"data/{file_prefix}_corp_data.csv"
        runner_filename = f"data/{file_prefix}_runner_data.csv"
        
        final_corp_df.to_csv(corp_filename, index=False, encoding='utf-8-sig')
        final_runner_df.to_csv(runner_filename, index=False, encoding='utf-8-sig')

        print(f"✅ 数据管道执行完毕！文件已保存为:\n  - {corp_filename}\n  - {runner_filename}")
        print(f"{'='*20} 管道结束 {'='*20}\n")
    except Exception as e:
        print(f"❌ 在最后一步处理和保存文件时出错: {e}")


In [3]:
run_data_pipeline(
    start_date="2025-05-01", 
    end_date="2025-07-31"
)


run_data_pipeline(
    start_date="2025-08-01", 
    end_date="2025-10-05"
)



--- 正在获取锦标赛列表 ---
时间范围: 2025-05-01 到 2025-07-31, 赛制: standard
注意：已将日期转换为API要求的格式 -> 2025.05.01 到 2025.07.31
实际请求的URL: https://alwaysberunning.net/api/tournaments?format=standard&concluded=1&start=2025.05.01&end=2025.07.31
✅ 成功获取了 114 场比赛的数据。
筛选后得到 45 场高质量比赛。

--- 正在获取所有牌表链接 ---
处理进度: 10 / 45 ...
处理进度: 20 / 45 ...
处理进度: 30 / 45 ...
处理进度: 40 / 45 ...
✅ 成功收集到 1145 份包含URL的牌表信息。

--- 正在从NetrunnerDB获取详细卡组构成 ---
处理进度: 10 / 1145 ...
处理进度: 20 / 1145 ...
处理进度: 30 / 1145 ...
处理进度: 40 / 1145 ...
处理进度: 50 / 1145 ...
处理进度: 60 / 1145 ...
处理进度: 70 / 1145 ...
处理进度: 80 / 1145 ...
处理进度: 90 / 1145 ...
处理进度: 100 / 1145 ...
处理进度: 110 / 1145 ...
处理进度: 120 / 1145 ...
处理进度: 130 / 1145 ...
处理进度: 140 / 1145 ...
处理进度: 150 / 1145 ...
处理进度: 160 / 1145 ...
处理进度: 170 / 1145 ...
处理进度: 180 / 1145 ...
处理进度: 190 / 1145 ...
处理进度: 200 / 1145 ...
处理进度: 210 / 1145 ...
处理进度: 220 / 1145 ...
处理进度: 230 / 1145 ...
处理进度: 240 / 1145 ...
处理进度: 250 / 1145 ...
处理进度: 260 / 1145 ...
处理进度: 270 / 1145 ...
处理进度: 280 / 1145 ...
处理进度: 290 