In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import pandas as pd
import efinance as ef       # pip install efinance

def main():
    # 1. 定位文件
    in_file  = os.path.expanduser("~/Desktop/行情与分析20250605.xlsx")
    out_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

    # 2. 读取原始 Excel
    df_raw = pd.read_excel(in_file, dtype=str)  # 支持 .xlsx/.xls 等格式 :contentReference[oaicite:8]{index=8}

    # 3. 拉取全市场可转债基础信息
    #    包含字段：债券代码、债券名称、正股代码、正股名称、...... :contentReference[oaicite:9]{index=9}
    df_all = ef.bond.get_all_base_info()

    # 4. 清洗字段：提取无后缀的可转债 code 和纯数字正股 code/name
    df_all["bond_code"]  = df_all["债券代码"].astype(str).str.strip()       # e.g. '110059' :contentReference[oaicite:10]{index=10}
    df_all["stock_code"] = df_all["正股代码"].astype(str).str.strip()       # e.g. '600000' :contentReference[oaicite:11]{index=11}
    df_all["stock_name"] = df_all["正股名称"].astype(str).str.strip()       # e.g. '浦发银行' :contentReference[oaicite:12]{index=12}

    # 5. 原表中提取 bond_code（去掉 '.SH'/'.SZ' 后缀）
    df_raw["bond_code"] = df_raw["代码"].str.split(".").str[0]

    # 6. 左合并：把正股信息映射到原表
    df_merged = df_raw.merge(
        df_all[["bond_code", "stock_code", "stock_name"]],
        on="bond_code", how="left"
    )

    # 7. 构造最终列：带后缀的正股代码 + 正股名称
    def add_suffix(code: str) -> str:
        if not isinstance(code, str) or not code.isdigit():
            return ""
        return code + (".SH" if code.startswith(("60","68")) else ".SZ")

    df_merged["正股代码"] = df_merged["stock_code"].apply(add_suffix)
    df_merged["正股名称"] = df_merged["stock_name"].fillna("")

    # 8. 删除中间列并保存到 Excel
    df_final = df_merged.drop(columns=["bond_code", "stock_code", "stock_name"])
    df_final.to_excel(out_file, index=False, engine="openpyxl")  # 支持写入 .xlsx :contentReference[oaicite:13]{index=13}

    print("✅ 正股映射已完成，结果保存在：", out_file)

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm


✅ 正股映射已完成，结果保存在： /Users/sam/Desktop/正股映射结果.xlsx


In [22]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import re
import time
import requests
import openpyxl
import pandas as pd

# —— 配置区 —— 
YEAR = 2024                           # 目标年份
GZH  = f"【年报】"
exclude_keywords = ['英文', '已取消', '摘要']
trade = ""                            # 行业过滤，不需要则留空
plate = ""                            # 板块过滤，不需要则留空

# 输出目录
output_dir = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)

# 正股映射表路径（第一部分脚本生成）
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

def get_report(page_num: int, date_range: str) -> requests.Response:
    """
    调用巨潮网历史公告查询接口
    """
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    headers = {
        "Accept": "*/*",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin": "http://www.cninfo.com.cn",
        "Referer": (
            "http://www.cninfo.com.cn/new/commonUrl/"
            "pageOfSearch?url=disclosure/list/search"
            "&checkedCategory=category_ndbg_szsh"
        ),
        "User-Agent": "Mozilla/5.0",
        "X-Requested-With": "XMLHttpRequest"
    }
    payload = {
        "pageNum": page_num,
        "pageSize": 30,
        "column": "szse",
        "tabName": "fulltext",
        "plate": plate,
        "searchkey": "",
        "secid": "",
        "category": "category_ndbg_szsh",
        "trade": trade,
        "seDate": date_range,
        "sortName": "code",
        "sortType": "asc",
        "isHLtitle": "false"
    }
    return requests.post(url, data=payload, headers=headers)

def download_report(date_range: str) -> list[dict]:
    """
    分页下载单个时间段内的所有公告
    """
    results = []
    page = 1
    resp = get_report(page, date_range)
    try:
        total_pages = resp.json().get("totalpages", 0)
    except Exception:
        return results
    if total_pages == 0:
        return results

    while page <= total_pages:
        for attempt in range(3):
            resp = get_report(page, date_range)
            try:
                resp.raise_for_status()
                ann = resp.json().get("announcements", [])
                if ann:
                    results.extend(ann)
                break
            except Exception:
                time.sleep(5)
        page += 1

    return results

def filter_latest_versions(announcements: list[dict]) -> list[dict]:
    """
    同一公司同一年只保留最新“更正/修订”版本
    """
    rev_kws = ['更正','更正后','更正版','修订后','修订版','更新后','更新版']
    latest = {}
    for item in announcements:
        sec  = item['secCode']
        title= item['announcementTitle']
        year_match = re.search(r"(\d{4})年", title)
        year_str   = year_match.group(1) if year_match else ""
        key = (sec, year_str)
        if key not in latest:
            latest[key] = item
        else:
            curr = latest[key]['announcementTitle']
            # 新条目若为修订版，且已有条目不是，则替换
            if any(kw in title for kw in rev_kws) and not any(kw in curr for kw in rev_kws):
                latest[key] = item
    return list(latest.values())

def download_reports_for_year(year: int) -> list[dict]:
    """
    抓取上一年发布的年度报告公告：
    1) 主时间段：year-01-01~year-12-31
    2) 次年分段：year+1 的若干子区间
    """
    # 次年分段，以捕获跨年发布时间
    ny = year + 1
    segments = [
        f"{ny}-01-01~{ny}-04-01", f"{ny}-04-02~{ny}-04-15",
        f"{ny}-04-16~{ny}-04-22", f"{ny}-04-23~{ny}-04-26",
        f"{ny}-04-27~{ny}-04-28", f"{ny}-04-29~{ny}-04-30",
        f"{ny}-05-01~{ny}-07-31", f"{ny}-08-01~{ny}-10-31",
        f"{ny}-11-01~{ny}-11-30", f"{ny}-12-01~{ny}-12-31"
    ]

    all_ann = []
    for seg in segments:
        all_ann.extend(download_report(seg))

    # 排除含有摘要等关键词的公告
    filtered = [
        x for x in all_ann
        if not any(kw in x['announcementTitle'] for kw in exclude_keywords)
    ]
    return filter_latest_versions(filtered)

def write_selected_excel(announcements: list[dict], year: int) -> None:
    """
    只输出映射表中正股对应公司的年报链接
    """
    # 读取映射表，提取无后缀的正股代码
    df_map = pd.read_excel(mapping_file, dtype=str)
    codes  = df_map['正股代码'].str.split('.').str[0].tolist()

    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = f"{year}年报"
    ws.append(["可转债正股代码", "公司简称", "标题", "年份", "年报链接"])
    for it in announcements:
        sec = it["secCode"]
        if sec not in codes:
            continue
        name      = it["secName"]
        raw_title = re.sub(r"<.*?>", "", it["announcementTitle"]).replace("：", "")
        title     = f"《{raw_title}》"
        ym        = re.search(r"(\d{4})年", raw_title)
        yr        = ym.group(1) if ym else ""
        url       = f"http://static.cninfo.com.cn/{it['adjunctUrl']}"
        ws.append([sec, name, title, yr, url])

    dst = os.path.join(output_dir, f"年报链接_{year}_选取公司{GZH}.xlsx")
    wb.save(dst)
    print(f"已保存：{dst}，共 {ws.max_row-1} 条链接")

if __name__ == "__main__":
    # 抓取并过滤
    all_ann = download_reports_for_year(YEAR)
    write_selected_excel(all_ann, YEAR)
    print(f"---- {YEAR} 年下载完成 ----")


已保存：/Users/sam/Desktop/cninfo_output/年报链接_2024_选取公司【年报】.xlsx，共 469 条链接
---- 2024 年下载完成 ----


In [23]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import re
import time
import requests
import openpyxl
import pandas as pd

# —— 参数配置 —— 
YEAR = 2024                           # 目标年份
GZH  = "【年报】"
exclude_keywords = ['英文', '已取消', '摘要']
trade = ""                            # 行业过滤，不需要则留空
plate = ""                            # 板块过滤，不需要则留空

# 输出目录
output_dir = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)

# 正股映射表路径（第一部分脚本生成）
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")


def get_report(page_num: int, date_range: str) -> requests.Response:
    """调用巨潮网历史公告查询接口"""
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    headers = {
        "Accept": "*/*",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin": "http://www.cninfo.com.cn",
        "Referer": (
            "http://www.cninfo.com.cn/new/commonUrl/"
            "pageOfSearch?url=disclosure/list/search"
            "&checkedCategory=category_ndbg_szsh"
        ),
        "User-Agent": "Mozilla/5.0",
        "X-Requested-With": "XMLHttpRequest"
    }
    payload = {
        "pageNum": page_num,
        "pageSize": 30,
        "column": "szse",    # 如果需要同时搜沪深，可循环 ["szse","shse"]
        "tabName": "fulltext",
        "plate": plate,
        "searchkey": "",
        "secid": "",
        "category": "category_ndbg_szsh",
        "trade": trade,
        "seDate": date_range,
        "sortName": "code",
        "sortType": "asc",
        "isHLtitle": "false"
    }
    return requests.post(url, data=payload, headers=headers)


def download_report(date_range: str) -> list[dict]:
    """分页下载单个时间段内的所有公告"""
    results = []
    page = 1
    resp = get_report(page, date_range)
    try:
        total_pages = resp.json().get("totalpages", 0)
    except Exception:
        return results
    if total_pages == 0:
        return results

    while page <= total_pages:
        for _ in range(3):
            resp = get_report(page, date_range)
            try:
                resp.raise_for_status()
                ann = resp.json().get("announcements", [])
                if ann:
                    results.extend(ann)
                break
            except Exception:
                time.sleep(5)
        page += 1

    return results


def filter_latest_versions(anns: list[dict]) -> list[dict]:
    """同一公司同一年只保留最新“更正/修订”版本"""
    rev_kws = ['更正','更正后','更正版','修订后','修订版','更新后','更新版']
    latest = {}
    for it in anns:
        sec = it['secCode']
        title = it['announcementTitle']
        ym = re.search(r"(\d{4})年", title)
        yr = ym.group(1) if ym else ""
        key = (sec, yr)
        if key not in latest:
            latest[key] = it
        else:
            curr = latest[key]['announcementTitle']
            # 如果本条为修订版，而已有条目不是，则替换
            if any(kw in title for kw in rev_kws) and not any(kw in curr for kw in rev_kws):
                latest[key] = it
    return list(latest.values())


def download_reports_for_year(year: int) -> list[dict]:
    """
    抓取目标年份的年度报告公告：
     1) year 全年（捕获当年整年公告）
     2) year+1 分段 （捕获跨年发布）
    """
    # 主时间段（可不用请求，只做分段足够）
    # main_range = f"{year}-01-01~{year}-12-31"
    # 次年分段，以捕获跨年发布时间
    ny = year + 1
    segments = [
        f"{ny}-01-01~{ny}-04-01", f"{ny}-04-02~{ny}-04-15",
        f"{ny}-04-16~{ny}-04-22", f"{ny}-04-23~{ny}-04-26",
        f"{ny}-04-27~{ny}-04-28", f"{ny}-04-29~{ny}-04-30",
        f"{ny}-05-01~{ny}-07-31", f"{ny}-08-01~{ny}-10-31",
        f"{ny}-11-01~{ny}-11-30", f"{ny}-12-01~{ny}-12-31"
    ]
    all_ann = []
    for seg in segments:
        all_ann.extend(download_report(seg))

    # 排除包含无用关键词的公告
    filtered = [
        x for x in all_ann
        if not any(kw in x['announcementTitle'] for kw in exclude_keywords)
    ]
    return filter_latest_versions(filtered)


def write_selected_excel(anns: list[dict], year: int) -> None:
    """
    根据映射表顺序，输出可转债→正股的年报链接
    """
    # 读取映射表
    df_map = pd.read_excel(mapping_file, dtype=str)
    # 确保映射表顺序与原表一致
    df_map['sec_no_suf'] = df_map['正股代码'].str.split('.').str[0]

    # 把公告列表按 secCode 建字典，便于快速查找
    ann_dict = { it['secCode']: it for it in anns }

    # 准备写表
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = f"{year}年报"
    ws.append([
        "可转债代码", "可转债名称",
        "公司代码", "公司简称",
        "标题", "年份", "年报链接"
    ])

    # 按映射表原始顺序输出
    count = 0
    for _, row in df_map.iterrows():
        bond_code = row['代码']
        bond_name = row.get('名称', "")
        sec = row['sec_no_suf']
        if sec not in ann_dict:
            continue
        it = ann_dict[sec]
        raw = re.sub(r"<.*?>", "", it['announcementTitle']).replace("：", "")
        title = f"《{raw}》"
        ym = re.search(r"(\d{4})年", raw)
        yr = ym.group(1) if ym else ""
        url = f"http://static.cninfo.com.cn/{it['adjunctUrl']}"

        ws.append([bond_code, bond_name, sec, it['secName'], title, yr, url])
        count += 1

    dst = os.path.join(output_dir, f"年报链接_{year}_选取公司{GZH}.xlsx")
    wb.save(dst)
    print(f"✅ 已保存：{dst}  （共 {count} 条链接）")


if __name__ == "__main__":
    # 1) 抓取公告并过滤
    all_ann = download_reports_for_year(YEAR)
    # 2) 按映射表顺序写出最终选取公司年报链接
    write_selected_excel(all_ann, YEAR)
    print(f"---- {YEAR} 年下载完成 ----")


✅ 已保存：/Users/sam/Desktop/cninfo_output/年报链接_2024_选取公司【年报】.xlsx  （共 472 条链接）
---- 2024 年下载完成 ----


In [24]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import re
import time
import requests
import openpyxl
import pandas as pd

# —— 参数配置 —— 
YEAR = 2024                           # 目标年份
GZH  = "【年报】"
exclude_keywords = ['英文', '已取消', '摘要']
# 行业过滤、不需要可留空
trade = ""
# 板块过滤、不需要可留空
plate = ""

# 输出目录
output_dir = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)

# 第一部分脚本生成的正股映射表路径
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

# 全局 market 列（在主流程里切换 szse / shse）
column = "szse"


def get_report(page_num: int, date_range: str) -> requests.Response:
    """调用巨潮网历史公告查询接口"""
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    headers = {
        "Accept": "*/*",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin": "http://www.cninfo.com.cn",
        "Referer": (
            "http://www.cninfo.com.cn/new/commonUrl/"
            "pageOfSearch?url=disclosure/list/search"
            "&checkedCategory=category_ndbg_szsh"
        ),
        "User-Agent": "Mozilla/5.0",
        "X-Requested-With": "XMLHttpRequest"
    }
    payload = {
        "pageNum": page_num,
        "pageSize": 30,
        "column": column,
        "tabName": "fulltext",
        "plate": plate,
        "searchkey": "",
        "secid": "",
        "category": "category_ndbg_szsh",
        "trade": trade,
        "seDate": date_range,
        "sortName": "code",
        "sortType": "asc",
        "isHLtitle": "false"
    }
    return requests.post(url, data=payload, headers=headers)


def download_report(date_range: str) -> list[dict]:
    """分页下载单个时间段内的公告列表"""
    results = []
    page = 1
    try:
        total_pages = get_report(1, date_range).json().get("totalpages", 0)
    except Exception:
        return results
    if total_pages == 0:
        return results

    while page <= total_pages:
        for _ in range(3):
            resp = get_report(page, date_range)
            try:
                resp.raise_for_status()
                ann = resp.json().get("announcements", [])
                if ann:
                    results.extend(ann)
                break
            except Exception:
                time.sleep(5)
        page += 1

    return results


def filter_latest_versions(anns: list[dict]) -> list[dict]:
    """同一公司同一年只保留最新“更正/修订”版本"""
    rev_kws = ['更正','更正后','更正版','修订后','修订版','更新后','更新版']
    latest = {}
    for it in anns:
        sec   = it['secCode']
        title = it['announcementTitle']
        m     = re.search(r"(\d{4})年", title)
        yr    = m.group(1) if m else ""
        key   = (sec, yr)
        if key not in latest:
            latest[key] = it
        else:
            prev = latest[key]['announcementTitle']
            # 若本条为修订版，且已有条目不是，则替换
            if any(kw in title for kw in rev_kws) and not any(kw in prev for kw in rev_kws):
                latest[key] = it
    return list(latest.values())


def download_reports_for_year(year: int) -> list[dict]:
    """
    抓取目标年度报告公告（只分次年段以捕获跨年发布）：
    segments: year+1 的若干子区间
    """
    ny = year + 1
    segments = [
        f"{ny}-01-01~{ny}-04-01", f"{ny}-04-02~{ny}-04-15",
        f"{ny}-04-16~{ny}-04-22", f"{ny}-04-23~{ny}-04-26",
        f"{ny}-04-27~{ny}-04-28", f"{ny}-04-29~{ny}-04-30",
        f"{ny}-05-01~{ny}-07-31", f"{ny}-08-01~{ny}-10-31",
        f"{ny}-11-01~{ny}-11-30", f"{ny}-12-01~{ny}-12-31"
    ]
    all_ann = []
    for seg in segments:
        all_ann.extend(download_report(seg))

    # 排除无效关键词
    filtered = [
        x for x in all_ann
        if not any(kw in x['announcementTitle'] for kw in exclude_keywords)
    ]
    return filter_latest_versions(filtered)


def write_selected_excel(anns: list[dict], year: int) -> None:
    """
    按映射表原始顺序，输出可转债→正股的年报链接
    列：可转债代码、可转债名称、公司代码、公司简称、标题、年份、年报链接
    """
    df_map = pd.read_excel(mapping_file, dtype=str)
    df_map['sec_no_suf'] = df_map['正股代码'].str.split('.').str[0]

    # 构建 secCode → 公告 的映射，后面快速查找
    ann_dict = {it['secCode']: it for it in anns}

    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = f"{year}年报"
    ws.append([
        "可转债代码", "可转债名称",
        "公司代码", "公司简称",
        "标题", "年份", "年报链接"
    ])

    count = 0
    for _, row in df_map.iterrows():
        bond_code = row['代码']
        bond_name = row.get('名称', "")
        sec       = row['sec_no_suf']
        if sec not in ann_dict:
            continue
        it  = ann_dict[sec]
        raw = re.sub(r"<.*?>", "", it['announcementTitle']).replace("：", "")
        title = f"《{raw}》"
        m     = re.search(r"(\d{4})年", raw)
        yr    = m.group(1) if m else ""
        url   = f"http://static.cninfo.com.cn/{it['adjunctUrl']}"

        ws.append([bond_code, bond_name, sec, it['secName'], title, yr, url])
        count += 1

    dst = os.path.join(output_dir, f"年报链接_{year}_选取公司{GZH}.xlsx")
    wb.save(dst)
    print(f"✅ 已保存：{dst} （共 {count} 条链接）")


if __name__ == "__main__":
    # 1) 深市 & 沪市 轮询抓取
    all_ann = []
    for market in ("szse", "shse"):
        column = market
        all_ann.extend(download_reports_for_year(YEAR))

    # 2) 去重（跨市场可能重复）并保留最新版本
    all_ann = filter_latest_versions(all_ann)

    # 3) 按映射顺序输出最终文件
    write_selected_excel(all_ann, YEAR)

    print(f"---- {YEAR} 年下载完成 ----")


✅ 已保存：/Users/sam/Desktop/cninfo_output/年报链接_2024_选取公司【年报】.xlsx （共 472 条链接）
---- 2024 年下载完成 ----


In [25]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import re
import time
import requests
import openpyxl
import pandas as pd

# —— 参数配置 —— 
YEAR             = 2024                           # 目标年份
GZH              = "【年报】"
exclude_keywords = ['英文', '已取消', '摘要']
# 行业过滤，不需要则留空
# plate 可在循环中切换
# market (column) 可在循环中切换

# 输出目录
output_dir = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)

# 正股映射表路径（第一部分脚本生成）
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

# 主流市场列（column）与板块（plate）列表
MARKETS = ["szse", "shse"]
PLATES  = [ "sz", "sh", "szmb", "shmb", "szcy", "shkcp", "bj"]


def get_report(page_num: int, date_range: str, column: str, plate: str) -> requests.Response:
    """调用巨潮网历史公告查询接口"""
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    headers = {
        "Accept": "*/*",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin": "http://www.cninfo.com.cn",
        "Referer": (
            "http://www.cninfo.com.cn/new/commonUrl/"
            "pageOfSearch?url=disclosure/list/search"
            "&checkedCategory=category_ndbg_szsh"
        ),
        "User-Agent": "Mozilla/5.0",
        "X-Requested-With": "XMLHttpRequest"
    }
    payload = {
        "pageNum":   page_num,
        "pageSize":  30,
        "column":    column,
        "tabName":   "fulltext",
        "plate":     plate,
        "searchkey": "",
        "secid":     "",
        "category":  "category_ndbg_szsh",
        "trade":     "",
        "seDate":    date_range,
        "sortName":  "code",
        "sortType":  "asc",
        "isHLtitle": "false"
    }
    return requests.post(url, data=payload, headers=headers)


def download_reports_for_segments(segments: list[str], column: str, plate: str) -> list[dict]:
    """
    对于给定 market(column) 和 plate，按各时间段分页拉取公告
    """
    all_ann = []
    for date_range in segments:
        # 首先请求第一页看总页数
        try:
            total = get_report(1, date_range, column, plate).json().get("totalpages", 0)
        except Exception:
            continue
        if total <= 0:
            continue

        page = 1
        while page <= total:
            for _ in range(3):  # 重试三次
                resp = get_report(page, date_range, column, plate)
                try:
                    resp.raise_for_status()
                    anns = resp.json().get("announcements", [])
                    if anns:
                        all_ann.extend(anns)
                    break
                except Exception:
                    time.sleep(2)
            page += 1

    # 排除摘要等无用记录
    filtered = [
        x for x in all_ann
        if not any(kw in x['announcementTitle'] for kw in exclude_keywords)
    ]
    return filtered


def filter_latest_versions(anns: list[dict]) -> list[dict]:
    """同一公司同一年只保留最新“更正/修订”版本"""
    rev_kws = ['更正','更正后','更正版','修订后','修订版','更新后','更新版']
    latest = {}
    for it in anns:
        sec   = it['secCode']
        title = it['announcementTitle']
        m     = re.search(r"(\d{4})年", title)
        yr    = m.group(1) if m else ""
        key   = (sec, yr)
        if key not in latest:
            latest[key] = it
        else:
            prev = latest[key]['announcementTitle']
            # 若本条为修订版，且已有条目不是，则替换
            if any(kw in title for kw in rev_kws) and not any(kw in prev for kw in rev_kws):
                latest[key] = it
    return list(latest.values())


def collect_all_announcements(year: int) -> list[dict]:
    """
    在所有 market×plate 下抓取次年各子区间公告（捕获跨年发布）
    """
    ny = year + 1
    segments = [
        f"{ny}-01-01~{ny}-04-01", f"{ny}-04-02~{ny}-04-15",
        f"{ny}-04-16~{ny}-04-22", f"{ny}-04-23~{ny}-04-26",
        f"{ny}-04-27~{ny}-04-28", f"{ny}-04-29~{ny}-04-30",
        f"{ny}-05-01~{ny}-07-31", f"{ny}-08-01~{ny}-10-31",
        f"{ny}-11-01~{ny}-11-30", f"{ny}-12-01~{ny}-12-31"
    ]
    raw = []
    for col in MARKETS:
        for pl in PLATES:
            raw.extend(download_reports_for_segments(segments, col, pl))
    # 去重（不同 market/plate 可能重复），仅保留最新版本
    return filter_latest_versions(raw)


def write_selected_excel(anns: list[dict], year: int) -> None:
    """
    按映射表顺序输出最终列表，若未找到公告，则链接等留空
    """
    # 1) 读映射表并保持顺序
    df_map = pd.read_excel(mapping_file, dtype=str)
    df_map['sec_no_suf'] = df_map['正股代码'].str.split('.').str[0]

    # 2) 构建 secCode→公告 的快速查找字典
    ann_dict = {(it['secCode'], re.search(r"(\d{4})年", it['announcementTitle']).group(1) if re.search(r"(\d{4})年", it['announcementTitle']) else ""): it
                for it in anns}

    # 3) 创建 Excel
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = f"{year}年报"
    ws.append([
        "可转债代码","可转债名称",
        "公司代码","公司简称",
        "标题","年份","年报链接"
    ])

    # 4) 按原始映射顺序回填
    for _, row in df_map.iterrows():
        bond_code = row['代码']
        bond_name = row.get('名称', "")
        sec       = row['sec_no_suf']
        comp_name = row['正股名称']

        # 查公告：尝试匹配 (sec, YEAR)
        key = (sec, str(year))
        it  = ann_dict.get(key)

        if it:
            raw = re.sub(r"<.*?>","", it['announcementTitle']).replace("：","")
            title = f"《{raw}》"
            url   = f"http://static.cninfo.com.cn/{it['adjunctUrl']}"
            yr    = str(year)
        else:
            title = ""
            url   = ""
            yr    = ""

        ws.append([bond_code, bond_name, sec, comp_name, title, yr, url])

    dst = os.path.join(output_dir, f"年报链接_{year}_选取公司{GZH}.xlsx")
    wb.save(dst)
    print(f"✅ 已输出：{dst}")


if __name__ == "__main__":
    # 1) 抓公告并过滤
    all_ann = collect_all_announcements(YEAR)
    # 2) 按映射顺序输出最终 Excel
    write_selected_excel(all_ann, YEAR)
    print(f"---- {YEAR} 年下载完成 ----")


✅ 已输出：/Users/sam/Desktop/cninfo_output/年报链接_2024_选取公司【年报】.xlsx
---- 2024 年下载完成 ----


In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import re
import time
import requests
import openpyxl
import pandas as pd

# —— 参数配置 —— 
YEAR             = 2024                           # 目标年份
GZH              = "【年报】"
# 排除列表：可加入 '更正后','修订版' 等，避免重复或旧版本年报
exclude_keywords = ['英文', '已取消', '摘要']

# 输出目录
output_dir = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)

# 正股映射表路径（第一部分脚本生成）
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

# —— 市场控制 (column) —— 
# szse = 深圳交易所（含深主板、中小板、创业板）
# shse = 上海交易所（含主板）
MARKETS = ["szse", "shse"]

# —— 板块控制 (plate) —— 
# ""      = 默认，不做二次过滤，拉取 category_ndbg_szsh 下所有子板块
# sz      = 深市（等同 column="szse" + plate=""）
# szmb    = 深市主板
# szcy    = 创业板
# sh      = 沪市（等同 column="shse" + plate=""）
# shmb    = 沪市主板
# shkcp   = 科创板（需配合 category_ndbg_shkcp 使用）
# bj      = 北交所（需配合 category_ndbg_bj 使用）
PLATES  = ["", "sz", "szmb", "szcy", "sh", "shmb", "shkcp", "bj"]


def get_report(page_num: int, date_range: str, column: str, plate: str) -> requests.Response:
    """调用巨潮网历史公告查询接口"""
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    payload = {
        "pageNum":   page_num,
        "pageSize":  30,
        "column":    column,       # 来自 MARKETS
        "tabName":   "fulltext",
        "category":  "category_ndbg_szsh",  # 年报大类（深市主板/中小/创业板）
        "plate":     plate,        # 子板块，参见 PLATES 注释
        "searchkey": "",
        "secid":     "",
        "trade":     "",
        "seDate":    date_range,
        "sortName":  "code",
        "sortType":  "asc",
        "isHLtitle": "false"
    }
    headers = {
        "Accept":             "*/*",
        "Accept-Encoding":    "gzip, deflate",
        "Accept-Language":    "zh-CN,zh;q=0.9",
        "Content-Type":       "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin":             "http://www.cninfo.com.cn",
        "Referer": (
            "http://www.cninfo.com.cn/new/commonUrl/"
            "pageOfSearch?url=disclosure/list/search"
            "&checkedCategory=category_ndbg_szsh"
        ),
        "User-Agent":         "Mozilla/5.0",
        "X-Requested-With":   "XMLHttpRequest"
    }
    return requests.post(url, data=payload, headers=headers)


def download_reports_for_segments(segments: list[str], column: str, plate: str) -> list[dict]:
    """
    对于给定 market(column) 和 plate，按各时间段分页拉取公告
    """
    all_ann = []
    for date_range in segments:
        try:
            total = get_report(1, date_range, column, plate).json().get("totalpages", 0)
        except Exception:
            continue
        if total <= 0:
            continue

        page = 1
        while page <= total:
            for _ in range(3):  # 重试三次
                resp = get_report(page, date_range, column, plate)
                try:
                    resp.raise_for_status()
                    anns = resp.json().get("announcements", [])
                    if anns:
                        all_ann.extend(anns)
                    break
                except Exception:
                    time.sleep(2)
            page += 1

    # 排除摘要等无用记录
    return [
        x for x in all_ann
        if not any(kw in x['announcementTitle'] for kw in exclude_keywords)
    ]


def filter_latest_versions(anns: list[dict]) -> list[dict]:
    """同一公司同一年只保留最新“更正/修订”版本"""
    rev_kws = ['更正','更正后','更正版','修订后','修订版','更新后','更新版']
    latest = {}
    for it in anns:
        sec   = it['secCode']
        title = it['announcementTitle']
        m     = re.search(r"(\d{4})年", title)
        yr    = m.group(1) if m else ""
        key   = (sec, yr)
        if key not in latest:
            latest[key] = it
        else:
            prev = latest[key]['announcementTitle']
            if any(kw in title for kw in rev_kws) and not any(kw in prev for kw in rev_kws):
                latest[key] = it
    return list(latest.values())


def collect_all_announcements(year: int) -> list[dict]:
    """
    在所有 market×plate 下抓取次年各子区间公告（捕获跨年发布）
    """
    ny = year + 1
    segments = [
        f"{ny}-01-01~{ny}-04-01", f"{ny}-04-02~{ny}-04-15",
        f"{ny}-04-16~{ny}-04-22", f"{ny}-04-23~{ny}-04-26",
        f"{ny}-04-27~{ny}-04-28", f"{ny}-04-29~{ny}-04-30",
        f"{ny}-05-01~{ny}-07-31", f"{ny}-08-01~{ny}-10-31",
        f"{ny}-11-01~{ny}-11-30", f"{ny}-12-01~{ny}-12-31"
    ]
    raw = []
    for col in MARKETS:
        for pl in PLATES:
            raw.extend(download_reports_for_segments(segments, col, pl))
    return filter_latest_versions(raw)


def write_selected_excel(anns: list[dict], year: int) -> None:
    """
    按映射表顺序输出最终列表，若未找到公告，则链接等留空
    """
    df_map = pd.read_excel(mapping_file, dtype=str)
    df_map['sec_no_suf'] = df_map['正股代码'].str.split('.').str[0]

    ann_dict = {
        (it['secCode'],
         re.search(r"(\d{4})年", it['announcementTitle']).group(1) if re.search(r"(\d{4})年", it['announcementTitle']) else "")
        : it
        for it in anns
    }

    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = f"{year}年报"
    ws.append([
        "可转债代码","可转债名称",
        "公司代码","公司简称",
        "标题","年份","年报链接"
    ])

    for _, row in df_map.iterrows():
        bond_code = row['代码']
        bond_name = row.get('名称', "")
        sec       = row['sec_no_suf']
        comp_name = row['正股名称']

        key = (sec, str(year))
        it  = ann_dict.get(key)
        if it:
            raw   = re.sub(r"<.*?>","", it['announcementTitle']).replace("：","")
            title = f"《{raw}》"
            url   = f"http://static.cninfo.com.cn/{it['adjunctUrl']}"
            yr    = str(year)
        else:
            title = ""
            url   = ""
            yr    = ""

        ws.append([bond_code, bond_name, sec, comp_name, title, yr, url])

    dst = os.path.join(output_dir, f"年报链接_{year}_选取公司{GZH}.xlsx")
    wb.save(dst)
    print(f"✅ 已输出：{dst}")


if __name__ == "__main__":
    # 1) 抓公告并过滤
    all_ann = collect_all_announcements(YEAR)
    # 2) 按映射顺序输出最终 Excel
    write_selected_excel(all_ann, YEAR)
    print(f"---- {YEAR} 年下载完成 ----")


✅ 已输出：/Users/sam/Desktop/cninfo_output/年报链接_2024_选取公司【年报】.xlsx
---- 2024 年下载完成 ----


In [27]:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
按指定正股列表（公司代码或公司简称）精准抓取 2024 年度报告公告并输出链接
使用 searchkey 参数直接索引，不再依赖 secid
"""
import os
import re
import time
import requests
import openpyxl
import pandas as pd

# —— 参数配置 ——
YEAR             = 2024                          # 目标年份
GZH              = "【年报】"
exclude_keywords = ['英文', '已取消', '摘要']       # 排除关键字

# 输入/输出路径
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")
output_dir   = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)

# 可选过滤：使用公司代码或简称过滤
sec_filter  = []  # 例 ['600519','000001']
name_filter = []  # 例 ['贵州茅台','平安银行']

# search_by: 'code' 或 'name'
search_by = 'name'

# 时间分段：全年 + 跨年
segments = [
    f"{YEAR}-01-01~{YEAR}-12-31",
    f"{YEAR+1}-01-01~{YEAR+1}-04-30"
]

# 调用接口：通过 searchkey 精准搜索
def get_report(page_num: int, date_range: str, searchkey: str) -> requests.Response:
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    payload = {
        "pageNum":  page_num,
        "pageSize": 30,
        "column":   "",             # 默认空，不做市场过滤
        "plate":    "",             # 默认空，不做板块过滤
        "tabName":  "fulltext",
        "category": "category_ndbg_szsh",
        "searchkey": searchkey,
        "seDate":   date_range,
        "sortName": "code",
        "sortType": "asc",
        "isHLtitle": "false"
    }
    headers = {
        "Accept":             "*/*",
        "Content-Type":       "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin":             "http://www.cninfo.com.cn",
        "Referer":            "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search",
        "User-Agent":         "Mozilla/5.0",
        "X-Requested-With":   "XMLHttpRequest"
    }
    return requests.post(url, data=payload, headers=headers)

# 批量抓取：使用公司列表 + searchkey
def collect_announcements(codes_names: list[tuple]) -> list[dict]:
    all_ann = []
    for sec, comp in codes_names:
        # 过滤
        if sec_filter and sec not in sec_filter:
            continue
        if name_filter and comp not in name_filter:
            continue
        # 选择 searchkey
        key = sec if search_by=='code' else comp
        for dr in segments:
            try:
                resp  = get_report(1, dr, key)
                total = resp.json().get("totalpages", 0)
            except Exception:
                continue
            if total <= 0:
                continue
            page = 1
            while page <= total:
                for _ in range(3):
                    r = get_report(page, dr, key)
                    try:
                        r.raise_for_status()
                        anns = r.json().get("announcements", [])
                        if anns:
                            all_ann.extend(anns)
                        break
                    except Exception:
                        time.sleep(2)
                page += 1
    # 排除无用记录
    return [a for a in all_ann if not any(kw in a.get('announcementTitle','') for kw in exclude_keywords)]

# 保留最新版本
def filter_latest_versions(anns: list) -> list[dict]:
    rev_kws = ['更正','更正后','更正版','修订后','修订版','更新后','更新版']
    latest = {}
    for it in anns:
        sec   = it.get('secCode','')
        title = it.get('announcementTitle','')
        m     = re.search(r"(\d{4})年", title)
        yr    = m.group(1) if m else ""
        key   = (sec, yr)
        if key not in latest or (
            any(kw in title for kw in rev_kws)
            and not any(kw in latest[key].get('announcementTitle','') for kw in rev_kws)
        ):
            latest[key] = it
    return list(latest.values())

# 输出为 Excel
def write_selected_excel(anns: list, year: int) -> None:
    df_map = pd.read_excel(mapping_file, dtype=str).fillna('')
    df_map['sec_no_suf'] = df_map['正股代码'].str.split('.').str[0]
    # 过滤
    if name_filter:
        df_map = df_map[df_map['正股名称'].isin(name_filter)]
    if sec_filter:
        df_map = df_map[df_map['sec_no_suf'].isin(sec_filter)]
    ann_dict = {
        (it.get('secCode',''), re.search(r"(\d{4})年", it.get('announcementTitle','')).group(1) if re.search(r"(\d{4})年", it.get('announcementTitle','')) else ""): it
        for it in anns
    }
    wb = openpyxl.Workbook()
    ws = wb.active; ws.title = f"{year}年报"
    ws.append(["可转债代码","可转债名称","公司代码","公司简称","标题","年份","年报链接"])
    for _, row in df_map.iterrows():
        bond_code = row.get('代码',''); bond_name = row.get('名称','')
        sec       = row.get('sec_no_suf',''); comp_name = row.get('正股名称','')
        key       = (sec, str(year)); it = ann_dict.get(key, {})
        if it:
            raw   = re.sub(r"<.*?>","", it.get('announcementTitle','')).replace("：","")
            title = f"《{raw}》"
            url   = f"http://static.cninfo.com.cn/{it.get('adjunctUrl','')}"
            yr    = str(year)
        else:
            title = url = yr = ""
        ws.append([bond_code, bond_name, sec, comp_name, title, yr, url])
    dst = os.path.join(output_dir, f"年报链接_{year}_选取公司{GZH}.xlsx")
    wb.save(dst); print(f"✅ 已输出：{dst}")

if __name__ == '__main__':
    df_map = pd.read_excel(mapping_file, dtype=str).fillna('')
    codes_names = list(zip(
        df_map['正股代码'].str.split('.').str[0],
        df_map['正股名称']
    ))
    raw_ann = collect_announcements(codes_names)
    all_ann = filter_latest_versions(raw_ann)
    write_selected_excel(all_ann, YEAR)
    print(f"---- {YEAR} 年年报链接抓取完成 ----")



✅ 已输出：/Users/sam/Desktop/cninfo_output/年报链接_2024_选取公司【年报】.xlsx
---- 2024 年年报链接抓取完成 ----


In [29]:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
按指定正股列表（公司代码或简称）精准抓取 2024 年度报告公告并输出链接
使用 searchkey 参数直接索引，不再遍历市场/板块
"""
import os
import re
import time
import requests
import openpyxl
import pandas as pd

# —— 参数配置 ——
YEAR             = 2024                          # 目标年份
GZH              = "【年报】"
exclude_keywords = ['英文', '已取消', '摘要']       # 排除关键字

# 输入/输出路径
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")
output_dir   = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)

# 可选过滤：sec_filter 按代码过滤，name_filter 按简称过滤
sec_filter  = []  # 例 ['600519','000001']
name_filter = []  # 例 ['贵州茅台','平安银行']

# search_by: 'code' 或 'name'
search_by = 'code'

# 时间分段：全年 + 跨年
segments = [
    f"{YEAR}-01-01~{YEAR}-12-31",
    f"{YEAR+1}-01-01~{YEAR+1}-04-30"
]

# 调用接口：通过 searchkey 精准搜索，不使用 market/plate
def get_report(page_num: int, date_range: str, searchkey: str) -> requests.Response:
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    payload = {
        "pageNum":   page_num,
        "pageSize":  30,
        "column":    "",             # 空：不做市场过滤
        "plate":     "",             # 空：不做板块过滤
        "tabName":   "fulltext",
        "category":  "category_ndbg_szsh",
        "searchkey": searchkey,
        "seDate":    date_range,
        "sortName":  "code",
        "sortType":  "asc",
        "isHLtitle": "false"
    }
    headers = {
        "Accept":             "*/*",
        "Content-Type":       "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin":             "http://www.cninfo.com.cn",
        "Referer":            "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search",
        "User-Agent":         "Mozilla/5.0",
        "X-Requested-With":   "XMLHttpRequest"
    }
    return requests.post(url, data=payload, headers=headers)

# 批量抓取：根据公司列表和 searchkey
def collect_announcements(codes_names: list) -> list[dict]:
    all_ann = []
    for sec, comp in codes_names:
        sec = str(sec).strip()
        if not sec.isdigit():
            continue
        if sec_filter and sec not in sec_filter:
            continue
        if name_filter and comp not in name_filter:
            continue
        # 确定 searchkey
        key = sec if search_by=='code' else comp
        for dr in segments:
            try:
                resp  = get_report(1, dr, key)
                total = resp.json().get("totalpages", 0)
            except Exception:
                continue
            if total <= 0:
                continue
            page = 1
            while page <= total:
                for _ in range(3):
                    r = get_report(page, dr, key)
                    try:
                        r.raise_for_status()
                        anns = r.json().get("announcements", [])
                        if anns:
                            all_ann.extend(anns)
                        break
                    except Exception:
                        time.sleep(2)
                page += 1
    # 排除无用记录
    return [a for a in all_ann if not any(kw in a.get('announcementTitle','') for kw in exclude_keywords)]

# 保留每家公司当年最新版本
def filter_latest_versions(anns: list) -> list[dict]:
    rev_kws = ['更正','更正后','更正版','修订后','修订版','更新后','更新版']
    latest = {}
    for it in anns:
        sec   = it.get('secCode','')
        title = it.get('announcementTitle','')
        m     = re.search(r"(\d{4})年", title)
        yr    = m.group(1) if m else ""
        key   = (sec, yr)
        if key not in latest or (
            any(kw in title for kw in rev_kws)
            and not any(kw in latest[key].get('announcementTitle','') for kw in rev_kws)
        ):
            latest[key] = it
    return list(latest.values())

# 输出至 Excel
def write_selected_excel(anns: list, year: int) -> None:
    df_map = pd.read_excel(mapping_file, dtype=str).fillna('')
    df_map['sec_no_suf'] = df_map['正股代码'].str.split('.').str[0]
    if name_filter:
        df_map = df_map[df_map['正股名称'].isin(name_filter)]
    if sec_filter:
        df_map = df_map[df_map['sec_no_suf'].isin(sec_filter)]
    ann_dict = {
        (it.get('secCode',''), re.search(r"(\d{4})年", it.get('announcementTitle','')).group(1) if re.search(r"(\d{4})年", it.get('announcementTitle','')) else ""): it
        for it in anns
    }
    wb = openpyxl.Workbook(); ws = wb.active; ws.title = f"{year}年报"
    ws.append(["可转债代码","可转债名称","公司代码","公司简称","标题","年份","年报链接"])
    for _, row in df_map.iterrows():
        bond_code = row.get('代码',''); bond_name = row.get('名称','')
        sec       = row.get('sec_no_suf',''); comp_name = row.get('正股名称','')
        key       = (sec, str(year)); it = ann_dict.get(key, {})
        if it:
            raw   = re.sub(r"<.*?>","", it.get('announcementTitle','')).replace("：","")
            title = f"《{raw}》"
            url   = f"http://static.cninfo.com.cn/{it.get('adjunctUrl','')}"
            yr    = str(year)
        else:
            title = url = yr = ""
        ws.append([bond_code, bond_name, sec, comp_name, title, yr, url])
    dst = os.path.join(output_dir, f"年报链接_{year}_选取公司{GZH}.xlsx")
    wb.save(dst); print(f"✅ 已输出：{dst}")

if __name__ == '__main__':
    df_map = pd.read_excel(mapping_file, dtype=str).fillna('')
    codes_names = list(zip(df_map['正股代码'].str.split('.').str[0], df_map['正股名称']))
    raw_ann = collect_announcements(codes_names)
    all_ann = filter_latest_versions(raw_ann)
    write_selected_excel(all_ann, YEAR)
    print(f"---- {YEAR} 年年报链接抓取完成 ----")



✅ 已输出：/Users/sam/Desktop/cninfo_output/年报链接_2024_选取公司【年报】.xlsx
---- 2024 年年报链接抓取完成 ----


In [7]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import re
import time
import requests
import openpyxl
import pandas as pd
from typing import List, Dict, Tuple

# —— 参数配置 ——
YEAR = 2024
GZH  = "【年报】"

# 输出目录 & 映射表
output_dir   = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

# —— 市场/板块控制（按你原来的保留） ——
MARKETS = ["szse", "shse"]
PLATES  = ["", "sz", "szmb", "szcy", "sh", "shmb", "shkcp", "bj"]

# —— 标题过滤：只要中文完整版年报；剔除英文/摘要/取消等 ——
EXCLUDE_KWS = [
    '英文', '英文版', 'Annual', 'annual', 'Summary',
    '摘要', '摘要版', '年报摘要', '年度报告摘要',
    '取消', '已取消', '披露日期变更', '变更的公告', '提示性公告',
    'H股公告', 'B股', '年报报告'
]
INCLUDE_AR_PAT = re.compile(r"(年\s*度\s*报\s*告|年\s*报)")

# 空白归一（含不间断空格/全角空格）
SPACE_RE = re.compile(r"[\s\u00A0\u3000]+")
def norm(s: str) -> str:
    return SPACE_RE.sub("", str(s or "").strip())

# 年份识别：允许“2024 年/年度/年报”等写法与空格
YEAR_PATS = [
    re.compile(r"(20\d{2})\s*年?\s*度?\s*报\s*告"),
    re.compile(r"(20\d{2})\s*年?\s*报(?!告)"),
]
def extract_year(title: str) -> str:
    t = norm(title)
    for pat in YEAR_PATS:
        m = pat.search(t)
        if m:
            return m.group(1)
    return ""

def is_valid_cn_annual_2024(title: str) -> bool:
    t = norm(title)
    if not INCLUDE_AR_PAT.search(t):
        return False
    if any(kw in t for kw in EXCLUDE_KWS):
        return False
    return extract_year(t) == str(YEAR)

# 不同板块对应的 category
def category_for_plate(plate: str) -> str:
    if plate == "shkcp":
        return "category_ndbg_shkcp"   # 科创板
    if plate == "bj":
        return "category_ndbg_bj"      # 北交所
    return "category_ndbg_szsh"        # 深沪主板大类

# —— 调用接口 ——
def get_report(page_num: int, date_range: str, column: str, plate: str) -> requests.Response:
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    payload = {
        "pageNum":   page_num,
        "pageSize":  30,
        "column":    column,
        "tabName":   "fulltext",
        "category":  category_for_plate(plate),
        "plate":     plate,
        "searchkey": "",
        "secid":     "",
        "trade":     "",
        "seDate":    date_range,
        "sortName":  "code",
        "sortType":  "asc",
        "isHLtitle": "false",
    }
    headers = {
        "Accept":           "*/*",
        "Accept-Language":  "zh-CN,zh;q=0.9",
        "Content-Type":     "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin":           "http://www.cninfo.com.cn",
        "Referer":          "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search",
        "User-Agent":       "Mozilla/5.0",
        "X-Requested-With": "XMLHttpRequest",
    }
    return requests.post(url, data=payload, headers=headers, timeout=15)

# —— 分段抓取（先粗过滤再汇总） ——
def download_reports_for_segments(segments: List[str], column: str, plate: str) -> List[Dict]:
    all_ann: List[Dict] = []
    seen = set()  # 去重：用 (secCode, adjunctUrl)
    for date_range in segments:
        try:
            total = get_report(1, date_range, column, plate).json().get("totalpages", 0)
        except Exception:
            continue
        if total <= 0:
            continue

        page = 1
        while page <= total:
            for _ in range(3):  # 重试三次
                try:
                    resp = get_report(page, date_range, column, plate)
                    resp.raise_for_status()
                    anns = resp.json().get("announcements", [])
                    # 当场做一次标题粗过滤 + 去重
                    for x in anns:
                        title = x.get("announcementTitle", "")
                        if not is_valid_cn_annual_2024(title):
                            continue
                        key = (str(x.get("secCode", "")), str(x.get("adjunctUrl", "")))
                        if key in seen:
                            continue
                        seen.add(key)
                        all_ann.append(x)
                    break
                except Exception:
                    time.sleep(1.5)
            page += 1
    return all_ann

# —— 最新版本优先（更正/修订/更新 > 时间更近） ——
REV_KWS = ["更正", "修订", "更新"]
def ann_weight(it: Dict) -> Tuple[int, int]:
    title = norm(it.get("announcementTitle", ""))
    w1 = 10 if any(k in title for k in REV_KWS) else 0
    try:
        ts = int(it.get("announcementTime", 0))
    except Exception:
        ts = 0
    return (w1, ts)

def filter_latest_versions(anns: List[Dict]) -> List[Dict]:
    latest: Dict[Tuple[str, str], Dict] = {}
    for it in anns:
        sec = str(it.get("secCode", ""))
        yr  = extract_year(it.get("announcementTitle", ""))
        if not sec or yr != str(YEAR):
            continue
        key = (sec, yr)
        if key not in latest or ann_weight(it) > ann_weight(latest[key]):
            latest[key] = it
    return list(latest.values())

# —— 统一抓取入口（两段时间，提速） ——
def collect_all_announcements(year: int) -> List[Dict]:
    ny = year + 1
    segments = [
        f"{ny}-01-01~{ny}-04-30",  # 主体披露期
        f"{ny}-05-01~{ny}-12-31",  # 补报/延期
    ]
    raw: List[Dict] = []
    for col in MARKETS:
        for pl in PLATES:
            raw.extend(download_reports_for_segments(segments, col, pl))
    return filter_latest_versions(raw)

# —— 写 Excel —— 
def write_selected_excel(anns: List[Dict], year: int) -> None:
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    df_map["sec_no_suf"] = df_map["正股代码"].str.split(".").str[0]

    ann_dict = {
        (str(it.get("secCode", "")), extract_year(it.get("announcementTitle", ""))): it
        for it in anns
    }

    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = f"{year}年报"
    ws.append(["可转债代码","可转债名称","公司代码","公司简称","标题","年份","年报链接"])

    for _, row in df_map.iterrows():
        bond_code = row.get("代码", "")
        bond_name = row.get("名称", "")
        sec       = row.get("sec_no_suf", "")
        comp_name = row.get("正股名称", "")

        it = ann_dict.get((sec, str(year)))
        if it:
            raw_title = re.sub(r"<.*?>", "", it.get("announcementTitle","")).replace("：","")
            title = f"《{raw_title}》"
            url   = f"http://static.cninfo.com.cn/{it.get('adjunctUrl','')}"
            yr    = str(year)
        else:
            title = url = yr = ""

        ws.append([bond_code, bond_name, sec, comp_name, title, yr, url])

    dst = os.path.join(output_dir, f"年报链接_{year}_选取公司{GZH}.xlsx")
    wb.save(dst)
    print(f"✅ 已输出：{dst}")

if __name__ == "__main__":
    all_ann = collect_all_announcements(YEAR)
    write_selected_excel(all_ann, YEAR)
    print(f"---- {YEAR} 年下载完成 ----")


KeyboardInterrupt: 