In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
02_collect_offering_links.py
功能：按《正股映射结果.xlsx》的公司集合，从巨潮接口检索“募集说明书”公告，
     并导出 Excel（格式与年报版一致，只是标题/年份/链接改为募集说明书对应内容）。
"""

import os
import re
import time
import requests
import openpyxl
import pandas as pd
from typing import List, Dict, Tuple
from datetime import date

# ========= 可调参数 =========
# 关键词：改为“募集说明书”
KEYWORDS = ["募集说明书"]
EXTRA_KWS: List[str] = []   # 可追加“更新稿”、“摘要”等

# 时间范围（起止）
DATE_BEGIN = "2000-02-01"
DATE_END   = date.today().strftime("%Y-%m-%d")

output_dir   = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

# 输出文件
OUT_XLSX = os.path.join(output_dir, "募集说明书链接_选取公司【募集】.xlsx")
SHEET    = "募集说明书"

# 市场/板块
MARKETS = ["szse", "shse"]
PLATES  = ["", "sz", "szmb", "szcy", "sh", "shmb", "shkcp", "bj"]

# 排除干扰词
EXCLUDE_KWS = [
    "英文", "英文版", "Annual", "annual", "Summary", "summary",
    "摘要", "摘要版", "取消", "已取消", "提示性公告", "更名提示", "B股", "H股"
]

# ========= 工具函数 =========
_SPACE_RE = re.compile(r"[\s\u00A0\u3000]+")
def norm(s: str) -> str:
    return _SPACE_RE.sub("", str(s or "").strip())

def any_kw_in(text: str, kws: List[str]) -> bool:
    t = norm(text)
    return any(norm(k) in t for k in kws if k)

_TITLE_YEAR_PATS = [
    re.compile(r"(20\d{2})\s*年?\s*度?\s*募\s*集\s*说\s*明\s*书"),
    re.compile(r"(20\d{2})\s*年?\s*募\s*集\s*说\s*明\s*书"),
    re.compile(r"(19\d{2}|20\d{2})")
]
def extract_year_from_title(title: str) -> str:
    t = norm(title)
    for pat in _TITLE_YEAR_PATS:
        m = pat.search(t)
        if m:
            return m.group(1)
    return ""

def year_from_timestamp_ms(ts: int) -> str:
    try:
        return time.strftime("%Y", time.localtime(int(ts)/1000))
    except Exception:
        return ""

def is_target_offering(title: str) -> bool:
    kws = list(KEYWORDS) + list(EXTRA_KWS)
    t = norm(title)
    if not any_kw_in(t, kws):
        return False
    if any_kw_in(t, EXCLUDE_KWS):
        return False
    return True

def ann_weight(it: Dict) -> Tuple[int, int]:
    title = norm(it.get("announcementTitle", ""))
    w_fix = 0
    if any_kw_in(title, ["更新", "修订", "更正"]):
        w_fix = 10
    try:
        ts = int(it.get("announcementTime", 0))
    except Exception:
        ts = 0
    return (w_fix, ts)

# ========= 调接口 =========
def get_report(page_num: int, date_range: str, column: str, plate: str, searchkey: str="") -> requests.Response:
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    payload = {
        "pageNum":   page_num,
        "pageSize":  30,
        "column":    column,
        "tabName":   "fulltext",
        "category":  "",
        "plate":     plate,
        "searchkey": searchkey,
        "secid":     "",
        "trade":     "",
        "seDate":    date_range,
        "sortName":  "code",
        "sortType":  "asc",
        "isHLtitle": "false"
    }
    headers = {
        "Accept":           "*/*",
        "Accept-Language":  "zh-CN,zh;q=0.9",
        "Content-Type":     "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin":           "http://www.cninfo.com.cn",
        "Referer":          "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search",
        "User-Agent":       "Mozilla/5.0",
        "X-Requested-With": "XMLHttpRequest"
    }
    return requests.post(url, data=payload, headers=headers, timeout=15)

def download_for_segments(segments: List[str], column: str, plate: str, searchkey: str) -> List[Dict]:
    all_ann: List[Dict] = []
    seen = set()
    for date_range in segments:
        try:
            total = get_report(1, date_range, column, plate, searchkey).json().get("totalpages", 0)
        except Exception:
            continue
        if total <= 0:
            continue

        page = 1
        while page <= total:
            for _ in range(3):
                try:
                    resp = get_report(page, date_range, column, plate, searchkey)
                    resp.raise_for_status()
                    anns = resp.json().get("announcements", []) or []
                    for x in anns:
                        title = x.get("announcementTitle", "")
                        if not is_target_offering(title):
                            continue
                        key = (str(x.get("secCode", "")), str(x.get("adjunctUrl", "")))
                        if key in seen:
                            continue
                        seen.add(key)
                        all_ann.append(x)
                    break
                except Exception:
                    time.sleep(1.2)
            page += 1
    return all_ann

def collect_all_offering() -> List[Dict]:
    y_begin = int(DATE_BEGIN[:4])
    y_end   = int(DATE_END[:4])
    segments: List[str] = []
    for y in range(y_begin, y_end + 1):
        seg_start = f"{y}-01-01" if y > y_begin else DATE_BEGIN
        seg_end   = f"{y}-12-31" if y < y_end   else DATE_END
        segments.append(f"{seg_start}~{seg_end}")

    search_terms = list({norm(k) for k in (KEYWORDS + EXTRA_KWS) if k})
    raw: List[Dict] = []
    for col in MARKETS:
        for pl in PLATES:
            for kw in (search_terms or ["募集说明书"]):
                raw.extend(download_for_segments(segments, col, pl, kw))
    return raw

# ========= 去重 =========
def filter_latest_per_company(anns: List[Dict], valid_secs: set) -> List[Dict]:
    latest: Dict[Tuple[str, str], Dict] = {}
    for it in anns:
        sec = str(it.get("secCode", ""))
        if sec not in valid_secs:
            continue
        yr = extract_year_from_title(it.get("announcementTitle", "")) or year_from_timestamp_ms(it.get("announcementTime", 0))
        if not yr:
            continue
        key = (sec, yr)
        if key not in latest or ann_weight(it) > ann_weight(latest[key]):
            latest[key] = it
    return list(latest.values())

# ========= 写 Excel =========
def write_selected_excel(anns: List[Dict]) -> None:
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    df_map["sec_no_suf"] = df_map["正股代码"].str.split(".").str[0]

    ann_dict: Dict[Tuple[str, str], Dict] = {}
    for it in anns:
        sec = str(it.get("secCode", ""))
        yr  = extract_year_from_title(it.get("announcementTitle", "")) or year_from_timestamp_ms(it.get("announcementTime", 0))
        if sec and yr:
            ann_dict[(sec, yr)] = it

    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = SHEET
    ws.append(["可转债代码","可转债名称","公司代码","公司简称","标题","年份","年报链接"])

    for _, row in df_map.iterrows():
        bond_code = row.get("代码", "")
        bond_name = row.get("名称", "")
        sec       = row.get("sec_no_suf", "")
        comp_name = row.get("正股名称", "")

        matched = [(k, v) for k, v in ann_dict.items() if k[0] == sec]
        if matched:
            matched.sort(key=lambda kv: kv[0][1])
            for (_, yr), it in matched:
                raw_title = re.sub(r"<.*?>","", it.get("announcementTitle","")).replace("：","")
                title = f"《{raw_title}》"
                url   = f"http://static.cninfo.com.cn/{it.get('adjunctUrl','')}"
                ws.append([bond_code, bond_name, sec, comp_name, title, yr, url])
        else:
            ws.append([bond_code, bond_name, sec, comp_name, "", "", ""])

    wb.save(OUT_XLSX)
    print(f"✅ 已输出：{OUT_XLSX}（工作表：{SHEET}）")

# ========= 主流程 =========
def main():
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    valid_secs = set(df_map["正股代码"].str.split(".").str[0].tolist())

    raw = collect_all_offering()
    picked = filter_latest_per_company(raw, valid_secs)
    write_selected_excel(picked)
    print("---- 募集说明书检索完成 ----")

if __name__ == "__main__":
    main()


✅ 已输出：/Users/sam/Desktop/cninfo_output/募集说明书链接_选取公司【募集】.xlsx（工作表：募集说明书）
---- 募集说明书检索完成 ----


In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
02_collect_offering_links.py  (仅可转债)
功能：
  - 按《正股映射结果.xlsx》的公司集合，从巨潮接口检索“募集说明书”，
  - 在结果中仅保留【可转债募集说明书】，显式排除摘要/附录/公告/意见/GDR/优先股/普通公司债/股票再融资等，
  - 公司维度仅保留一条最新版（修订/修正/更新优先，其次 announcementTime 更近），
  - 导出 Excel（与年报版列顺序一致），每家公司只一行。
"""

import os
import re
import time
import requests
import openpyxl
import pandas as pd
from typing import List, Dict, Tuple
from datetime import date

# ========= 可调参数 =========
# 1) 搜索词：按你的要求——每家公司只用一个 searchkey
KEYWORDS = ["募集说明书"]          # 固定单一检索词
EXTRA_KWS: List[str] = []         # 不额外扩展，保证请求轻量

# 2) 时间范围（起止）
DATE_BEGIN = "2000-01-01"
DATE_END   = date.today().strftime("%Y-%m-%d")

# 3) I/O
output_dir   = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

OUT_XLSX = os.path.join(output_dir, "募集说明书链接_选取公司【募集】.xlsx")
SHEET    = "募集说明书"

# 4) 市场/板块
MARKETS = ["szse", "shse"]
PLATES  = ["", "sz", "szmb", "szcy", "sh", "shmb", "shkcp", "bj"]

# ========= 规则 =========
_SPACE_RE = re.compile(r"[\s\u00A0\u3000]+")
def norm(s: str) -> str:
    return _SPACE_RE.sub("", str(s or "").strip())

def any_kw_in(text: str, kws: List[str]) -> bool:
    t = norm(text)
    return any(norm(k) in t for k in kws if k)

# ——显式排除：摘要/附录/公告/意见/问询/GDR/优先股/普通公司债/股票再融资……
EXCLUDE_KWS = [
    # 文档形态
    "摘要", "摘要版", "附录", "概览", "封卷稿", "封卷版",
    "公告", "情况说明", "说明的公告", "更正公告", "更正后",
    "审核意见", "确认意见", "监事会", "全体监事", "承诺", "问询回复", "反馈意见",
    # 其他证券类型
    "优先股", "境内优先股",
    # 普通公司债（不含“可转”）
    "公司债券募集说明书",
    # 股票再融资
    "向特定对象发行股票募集说明书", "以简易程序向特定对象发行股票募集说明书",
    "向特定对象发行A股股票募集说明书", "发行证券募集说明书", "并在主板上市募集说明书",
    # GDR / 瑞士等
    "GDR", "全球存托凭证", "瑞士交易所", "瑞士证券交易所", "价格区间确定", "批准的公告",
    # 英文/其他干扰
    "英文", "英文版", "Annual", "annual", "Summary", "summary",
    "取消", "已取消", "提示性公告", "更名提示", "B股", "H股"
]

# ——判定“普通公司债券”的辅助（只有含“公司债券募集说明书”且不含“可转/可转换”才视为普通债并排除）
def is_plain_corporate_bond(t: str) -> bool:
    t = norm(t)
    return ("公司债券募集说明书" in t) and ("可转" not in t and "可转换" not in t)

# ——仅保留“可转债募集说明书”的包含规则（命中其一即可）
CB_INCLUDE_RES = [
    re.compile(r"可转(换)?(公司)?债(券)?募集说明书"),                 # 可转债募集说明书/可转换公司债券募集说明书
    re.compile(r"公开发行.*可转(换)?(公司)?债(券)?.*募集说明书"),     # 公开发行…可转债…募集说明书
    re.compile(r"向不特定对象发行.*可转(换)?(公司)?债(券)?.*募集说明书"),
]

def is_target_cb_offering(title: str) -> bool:
    """只保留可转债募集说明书；过滤掉摘要/附录/公告/意见/GDR/优先股/普通公司债/股票再融资等"""
    t = norm(title)
    if is_plain_corporate_bond(t):       # 普通公司债，剔除
        return False
    if any_kw_in(t, EXCLUDE_KWS):        # 显式排除
        return False
    return any(p.search(t) for p in CB_INCLUDE_RES)

# ——年份提取（用于展示；去重不按年份）
_TITLE_YEAR_PATS = [
    re.compile(r"(20\d{2})\s*年?\s*度?\s*募\s*集\s*说\s*明\s*书"),
    re.compile(r"(20\d{2})\s*年?\s*募\s*集\s*说\s*明\s*书"),
    re.compile(r"(19\d{2}|20\d{2})")
]
def extract_year_from_title(title: str) -> str:
    t = norm(title)
    for pat in _TITLE_YEAR_PATS:
        m = pat.search(t)
        if m:
            return m.group(1)
    return ""

def year_from_timestamp_ms(ts: int) -> str:
    try:
        return time.strftime("%Y", time.localtime(int(ts)/1000))
    except Exception:
        return ""

# ——权重：修订/修正/更新优先，其次发行稿/注册稿/上会稿/申报稿略加分；同权重按时间近
def ann_weight(it: Dict) -> Tuple[int, int]:
    title = norm(it.get("announcementTitle", ""))
    w = 0
    if any_kw_in(title, ["修订", "修正", "更正", "更新"]):
        w += 10
    if any_kw_in(title, ["发行稿"]):
        w += 3
    if any_kw_in(title, ["注册稿", "上会稿", "申报稿"]):
        w += 1
    try:
        ts = int(it.get("announcementTime", 0))
    except Exception:
        ts = 0
    return (w, ts)

# ========= 调接口 =========
def get_report(page_num: int, date_range: str, column: str, plate: str, searchkey: str="") -> requests.Response:
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    payload = {
        "pageNum":   page_num,
        "pageSize":  30,
        "column":    column,
        "tabName":   "fulltext",
        "category":  "",
        "plate":     plate,
        "searchkey": searchkey,     # 只用“募集说明书”
        "secid":     "",
        "trade":     "",
        "seDate":    date_range,
        "sortName":  "code",
        "sortType":  "asc",
        "isHLtitle": "false"
    }
    headers = {
        "Accept":           "*/*",
        "Accept-Language":  "zh-CN,zh;q=0.9",
        "Content-Type":     "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin":           "http://www.cninfo.com.cn",
        "Referer":          "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search",
        "User-Agent":       "Mozilla/5.0",
        "X-Requested-With": "XMLHttpRequest"
    }
    return requests.post(url, data=payload, headers=headers, timeout=15)

def download_for_segments(segments: List[str], column: str, plate: str, searchkey: str) -> List[Dict]:
    all_ann: List[Dict] = []
    seen = set()
    for date_range in segments:
        try:
            total = get_report(1, date_range, column, plate, searchkey).json().get("totalpages", 0)
        except Exception:
            continue
        if total <= 0:
            continue

        page = 1
        while page <= total:
            for _ in range(3):
                try:
                    resp = get_report(page, date_range, column, plate, searchkey)
                    resp.raise_for_status()
                    anns = resp.json().get("announcements", []) or []
                    for x in anns:
                        title = x.get("announcementTitle", "")
                        if not is_target_cb_offering(title):
                            continue
                        key = (str(x.get("secCode", "")), str(x.get("adjunctUrl", "")))
                        if key in seen:
                            continue
                        seen.add(key)
                        all_ann.append(x)
                    break
                except Exception:
                    time.sleep(1.1)
            page += 1
    return all_ann

def collect_all_offering() -> List[Dict]:
    # 年度分片，避免一次性范围过大
    y_begin = int(DATE_BEGIN[:4])
    y_end   = int(DATE_END[:4])
    segments: List[str] = []
    for y in range(y_begin, y_end + 1):
        seg_start = f"{y}-01-01" if y > y_begin else DATE_BEGIN
        seg_end   = f"{y}-12-31" if y < y_end   else DATE_END
        segments.append(f"{seg_start}~{seg_end}")

    # 按你的要求：只有一个 searchkey——“募集说明书”
    raw: List[Dict] = []
    for col in MARKETS:
        for pl in PLATES:
            raw.extend(download_for_segments(segments, col, pl, "募集说明书"))
    return raw

# ========= 去重（公司维度，仅保留最新版） =========
def pick_latest_per_company(anns: List[Dict], valid_secs: set) -> List[Dict]:
    latest: Dict[str, Dict] = {}
    for it in anns:
        sec = str(it.get("secCode", ""))
        if sec not in valid_secs:
            continue
        if sec not in latest or ann_weight(it) > ann_weight(latest[sec]):
            latest[sec] = it
    return list(latest.values())

# ========= 写 Excel（每家公司只输出一行） =========
def write_selected_excel(anns: List[Dict]) -> None:
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    df_map["sec_no_suf"] = df_map["正股代码"].str.split(".").str[0]

    # secCode -> 记录
    ann_dict: Dict[str, Dict] = {}
    for it in anns:
        sec = str(it.get("secCode", ""))
        if sec:
            ann_dict[sec] = it

    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = SHEET
    ws.append(["可转债代码","可转债名称","公司代码","公司简称","标题","年份","年报链接"])

    for _, row in df_map.iterrows():
        bond_code = row.get("代码", "")
        bond_name = row.get("名称", "")
        sec       = row.get("sec_no_suf", "")
        comp_name = row.get("正股名称", "")

        it = ann_dict.get(sec)
        if it:
            raw_title = re.sub(r"<.*?>","", it.get("announcementTitle","")).replace("：","")
            title = f"《{raw_title}》"
            yr = extract_year_from_title(raw_title) or year_from_timestamp_ms(it.get("announcementTime", 0))
            url = f"http://static.cninfo.com.cn/{it.get('adjunctUrl','')}"
            ws.append([bond_code, bond_name, sec, comp_name, title, yr, url])
        else:
            ws.append([bond_code, bond_name, sec, comp_name, "", "", ""])

    wb.save(OUT_XLSX)
    print(f"✅ 已输出：{OUT_XLSX}（工作表：{SHEET}）")

# ========= 主流程 =========
def main():
    # 1) 公司集合
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    valid_secs = set(df_map["正股代码"].str.split(".").str[0].tolist())

    # 2) 拉取（单一searchkey）+ 严格筛选仅保留可转债募集说明书
    raw = collect_all_offering()

    # 3) 公司维度仅保留最新版
    picked = pick_latest_per_company(raw, valid_secs)

    # 4) 导出
    write_selected_excel(picked)

    print("---- 可转债募集说明书检索完成 ----")

if __name__ == "__main__":
    main()


✅ 已输出：/Users/sam/Desktop/cninfo_output/募集说明书链接_选取公司【募集】.xlsx（工作表：募集说明书）
---- 可转债募集说明书检索完成 ----


In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
02_collect_offering_links.py
功能：按《正股映射结果.xlsx》的公司集合，从巨潮接口检索“可转债募集说明书”公告，
     只保留与可转债相关的最新版（修订/更新/更正优先），并导出 Excel（格式与年报版一致）。
"""

import os
import re
import time
import requests
import openpyxl
import pandas as pd
from typing import List, Dict, Tuple
from datetime import date

# ========= 可调参数 =========
# 只用这个关键词请求接口（避免请求面太广导致截断/花时）
SEARCH_KEY = "募集说明书"

# 时间范围
DATE_BEGIN = "2000-01-01"
DATE_END   = date.today().strftime("%Y-%m-%d")

output_dir   = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

# 输出文件
OUT_XLSX = os.path.join(output_dir, "募集说明书链接_选取公司【募集】.xlsx")
SHEET    = "募集说明书"

# 市场/板块
MARKETS = ["szse", "shse"]               # 沪深
PLATES  = ["", "sz", "szmb", "szcy", "sh", "shmb", "shkcp", "bj"]  # 板块

# ——显式排除（摘要/公告/确认意见/监事/GDR/增发等）
EXCLUDE_HARD = [
    "英文","英文版","annual","summary","摘要","概览","概要","备查","备查文件",
    "取消","已取消","提示性公告","公告","问询回复","反馈意见","修订情况说明",
    "核准情况","承诺的说明","说明的公告","确认意见","监事","全体监事",
    "控股股东","实际控制人","一致行动人",
    "GDR","瑞士","价格区间","批准",
    "公开增发","增发"
]

# ——与“可转债”同义/近义（用于识别可转债募集书）
CONV_ALIASES = [
    "可转债", "可转换公司债券", "可转换债券", "可转换公司债", "可转公司债", "可转换债"
]

# ========= 工具函数 =========
_SPACE_RE = re.compile(r"[\s\u00A0\u3000]+")
def norm(s: str) -> str:
    return _SPACE_RE.sub("", str(s or "").strip())

def any_kw_in(text: str, kws: List[str]) -> bool:
    t = norm(text)
    return any(norm(k) in t for k in kws if k)

def year_from_timestamp_ms(ts: int) -> str:
    try:
        return time.strftime("%Y", time.localtime(int(ts)/1000))
    except Exception:
        return ""

def extract_year_for_display(title: str, ts: int) -> str:
    # 仅用于Excel展示年份：先看标题有没有年份，没有就用公告时间年
    m = re.search(r"(19|20)\d{2}", norm(title))
    return m.group(0) if m else year_from_timestamp_ms(ts)

def is_target_offering(title: str) -> bool:
    """
    只保留『募集说明书』 + 『可转债/可转换（公司）债券』；显式排除干扰项。
    同时过滤掉普通“公司债券/优先股/发行股票”的募集说明书（非可转债）。
    """
    t = norm(title)

    # 必须包含“募集说明书”
    if "募集说明书" not in t:
        return False

    # 排除噪音类型
    if any_kw_in(t, EXCLUDE_HARD):
        return False

    # 必须是可转债相关
    is_conv = any_kw_in(t, CONV_ALIASES)
    if not is_conv:
        return False

    # 如果含“公司债券/优先股/发行股票”且不含“可转换/可转债”，也排除（双保险）
    if ("公司债券" in t or "优先股" in t or "发行股票" in t) and not any_kw_in(t, ["可转换","可转债"]):
        return False

    return True

def ann_weight(it: Dict) -> Tuple[int, int]:
    """
    版本加权：更新/修订/更正 > 发行稿 > 注册稿 > 上会稿 > 申报稿；同级按时间更近
    """
    title = norm(it.get("announcementTitle", ""))
    ts = int(it.get("announcementTime", 0) or 0)

    w = 0
    if any_kw_in(title, ["更新","修订","更正"]): w += 10
    if "发行稿" in title:  w += 8
    if "注册稿" in title:  w += 6
    if "上会稿" in title:  w += 4
    if "申报稿" in title:  w += 2

    return (w, ts)

# ========= 调接口 =========
def get_report(page_num: int, date_range: str, column: str, plate: str) -> requests.Response:
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    payload = {
        "pageNum":   page_num,
        "pageSize":  30,
        "column":    column,       # szse / shse
        "tabName":   "fulltext",
        "category":  "",
        "plate":     plate,
        "searchkey": SEARCH_KEY,   # 只搜“募集说明书”
        "secid":     "",
        "trade":     "",
        "seDate":    date_range,   # 2000-01-01~YYYY-MM-DD
        "sortName":  "code",
        "sortType":  "asc",
        "isHLtitle": "false"
    }
    headers = {
        "Accept":           "*/*",
        "Accept-Language":  "zh-CN,zh;q=0.9",
        "Content-Type":     "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin":           "http://www.cninfo.com.cn",
        "Referer":          "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search",
        "User-Agent":       "Mozilla/5.0",
        "X-Requested-With": "XMLHttpRequest"
    }
    return requests.post(url, data=payload, headers=headers, timeout=15)

def download_for_segments(segments: List[str], column: str, plate: str) -> List[Dict]:
    """
    分段分页拉取；当场过滤 + 去重（secCode, adjunctUrl）
    """
    all_ann: List[Dict] = []
    seen = set()
    for date_range in segments:
        try:
            total = get_report(1, date_range, column, plate).json().get("totalpages", 0)
        except Exception:
            continue
        if total <= 0:
            continue

        page = 1
        while page <= total:
            for _ in range(3):
                try:
                    resp = get_report(page, date_range, column, plate)
                    resp.raise_for_status()
                    anns = resp.json().get("announcements", []) or []
                    for x in anns:
                        title = x.get("announcementTitle", "") or ""
                        if not is_target_offering(title):
                            continue
                        key = (str(x.get("secCode", "")), str(x.get("adjunctUrl", "")))
                        if key in seen:
                            continue
                        seen.add(key)
                        all_ann.append(x)
                    break
                except Exception:
                    time.sleep(1.0)
            page += 1
    return all_ann

def collect_all_offering() -> List[Dict]:
    # 时间按年切片，避免结果截断
    y_begin = int(DATE_BEGIN[:4])
    y_end   = int(DATE_END[:4])
    segments: List[str] = []
    for y in range(y_begin, y_end + 1):
        seg_start = f"{y}-01-01" if y > y_begin else DATE_BEGIN
        seg_end   = f"{y}-12-31" if y < y_end   else DATE_END
        segments.append(f"{seg_start}~{seg_end}")

    raw: List[Dict] = []
    for col in MARKETS:
        for pl in PLATES:
            raw.extend(download_for_segments(segments, col, pl))
    return raw

# ========= 公司维度去重（仅保留最新版） =========
def filter_latest_per_company(anns: List[Dict], valid_secs: set) -> List[Dict]:
    latest: Dict[str, Dict] = {}  # key: secCode -> best announcement
    for it in anns:
        sec = str(it.get("secCode", "") or "")
        if not sec or sec not in valid_secs:
            continue
        if (sec not in latest) or (ann_weight(it) > ann_weight(latest[sec])):
            latest[sec] = it
    return list(latest.values())

# ========= 写 Excel =========
def write_selected_excel(anns: List[Dict]) -> None:
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    df_map["sec_no_suf"] = df_map["正股代码"].str.split(".").str[0]

    # secCode -> 记录
    pick: Dict[str, Dict] = {}
    for it in anns:
        sec = str(it.get("secCode", "") or "")
        if sec:
            pick[sec] = it

    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = SHEET
    ws.append(["可转债代码","可转债名称","公司代码","公司简称","标题","年份","年报链接"])

    for _, row in df_map.iterrows():
        bond_code = row.get("代码", "")
        bond_name = row.get("名称", "")
        sec       = row.get("sec_no_suf", "")
        comp_name = row.get("正股名称", "")

        it = pick.get(sec)
        if it:
            raw_title = re.sub(r"<.*?>","", it.get("announcementTitle",""))
            title = f"《{raw_title}》"
            ts = int(it.get("announcementTime", 0) or 0)
            yr = extract_year_for_display(raw_title, ts)
            url = f"http://static.cninfo.com.cn/{it.get('adjunctUrl','')}"
            ws.append([bond_code, bond_name, sec, comp_name, title, yr, url])
        else:
            ws.append([bond_code, bond_name, sec, comp_name, "", "", ""])

    wb.save(OUT_XLSX)
    print(f"✅ 已输出：{OUT_XLSX}（工作表：{SHEET}）")

# ========= 主流程 =========
def main():
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    valid_secs = set(df_map["正股代码"].str.split(".").str[0].tolist())

    raw = collect_all_offering()
    picked = filter_latest_per_company(raw, valid_secs)
    write_selected_excel(picked)
    print("---- 可转债募集说明书检索完成 ----")

if __name__ == "__main__":
    main()


✅ 已输出：/Users/sam/Desktop/cninfo_output/募集说明书链接_选取公司【募集】.xlsx（工作表：募集说明书）
---- 可转债募集说明书检索完成 ----


In [4]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
02_collect_offering_links.py
功能：按《正股映射结果.xlsx》的公司集合，从巨潮接口检索“募集说明书”，
     只保留与“可转债/可转换公司债券”相关的最新版；若标题不写“可转债”，
     也保留该公司“通用募集说明书”的最新版（排除优先股/非可转债公司债/股票增发等）。
输出：与年报版相同列顺序；每家公司仅一行（最新版）。
"""

import os
import re
import time
import requests
import openpyxl
import pandas as pd
from typing import List, Dict, Tuple
from datetime import date

# ========= 可调参数 =========
DATE_BEGIN = "2000-01-01"
DATE_END   = date.today().strftime("%Y-%m-%d")

output_dir   = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

OUT_XLSX = os.path.join(output_dir, "募集说明书链接_选取公司【募集】.xlsx")
SHEET    = "募集说明书"

# 市场/板块
MARKETS = ["szse", "shse"]
PLATES  = ["", "sz", "szmb", "szcy", "sh", "shmb", "shkcp", "bj"]

# 识别“可转债”正向关键词（只用于分类，不用于请求）
CONV_POS_KWS = ["可转换公司债券", "可转债", "可转换债券", "可转债券"]

# 一票否决（任何情况下都不要）
HARD_EXCLUDE = [
    "英文", "英文版", "Annual", "annual", "Summary", "summary",
    "摘要", "摘要版", "附录", "备查文件", "取消", "已取消",
    "提示性公告", "公告", "更名提示",
    "确认意见", "审核意见", "核准情况", "监事", "全体监事",
    "控股股东", "实际控制人", "一致行动人",
    "GDR", "全球存托凭证", "瑞士", "价格区间", "批准的公告",
    "公开增发", "增发"
]

# 明确排除“不是可转债”的品种
EQUITY_EXCLUDE = [  # 纯股票类
    "向特定对象发行股票", "以简易程序向特定对象发行股票", "发行A股股票", "非公开发行股票",
]
NON_CONV_BOND_EXCLUDE = [  # 非可转债的公司债/优先股
    "优先股", "公司债券", "绿色公司债", "科技创新公司债", "扶贫专项", "乡村振兴", "可续期公司债",
]

# ========= 工具函数 =========
_SPACE_RE = re.compile(r"[\s\u00A0\u3000]+")
def norm(s: str) -> str:
    return _SPACE_RE.sub("", str(s or "").strip())

def any_kw_in(text: str, kws: List[str]) -> bool:
    t = norm(text)
    return any(norm(k) in t for k in kws if k)

def contains_any(text: str, kws: List[str]) -> bool:
    return any_kw_in(text, kws)

def extract_year_from_title(title: str) -> str:
    m = re.search(r"(19\d{2}|20\d{2})", norm(title))
    return m.group(1) if m else ""

def year_from_timestamp_ms(ts: int) -> str:
    try:
        return time.strftime("%Y", time.localtime(int(ts)/1000))
    except Exception:
        return ""

def is_summary_or_bad(title: str) -> bool:
    """是否属于摘要/附录/公告/确认意见等一票否决"""
    t = norm(title)
    if contains_any(t, HARD_EXCLUDE):
        return True
    return False

def is_pure_equity(title: str) -> bool:
    t = norm(title)
    return contains_any(t, EQUITY_EXCLUDE)

def is_non_convertible_bond(title: str) -> bool:
    """排除非可转债公司债/优先股；但保留含'可转换公司债券'的情况"""
    t = norm(title)
    if "可转换公司债券" in t or "可转债" in t or "可转换债券" in t:
        return False
    return contains_any(t, NON_CONV_BOND_EXCLUDE)

def is_convertible(title: str) -> bool:
    t = norm(title)
    return contains_any(t, CONV_POS_KWS)

def ann_weight(it: Dict) -> Tuple[int, int, int]:
    """
    选择权重：
      1) 非摘要优先（摘要-5分）
      2) 修订/更新/更正 +10 分
      3) 时间越新越好
      4) 备用：标题长度（更长通常是正文）
    """
    title = norm(it.get("announcementTitle", ""))
    w = 0
    if "摘要" in title: w -= 5
    if contains_any(title, ["更新", "修订", "更正"]): w += 10
    try:
        ts = int(it.get("announcementTime", 0))
    except Exception:
        ts = 0
    return (w, ts, len(title))

# ========= 调接口 =========
def get_report(page_num: int, date_range: str, column: str, plate: str) -> requests.Response:
    """用一个关键词：募集说明书"""
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    payload = {
        "pageNum":   page_num,
        "pageSize":  30,
        "column":    column,
        "tabName":   "fulltext",
        "category":  "",
        "plate":     plate,
        "searchkey": "募集说明书",
        "secid":     "",
        "trade":     "",
        "seDate":    date_range,
        "sortName":  "code",
        "sortType":  "asc",
        "isHLtitle": "false"
    }
    headers = {
        "Accept":           "*/*",
        "Accept-Language":  "zh-CN,zh;q=0.9",
        "Content-Type":     "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin":           "http://www.cninfo.com.cn",
        "Referer":          "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search",
        "User-Agent":       "Mozilla/5.0",
        "X-Requested-With": "XMLHttpRequest"
    }
    return requests.post(url, data=payload, headers=headers, timeout=15)

def download_for_segments(segments: List[str], column: str, plate: str) -> List[Dict]:
    all_ann: List[Dict] = []
    seen = set()
    for date_range in segments:
        try:
            total = get_report(1, date_range, column, plate).json().get("totalpages", 0)
        except Exception:
            continue
        if total <= 0:
            continue

        page = 1
        while page <= total:
            for _ in range(3):  # 重试
                try:
                    resp = get_report(page, date_range, column, plate)
                    resp.raise_for_status()
                    anns = resp.json().get("announcements", []) or []
                    for x in anns:
                        title = x.get("announcementTitle", "") or ""
                        t = norm(title)

                        # 一票否决
                        if is_summary_or_bad(t):
                            continue
                        # 过滤掉明显不是可转债方向的
                        if is_pure_equity(t):
                            continue
                        if is_non_convertible_bond(t):
                            continue

                        key = (str(x.get("secCode", "")), str(x.get("adjunctUrl", "")))
                        if key in seen:
                            continue
                        seen.add(key)
                        all_ann.append(x)
                    break
                except Exception:
                    time.sleep(1.2)
            page += 1
    return all_ann

def collect_all_offering() -> List[Dict]:
    # 按年分片，避免接口截断
    y_begin = int(DATE_BEGIN[:4])
    y_end   = int(DATE_END[:4])
    segments: List[str] = []
    for y in range(y_begin, y_end + 1):
        seg_start = f"{y}-01-01" if y > y_begin else DATE_BEGIN
        seg_end   = f"{y}-12-31" if y < y_end   else DATE_END
        segments.append(f"{seg_start}~{seg_end}")

    raw: List[Dict] = []
    for col in MARKETS:
        for pl in PLATES:
            raw.extend(download_for_segments(segments, col, pl))
    return raw

# ========= 去重（公司层面只保留一条）=========
def pick_latest_per_company(anns: List[Dict], valid_secs: set) -> List[Dict]:
    """
    先在每家公司里找“可转债”募集说明书（按权重选最佳）；
    如果没有，再在剩余“通用募集说明书”里（非摘要/非股票/非非转债的公司债/非优先股）取权重最高的一条。
    """
    by_sec_all: Dict[str, List[Dict]] = {}
    for it in anns:
        sec = str(it.get("secCode", ""))
        if sec and (sec in valid_secs):
            by_sec_all.setdefault(sec, []).append(it)

    picked: List[Dict] = []
    for sec, lst in by_sec_all.items():
        # 先挑“可转债”类
        conv = [x for x in lst if is_convertible(x.get("announcementTitle","") or "")]
        base = conv if conv else lst  # 没有可转债标题，就用“通用募集说明书”
        best = max(base, key=ann_weight)
        picked.append(best)
    return picked

# ========= 写 Excel =========
def write_selected_excel(anns: List[Dict]) -> None:
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    df_map["sec_no_suf"] = df_map["正股代码"].str.split(".").str[0]

    ann_dict: Dict[str, Dict] = {str(it.get("secCode", "")): it for it in anns}

    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = SHEET
    ws.append(["可转债代码","可转债名称","公司代码","公司简称","标题","年份","年报链接"])

    for _, row in df_map.iterrows():
        bond_code = row.get("代码", "")
        bond_name = row.get("名称", "")
        sec       = row.get("sec_no_suf", "")
        comp_name = row.get("正股名称", "")

        it = ann_dict.get(sec)
        if it:
            raw_title = re.sub(r"<.*?>","", it.get("announcementTitle","")).replace("：","")
            title = f"《{raw_title}》"
            yr  = extract_year_from_title(raw_title) or year_from_timestamp_ms(it.get("announcementTime",0))
            url = f"http://static.cninfo.com.cn/{it.get('adjunctUrl','')}"
            ws.append([bond_code, bond_name, sec, comp_name, title, yr, url])
        else:
            # 没拉到就留空一行，便于你后续人工核对
            ws.append([bond_code, bond_name, sec, comp_name, "", "", ""])

    wb.save(OUT_XLSX)
    print(f"✅ 已输出：{OUT_XLSX}（工作表：{SHEET}）")

# ========= 主流程 =========
def main():
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    valid_secs = set(df_map["正股代码"].str.split(".").str[0].tolist())

    raw = collect_all_offering()
    picked = pick_latest_per_company(raw, valid_secs)
    write_selected_excel(picked)
    print("---- 可转债募集说明书检索完成 ----")

if __name__ == "__main__":
    main()


✅ 已输出：/Users/sam/Desktop/cninfo_output/募集说明书链接_选取公司【募集】.xlsx（工作表：募集说明书）
---- 可转债募集说明书检索完成 ----


In [5]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
02_collect_offering_links.py
功能：按《正股映射结果.xlsx》的公司集合，从巨潮检索“募集说明书”，
     仅保留与“可转债”相关的最新版；若找不到可转债，则保留该公司最新一份募集说明书。
改进点：
- 仅使用 searchkey="募集说明书"
- 先全量拉取，再对“缺口公司”做按公司补抓（stock=代码,简称）
- 摘要不硬排除，但权重扣分，正文优先
- 同公司仅保留一条（修订/更新加分，其次公告时间近）
"""

import os
import re
import time
import requests
import openpyxl
import pandas as pd
from typing import List, Dict, Tuple, Iterable
from datetime import date

# ========= 可调参数 =========
DATE_BEGIN = "2000-01-01"
DATE_END   = date.today().strftime("%Y-%m-%d")

output_dir   = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

OUT_XLSX = os.path.join(output_dir, "募集说明书链接_选取公司【募集】.xlsx")
SHEET    = "募集说明书"

# 市场/板块（覆盖尽量全）
MARKETS = ["szse", "shse"]
PLATES  = ["", "sz", "szmb", "szcy", "sh", "shmb", "shkcp", "bj"]

# 通用负面词（直接剔除）
EXCLUDE_HARD = [
    "英文", "英文版", "annual", "summary", "概览", "备查", "备查文件",
    "取消", "已取消", "提示性公告", "公告", "更名提示",
    "监事会", "确认意见", "审核意见", "问询回复", "情况说明", "变更",
    "gdr", "全球存托凭证", "瑞士", "价格区间确定", "批准的公告",
    "封卷稿", "封卷", "附录",
    # 明确不是可转债的发行品种
    "优先股", "公司债券", "绿色公司债", "可续期公司债", "科技创新公司债",
    "扶贫专项", "乡村振兴",
    # 股票再融资（非可转债）
    "向特定对象发行股票", "以简易程序向特定对象发行股票", "发行a股股票"
]

# 判定“这是可转债”的正面关键词（其一即可）
CB_POSITIVE = [
    "可转债", "可转债募集说明书",
    "可转换公司债", "可转换公司债券",
    "a股可转换公司债券",
    "向不特定对象发行可转换公司债券",
    "并在主板上市募集说明书", "并在创业板上市募集说明书", "并在科创板上市募集说明书"
]

# ========= 小工具 =========
_SPACE_RE = re.compile(r"[\s\u00A0\u3000]+")
def norm(s: str) -> str:
    return _SPACE_RE.sub("", str(s or "").strip()).lower()

def strip_html(s: str) -> str:
    return re.sub(r"<.*?>", "", str(s or ""))

def any_kw_in(text: str, kws: Iterable[str]) -> bool:
    t = norm(text)
    return any(norm(k) in t for k in kws if k)

def extract_year_from_title(title: str) -> str:
    m = re.search(r"(19\d{2}|20\d{2})", strip_html(title))
    return m.group(1) if m else ""

def year_from_timestamp_ms(ts: int) -> str:
    try:
        return time.strftime("%Y", time.localtime(int(ts)/1000))
    except Exception:
        return ""

def is_hard_excluded(title: str) -> bool:
    return any_kw_in(title, EXCLUDE_HARD)

def is_cb_title(title: str) -> bool:
    """是否能从标题看出是可转债"""
    return any_kw_in(title, CB_POSITIVE)

def ann_weight(it: Dict) -> Tuple[int, int, int]:
    """越大越优：正文 > 摘要；修订/更新/更正加分；其后按时间新"""
    title = strip_html(it.get("announcementTitle", "") or "")
    t = norm(title)
    w_abs = -8 if "摘要" in t else 0
    w_fix = 10 if any_kw_in(t, ["修订", "修正", "更新", "更正"]) else 0
    try:
        ts = int(it.get("announcementTime", 0))
    except Exception:
        ts = 0
    # 加一个轻微长度项，避免极短标题（通常摘要/概览）
    return (w_abs + w_fix, ts, len(t))

# ========= 调接口 =========
CNINFO_URL = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
HEADERS = {
    "Accept": "*/*",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    "Origin": "http://www.cninfo.com.cn",
    "Referer": "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search",
    "User-Agent": "Mozilla/5.0",
    "X-Requested-With": "XMLHttpRequest"
}

def get_report(page_num: int, date_range: str, column: str, plate: str, stock: str = "") -> requests.Response:
    """
    stock 可为 "" 或 "603898,好莱客" 这种，能显著提高按公司命中率
    """
    payload = {
        "pageNum":   page_num,
        "pageSize":  30,
        "column":    column,
        "tabName":   "fulltext",
        "category":  "",
        "plate":     plate,
        "searchkey": "募集说明书",
        "secid":     "",
        "trade":     "",
        "seDate":    date_range,
        "sortName":  "code",
        "sortType":  "asc",
        "isHLtitle": "false",
    }
    if stock:
        payload["stock"] = stock
    return requests.post(CNINFO_URL, data=payload, headers=HEADERS, timeout=15)

def paged_fetch(segments: List[str], column: str, plate: str, stock: str = "") -> List[Dict]:
    out: List[Dict] = []
    seen = set()
    for date_range in segments:
        try:
            total = get_report(1, date_range, column, plate, stock).json().get("totalpages", 0)
        except Exception:
            total = 0
        if total <= 0:
            continue

        page = 1
        while page <= total:
            for _ in range(3):
                try:
                    resp = get_report(page, date_range, column, plate, stock)
                    resp.raise_for_status()
                    anns = resp.json().get("announcements", []) or []
                    for x in anns:
                        # 只要标题里含“募集说明书”才考虑
                        title = strip_html(x.get("announcementTitle", "") or "")
                        if "募集说明书" not in title:
                            continue
                        # 硬负面直接排除
                        if is_hard_excluded(title):
                            continue
                        key = (str(x.get("secCode", "")), str(x.get("adjunctUrl", "")))
                        if key in seen:
                            continue
                        seen.add(key)
                        out.append(x)
                    break
                except Exception:
                    time.sleep(1.0)
            page += 1
    return out

def year_segments() -> List[str]:
    y0 = int(DATE_BEGIN[:4]); y1 = int(DATE_END[:4])
    segs = []
    for y in range(y0, y1 + 1):
        a = f"{y}-01-01" if y > y0 else DATE_BEGIN
        b = f"{y}-12-31" if y < y1 else DATE_END
        segs.append(f"{a}~{b}")
    return segs

def collect_all_raw(stock: str = "") -> List[Dict]:
    segs = year_segments()
    raw: List[Dict] = []
    for col in MARKETS:
        for pl in PLATES:
            raw.extend(paged_fetch(segs, col, pl, stock))
    return raw

# ========= 选择逻辑 =========
def choose_one_for_company(cands: List[Dict]) -> Dict:
    """
    在候选中优先选择“可转债”，若无可转债，则在剩余募集说明书里选权重最高的一条
    """
    if not cands:
        return {}

    def _title(it): return strip_html(it.get("announcementTitle", "") or "")
    cb = [it for it in cands if is_cb_title(_title(it))]
    if cb:
        return max(cb, key=ann_weight)

    # 没有明显“可转债”字样，退而选权重最高（正文>摘要，时间新）
    return max(cands, key=ann_weight)

def filter_latest_per_company(all_anns: List[Dict], valid_secs: set) -> Dict[str, Dict]:
    by_sec: Dict[str, List[Dict]] = {}
    for it in all_anns:
        sec = str(it.get("secCode", "") or "")
        if sec and sec in valid_secs:
            by_sec.setdefault(sec, []).append(it)

    chosen: Dict[str, Dict] = {}
    for sec, items in by_sec.items():
        pick = choose_one_for_company(items)
        if pick:
            chosen[sec] = pick
    return chosen

# ========= 写 Excel =========
def write_selected_excel(chosen: Dict[str, Dict]) -> None:
    df = pd.read_excel(mapping_file, dtype=str).fillna("")
    df["sec_no_suf"] = df["正股代码"].str.split(".").str[0]

    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = SHEET
    ws.append(["可转债代码","可转债名称","公司代码","公司简称","标题","年份","年报链接"])

    for _, r in df.iterrows():
        bond_code = r.get("代码", "")
        bond_name = r.get("名称", "")
        sec       = r.get("sec_no_suf", "")
        comp_name = r.get("正股名称", "")

        it = chosen.get(sec)
        if it:
            raw_title = strip_html(it.get("announcementTitle", "") or "")
            title = f"《{raw_title}》"
            yr  = extract_year_from_title(raw_title) or year_from_timestamp_ms(it.get("announcementTime", 0))
            url = f"http://static.cninfo.com.cn/{it.get('adjunctUrl', '')}"
            ws.append([bond_code, bond_name, sec, comp_name, title, yr, url])
        else:
            ws.append([bond_code, bond_name, sec, comp_name, "", "", ""])

    wb.save(OUT_XLSX)
    print(f"✅ 已输出：{OUT_XLSX}（工作表：{SHEET}）")

# ========= 主流程 =========
def main():
    # 公司集合
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    df_map["sec_no_suf"] = df_map["正股代码"].str.split(".").str[0]
    valid_secs = set(df_map["sec_no_suf"].tolist())
    name_by_sec = dict(zip(df_map["sec_no_suf"], df_map["正股名称"]))

    # 1) 全量抓取（按年份分段，覆盖全板块）
    raw_all = collect_all_raw()

    # 2) 先用全量结果做一次“公司维度仅保留一条”
    chosen = filter_latest_per_company(raw_all, valid_secs)

    # 3) 对仍然缺失的公司，做“按公司补抓”（stock=代码,简称），再应用相同选择逻辑
    missing_secs = [sec for sec in valid_secs if sec not in chosen]
    if missing_secs:
        segs = year_segments()
        for sec in missing_secs:
            comp = name_by_sec.get(sec, "")
            stock_q = f"{sec},{comp}" if comp else sec
            extra: List[Dict] = []
            for col in MARKETS:
                for pl in PLATES:
                    extra.extend(paged_fetch(segs, col, pl, stock_q))
            pick = choose_one_for_company(extra)
            if pick:
                chosen[sec] = pick

    # 4) 导出
    write_selected_excel(chosen)
    print("---- 可转债募集说明书检索完成 ----")

if __name__ == "__main__":
    main()


KeyboardInterrupt: 