In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
02_collect_offering_links.py
功能：按《正股映射结果.xlsx》的公司集合，从巨潮接口检索“募集说明书”公告，
     并导出 Excel（格式与年报版一致，只是标题/年份/链接改为募集说明书对应内容）。
"""

import os
import re
import time
import requests
import openpyxl
import pandas as pd
from typing import List, Dict, Tuple
from datetime import date

# ========= 可调参数 =========
# 关键词：改为“募集说明书”
KEYWORDS = ["募集说明书"]
EXTRA_KWS: List[str] = []   # 可追加“更新稿”、“摘要”等

# 时间范围（起止）
DATE_BEGIN = "2000-01-01"
DATE_END   = date.today().strftime("%Y-%m-%d")

output_dir   = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

# 输出文件
OUT_XLSX = os.path.join(output_dir, "募集说明书链接_选取公司【募集】.xlsx")
SHEET    = "募集说明书"

# 市场/板块
MARKETS = ["szse", "shse"]
PLATES  = ["", "sz", "szmb", "szcy", "sh", "shmb", "shkcp", "bj"]

# 排除干扰词
EXCLUDE_KWS = [
    "英文", "英文版", "Annual", "annual", "Summary", "summary",
    "摘要", "摘要版", "取消", "已取消", "提示性公告", "更名提示", "B股", "H股"
]

# ========= 工具函数 =========
_SPACE_RE = re.compile(r"[\s\u00A0\u3000]+")
def norm(s: str) -> str:
    return _SPACE_RE.sub("", str(s or "").strip())

def any_kw_in(text: str, kws: List[str]) -> bool:
    t = norm(text)
    return any(norm(k) in t for k in kws if k)

_TITLE_YEAR_PATS = [
    re.compile(r"(20\d{2})\s*年?\s*度?\s*募\s*集\s*说\s*明\s*书"),
    re.compile(r"(20\d{2})\s*年?\s*募\s*集\s*说\s*明\s*书"),
    re.compile(r"(19\d{2}|20\d{2})")
]
def extract_year_from_title(title: str) -> str:
    t = norm(title)
    for pat in _TITLE_YEAR_PATS:
        m = pat.search(t)
        if m:
            return m.group(1)
    return ""

def year_from_timestamp_ms(ts: int) -> str:
    try:
        return time.strftime("%Y", time.localtime(int(ts)/1000))
    except Exception:
        return ""

def is_target_offering(title: str) -> bool:
    kws = list(KEYWORDS) + list(EXTRA_KWS)
    t = norm(title)
    if not any_kw_in(t, kws):
        return False
    if any_kw_in(t, EXCLUDE_KWS):
        return False
    return True

def ann_weight(it: Dict) -> Tuple[int, int]:
    title = norm(it.get("announcementTitle", ""))
    w_fix = 0
    if any_kw_in(title, ["更新", "修订", "更正"]):
        w_fix = 10
    try:
        ts = int(it.get("announcementTime", 0))
    except Exception:
        ts = 0
    return (w_fix, ts)

# ========= 调接口 =========
def get_report(page_num: int, date_range: str, column: str, plate: str, searchkey: str="") -> requests.Response:
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    payload = {
        "pageNum":   page_num,
        "pageSize":  30,
        "column":    column,
        "tabName":   "fulltext",
        "category":  "",
        "plate":     plate,
        "searchkey": searchkey,
        "secid":     "",
        "trade":     "",
        "seDate":    date_range,
        "sortName":  "code",
        "sortType":  "asc",
        "isHLtitle": "false"
    }
    headers = {
        "Accept":           "*/*",
        "Accept-Language":  "zh-CN,zh;q=0.9",
        "Content-Type":     "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin":           "http://www.cninfo.com.cn",
        "Referer":          "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search",
        "User-Agent":       "Mozilla/5.0",
        "X-Requested-With": "XMLHttpRequest"
    }
    return requests.post(url, data=payload, headers=headers, timeout=15)

def download_for_segments(segments: List[str], column: str, plate: str, searchkey: str) -> List[Dict]:
    all_ann: List[Dict] = []
    seen = set()
    for date_range in segments:
        try:
            total = get_report(1, date_range, column, plate, searchkey).json().get("totalpages", 0)
        except Exception:
            continue
        if total <= 0:
            continue

        page = 1
        while page <= total:
            for _ in range(3):
                try:
                    resp = get_report(page, date_range, column, plate, searchkey)
                    resp.raise_for_status()
                    anns = resp.json().get("announcements", []) or []
                    for x in anns:
                        title = x.get("announcementTitle", "")
                        if not is_target_offering(title):
                            continue
                        key = (str(x.get("secCode", "")), str(x.get("adjunctUrl", "")))
                        if key in seen:
                            continue
                        seen.add(key)
                        all_ann.append(x)
                    break
                except Exception:
                    time.sleep(1.2)
            page += 1
    return all_ann

def collect_all_offering() -> List[Dict]:
    y_begin = int(DATE_BEGIN[:4])
    y_end   = int(DATE_END[:4])
    segments: List[str] = []
    for y in range(y_begin, y_end + 1):
        seg_start = f"{y}-01-01" if y > y_begin else DATE_BEGIN
        seg_end   = f"{y}-12-31" if y < y_end   else DATE_END
        segments.append(f"{seg_start}~{seg_end}")

    search_terms = list({norm(k) for k in (KEYWORDS + EXTRA_KWS) if k})
    raw: List[Dict] = []
    for col in MARKETS:
        for pl in PLATES:
            for kw in (search_terms or ["募集说明书"]):
                raw.extend(download_for_segments(segments, col, pl, kw))
    return raw

# ========= 去重 =========
def filter_latest_per_company(anns: List[Dict], valid_secs: set) -> List[Dict]:
    latest: Dict[Tuple[str, str], Dict] = {}
    for it in anns:
        sec = str(it.get("secCode", ""))
        if sec not in valid_secs:
            continue
        yr = extract_year_from_title(it.get("announcementTitle", "")) or year_from_timestamp_ms(it.get("announcementTime", 0))
        if not yr:
            continue
        key = (sec, yr)
        if key not in latest or ann_weight(it) > ann_weight(latest[key]):
            latest[key] = it
    return list(latest.values())

# ========= 写 Excel =========
def write_selected_excel(anns: List[Dict]) -> None:
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    df_map["sec_no_suf"] = df_map["正股代码"].str.split(".").str[0]

    ann_dict: Dict[Tuple[str, str], Dict] = {}
    for it in anns:
        sec = str(it.get("secCode", ""))
        yr  = extract_year_from_title(it.get("announcementTitle", "")) or year_from_timestamp_ms(it.get("announcementTime", 0))
        if sec and yr:
            ann_dict[(sec, yr)] = it

    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = SHEET
    ws.append(["可转债代码","可转债名称","公司代码","公司简称","标题","年份","年报链接"])

    for _, row in df_map.iterrows():
        bond_code = row.get("代码", "")
        bond_name = row.get("名称", "")
        sec       = row.get("sec_no_suf", "")
        comp_name = row.get("正股名称", "")

        matched = [(k, v) for k, v in ann_dict.items() if k[0] == sec]
        if matched:
            matched.sort(key=lambda kv: kv[0][1])
            for (_, yr), it in matched:
                raw_title = re.sub(r"<.*?>","", it.get("announcementTitle","")).replace("：","")
                title = f"《{raw_title}》"
                url   = f"http://static.cninfo.com.cn/{it.get('adjunctUrl','')}"
                ws.append([bond_code, bond_name, sec, comp_name, title, yr, url])
        else:
            ws.append([bond_code, bond_name, sec, comp_name, "", "", ""])

    wb.save(OUT_XLSX)
    print(f"✅ 已输出：{OUT_XLSX}（工作表：{SHEET}）")

# ========= 主流程 =========
def main():
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    valid_secs = set(df_map["正股代码"].str.split(".").str[0].tolist())

    raw = collect_all_offering()
    picked = filter_latest_per_company(raw, valid_secs)
    write_selected_excel(picked)
    print("---- 募集说明书检索完成 ----")

if __name__ == "__main__":
    main()


✅ 已输出：/Users/sam/Desktop/cninfo_output/募集说明书链接_选取公司【募集】.xlsx（工作表：募集说明书）
---- 募集说明书检索完成 ----


In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
02_collect_offering_links.py  (仅可转债)
功能：
  - 按《正股映射结果.xlsx》的公司集合，从巨潮接口检索“募集说明书”，
  - 在结果中仅保留【可转债募集说明书】，显式排除摘要/附录/公告/意见/GDR/优先股/普通公司债/股票再融资等，
  - 公司维度仅保留一条最新版（修订/修正/更新优先，其次 announcementTime 更近），
  - 导出 Excel（与年报版列顺序一致），每家公司只一行。
"""

import os
import re
import time
import requests
import openpyxl
import pandas as pd
from typing import List, Dict, Tuple
from datetime import date

# ========= 可调参数 =========
# 1) 搜索词：按你的要求——每家公司只用一个 searchkey
KEYWORDS = ["募集说明书"]          # 固定单一检索词
EXTRA_KWS: List[str] = []         # 不额外扩展，保证请求轻量

# 2) 时间范围（起止）
DATE_BEGIN = "2000-01-01"
DATE_END   = date.today().strftime("%Y-%m-%d")

# 3) I/O
output_dir   = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

OUT_XLSX = os.path.join(output_dir, "募集说明书链接_选取公司【募集】.xlsx")
SHEET    = "募集说明书"

# 4) 市场/板块
MARKETS = ["szse", "shse"]
PLATES  = ["", "sz", "szmb", "szcy", "sh", "shmb", "shkcp", "bj"]

# ========= 规则 =========
_SPACE_RE = re.compile(r"[\s\u00A0\u3000]+")
def norm(s: str) -> str:
    return _SPACE_RE.sub("", str(s or "").strip())

def any_kw_in(text: str, kws: List[str]) -> bool:
    t = norm(text)
    return any(norm(k) in t for k in kws if k)

# ——显式排除：摘要/附录/公告/意见/问询/GDR/优先股/普通公司债/股票再融资……
EXCLUDE_KWS = [
    # 文档形态
    "摘要", "摘要版", "附录", "概览", "封卷稿", "封卷版",
    "公告", "情况说明", "说明的公告", "更正公告", "更正后",
    "审核意见", "确认意见", "监事会", "全体监事", "承诺", "问询回复", "反馈意见",
    # 其他证券类型
    "优先股", "境内优先股",
    # 普通公司债（不含“可转”）
    "公司债券募集说明书",
    # 股票再融资
    "向特定对象发行股票募集说明书", "以简易程序向特定对象发行股票募集说明书",
    "向特定对象发行A股股票募集说明书", "发行证券募集说明书", "并在主板上市募集说明书",
    # GDR / 瑞士等
    "GDR", "全球存托凭证", "瑞士交易所", "瑞士证券交易所", "价格区间确定", "批准的公告",
    # 英文/其他干扰
    "英文", "英文版", "Annual", "annual", "Summary", "summary",
    "取消", "已取消", "提示性公告", "更名提示", "B股", "H股"
]

# ——判定“普通公司债券”的辅助（只有含“公司债券募集说明书”且不含“可转/可转换”才视为普通债并排除）
def is_plain_corporate_bond(t: str) -> bool:
    t = norm(t)
    return ("公司债券募集说明书" in t) and ("可转" not in t and "可转换" not in t)

# ——仅保留“可转债募集说明书”的包含规则（命中其一即可）
CB_INCLUDE_RES = [
    re.compile(r"可转(换)?(公司)?债(券)?募集说明书"),                 # 可转债募集说明书/可转换公司债券募集说明书
    re.compile(r"公开发行.*可转(换)?(公司)?债(券)?.*募集说明书"),     # 公开发行…可转债…募集说明书
    re.compile(r"向不特定对象发行.*可转(换)?(公司)?债(券)?.*募集说明书"),
]

def is_target_cb_offering(title: str) -> bool:
    """只保留可转债募集说明书；过滤掉摘要/附录/公告/意见/GDR/优先股/普通公司债/股票再融资等"""
    t = norm(title)
    if is_plain_corporate_bond(t):       # 普通公司债，剔除
        return False
    if any_kw_in(t, EXCLUDE_KWS):        # 显式排除
        return False
    return any(p.search(t) for p in CB_INCLUDE_RES)

# ——年份提取（用于展示；去重不按年份）
_TITLE_YEAR_PATS = [
    re.compile(r"(20\d{2})\s*年?\s*度?\s*募\s*集\s*说\s*明\s*书"),
    re.compile(r"(20\d{2})\s*年?\s*募\s*集\s*说\s*明\s*书"),
    re.compile(r"(19\d{2}|20\d{2})")
]
def extract_year_from_title(title: str) -> str:
    t = norm(title)
    for pat in _TITLE_YEAR_PATS:
        m = pat.search(t)
        if m:
            return m.group(1)
    return ""

def year_from_timestamp_ms(ts: int) -> str:
    try:
        return time.strftime("%Y", time.localtime(int(ts)/1000))
    except Exception:
        return ""

# ——权重：修订/修正/更新优先，其次发行稿/注册稿/上会稿/申报稿略加分；同权重按时间近
def ann_weight(it: Dict) -> Tuple[int, int]:
    title = norm(it.get("announcementTitle", ""))
    w = 0
    if any_kw_in(title, ["修订", "修正", "更正", "更新"]):
        w += 10
    if any_kw_in(title, ["发行稿"]):
        w += 3
    if any_kw_in(title, ["注册稿", "上会稿", "申报稿"]):
        w += 1
    try:
        ts = int(it.get("announcementTime", 0))
    except Exception:
        ts = 0
    return (w, ts)

# ========= 调接口 =========
def get_report(page_num: int, date_range: str, column: str, plate: str, searchkey: str="") -> requests.Response:
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    payload = {
        "pageNum":   page_num,
        "pageSize":  30,
        "column":    column,
        "tabName":   "fulltext",
        "category":  "",
        "plate":     plate,
        "searchkey": searchkey,     # 只用“募集说明书”
        "secid":     "",
        "trade":     "",
        "seDate":    date_range,
        "sortName":  "code",
        "sortType":  "asc",
        "isHLtitle": "false"
    }
    headers = {
        "Accept":           "*/*",
        "Accept-Language":  "zh-CN,zh;q=0.9",
        "Content-Type":     "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin":           "http://www.cninfo.com.cn",
        "Referer":          "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search",
        "User-Agent":       "Mozilla/5.0",
        "X-Requested-With": "XMLHttpRequest"
    }
    return requests.post(url, data=payload, headers=headers, timeout=15)

def download_for_segments(segments: List[str], column: str, plate: str, searchkey: str) -> List[Dict]:
    all_ann: List[Dict] = []
    seen = set()
    for date_range in segments:
        try:
            total = get_report(1, date_range, column, plate, searchkey).json().get("totalpages", 0)
        except Exception:
            continue
        if total <= 0:
            continue

        page = 1
        while page <= total:
            for _ in range(3):
                try:
                    resp = get_report(page, date_range, column, plate, searchkey)
                    resp.raise_for_status()
                    anns = resp.json().get("announcements", []) or []
                    for x in anns:
                        title = x.get("announcementTitle", "")
                        if not is_target_cb_offering(title):
                            continue
                        key = (str(x.get("secCode", "")), str(x.get("adjunctUrl", "")))
                        if key in seen:
                            continue
                        seen.add(key)
                        all_ann.append(x)
                    break
                except Exception:
                    time.sleep(1.1)
            page += 1
    return all_ann

def collect_all_offering() -> List[Dict]:
    # 年度分片，避免一次性范围过大
    y_begin = int(DATE_BEGIN[:4])
    y_end   = int(DATE_END[:4])
    segments: List[str] = []
    for y in range(y_begin, y_end + 1):
        seg_start = f"{y}-01-01" if y > y_begin else DATE_BEGIN
        seg_end   = f"{y}-12-31" if y < y_end   else DATE_END
        segments.append(f"{seg_start}~{seg_end}")

    # 按你的要求：只有一个 searchkey——“募集说明书”
    raw: List[Dict] = []
    for col in MARKETS:
        for pl in PLATES:
            raw.extend(download_for_segments(segments, col, pl, "募集说明书"))
    return raw

# ========= 去重（公司维度，仅保留最新版） =========
def pick_latest_per_company(anns: List[Dict], valid_secs: set) -> List[Dict]:
    latest: Dict[str, Dict] = {}
    for it in anns:
        sec = str(it.get("secCode", ""))
        if sec not in valid_secs:
            continue
        if sec not in latest or ann_weight(it) > ann_weight(latest[sec]):
            latest[sec] = it
    return list(latest.values())

# ========= 写 Excel（每家公司只输出一行） =========
def write_selected_excel(anns: List[Dict]) -> None:
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    df_map["sec_no_suf"] = df_map["正股代码"].str.split(".").str[0]

    # secCode -> 记录
    ann_dict: Dict[str, Dict] = {}
    for it in anns:
        sec = str(it.get("secCode", ""))
        if sec:
            ann_dict[sec] = it

    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = SHEET
    ws.append(["可转债代码","可转债名称","公司代码","公司简称","标题","年份","年报链接"])

    for _, row in df_map.iterrows():
        bond_code = row.get("代码", "")
        bond_name = row.get("名称", "")
        sec       = row.get("sec_no_suf", "")
        comp_name = row.get("正股名称", "")

        it = ann_dict.get(sec)
        if it:
            raw_title = re.sub(r"<.*?>","", it.get("announcementTitle","")).replace("：","")
            title = f"《{raw_title}》"
            yr = extract_year_from_title(raw_title) or year_from_timestamp_ms(it.get("announcementTime", 0))
            url = f"http://static.cninfo.com.cn/{it.get('adjunctUrl','')}"
            ws.append([bond_code, bond_name, sec, comp_name, title, yr, url])
        else:
            ws.append([bond_code, bond_name, sec, comp_name, "", "", ""])

    wb.save(OUT_XLSX)
    print(f"✅ 已输出：{OUT_XLSX}（工作表：{SHEET}）")

# ========= 主流程 =========
def main():
    # 1) 公司集合
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    valid_secs = set(df_map["正股代码"].str.split(".").str[0].tolist())

    # 2) 拉取（单一searchkey）+ 严格筛选仅保留可转债募集说明书
    raw = collect_all_offering()

    # 3) 公司维度仅保留最新版
    picked = pick_latest_per_company(raw, valid_secs)

    # 4) 导出
    write_selected_excel(picked)

    print("---- 可转债募集说明书检索完成 ----")

if __name__ == "__main__":
    main()


✅ 已输出：/Users/sam/Desktop/cninfo_output/募集说明书链接_选取公司【募集】.xlsx（工作表：募集说明书）
---- 可转债募集说明书检索完成 ----


In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
02_collect_offering_links.py
功能：按《正股映射结果.xlsx》的公司集合，从巨潮接口检索“可转债募集说明书”公告，
     只保留与可转债相关的最新版（修订/更新/更正优先），并导出 Excel（格式与年报版一致）。
"""

import os
import re
import time
import requests
import openpyxl
import pandas as pd
from typing import List, Dict, Tuple
from datetime import date

# ========= 可调参数 =========
# 只用这个关键词请求接口（避免请求面太广导致截断/花时）
SEARCH_KEY = "募集说明书"

# 时间范围
DATE_BEGIN = "2000-01-01"
DATE_END   = date.today().strftime("%Y-%m-%d")

output_dir   = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

# 输出文件
OUT_XLSX = os.path.join(output_dir, "募集说明书链接_选取公司【募集】.xlsx")
SHEET    = "募集说明书"

# 市场/板块
MARKETS = ["szse", "shse"]               # 沪深
PLATES  = ["", "sz", "szmb", "szcy", "sh", "shmb", "shkcp", "bj"]  # 板块

# ——显式排除（摘要/公告/确认意见/监事/GDR/增发等）
EXCLUDE_HARD = [
    "英文","英文版","annual","summary","摘要","概览","概要","备查","备查文件",
    "取消","已取消","提示性公告","公告","问询回复","反馈意见","修订情况说明",
    "核准情况","承诺的说明","说明的公告","确认意见","监事","全体监事",
    "控股股东","实际控制人","一致行动人",
    "GDR","瑞士","价格区间","批准",
    "公开增发","增发"
]

# ——与“可转债”同义/近义（用于识别可转债募集书）
CONV_ALIASES = [
    "可转债", "可转换公司债券", "可转换债券", "可转换公司债", "可转公司债", "可转换债"
]

# ========= 工具函数 =========
_SPACE_RE = re.compile(r"[\s\u00A0\u3000]+")
def norm(s: str) -> str:
    return _SPACE_RE.sub("", str(s or "").strip())

def any_kw_in(text: str, kws: List[str]) -> bool:
    t = norm(text)
    return any(norm(k) in t for k in kws if k)

def year_from_timestamp_ms(ts: int) -> str:
    try:
        return time.strftime("%Y", time.localtime(int(ts)/1000))
    except Exception:
        return ""

def extract_year_for_display(title: str, ts: int) -> str:
    # 仅用于Excel展示年份：先看标题有没有年份，没有就用公告时间年
    m = re.search(r"(19|20)\d{2}", norm(title))
    return m.group(0) if m else year_from_timestamp_ms(ts)

def is_target_offering(title: str) -> bool:
    """
    只保留『募集说明书』 + 『可转债/可转换（公司）债券』；显式排除干扰项。
    同时过滤掉普通“公司债券/优先股/发行股票”的募集说明书（非可转债）。
    """
    t = norm(title)

    # 必须包含“募集说明书”
    if "募集说明书" not in t:
        return False

    # 排除噪音类型
    if any_kw_in(t, EXCLUDE_HARD):
        return False

    # 必须是可转债相关
    is_conv = any_kw_in(t, CONV_ALIASES)
    if not is_conv:
        return False

    # 如果含“公司债券/优先股/发行股票”且不含“可转换/可转债”，也排除（双保险）
    if ("公司债券" in t or "优先股" in t or "发行股票" in t) and not any_kw_in(t, ["可转换","可转债"]):
        return False

    return True

def ann_weight(it: Dict) -> Tuple[int, int]:
    """
    版本加权：更新/修订/更正 > 发行稿 > 注册稿 > 上会稿 > 申报稿；同级按时间更近
    """
    title = norm(it.get("announcementTitle", ""))
    ts = int(it.get("announcementTime", 0) or 0)

    w = 0
    if any_kw_in(title, ["更新","修订","更正"]): w += 10
    if "发行稿" in title:  w += 8
    if "注册稿" in title:  w += 6
    if "上会稿" in title:  w += 4
    if "申报稿" in title:  w += 2

    return (w, ts)

# ========= 调接口 =========
def get_report(page_num: int, date_range: str, column: str, plate: str) -> requests.Response:
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    payload = {
        "pageNum":   page_num,
        "pageSize":  30,
        "column":    column,       # szse / shse
        "tabName":   "fulltext",
        "category":  "",
        "plate":     plate,
        "searchkey": SEARCH_KEY,   # 只搜“募集说明书”
        "secid":     "",
        "trade":     "",
        "seDate":    date_range,   # 2000-01-01~YYYY-MM-DD
        "sortName":  "code",
        "sortType":  "asc",
        "isHLtitle": "false"
    }
    headers = {
        "Accept":           "*/*",
        "Accept-Language":  "zh-CN,zh;q=0.9",
        "Content-Type":     "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin":           "http://www.cninfo.com.cn",
        "Referer":          "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search",
        "User-Agent":       "Mozilla/5.0",
        "X-Requested-With": "XMLHttpRequest"
    }
    return requests.post(url, data=payload, headers=headers, timeout=15)

def download_for_segments(segments: List[str], column: str, plate: str) -> List[Dict]:
    """
    分段分页拉取；当场过滤 + 去重（secCode, adjunctUrl）
    """
    all_ann: List[Dict] = []
    seen = set()
    for date_range in segments:
        try:
            total = get_report(1, date_range, column, plate).json().get("totalpages", 0)
        except Exception:
            continue
        if total <= 0:
            continue

        page = 1
        while page <= total:
            for _ in range(3):
                try:
                    resp = get_report(page, date_range, column, plate)
                    resp.raise_for_status()
                    anns = resp.json().get("announcements", []) or []
                    for x in anns:
                        title = x.get("announcementTitle", "") or ""
                        if not is_target_offering(title):
                            continue
                        key = (str(x.get("secCode", "")), str(x.get("adjunctUrl", "")))
                        if key in seen:
                            continue
                        seen.add(key)
                        all_ann.append(x)
                    break
                except Exception:
                    time.sleep(1.0)
            page += 1
    return all_ann

def collect_all_offering() -> List[Dict]:
    # 时间按年切片，避免结果截断
    y_begin = int(DATE_BEGIN[:4])
    y_end   = int(DATE_END[:4])
    segments: List[str] = []
    for y in range(y_begin, y_end + 1):
        seg_start = f"{y}-01-01" if y > y_begin else DATE_BEGIN
        seg_end   = f"{y}-12-31" if y < y_end   else DATE_END
        segments.append(f"{seg_start}~{seg_end}")

    raw: List[Dict] = []
    for col in MARKETS:
        for pl in PLATES:
            raw.extend(download_for_segments(segments, col, pl))
    return raw

# ========= 公司维度去重（仅保留最新版） =========
def filter_latest_per_company(anns: List[Dict], valid_secs: set) -> List[Dict]:
    latest: Dict[str, Dict] = {}  # key: secCode -> best announcement
    for it in anns:
        sec = str(it.get("secCode", "") or "")
        if not sec or sec not in valid_secs:
            continue
        if (sec not in latest) or (ann_weight(it) > ann_weight(latest[sec])):
            latest[sec] = it
    return list(latest.values())

# ========= 写 Excel =========
def write_selected_excel(anns: List[Dict]) -> None:
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    df_map["sec_no_suf"] = df_map["正股代码"].str.split(".").str[0]

    # secCode -> 记录
    pick: Dict[str, Dict] = {}
    for it in anns:
        sec = str(it.get("secCode", "") or "")
        if sec:
            pick[sec] = it

    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = SHEET
    ws.append(["可转债代码","可转债名称","公司代码","公司简称","标题","年份","年报链接"])

    for _, row in df_map.iterrows():
        bond_code = row.get("代码", "")
        bond_name = row.get("名称", "")
        sec       = row.get("sec_no_suf", "")
        comp_name = row.get("正股名称", "")

        it = pick.get(sec)
        if it:
            raw_title = re.sub(r"<.*?>","", it.get("announcementTitle",""))
            title = f"《{raw_title}》"
            ts = int(it.get("announcementTime", 0) or 0)
            yr = extract_year_for_display(raw_title, ts)
            url = f"http://static.cninfo.com.cn/{it.get('adjunctUrl','')}"
            ws.append([bond_code, bond_name, sec, comp_name, title, yr, url])
        else:
            ws.append([bond_code, bond_name, sec, comp_name, "", "", ""])

    wb.save(OUT_XLSX)
    print(f"✅ 已输出：{OUT_XLSX}（工作表：{SHEET}）")

# ========= 主流程 =========
def main():
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    valid_secs = set(df_map["正股代码"].str.split(".").str[0].tolist())

    raw = collect_all_offering()
    picked = filter_latest_per_company(raw, valid_secs)
    write_selected_excel(picked)
    print("---- 可转债募集说明书检索完成 ----")

if __name__ == "__main__":
    main()


✅ 已输出：/Users/sam/Desktop/cninfo_output/募集说明书链接_选取公司【募集】.xlsx（工作表：募集说明书）
---- 可转债募集说明书检索完成 ----


In [None]:
#5

In [8]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
02_collect_offering_links.py
功能：按《正股映射结果.xlsx》的公司集合，从巨潮接口检索“募集说明书”，
     只保留与“可转债/可转换公司债券”相关的最新版；若标题不写“可转债”，
     也保留该公司“通用募集说明书”的最新版（排除优先股/非可转债公司债/股票增发等）。
输出：与年报版相同列顺序；每家公司仅一行（最新版）。
"""

import os
import re
import time
import requests
import openpyxl
import pandas as pd
from typing import List, Dict, Tuple
from datetime import date

# ========= 可调参数 =========
DATE_BEGIN = "2000-01-01"
DATE_END   = date.today().strftime("%Y-%m-%d")

output_dir   = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

OUT_XLSX = os.path.join(output_dir, "募集说明书链接_选取公司【募集】.xlsx")
SHEET    = "募集说明书"

# 市场/板块
MARKETS = ["szse", "shse"]
PLATES  = ["", "sz", "szmb", "szcy", "sh", "shmb", "shkcp", "bj"]

# 识别“可转债”正向关键词（只用于分类，不用于请求）
CONV_POS_KWS = ["可转换公司债券", "可转债", "可转换债券", "可转债券"]

# 一票否决（任何情况下都不要）
HARD_EXCLUDE = [
    "英文", "英文版", "Annual", "annual", "Summary", "summary",
    "摘要", "摘要版", "附录", "备查文件", "取消", "已取消",
    "提示性公告", "公告", "更名提示",
    "确认意见", "审核意见", "核准情况", "监事", "全体监事",
    "控股股东", "实际控制人", "一致行动人",
    "GDR", "全球存托凭证", "瑞士", "价格区间", "批准的公告",
    "公开增发", "增发"
]

# 明确排除“不是可转债”的品种
EQUITY_EXCLUDE = [  # 纯股票类
    "向特定对象发行股票", "以简易程序向特定对象发行股票", "发行A股股票", "非公开发行股票",
]
NON_CONV_BOND_EXCLUDE = [  # 非可转债的公司债/优先股
    "优先股", "公司债券", "绿色公司债", "科技创新公司债", "扶贫专项", "乡村振兴", "可续期公司债",
]

# ========= 工具函数 =========
_SPACE_RE = re.compile(r"[\s\u00A0\u3000]+")
def norm(s: str) -> str:
    return _SPACE_RE.sub("", str(s or "").strip())

def any_kw_in(text: str, kws: List[str]) -> bool:
    t = norm(text)
    return any(norm(k) in t for k in kws if k)

def contains_any(text: str, kws: List[str]) -> bool:
    return any_kw_in(text, kws)

def extract_year_from_title(title: str) -> str:
    m = re.search(r"(19\d{2}|20\d{2})", norm(title))
    return m.group(1) if m else ""

def year_from_timestamp_ms(ts: int) -> str:
    try:
        return time.strftime("%Y", time.localtime(int(ts)/1000))
    except Exception:
        return ""

def is_summary_or_bad(title: str) -> bool:
    """是否属于摘要/附录/公告/确认意见等一票否决"""
    t = norm(title)
    if contains_any(t, HARD_EXCLUDE):
        return True
    return False

def is_pure_equity(title: str) -> bool:
    t = norm(title)
    return contains_any(t, EQUITY_EXCLUDE)

def is_non_convertible_bond(title: str) -> bool:
    """排除非可转债公司债/优先股；但保留含'可转换公司债券'的情况"""
    t = norm(title)
    if "可转换公司债券" in t or "可转债" in t or "可转换债券" in t:
        return False
    return contains_any(t, NON_CONV_BOND_EXCLUDE)

def is_convertible(title: str) -> bool:
    t = norm(title)
    return contains_any(t, CONV_POS_KWS)

def ann_weight(it: Dict) -> Tuple[int, int, int]:
    """
    选择权重：
      1) 非摘要优先（摘要-5分）
      2) 修订/更新/更正 +10 分
      3) 时间越新越好
      4) 备用：标题长度（更长通常是正文）
    """
    title = norm(it.get("announcementTitle", ""))
    w = 0
    if "摘要" in title: w -= 5
    if contains_any(title, ["更新", "修订", "更正"]): w += 10
    try:
        ts = int(it.get("announcementTime", 0))
    except Exception:
        ts = 0
    return (w, ts, len(title))

# ========= 调接口 =========
def get_report(page_num: int, date_range: str, column: str, plate: str) -> requests.Response:
    """用一个关键词：募集说明书"""
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    payload = {
        "pageNum":   page_num,
        "pageSize":  30,
        "column":    column,
        "tabName":   "fulltext",
        "category":  "",
        "plate":     plate,
        "searchkey": "募集说明书",
        "secid":     "",
        "trade":     "",
        "seDate":    date_range,
        "sortName":  "code",
        "sortType":  "asc",
        "isHLtitle": "false"
    }
    headers = {
        "Accept":           "*/*",
        "Accept-Language":  "zh-CN,zh;q=0.9",
        "Content-Type":     "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin":           "http://www.cninfo.com.cn",
        "Referer":          "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search",
        "User-Agent":       "Mozilla/5.0",
        "X-Requested-With": "XMLHttpRequest"
    }
    return requests.post(url, data=payload, headers=headers, timeout=15)

def download_for_segments(segments: List[str], column: str, plate: str) -> List[Dict]:
    all_ann: List[Dict] = []
    seen = set()
    for date_range in segments:
        try:
            total = get_report(1, date_range, column, plate).json().get("totalpages", 0)
        except Exception:
            continue
        if total <= 0:
            continue

        page = 1
        while page <= total:
            for _ in range(3):  # 重试
                try:
                    resp = get_report(page, date_range, column, plate)
                    resp.raise_for_status()
                    anns = resp.json().get("announcements", []) or []
                    for x in anns:
                        title = x.get("announcementTitle", "") or ""
                        t = norm(title)

                        # 一票否决
                        if is_summary_or_bad(t):
                            continue
                        # 过滤掉明显不是可转债方向的
                        if is_pure_equity(t):
                            continue
                        if is_non_convertible_bond(t):
                            continue

                        key = (str(x.get("secCode", "")), str(x.get("adjunctUrl", "")))
                        if key in seen:
                            continue
                        seen.add(key)
                        all_ann.append(x)
                    break
                except Exception:
                    time.sleep(1.2)
            page += 1
    return all_ann

def collect_all_offering() -> List[Dict]:
    # 按年分片，避免接口截断
    y_begin = int(DATE_BEGIN[:4])
    y_end   = int(DATE_END[:4])
    segments: List[str] = []
    for y in range(y_begin, y_end + 1):
        seg_start = f"{y}-01-01" if y > y_begin else DATE_BEGIN
        seg_end   = f"{y}-12-31" if y < y_end   else DATE_END
        segments.append(f"{seg_start}~{seg_end}")

    raw: List[Dict] = []
    for col in MARKETS:
        for pl in PLATES:
            raw.extend(download_for_segments(segments, col, pl))
    return raw

# ========= 去重（公司层面只保留一条）=========
def pick_latest_per_company(anns: List[Dict], valid_secs: set) -> List[Dict]:
    """
    先在每家公司里找“可转债”募集说明书（按权重选最佳）；
    如果没有，再在剩余“通用募集说明书”里（非摘要/非股票/非非转债的公司债/非优先股）取权重最高的一条。
    """
    by_sec_all: Dict[str, List[Dict]] = {}
    for it in anns:
        sec = str(it.get("secCode", ""))
        if sec and (sec in valid_secs):
            by_sec_all.setdefault(sec, []).append(it)

    picked: List[Dict] = []
    for sec, lst in by_sec_all.items():
        # 先挑“可转债”类
        conv = [x for x in lst if is_convertible(x.get("announcementTitle","") or "")]
        base = conv if conv else lst  # 没有可转债标题，就用“通用募集说明书”
        best = max(base, key=ann_weight)
        picked.append(best)
    return picked

# ========= 写 Excel =========
def write_selected_excel(anns: List[Dict]) -> None:
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    df_map["sec_no_suf"] = df_map["正股代码"].str.split(".").str[0]

    ann_dict: Dict[str, Dict] = {str(it.get("secCode", "")): it for it in anns}

    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = SHEET
    ws.append(["可转债代码","可转债名称","公司代码","公司简称","标题","年份","年报链接"])

    for _, row in df_map.iterrows():
        bond_code = row.get("代码", "")
        bond_name = row.get("名称", "")
        sec       = row.get("sec_no_suf", "")
        comp_name = row.get("正股名称", "")

        it = ann_dict.get(sec)
        if it:
            raw_title = re.sub(r"<.*?>","", it.get("announcementTitle","")).replace("：","")
            title = f"《{raw_title}》"
            yr  = extract_year_from_title(raw_title) or year_from_timestamp_ms(it.get("announcementTime",0))
            url = f"http://static.cninfo.com.cn/{it.get('adjunctUrl','')}"
            ws.append([bond_code, bond_name, sec, comp_name, title, yr, url])
        else:
            # 没拉到就留空一行，便于你后续人工核对
            ws.append([bond_code, bond_name, sec, comp_name, "", "", ""])

    wb.save(OUT_XLSX)
    print(f"✅ 已输出：{OUT_XLSX}（工作表：{SHEET}）")

# ========= 主流程 =========
def main():
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    valid_secs = set(df_map["正股代码"].str.split(".").str[0].tolist())

    raw = collect_all_offering()
    picked = pick_latest_per_company(raw, valid_secs)
    write_selected_excel(picked)
    print("---- 可转债募集说明书检索完成 ----")

if __name__ == "__main__":
    main()


✅ 已输出：/Users/sam/Desktop/cninfo_output/募集说明书链接_选取公司【募集】.xlsx（工作表：募集说明书）
---- 可转债募集说明书检索完成 ----


In [7]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
02_collect_offering_links.py
按《正股映射结果.xlsx》中的公司集合逐一在巨潮检索“募集说明书”。
规则：
- 仅用 searchkey="募集说明书"
- 每家公司优先用 stock=代码,简称 精确检索；无结果回退为 stock=代码；仍无结果再跨交易所兜底
- 标题含“可转债/可转换公司债券/向不特定对象发行可转换公司债券”等则“可转债优先”
- 摘要不剔除但扣分；“修订/修正/更新/更正”加分；再按公告时间新
- 若没有任何“可转债”标题，也保留该公司“通用募集说明书”中的最新一条（你截图里这种情况）
- 每家公司仅输出一行
"""

import os
import re
import time
import requests
import openpyxl
import pandas as pd
from typing import List, Dict, Tuple
from datetime import date

# ========= 基本参数 =========
DATE_BEGIN = "2000-01-01"
DATE_END   = date.today().strftime("%Y-%m-%d")

output_dir   = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

OUT_XLSX = os.path.join(output_dir, "募集说明书链接_选取公司【募集】.xlsx")
SHEET    = "募集说明书"

# ========= 过滤/识别 =========
# 明确不要的类型（不影响“可转债”标题）
HARD_EXCLUDE = [
    "英文", "英文版", "annual", "summary",
    "附录", "备查", "备查文件", "封卷", "封卷稿",
    "取消", "已取消", "提示性公告", "公告",
    "确认意见", "审核意见", "问询", "回复", "核准情况",
    "监事", "全体监事", "控股股东", "实际控制人", "一致行动人",
    "gdr", "全球存托凭证", "瑞士", "价格区间", "批准的公告",
]
# 明确不是可转债的债/股（若标题本身出现“可转换公司债券/可转债”则不排）
PURE_EQUITY = ["向特定对象发行股票", "以简易程序向特定对象发行股票", "发行a股股票", "非公开发行股票", "公开增发", "增发"]
NON_CONV_BOND = ["优先股", "公司债券", "绿色公司债", "科技创新公司债", "扶贫专项", "乡村振兴", "可续期公司债"]

CB_POSITIVE = [
    "可转债", "可转换公司债", "可转换公司债券",
    "向不特定对象发行可转换公司债券",
    "并在主板上市募集说明书", "并在创业板上市募集说明书", "并在科创板上市募集说明书"
]

# ========= 工具 =========
_SPACE_RE = re.compile(r"[\s\u00A0\u3000]+")
def norm(s: str) -> str:
    return _SPACE_RE.sub("", str(s or "").strip())

def strip_html(s: str) -> str:
    return re.sub(r"<.*?>", "", str(s or ""))

def any_kw_in(text: str, kws: List[str]) -> bool:
    t = norm(text).lower()
    return any(norm(k).lower() in t for k in kws if k)

def is_hard_excluded(title: str) -> bool:
    t = norm(title).lower()
    # “公告”两个字作为强干扰放在最后防误杀（很多正文不含“公告”）
    for kw in HARD_EXCLUDE:
        if kw in t:
            return True
    return False

def is_pure_equity(title: str) -> bool:
    t = norm(title)
    return any(kw in t for kw in PURE_EQUITY)

def is_non_conv_bond(title: str) -> bool:
    t = norm(title)
    if ("可转换公司债" in t) or ("可转换公司债券" in t) or ("可转债" in t):
        return False
    return any(kw in t for kw in NON_CONV_BOND)

def is_cb_title(title: str) -> bool:
    return any_kw_in(title, CB_POSITIVE)

def extract_year_from_title(title: str) -> str:
    m = re.search(r"(19\d{2}|20\d{2})", strip_html(title))
    return m.group(1) if m else ""

def year_from_timestamp_ms(ts: int) -> str:
    try:
        return time.strftime("%Y", time.localtime(int(ts)/1000))
    except Exception:
        return ""

def ann_weight(it: Dict) -> Tuple[int, int, int]:
    """
    摘要扣8分；修订/修正/更新/更正加10分；时间新优先；再用标题长度兜底。
    """
    title = strip_html(it.get("announcementTitle", "") or "")
    t = norm(title)
    w = 0
    if "摘要" in t: w -= 8
    if any_kw_in(t, ["修订", "修正", "更新", "更正"]): w += 10
    try:
        ts = int(it.get("announcementTime", 0))
    except Exception:
        ts = 0
    return (w, ts, len(t))

# ========= 请求 =========
CNINFO_URL = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
HEADERS = {
    "Accept": "*/*",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    "Origin": "http://www.cninfo.com.cn",
    "Referer": "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search",
    "User-Agent": "Mozilla/5.0",
    "X-Requested-With": "XMLHttpRequest"
}

def fetch_one(session: requests.Session, column: str, stock: str) -> List[Dict]:
    """
    在指定 column（shse/szse）+ stock（'代码,简称' 或仅'代码'）条件下，
    把 2000-01-01 ~ 今天 的“募集说明书”分页抓完并做基础清洗。
    """
    base = {
        "pageSize":  30,
        "column":    column,
        "tabName":   "fulltext",
        "category":  "",
        "plate":     "",  # 不限子板块，防漏
        "searchkey": "募集说明书",
        "secid":     "",
        "trade":     "",
        "seDate":    f"{DATE_BEGIN}~{DATE_END}",
        "sortName":  "code",
        "sortType":  "asc",
        "isHLtitle": "false",
        "stock":     stock
    }
    out: List[Dict] = []
    seen = set()
    page = 1
    while True:
        payload = dict(base, pageNum=page)
        try:
            resp = session.post(CNINFO_URL, data=payload, headers=HEADERS, timeout=12)
            resp.raise_for_status()
            data = resp.json() or {}
        except Exception:
            # 轻量重试一次
            time.sleep(0.8)
            try:
                resp = session.post(CNINFO_URL, data=payload, headers=HEADERS, timeout=12)
                data = resp.json() or {}
            except Exception:
                break

        anns = data.get("announcements", []) or []
        for x in anns:
            title = strip_html(x.get("announcementTitle", "") or "")
            if "募集说明书" not in title:
                continue
            if is_hard_excluded(title):     # 摘要不在这里过滤
                continue
            if is_pure_equity(title):
                continue
            if is_non_conv_bond(title):
                continue
            key = (str(x.get("secCode", "")), str(x.get("adjunctUrl", "")))
            if key in seen:
                continue
            seen.add(key)
            out.append(x)

        tp = int(data.get("totalpages", 0) or 0)
        if tp <= 0 or page >= tp:
            break
        page += 1
    return out

def fetch_company(session: requests.Session, code_no_suf: str, short_name: str, column_guess: str) -> List[Dict]:
    """
    逐公司抓取：优先 column_guess + '代码,简称'；为空则回退为 '代码'；
    仍为空则换另一交易所再试（防个别公司被错挂或历史迁移）。
    """
    # 1) column 猜测（SH→shse；SZ→szse）
    cols_try = [column_guess]
    cols_try.append("szse" if column_guess == "shse" else "shse")  # 兜底跨所
    tried = []

    for col in cols_try:
        # 1.1 代码+简称
        stock = f"{code_no_suf},{short_name}" if short_name else code_no_suf
        anns = fetch_one(session, col, stock)
        tried.append((col, stock, len(anns)))
        if not anns and short_name:
            # 1.2 仅代码（兼容简称不一致）
            anns = fetch_one(session, col, code_no_suf)
            tried[-1] = (col, f"{code_no_suf}", len(anns))
        if anns:
            return anns
    return []  # 都没抓到则留空

def choose_best_for_company(cands: List[Dict]) -> Dict:
    if not cands:
        return {}
    # 先可转债集合
    cb = [it for it in cands if is_cb_title(it.get("announcementTitle", "") or "")]
    base = cb if cb else cands
    return max(base, key=ann_weight)

# ========= 导出 =========
def write_selected_excel(chosen: Dict[str, Dict], df_map: pd.DataFrame) -> None:
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = SHEET
    ws.append(["可转债代码","可转债名称","公司代码","公司简称","标题","年份","年报链接"])

    for _, r in df_map.iterrows():
        bond_code = r.get("代码", "")
        bond_name = r.get("名称", "")
        sec       = r.get("sec_no_suf", "")
        comp_name = r.get("正股名称", "")

        it = chosen.get(sec)
        if it:
            raw_title = strip_html(it.get("announcementTitle", "") or "")
            yr  = extract_year_from_title(raw_title) or year_from_timestamp_ms(it.get("announcementTime", 0))
            url = f"http://static.cninfo.com.cn/{it.get('adjunctUrl', '')}"
            ws.append([bond_code, bond_name, sec, comp_name, f"《{raw_title}》", yr, url])
        else:
            ws.append([bond_code, bond_name, sec, comp_name, "", "", ""])

    wb.save(OUT_XLSX)
    print(f"✅ 已输出：{OUT_XLSX}（工作表：{SHEET}）")

# ========= 主流程 =========
def main():
    df = pd.read_excel(mapping_file, dtype=str).fillna("")
    # 代码去后缀、识别交易所（推测 column）
    df["sec_no_suf"] = df["正股代码"].str.split(".").str[0]
    df["ex_suffix"]  = df["正股代码"].str.split(".").str[-1].str.upper()
    df["column_guess"] = df["ex_suffix"].map({"SH": "shse", "SZ": "szse"}).fillna("shse")

    chosen: Dict[str, Dict] = {}
    with requests.Session() as s:
        for _, row in df.iterrows():
            sec = row["sec_no_suf"]
            nm  = row.get("正股名称", "") or ""
            col = row["column_guess"]

            cands = fetch_company(s, sec, nm, col)
            pick  = choose_best_for_company(cands)
            if pick:
                chosen[sec] = pick
            # 抓不到则保留空行，由你后续人工核查

    write_selected_excel(chosen, df)
    print("---- 可转债募集说明书检索完成 ----")

if __name__ == "__main__":
    main()


✅ 已输出：/Users/sam/Desktop/cninfo_output/募集说明书链接_选取公司【募集】.xlsx（工作表：募集说明书）
---- 可转债募集说明书检索完成 ----


In [None]:
#6

In [13]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
02_collect_offering_links.py  (按公司定向检索版)
功能：
  1) 读取《正股映射结果.xlsx》，仅按其中公司集合检索；
  2) 先用 topSearch 拿 orgId，再在 hisAnnouncement 中用 stock="代码,orgId"
     + column=shse/szse/neeq 精确过滤到该公司；
  3) 仅拉“募集说明书”，本地过滤出“可转债/可转换公司债券”方向；
     若标题不显式写“可转债”，也保留该公司“通用募集说明书”的最新版；
  4) 输出与年报版相同列顺序，但把“年报链接”列名改为“链接”。
"""

import os
import re
import time
import requests
import openpyxl
import pandas as pd
from typing import List, Dict, Tuple, Optional
from datetime import date

# ========= 可调参数 =========
DATE_BEGIN = "2000-01-01"
DATE_END   = date.today().strftime("%Y-%m-%d")

output_dir   = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

OUT_XLSX = os.path.join(output_dir, "募集说明书链接_选取公司【募集】.xlsx")
SHEET    = "募集说明书"

# —— 交易所 column 映射（注意：沪=shse，深=szse，北交所=neeq）——
def parse_exchange(code_with_suf: str) -> str:
    s = (code_with_suf or "").strip().upper()
    if s.endswith(".SH"):
        return "shse"
    if s.endswith(".SZ"):
        return "szse"
    if s.endswith(".BJ"):
        return "neeq"   # 北交所/新三板接口归在 neeq
    # 若映射表没有后缀，按代码段粗判（6 开头倾向沪，0/3 开头倾向深；仅作兜底）
    base = re.sub(r"\D", "", s)
    if base.startswith(("600","601","603","605","688")):
        return "shse"
    if base.startswith(("000","001","002","003","300","301")):
        return "szse"
    return "shse"  # 默认给沪

# 识别“可转债”正向关键词（只用于分类，不用于请求）
CONV_POS_KWS = ["可转换公司债券", "可转债", "可转换债券", "可转债券"]

# 一票否决（任何情况下都不要）
HARD_EXCLUDE = [
    "英文", "英文版", "Annual", "annual", "Summary", "summary",
    "摘要", "摘要版", "附录", "备查文件", "取消", "已取消",
    "提示性公告", "更名提示",
    "确认意见", "审核意见", "核准情况", "监事", "全体监事",
    "控股股东", "实际控制人", "一致行动人",
    "GDR", "全球存托凭证", "瑞士", "价格区间", "批准的公告",
    "公开增发", "增发"
]

# 明确排除“不是可转债”的品种
EQUITY_EXCLUDE = [  # 纯股票类
    "向特定对象发行股票", "以简易程序向特定对象发行股票", "发行A股股票", "非公开发行股票",
]
NON_CONV_BOND_EXCLUDE = [  # 非可转债的公司债/优先股
    "优先股", "公司债券", "绿色公司债", "科技创新公司债", "扶贫专项", "乡村振兴", "可续期公司债",
]

# ========= 工具函数 =========
_SPACE_RE = re.compile(r"[\s\u00A0\u3000]+")
def norm(s: str) -> str:
    return _SPACE_RE.sub("", str(s or "").strip())

def any_kw_in(text: str, kws: List[str]) -> bool:
    t = norm(text)
    return any(norm(k) in t for k in kws if k)

def contains_any(text: str, kws: List[str]) -> bool:
    return any_kw_in(text, kws)

def extract_year_from_title(title: str) -> str:
    m = re.search(r"(19\d{2}|20\d{2})", norm(title))
    return m.group(1) if m else ""

def year_from_timestamp_ms(ts: int) -> str:
    try:
        return time.strftime("%Y", time.localtime(int(ts)/1000))
    except Exception:
        return ""

def is_summary_or_bad(title: str) -> bool:
    t = norm(title)
    if contains_any(t, HARD_EXCLUDE):
        return True
    # “公告”两个字会误杀极少数标题，这里不过滤“公告”本身，只过滤“提示性公告/更名提示”等
    return False

def is_pure_equity(title: str) -> bool:
    t = norm(title)
    return contains_any(t, EQUITY_EXCLUDE)

def is_non_convertible_bond(title: str) -> bool:
    """排除非可转债公司债/优先股；但保留含'可转换公司债券/可转债'的情况"""
    t = norm(title)
    if ("可转换公司债券" in t) or ("可转债" in t) or ("可转换债券" in t):
        return False
    return contains_any(t, NON_CONV_BOND_EXCLUDE)

def is_convertible(title: str) -> bool:
    t = norm(title)
    return contains_any(t, CONV_POS_KWS)

def ann_weight(it: Dict) -> Tuple[int, int, int]:
    """
    选择权重：
      1) 非摘要优先（摘要-5分）
      2) 修订/更新/更正 +10 分
      3) 时间越新越好
      4) 备用：标题长度（更长通常是正文）
    """
    title = norm(it.get("announcementTitle", "") or "")
    w = 0
    if "摘要" in title: w -= 5
    if contains_any(title, ["更新", "修订", "更正", "修正"]): w += 10
    try:
        ts = int(it.get("announcementTime", 0))
    except Exception:
        ts = 0
    return (w, ts, len(title))

# ========= 接口封装 =========

HEADERS_JSON = {
    "Accept":           "*/*",
    "Accept-Language":  "zh-CN,zh;q=0.9",
    "Content-Type":     "application/x-www-form-urlencoded; charset=UTF-8",
    "Origin":           "http://www.cninfo.com.cn",
    "Referer":          "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search",
    "User-Agent":       "Mozilla/5.0",
    "X-Requested-With": "XMLHttpRequest",
}

def cninfo_top_search_orgid(keyword: str, delay: float = 0.15) -> Optional[str]:
    """
    用 topSearch 拿 orgId。这里按【证券代码】查，只用一个字段，满足你的要求。
    返回 orgId（字符串）；失败返回 None。
    """
    try:
        url = "http://www.cninfo.com.cn/new/information/topSearch/query"
        data = {"keyWord": str(keyword), "maxNum": "10"}
        r = requests.post(url, data=data, headers=HEADERS_JSON, timeout=10)
        r.raise_for_status()
        arr = r.json() or []
        # 选第一个 code 完全匹配的条目
        for it in arr:
            if str(it.get("code","")).strip() == str(keyword).strip():
                return str(it.get("orgId","") or "") or None
        # 没完全匹配就退而求其次取第一条
        if arr:
            return str(arr[0].get("orgId","") or "") or None
        return None
    except Exception:
        return None
    finally:
        if delay > 0:
            time.sleep(delay)

def get_report_by_stock(page_num: int, seDate: str, column: str, stock: str) -> requests.Response:
    """
    按“公司（stock='代码,orgId'）+ column(交易所) + 时间”查询，searchkey 固定为“募集说明书”。
    plate 留空即可涵盖该交易所所有板块，避免漏抓。
    """
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    payload = {
        "pageNum":   page_num,
        "pageSize":  30,
        "column":    column,        # shse / szse / neeq
        "tabName":   "fulltext",
        "category":  "",
        "plate":     "",            # 留空以避免板块误筛
        "searchkey": "募集说明书",
        "secid":     "",
        "stock":     stock,         # "代码,orgId"
        "trade":     "",
        "seDate":    seDate,        # "2000-01-01~YYYY-MM-DD"
        "sortName":  "code",
        "sortType":  "asc",
        "isHLtitle": "false",
    }
    return requests.post(url, data=payload, headers=HEADERS_JSON, timeout=15)

# ========= 主流程拉取 =========

def fetch_company_offering(sec_no_suf: str, column: str, orgid_cache: Dict[str,str]) -> List[Dict]:
    """
    对单个公司（给 6 位证券代码，不带后缀）：
      1) 取 orgId（带简单缓存）；
      2) 用 stock="代码,orgId" 拉“募集说明书”；
      3) 本地做摘要/非转债等过滤。
    """
    results: List[Dict] = []

    if sec_no_suf in orgid_cache:
        org_id = orgid_cache[sec_no_suf]
    else:
        org_id = cninfo_top_search_orgid(sec_no_suf) or ""
        orgid_cache[sec_no_suf] = org_id

    if not org_id:
        return results  # 没拿到 orgId，直接返回空

    stock_str = f"{sec_no_suf},{org_id}"
    # 分页拉取（单公司一般页数很少；时间直接 2000-01-01~今天）
    try:
        first = get_report_by_stock(1, f"{DATE_BEGIN}~{DATE_END}", column, stock_str)
        first.raise_for_status()
        total = int(first.json().get("totalpages", 0) or 0)
    except Exception:
        total = 0

    page = 1
    seen = set()  # (secCode, adjunctUrl)
    while page <= max(total, 1):
        try:
            resp = get_report_by_stock(page, f"{DATE_BEGIN}~{DATE_END}", column, stock_str)
            resp.raise_for_status()
            anns = resp.json().get("announcements", []) or []
            for x in anns:
                title = x.get("announcementTitle","") or ""
                t = norm(title)
                # 一票否决
                if is_summary_or_bad(t):
                    continue
                # 排除明显不是可转债方向的
                if is_pure_equity(t):
                    continue
                if is_non_convertible_bond(t):
                    continue
                key = (str(x.get("secCode","")), str(x.get("adjunctUrl","")))
                if key in seen:
                    continue
                seen.add(key)
                results.append(x)
        except Exception:
            time.sleep(0.6)
        page += 1

    return results

def collect_all_offering_precise() -> List[Dict]:
    """
    逐公司精确拉取（只跑一次正确的交易所；不再跑全市场）。
    """
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    df_map["sec_no_suf"] = df_map["正股代码"].str.split(".").str[0]
    df_map["exchange"]   = df_map["正股代码"].str.upper().str[-2:]  # SH/SZ/BJ

    orgid_cache: Dict[str, str] = {}
    all_rec: List[Dict] = []

    for _, row in df_map.iterrows():
        sec_full = str(row.get("正股代码","") or "")
        sec      = str(row.get("sec_no_suf","") or "")
        if not sec:
            continue
        column = parse_exchange(sec_full)  # shse / szse / neeq
        recs = fetch_company_offering(sec, column, orgid_cache)
        all_rec.extend(recs)

    return all_rec

# ========= 去重（公司层面只保留一条）=========
def pick_latest_per_company(anns: List[Dict], valid_secs: set) -> List[Dict]:
    """
    先在每家公司里找“可转债”募集说明书（按权重选最佳）；
    如果没有，再在剩余“通用募集说明书”里取权重最高的一条。
    """
    by_sec_all: Dict[str, List[Dict]] = {}
    for it in anns:
        sec = str(it.get("secCode", ""))
        if sec and (sec in valid_secs):
            by_sec_all.setdefault(sec, []).append(it)

    picked: List[Dict] = []
    for sec, lst in by_sec_all.items():
        conv = [x for x in lst if is_convertible(x.get("announcementTitle","") or "")]
        base = conv if conv else lst
        best = max(base, key=ann_weight)
        picked.append(best)
    return picked

# ========= 写 Excel =========
def write_selected_excel(anns: List[Dict]) -> None:
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    df_map["sec_no_suf"] = df_map["正股代码"].str.split(".").str[0]

    ann_dict: Dict[str, Dict] = {str(it.get("secCode", "")): it for it in anns}

    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = SHEET
    # 把“年报链接”列名改为“链接”
    ws.append(["可转债代码","可转债名称","公司代码","公司简称","标题","年份","链接"])

    for _, row in df_map.iterrows():
        bond_code = row.get("代码", "")
        bond_name = row.get("名称", "")
        sec       = row.get("sec_no_suf", "")
        comp_name = row.get("正股名称", "")

        it = ann_dict.get(sec)
        if it:
            raw_title = re.sub(r"<.*?>","", it.get("announcementTitle","")).replace("：","")
            title = f"《{raw_title}》"
            yr  = extract_year_from_title(raw_title) or year_from_timestamp_ms(it.get("announcementTime",0))
            # 统一用 https
            url = f"https://static.cninfo.com.cn/{it.get('adjunctUrl','')}"
            ws.append([bond_code, bond_name, sec, comp_name, title, yr, url])
        else:
            # 没拉到就留空一行，便于后续人工核对
            ws.append([bond_code, bond_name, sec, comp_name, "", "", ""])

    wb.save(OUT_XLSX)
    print(f"✅ 已输出：{OUT_XLSX}（工作表：{SHEET}）")

# ========= 主流程 =========
def main():
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    valid_secs = set(df_map["正股代码"].str.split(".").str[0].tolist())

    raw = collect_all_offering_precise()
    picked = pick_latest_per_company(raw, valid_secs)
    write_selected_excel(picked)
    print("---- 可转债募集说明书检索完成 ----")

if __name__ == "__main__":
    main()


✅ 已输出：/Users/sam/Desktop/cninfo_output/募集说明书链接_选取公司【募集】.xlsx（工作表：募集说明书）
---- 可转债募集说明书检索完成 ----


In [None]:
#7

In [14]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
02_collect_offering_links.py  — 逐公司【标题关键词】检索版
按《正股映射结果.xlsx》的公司集合，逐公司用 searchkey="公司简称 募集说明书"
在对应交易所(column=shse/szse/neeq) + plate="" + 时间段(2000-01-01~今天)检索；
本地过滤掉摘要/纯股票类/非可转债公司债/优先股等噪音；
每家公司仅保留一条（可转债标题优先，其次最新）。

与年报版相同列顺序，但把“年报链接”列名改成“链接”。
"""

import os
import re
import time
import requests
import openpyxl
import pandas as pd
from typing import List, Dict, Tuple
from datetime import date

# ========= 参数 =========
DATE_BEGIN = "2000-01-01"
DATE_END   = date.today().strftime("%Y-%m-%d")

output_dir   = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

OUT_XLSX = os.path.join(output_dir, "募集说明书链接_选取公司【募集】.xlsx")
SHEET    = "募集说明书"

# ========= 过滤词 =========
CONV_POS_KWS = ["可转换公司债券", "可转债", "可转换债券", "可转债券"]
HARD_EXCLUDE = [
    "英文", "英文版", "Annual", "annual", "Summary", "summary",
    "摘要", "摘要版", "概要", "概览",
    "附录", "备查文件", "取消", "已取消",
    "提示性公告", "更名提示",
    "确认意见", "审核意见", "核准情况", "监事", "全体监事",
    "控股股东", "实际控制人", "一致行动人",
    "GDR", "全球存托凭证", "瑞士", "价格区间", "批准的公告",
    "公开增发", "增发"
]
EQUITY_EXCLUDE = ["向特定对象发行股票", "以简易程序向特定对象发行股票", "发行A股股票", "非公开发行股票"]
NON_CONV_BOND_EXCLUDE = ["优先股", "公司债券", "绿色公司债", "科技创新公司债", "扶贫专项", "乡村振兴", "可续期公司债"]

# ========= 工具函数 =========
_SPACE_RE = re.compile(r"[\s\u00A0\u3000]+")
def norm(s: str) -> str:
    return _SPACE_RE.sub("", str(s or "").strip())

def any_kw_in(text: str, kws: List[str]) -> bool:
    t = norm(text)
    return any(norm(k) in t for k in kws if k)

def contains_any(text: str, kws: List[str]) -> bool:
    return any_kw_in(text, kws)

def extract_year_from_title(title: str) -> str:
    m = re.search(r"(19\d{2}|20\d{2})", norm(title))
    return m.group(1) if m else ""

def year_from_timestamp_ms(ts: int) -> str:
    try:
        return time.strftime("%Y", time.localtime(int(ts)/1000))
    except Exception:
        return ""

def is_summary_or_bad(title: str) -> bool:
    return contains_any(norm(title), HARD_EXCLUDE)

def is_pure_equity(title: str) -> bool:
    return contains_any(norm(title), EQUITY_EXCLUDE)

def is_non_convertible_bond(title: str) -> bool:
    t = norm(title)
    if ("可转换公司债券" in t) or ("可转债" in t) or ("可转换债券" in t):
        return False
    return contains_any(t, NON_CONV_BOND_EXCLUDE)

def is_convertible(title: str) -> bool:
    return contains_any(norm(title), CONV_POS_KWS)

def ann_weight(it: Dict) -> Tuple[int, int, int]:
    """
    权重：非摘要优先(摘要/概要/概览 -5) + 修订/更新/更正 +10 + 时间越新越好 + 标题更长略优
    """
    title = norm(it.get("announcementTitle", "") or "")
    w = 0
    if ("摘要" in title) or ("概要" in title) or ("概览" in title):
        w -= 5
    if contains_any(title, ["更新","修订","更正","修正","（修订稿）","修订稿"]):
        w += 10
    try:
        ts = int(it.get("announcementTime", 0))
    except Exception:
        ts = 0
    return (w, ts, len(title))

def parse_exchange(code_with_suf: str) -> str:
    """由映射表里的后缀推断交易所 column 值；无后缀时按代码段兜底"""
    s = (code_with_suf or "").strip().upper()
    if s.endswith(".SH"): return "shse"
    if s.endswith(".SZ"): return "szse"
    if s.endswith(".BJ"): return "neeq"
    base = re.sub(r"\D", "", s)
    if base.startswith(("600","601","603","605","688")): return "shse"
    if base.startswith(("000","001","002","003","300","301")): return "szse"
    return "shse"

# ========= 调接口（标题关键词检索）=========
HEADERS = {
    "Accept":           "*/*",
    "Accept-Language":  "zh-CN,zh;q=0.9",
    "Content-Type":     "application/x-www-form-urlencoded; charset=UTF-8",
    "Origin":           "https://www.cninfo.com.cn",
    "Referer":          "https://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search",
    "User-Agent":       "Mozilla/5.0",
    "X-Requested-With": "XMLHttpRequest",
}

def query_pages(page_num: int, column: str, searchkey: str, seDate: str) -> requests.Response:
    """
    用 hisAnnouncement/query：
      - column: shse/szse/neeq（只跑对应交易所）
      - plate: ""  （不限板块，避免漏）
      - searchkey: "公司简称 募集说明书"（或“代码 募集说明书”，仅在简称缺失时）
      - seDate: "2000-01-01~YYYY-MM-DD"
    """
    url = "https://www.cninfo.com.cn/new/hisAnnouncement/query"
    payload = {
        "pageNum":   page_num,
        "pageSize":  30,
        "column":    column,
        "tabName":   "fulltext",
        "category":  "",
        "plate":     "",
        "searchkey": searchkey,
        "secid":     "",
        "stock":     "",
        "trade":     "",
        "seDate":    seDate,
        "sortName":  "code",
        "sortType":  "asc",
        "isHLtitle": "false",
    }
    return requests.post(url, data=payload, headers=HEADERS, timeout=15)

def fetch_for_company(sec: str, comp_name: str, column: str) -> List[Dict]:
    """对单个公司：用 ‘公司简称 募集说明书’ 搜索；过滤 & 仅保留 secCode=该公司代码 的记录。"""
    key = f"{comp_name} 募集说明书" if comp_name else f"{sec} 募集说明书"
    seDate = f"{DATE_BEGIN}~{DATE_END}"

    # 先读总页数
    try:
        total = int((query_pages(1, column, key, seDate).json() or {}).get("totalpages", 0) or 0)
    except Exception:
        total = 0

    results: List[Dict] = []
    seen = set()
    page = 1
    while page <= max(total, 1):
        try:
            r = query_pages(page, column, key, seDate)
            r.raise_for_status()
            anns = r.json().get("announcements", []) or []
            for x in anns:
                title = x.get("announcementTitle","") or ""
                t = norm(title)

                # 一票否决
                if is_summary_or_bad(t):
                    continue
                # 明显与可转债无关的剔除
                if is_pure_equity(t):
                    continue
                if is_non_convertible_bond(t):
                    continue

                # 仅保留该公司的记录（以 secCode 判定，避免同名干扰）
                if str(x.get("secCode","")) != sec:
                    continue

                key2 = (sec, str(x.get("adjunctUrl","")))
                if key2 in seen:
                    continue
                seen.add(key2)
                results.append(x)
        except Exception:
            time.sleep(0.6)
        page += 1

    return results

# ========= 全量收集 & 公司维度择优 =========
def collect_all_offering() -> List[Dict]:
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    df_map["sec_no_suf"] = df_map["正股代码"].str.split(".").str[0]
    all_rec: List[Dict] = []

    for _, row in df_map.iterrows():
        sec_full  = str(row.get("正股代码","") or "")
        sec       = str(row.get("sec_no_suf","") or "")
        comp_name = str(row.get("正股名称","") or "")
        if not sec and not comp_name:
            continue
        col = parse_exchange(sec_full)  # shse/szse/neeq
        cands = fetch_for_company(sec, comp_name, col)
        if not cands:
            continue
        # 可转债标题优先；否则按权重
        conv = [x for x in cands if is_convertible(x.get("announcementTitle","") or "")]
        base = conv if conv else cands
        best = max(base, key=ann_weight)
        all_rec.append(best)
    return all_rec

def pick_latest_per_company(anns: List[Dict], valid_secs: set) -> List[Dict]:
    by_sec: Dict[str, List[Dict]] = {}
    for it in anns:
        s = str(it.get("secCode",""))
        if s in valid_secs:
            by_sec.setdefault(s, []).append(it)
    picked = []
    for s, lst in by_sec.items():
        conv = [x for x in lst if is_convertible(x.get("announcementTitle","") or "")]
        base = conv if conv else lst
        picked.append(max(base, key=ann_weight))
    return picked

# ========= 写 Excel =========
def write_selected_excel(anns: List[Dict]) -> None:
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    df_map["sec_no_suf"] = df_map["正股代码"].str.split(".").str[0]
    ann_dict = {str(it.get("secCode","")): it for it in anns}

    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = SHEET
    # 列名把“年报链接”改成“链接”
    ws.append(["可转债代码","可转债名称","公司代码","公司简称","标题","年份","链接"])

    for _, row in df_map.iterrows():
        bond_code = row.get("代码","")
        bond_name = row.get("名称","")
        sec       = row.get("sec_no_suf","")
        comp_name = row.get("正股名称","")

        it = ann_dict.get(sec)
        if it:
            raw_title = re.sub(r"<.*?>","", it.get("announcementTitle","")).replace("：","")
            title = f"《{raw_title}》"
            yr  = extract_year_from_title(raw_title) or year_from_timestamp_ms(it.get("announcementTime",0))
            url = f"https://static.cninfo.com.cn/{it.get('adjunctUrl','')}"
            ws.append([bond_code, bond_name, sec, comp_name, title, yr, url])
        else:
            ws.append([bond_code, bond_name, sec, comp_name, "", "", ""])

    wb.save(OUT_XLSX)
    print(f"✅ 已输出：{OUT_XLSX}（工作表：{SHEET}）")

# ========= main =========
def main():
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    valid_secs = set(df_map["正股代码"].str.split(".").str[0].tolist())
    raw = collect_all_offering()
    picked = pick_latest_per_company(raw, valid_secs)
    write_selected_excel(picked)
    print("---- 可转债募集说明书（逐公司·标题关键词）完成 ----")

if __name__ == "__main__":
    main()


✅ 已输出：/Users/sam/Desktop/cninfo_output/募集说明书链接_选取公司【募集】.xlsx（工作表：募集说明书）
---- 可转债募集说明书（逐公司·标题关键词）完成 ----


In [None]:
# 8

In [15]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os, re, time, requests, openpyxl, pandas as pd
from typing import List, Dict, Tuple, Optional
from datetime import date

DATE_BEGIN = "2000-01-01"
DATE_END   = date.today().strftime("%Y-%m-%d")

output_dir   = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

OUT_XLSX = os.path.join(output_dir, "募集说明书链接_选取公司【募集】.xlsx")
SHEET    = "募集说明书"

# ---- 过滤词 ----
CONV_POS_KWS = ["可转换公司债券", "可转债", "可转换债券", "可转债券"]
HARD_EXCLUDE = ["英文","英文版","Annual","annual","Summary","summary","摘要","摘要版","附录","备查文件",
                "取消","已取消","提示性公告","更名提示","确认意见","审核意见","核准情况","监事","全体监事",
                "控股股东","实际控制人","一致行动人","GDR","全球存托凭证","瑞士","价格区间","批准的公告","公开增发","增发"]
EQUITY_EXCLUDE = ["向特定对象发行股票","以简易程序向特定对象发行股票","发行A股股票","非公开发行股票"]
NON_CONV_BOND_EXCLUDE = ["优先股","公司债券","绿色公司债","科技创新公司债","扶贫专项","乡村振兴","可续期公司债"]

_SPACE_RE = re.compile(r"[\s\u00A0\u3000]+")
def norm(s: str) -> str: return _SPACE_RE.sub("", str(s or "").strip())
def contains_any(text: str, kws: List[str]) -> bool: return any(k for k in kws if norm(k) in norm(text))
def extract_year_from_title(title: str) -> str:
    m = re.search(r"(19\d{2}|20\d{2})", norm(title)); return m.group(1) if m else ""
def year_from_timestamp_ms(ts: int) -> str:
    try: return time.strftime("%Y", time.localtime(int(ts)/1000))
    except Exception: return ""
def is_summary_or_bad(title: str) -> bool: return contains_any(title, HARD_EXCLUDE)
def is_pure_equity(title: str) -> bool: return contains_any(title, EQUITY_EXCLUDE)
def is_non_convertible_bond(title: str) -> bool:
    t = norm(title)
    if ("可转换公司债券" in t) or ("可转债" in t) or ("可转换债券" in t): return False
    return contains_any(t, NON_CONV_BOND_EXCLUDE)
def is_convertible(title: str) -> bool: return contains_any(title, CONV_POS_KWS)
def ann_weight(it: Dict) -> Tuple[int, int, int]:
    title = norm(it.get("announcementTitle","") or "")
    w = 0
    if "摘要" in title: w -= 5
    if contains_any(title, ["更新","修订","更正","修正"]): w += 10
    try: ts = int(it.get("announcementTime",0))
    except Exception: ts = 0
    return (w, ts, len(title))

# ---- 交易所映射（关键修复：.SH -> sse）----
def parse_exchange(code_with_suf: str) -> str:
    s = (code_with_suf or "").strip().upper()
    if s.endswith(".SH"): return "sse"   # 修复点
    if s.endswith(".SZ"): return "szse"
    if s.endswith(".BJ"): return "neeq"
    base = re.sub(r"\D","",s)
    if base.startswith(("600","601","603","605","688")): return "sse"
    if base.startswith(("000","001","002","003","300","301")): return "szse"
    return "sse"

# 兜底：沪市优先 sse，必要时试 shse 一次
COLUMN_FALLBACKS = {
    "sse":  ["sse","shse"],
    "szse": ["szse"],
    "neeq": ["neeq"],
}

HEADERS_JSON = {
    "Accept": "*/*",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    "Origin": "https://www.cninfo.com.cn",
    "Referer": "https://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search",
    "User-Agent": "Mozilla/5.0",
    "X-Requested-With": "XMLHttpRequest",
}

def cninfo_top_search_orgid(keyword: str, delay: float = 0.12) -> Optional[str]:
    try:
        url = "https://www.cninfo.com.cn/new/information/topSearch/query"
        r = requests.post(url, data={"keyWord": str(keyword), "maxNum": "10"}, headers=HEADERS_JSON, timeout=10)
        arr = r.json() or []
        for it in arr:
            if str(it.get("code","")).strip() == str(keyword).strip():
                return str(it.get("orgId","") or "") or None
        return (str(arr[0].get("orgId","")) if arr else None)
    except Exception:
        return None
    finally:
        time.sleep(delay)

def get_report_by_stock(page_num: int, seDate: str, column: str, stock: str) -> requests.Response:
    url = "https://www.cninfo.com.cn/new/hisAnnouncement/query"
    payload = {
        "pageNum": page_num, "pageSize": 30, "column": column, "tabName": "fulltext",
        "category": "", "plate": "", "searchkey": "募集说明书",
        "secid": "", "stock": stock, "trade": "", "seDate": seDate,
        "sortName": "code", "sortType": "asc", "isHLtitle": "false",
    }
    return requests.post(url, data=payload, headers=HEADERS_JSON, timeout=15)

def fetch_company_offering(sec_no_suf: str, column_pref: str, orgid_cache: Dict[str,str]) -> List[Dict]:
    res: List[Dict] = []
    org_id = orgid_cache.get(sec_no_suf) or cninfo_top_search_orgid(sec_no_suf) or ""
    orgid_cache[sec_no_suf] = org_id
    if not org_id: return res

    stock_str = f"{sec_no_suf},{org_id}"
    seDate = f"{DATE_BEGIN}~{DATE_END}"

    for col in COLUMN_FALLBACKS.get(column_pref, [column_pref]):
        try:
            first = get_report_by_stock(1, seDate, col, stock_str)
            total = int(first.json().get("totalpages", 0) or 0)
        except Exception:
            total = 0

        # 如果这一口径直接 0 页，且有兜底方案，换下一个列值再试
        if total == 0 and col != COLUMN_FALLBACKS.get(column_pref, [column_pref])[-1]:
            continue

        page, seen = 1, set()
        while page <= max(total, 1):
            try:
                r = get_report_by_stock(page, seDate, col, stock_str); r.raise_for_status()
                for x in (r.json().get("announcements", []) or []):
                    title = x.get("announcementTitle","") or ""
                    t = norm(title)
                    if is_summary_or_bad(t):       continue
                    if is_pure_equity(t):          continue
                    if is_non_convertible_bond(t): continue
                    if str(x.get("secCode","")) != sec_no_suf: continue
                    key = (sec_no_suf, str(x.get("adjunctUrl","")))
                    if key in seen: continue
                    seen.add(key); res.append(x)
            except Exception:
                time.sleep(0.5)
            page += 1
        break  # 成功跑完一个口径就不再试下一个

    return res

def collect_all_offering_precise() -> List[Dict]:
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    df_map["sec_no_suf"] = df_map["正股代码"].str.split(".").str[0]
    org_cache: Dict[str,str] = {}
    all_rec: List[Dict] = []

    for _, row in df_map.iterrows():
        sec_full = str(row.get("正股代码","") or "")
        sec      = str(row.get("sec_no_suf","") or "")
        if not sec: continue
        column_pref = parse_exchange(sec_full)  # sse / szse / neeq
        recs = fetch_company_offering(sec, column_pref, org_cache)
        all_rec.extend(recs)
    return all_rec

def pick_latest_per_company(anns: List[Dict], valid_secs: set) -> List[Dict]:
    by_sec: Dict[str, List[Dict]] = {}
    for it in anns:
        s = str(it.get("secCode",""))
        if s in valid_secs: by_sec.setdefault(s, []).append(it)
    picked = []
    for s, lst in by_sec.items():
        conv = [x for x in lst if is_convertible(x.get("announcementTitle","") or "")]
        base = conv if conv else lst
        picked.append(max(base, key=ann_weight))
    return picked

def write_selected_excel(anns: List[Dict]) -> None:
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    df_map["sec_no_suf"] = df_map["正股代码"].str.split(".").str[0]
    ann_dict = {str(it.get("secCode","")): it for it in anns}

    wb = openpyxl.Workbook(); ws = wb.active; ws.title = SHEET
    ws.append(["可转债代码","可转债名称","公司代码","公司简称","标题","年份","链接"])
    for _, row in df_map.iterrows():
        bond_code = row.get("代码",""); bond_name = row.get("名称","")
        sec = row.get("sec_no_suf",""); comp = row.get("正股名称","")
        it = ann_dict.get(sec)
        if it:
            raw = re.sub(r"<.*?>","", it.get("announcementTitle","")).replace("：","")
            title = f"《{raw}》"
            yr = extract_year_from_title(raw) or year_from_timestamp_ms(it.get("announcementTime",0))
            url = f"https://static.cninfo.com.cn/{it.get('adjunctUrl','')}"
            ws.append([bond_code, bond_name, sec, comp, title, yr, url])
        else:
            ws.append([bond_code, bond_name, sec, comp, "", "", ""])
    wb.save(OUT_XLSX)
    print(f"✅ 已输出：{OUT_XLSX}（工作表：{SHEET}）")

def main():
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    valid_secs = set(df_map["正股代码"].str.split(".").str[0].tolist())
    raw = collect_all_offering_precise()
    picked = pick_latest_per_company(raw, valid_secs)
    write_selected_excel(picked)
    print("---- 可转债募集说明书检索完成 ----")

if __name__ == "__main__":
    main()


✅ 已输出：/Users/sam/Desktop/cninfo_output/募集说明书链接_选取公司【募集】.xlsx（工作表：募集说明书）
---- 可转债募集说明书检索完成 ----


In [None]:
#9

In [16]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
02_collect_offering_links.py — 逐公司·标题关键词（按交易所一次检索）
按《正股映射结果.xlsx》逐公司，用 searchkey="公司简称 募集说明书"
只在对应交易所(column=sse/szse/neeq, plate="")检索一次；
本地过滤摘要/纯股票类/非可转债公司债/优先股等；每家公司保留 1 条最新版。
导出列名把“年报链接”改为“链接”。
"""

import os, re, time, requests, openpyxl, pandas as pd
from typing import List, Dict, Tuple
from datetime import date

# ===== 参数 =====
DATE_BEGIN = "2000-01-01"
DATE_END   = date.today().strftime("%Y-%m-%d")

output_dir   = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

OUT_XLSX = os.path.join(output_dir, "募集说明书链接_选取公司【募集】.xlsx")
SHEET    = "募集说明书"

# ===== 过滤词 =====
CONV_POS_KWS = ["可转换公司债券", "可转债", "可转换债券", "可转债券"]
HARD_EXCLUDE = [
    "英文","英文版","Annual","annual","Summary","summary",
    "摘要","摘要版","概要","概览",
    "附录","备查文件","取消","已取消",
    "提示性公告","更名提示",
    "确认意见","审核意见","核准情况","监事","全体监事",
    "控股股东","实际控制人","一致行动人",
    "GDR","全球存托凭证","瑞士","价格区间","批准的公告",
    "公开增发","增发"
]
EQUITY_EXCLUDE = ["向特定对象发行股票","以简易程序向特定对象发行股票","发行A股股票","非公开发行股票"]
NON_CONV_BOND_EXCLUDE = ["优先股","公司债券","绿色公司债","科技创新公司债","扶贫专项","乡村振兴","可续期公司债"]

# ===== 小工具 =====
_SPACE_RE = re.compile(r"[\s\u00A0\u3000]+")
def norm(s: str) -> str: return _SPACE_RE.sub("", str(s or "").strip())
def contains_any(text: str, kws: List[str]) -> bool:
    t = norm(text); return any(norm(k) in t for k in kws if k)
def extract_year_from_title(title: str) -> str:
    m = re.search(r"(19\d{2}|20\d{2})", norm(title)); return m.group(1) if m else ""
def year_from_timestamp_ms(ts: int) -> str:
    try: return time.strftime("%Y", time.localtime(int(ts)/1000))
    except Exception: return ""
def is_summary_or_bad(title: str) -> bool: return contains_any(title, HARD_EXCLUDE)
def is_pure_equity(title: str) -> bool: return contains_any(title, EQUITY_EXCLUDE)
def is_non_convertible_bond(title: str) -> bool:
    t = norm(title)
    if ("可转换公司债券" in t) or ("可转债" in t) or ("可转换债券" in t): return False
    return contains_any(t, NON_CONV_BOND_EXCLUDE)
def is_convertible(title: str) -> bool: return contains_any(norm(title), CONV_POS_KWS)

def ann_weight(it: Dict) -> Tuple[int, int, int]:
    title = norm(it.get("announcementTitle","") or "")
    w = 0
    if ("摘要" in title) or ("概要" in title) or ("概览" in title): w -= 5
    if contains_any(title, ["更新","修订","更正","修正","修订稿","（修订稿）"]): w += 10
    try: ts = int(it.get("announcementTime",0))
    except Exception: ts = 0
    return (w, ts, len(title))

# ===== 交易所解析（注意：沪市用 sse）=====
def parse_exchange(code_with_suf: str) -> str:
    s = (code_with_suf or "").strip().upper()
    if s.endswith(".SH"): return "sse"   # 关键：沪市是 sse
    if s.endswith(".SZ"): return "szse"
    if s.endswith(".BJ"): return "neeq"
    base = re.sub(r"\D","",s)
    if base.startswith(("600","601","603","605","688")): return "sse"
    if base.startswith(("000","001","002","003","300","301")): return "szse"
    return "sse"

# ===== 接口 =====
HEADERS = {
    "Accept": "*/*",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    "Origin": "https://www.cninfo.com.cn",
    "Referer": "https://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search",
    "User-Agent": "Mozilla/5.0",
    "X-Requested-With": "XMLHttpRequest",
}

def query_once(page_num: int, column: str, searchkey: str, seDate: str) -> requests.Response:
    url = "https://www.cninfo.com.cn/new/hisAnnouncement/query"
    payload = {
        "pageNum": page_num, "pageSize": 30,
        "column": column, "tabName": "fulltext",
        "category": "", "plate": "",             # 不限板块，但仅限该交易所
        "searchkey": searchkey, "secid": "", "stock": "", "trade": "",
        "seDate": seDate, "sortName": "code", "sortType": "asc",
        "isHLtitle": "false",
    }
    return requests.post(url, data=payload, headers=HEADERS, timeout=15)

def fetch_for_company(sec: str, comp_name: str, column: str) -> List[Dict]:
    """只在对应交易所查一次；严格用 secCode 过滤到该公司。"""
    key = f"{comp_name} 募集说明书" if comp_name else f"{sec} 募集说明书"
    seDate = f"{DATE_BEGIN}~{DATE_END}"

    # 先看总页数
    try:
        total = int((query_once(1, column, key, seDate).json() or {}).get("totalpages", 0) or 0)
    except Exception:
        total = 0

    results, seen = [], set()
    page = 1
    while page <= max(total, 1):
        try:
            r = query_once(page, column, key, seDate); r.raise_for_status()
            anns = r.json().get("announcements", []) or []
            for x in anns:
                # 仅保留本公司的结果
                if str(x.get("secCode","")) != sec:
                    continue
                title = x.get("announcementTitle","") or ""
                t = norm(title)
                if is_summary_or_bad(t):       continue
                if is_pure_equity(t):          continue
                if is_non_convertible_bond(t): continue
                k = (sec, str(x.get("adjunctUrl","")))
                if k in seen: continue
                seen.add(k); results.append(x)
        except Exception:
            time.sleep(0.5)
        page += 1

    return results

# ===== 汇总 & 取优 =====
def collect_all_offering() -> List[Dict]:
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    df_map["sec_no_suf"] = df_map["正股代码"].str.split(".").str[0]
    all_rec: List[Dict] = []

    for _, row in df_map.iterrows():
        sec_full  = str(row.get("正股代码","") or "")
        sec       = str(row.get("sec_no_suf","") or "")
        comp_name = str(row.get("正股名称","") or "")
        if not sec and not comp_name: continue
        col = parse_exchange(sec_full)  # sse / szse / neeq
        cands = fetch_for_company(sec, comp_name, col)
        if not cands: continue
        conv = [x for x in cands if is_convertible(x.get("announcementTitle","") or "")]
        base = conv if conv else cands
        all_rec.append(max(base, key=ann_weight))
    return all_rec

def pick_latest_per_company(anns: List[Dict], valid_secs: set) -> List[Dict]:
    by_sec: Dict[str, List[Dict]] = {}
    for it in anns:
        s = str(it.get("secCode",""))
        if s in valid_secs: by_sec.setdefault(s, []).append(it)
    picked = []
    for s, lst in by_sec.items():
        conv = [x for x in lst if is_convertible(x.get("announcementTitle","") or "")]
        base = conv if conv else lst
        picked.append(max(base, key=ann_weight))
    return picked

# ===== 写 Excel =====
def write_selected_excel(anns: List[Dict]) -> None:
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    df_map["sec_no_suf"] = df_map["正股代码"].str.split(".").str[0]
    ann_dict = {str(it.get("secCode","")): it for it in anns}

    wb = openpyxl.Workbook(); ws = wb.active; ws.title = SHEET
    ws.append(["可转债代码","可转债名称","公司代码","公司简称","标题","年份","链接"])
    for _, row in df_map.iterrows():
        bond_code = row.get("代码",""); bond_name = row.get("名称","")
        sec = row.get("sec_no_suf",""); comp = row.get("正股名称","")
        it = ann_dict.get(sec)
        if it:
            raw = re.sub(r"<.*?>","", it.get("announcementTitle","")).replace("：","")
            title = f"《{raw}》"
            yr = extract_year_from_title(raw) or year_from_timestamp_ms(it.get("announcementTime",0))
            url = f"https://static.cninfo.com.cn/{it.get('adjunctUrl','')}"
            ws.append([bond_code, bond_name, sec, comp, title, yr, url])
        else:
            ws.append([bond_code, bond_name, sec, comp, "", "", ""])
    wb.save(OUT_XLSX)
    print(f"✅ 已输出：{OUT_XLSX}（工作表：{SHEET}）")

# ===== main =====
def main():
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    valid_secs = set(df_map["正股代码"].str.split(".").str[0].tolist())
    raw = collect_all_offering()
    picked = pick_latest_per_company(raw, valid_secs)
    write_selected_excel(picked)
    print("---- 可转债募集说明书（逐公司·标题关键词·单交易所一次）完成 ----")

if __name__ == "__main__":
    main()


✅ 已输出：/Users/sam/Desktop/cninfo_output/募集说明书链接_选取公司【募集】.xlsx（工作表：募集说明书）
---- 可转债募集说明书（逐公司·标题关键词·单交易所一次）完成 ----


In [None]:
#10

In [17]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
02_collect_offering_links.py — 逐公司关键词检索（按交易所+板块精确查询）
- 读取《正股映射结果.xlsx》
- 逐公司构造 searchkey：
    * 优先：'正股名称 募集说明书'
    * 兜底：'正股代码 募集说明书'
- 只在该公司所属交易所(column=sse/szse/neeq)的**对应板块**里查（不再用 plate=""）：
    * 沪市(.SH)：
        - 科创板：代码以 688/689 开头 → plate="shkcp"，兜底 "sh"
        - 主板：其余沪市代码 → plate="shmb"，兜底 "sh"
    * 深市(.SZ)：
        - 创业板：代码以 300/301 开头 → plate="szcy"，兜底 "sz"
        - 主板：其余深市代码 → plate="szmb"，兜底 "sz"
    * 北交所(.BJ)：plate="bj"
- 本地过滤摘要/纯股票类/非可转债公司债/优先股；每家公司仅保留一条（可转债标题优先、时间近优先）。
- 导出列名把“年报链接”改为“链接”。
"""

import os, re, time, requests, openpyxl, pandas as pd
from typing import List, Dict, Tuple
from datetime import date

# ========== 可调参数 ==========
DATE_BEGIN = "2000-01-01"
DATE_END   = date.today().strftime("%Y-%m-%d")

output_dir   = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

OUT_XLSX = os.path.join(output_dir, "募集说明书链接_选取公司【募集】.xlsx")
SHEET    = "募集说明书"

# ========== 过滤词 ==========
CONV_POS_KWS = ["可转换公司债券", "可转债", "可转换债券", "可转债券"]
HARD_EXCLUDE = [
    "英文","英文版","Annual","annual","Summary","summary",
    "摘要","摘要版","概要","概览",
    "附录","备查文件","取消","已取消",
    "提示性公告","更名提示",
    "确认意见","审核意见","核准情况","监事","全体监事",
    "控股股东","实际控制人","一致行动人",
    "GDR","全球存托凭证","瑞士","价格区间","批准的公告",
    "公开增发","增发"
]
EQUITY_EXCLUDE = ["向特定对象发行股票","以简易程序向特定对象发行股票","发行A股股票","非公开发行股票"]
NON_CONV_BOND_EXCLUDE = ["优先股","公司债券","绿色公司债","科技创新公司债","扶贫专项","乡村振兴","可续期公司债"]

# ========== 小工具 ==========
_SPACE_RE = re.compile(r"[\s\u00A0\u3000]+")
def norm(s: str) -> str: return _SPACE_RE.sub("", str(s or "").strip())
def contains_any(text: str, kws: List[str]) -> bool:
    t = norm(text); return any(norm(k) in t for k in kws if k)

def extract_year_from_title(title: str) -> str:
    m = re.search(r"(19\d{2}|20\d{2})", norm(title)); return m.group(1) if m else ""

def year_from_timestamp_ms(ts: int) -> str:
    try: return time.strftime("%Y", time.localtime(int(ts)/1000))
    except Exception: return ""

def is_summary_or_bad(title: str) -> bool: return contains_any(title, HARD_EXCLUDE)
def is_pure_equity(title: str) -> bool: return contains_any(title, EQUITY_EXCLUDE)
def is_non_convertible_bond(title: str) -> bool:
    t = norm(title)
    if ("可转换公司债券" in t) or ("可转债" in t) or ("可转换债券" in t): return False
    return contains_any(t, NON_CONV_BOND_EXCLUDE)
def is_convertible(title: str) -> bool: return contains_any(norm(title), CONV_POS_KWS)

def ann_weight(it: Dict) -> Tuple[int, int, int]:
    title = norm(it.get("announcementTitle","") or "")
    w = 0
    if ("摘要" in title) or ("概要" in title) or ("概览" in title): w -= 5
    if contains_any(title, ["更新","修订","更正","修正","修订稿","（修订稿）"]): w += 10
    try: ts = int(it.get("announcementTime",0))
    except Exception: ts = 0
    return (w, ts, len(title))

# ========== 交易所 & 板块解析 ==========
def parse_exchange(code_with_suf: str) -> str:
    """返回 column：沪=sse，深=szse，北交所=neeq"""
    s = (code_with_suf or "").strip().upper()
    if s.endswith(".SH"): return "sse"
    if s.endswith(".SZ"): return "szse"
    if s.endswith(".BJ"): return "neeq"
    base = re.sub(r"\D","",s)
    if base.startswith(("600","601","603","605","688","689")): return "sse"
    if base.startswith(("000","001","002","003","300","301")): return "szse"
    return "sse"

def decide_plates_for_code(sec_no_suf: str, column: str) -> List[str]:
    """
    根据代码段给出板块优先级列表：
      - sse: 688/689 -> ['shkcp','sh']；否则 ['shmb','sh']
      - szse: 300/301 -> ['szcy','sz']；否则 ['szmb','sz']
      - neeq: ['bj']
    """
    s = str(sec_no_suf or "")
    if column == "sse":
        if s.startswith(("688","689")): return ["shkcp","sh"]
        return ["shmb","sh"]
    if column == "szse":
        if s.startswith(("300","301")): return ["szcy","sz"]
        return ["szmb","sz"]
    if column == "neeq":
        return ["bj"]
    return ["sh","sz"]  # 兜底，理论不会走到

# ========== 接口 ==========
HEADERS = {
    "Accept": "*/*",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    "Origin": "https://www.cninfo.com.cn",
    "Referer": "https://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search",
    "User-Agent": "Mozilla/5.0",
    "X-Requested-With": "XMLHttpRequest",
}

def query_once(page_num: int, column: str, plate: str, searchkey: str, seDate: str) -> requests.Response:
    url = "https://www.cninfo.com.cn/new/hisAnnouncement/query"
    payload = {
        "pageNum": page_num, "pageSize": 30,
        "column": column, "tabName": "fulltext",
        "category": "",
        "plate": plate,            # 关键：用具体板块
        "searchkey": searchkey,
        "secid": "", "stock": "", "trade": "",
        "seDate": seDate, "sortName": "code", "sortType": "asc",
        "isHLtitle": "false",
    }
    return requests.post(url, data=payload, headers=HEADERS, timeout=15)

# ========== 单公司检索 ==========
def fetch_for_company(sec: str, comp_name: str, column: str) -> List[Dict]:
    """
    在所属交易所 column 下，只用对应 plate 列表检索：
      * 先试 “公司简称 募集说明书”
      * 若该 plate 0 页，再试 “代码 募集说明书”
      * 若特定板块仍 0 页，才试其“汇总板块”（如 'sh'/'sz'）
    只保留 secCode==sec 的记录。
    """
    seDate = f"{DATE_BEGIN}~{DATE_END}"
    plates = decide_plates_for_code(sec, column)
    key1 = f"{comp_name} 募集说明书" if comp_name else None
    key2 = f"{sec} 募集说明书"

    results: List[Dict] = []
    for plate in plates:
        # 先用名称关键词
        tried_any = False
        if key1:
            tried_any = True
            try:
                total = int((query_once(1, column, plate, key1, seDate).json() or {}).get("totalpages", 0) or 0)
            except Exception:
                total = 0
            if total > 0:
                page, seen = 1, set()
                while page <= max(total, 1):
                    try:
                        r = query_once(page, column, plate, key1, seDate); r.raise_for_status()
                        for x in (r.json().get("announcements", []) or []):
                            if str(x.get("secCode","")) != sec:   # 只保留本公司
                                continue
                            title = x.get("announcementTitle","") or ""
                            t = norm(title)
                            if is_summary_or_bad(t):       continue
                            if is_pure_equity(t):          continue
                            if is_non_convertible_bond(t): continue
                            k = (sec, str(x.get("adjunctUrl","")))
                            if k in seen: continue
                            seen.add(k); results.append(x)
                    except Exception:
                        time.sleep(0.4)
                    page += 1
                if results: break  # 该板块已有结果，停止换板块
        # 再用代码关键词兜底
        try:
            total = int((query_once(1, column, plate, key2, seDate).json() or {}).get("totalpages", 0) or 0)
        except Exception:
            total = 0
        if total > 0:
            page, seen = 1, set()
            while page <= max(total, 1):
                try:
                    r = query_once(page, column, plate, key2, seDate); r.raise_for_status()
                    for x in (r.json().get("announcements", []) or []):
                        if str(x.get("secCode","")) != sec:
                            continue
                        title = x.get("announcementTitle","") or ""
                        t = norm(title)
                        if is_summary_or_bad(t):       continue
                        if is_pure_equity(t):          continue
                        if is_non_convertible_bond(t): continue
                        k = (sec, str(x.get("adjunctUrl","")))
                        if k in seen: continue
                        seen.add(k); results.append(x)
                except Exception:
                    time.sleep(0.4)
                page += 1
            if results: break
        # 若该板块没拉到且也没名称可试，则继续下一个 plate
        if not tried_any and not results:
            continue
    return results

# ========== 汇总 & 公司维度取优 ==========
def collect_all_offering() -> List[Dict]:
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    df_map["sec_no_suf"] = df_map["正股代码"].str.split(".").str[0]
    all_rec: List[Dict] = []

    for _, row in df_map.iterrows():
        sec_full  = str(row.get("正股代码","") or "")
        sec       = str(row.get("sec_no_suf","") or "")
        comp_name = str(row.get("正股名称","") or "")
        if not sec and not comp_name: 
            continue
        column = parse_exchange(sec_full)  # sse / szse / neeq
        cands = fetch_for_company(sec, comp_name, column)
        if not cands:
            continue
        # 可转债标题优先；否则按权重
        conv = [x for x in cands if is_convertible(x.get("announcementTitle","") or "")]
        base = conv if conv else cands
        best = max(base, key=ann_weight)
        all_rec.append(best)
    return all_rec

def pick_latest_per_company(anns: List[Dict], valid_secs: set) -> List[Dict]:
    by_sec: Dict[str, List[Dict]] = {}
    for it in anns:
        s = str(it.get("secCode",""))
        if s in valid_secs: by_sec.setdefault(s, []).append(it)
    picked = []
    for s, lst in by_sec.items():
        conv = [x for x in lst if is_convertible(x.get("announcementTitle","") or "")]
        base = conv if conv else lst
        picked.append(max(base, key=ann_weight))
    return picked

# ========== 写 Excel ==========
def write_selected_excel(anns: List[Dict]) -> None:
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    df_map["sec_no_suf"] = df_map["正股代码"].str.split(".").str[0]
    ann_dict = {str(it.get("secCode","")): it for it in anns}

    wb = openpyxl.Workbook(); ws = wb.active; ws.title = SHEET
    ws.append(["可转债代码","可转债名称","公司代码","公司简称","标题","年份","链接"])
    for _, row in df_map.iterrows():
        bond_code = row.get("代码",""); bond_name = row.get("名称","")
        sec = row.get("sec_no_suf",""); comp = row.get("正股名称","")
        it = ann_dict.get(sec)
        if it:
            raw = re.sub(r"<.*?>","", it.get("announcementTitle","")).replace("：","")
            title = f"《{raw}》"
            yr = extract_year_from_title(raw) or year_from_timestamp_ms(it.get("announcementTime",0))
            url = f"https://static.cninfo.com.cn/{it.get('adjunctUrl','')}"
            ws.append([bond_code, bond_name, sec, comp, title, yr, url])
        else:
            ws.append([bond_code, bond_name, sec, comp, "", "", ""])
    wb.save(OUT_XLSX)
    print(f"✅ 已输出：{OUT_XLSX}（工作表：{SHEET}）")

# ========== main ==========
def main():
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    valid_secs = set(df_map["正股代码"].str.split(".").str[0].tolist())
    raw = collect_all_offering()
    picked = pick_latest_per_company(raw, valid_secs)
    write_selected_excel(picked)
    print("---- 可转债募集说明书（逐公司·按板块精确查询）完成 ----")

if __name__ == "__main__":
    main()


✅ 已输出：/Users/sam/Desktop/cninfo_output/募集说明书链接_选取公司【募集】.xlsx（工作表：募集说明书）
---- 可转债募集说明书（逐公司·按板块精确查询）完成 ----


In [19]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
02_collect_offering_links.py — 逐公司关键词检索（按交易所→对应板块依次查询）
需求实现：
  1) 上交所: plate 依次 ["sh","shmb","shkcp"]；深交所: ["sz","szmb","szcy"]；北交所: ["bj"]
  2) 每家公司优先用 “正股名称 募集说明书” 搜索；若全板块皆空，再用 “正股代码 募集说明书”
  3) 同时存在“公开发行”与“向特定对象发行”时，尽量选择“公开发行”（权重偏好）
  4) 导出第 7 列列名为“链接”
"""

import os
import re
import time
import requests
import openpyxl
import pandas as pd
from typing import List, Dict, Tuple
from datetime import date

# ========= 可调参数 =========
DATE_BEGIN = "2000-01-01"
DATE_END   = date.today().strftime("%Y-%m-%d")

output_dir   = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

OUT_XLSX = os.path.join(output_dir, "募集说明书链接_选取公司【募集】.xlsx")
SHEET    = "募集说明书"

# ========= 过滤词 =========
CONV_POS_KWS = ["可转换公司债券", "可转债", "可转换债券", "可转债券"]

HARD_EXCLUDE = [
    "英文","英文版","Annual","annual","Summary","summary",
    "摘要","摘要版","概要","概览",
    "附录","备查文件","取消","已取消",
    "提示性公告","更名提示",
    "确认意见","审核意见","核准情况","监事","全体监事",
    "控股股东","实际控制人","一致行动人",
    "GDR","全球存托凭证","瑞士","价格区间","批准的公告",
    "公开增发","增发",
]

EQUITY_EXCLUDE = ["向特定对象发行股票","以简易程序向特定对象发行股票","发行A股股票","非公开发行股票"]
NON_CONV_BOND_EXCLUDE = ["优先股","公司债券","绿色公司债","科技创新公司债","扶贫专项","乡村振兴","可续期公司债"]

# ========= 工具函数 =========
_SPACE_RE = re.compile(r"[\s\u00A0\u3000]+")
def norm(s: str) -> str:
    return _SPACE_RE.sub("", str(s or "").strip())

def contains_any(text: str, kws: List[str]) -> bool:
    t = norm(text)
    return any(norm(k) in t for k in kws if k)

def extract_year_from_title(title: str) -> str:
    m = re.search(r"(19\d{2}|20\d{2})", norm(title))
    return m.group(1) if m else ""

def year_from_timestamp_ms(ts: int) -> str:
    try:
        return time.strftime("%Y", time.localtime(int(ts)/1000))
    except Exception:
        return ""

def is_summary_or_bad(title: str) -> bool:
    return contains_any(title, HARD_EXCLUDE)

def is_pure_equity(title: str) -> bool:
    return contains_any(title, EQUITY_EXCLUDE)

def is_non_convertible_bond(title: str) -> bool:
    t = norm(title)
    if ("可转换公司债券" in t) or ("可转债" in t) or ("可转换债券" in t):
        return False
    return contains_any(t, NON_CONV_BOND_EXCLUDE)

def is_convertible(title: str) -> bool:
    return contains_any(norm(title), CONV_POS_KWS)

def ann_weight(it: Dict) -> Tuple[int, int, int, int]:
    """
    选择权重（越大越优）：
      - 摘要/概要/概览：-5
      - 修订/更新/更正：+10
      - 明确“公开发行”或“向不特定对象发行”：+20
      - 明确“向特定对象发行”：-15
      - 时间越新越好
      - 备用：标题长度
      - 备用2：包含“可转换公司债券/可转债”：+5（微调）
    """
    title = norm(it.get("announcementTitle","") or "")
    w = 0
    if ("摘要" in title) or ("概要" in title) or ("概览" in title): w -= 5
    if contains_any(title, ["更新","修订","更正","修正","修订稿","（修订稿）"]): w += 10
    if ("公开发行" in title) or ("向不特定对象发行" in title): w += 20
    if "向特定对象发行" in title: w -= 15
    if is_convertible(title): w += 5
    try:
        ts = int(it.get("announcementTime", 0))
    except Exception:
        ts = 0
    return (w, ts, len(title), 1 if "公开发行" in title or "向不特定对象发行" in title else 0)

# ========= 交易所 & 板块 =========
def parse_exchange(code_with_suf: str) -> str:
    """返回 column：沪=sse，深=szse，北交所=neeq；无后缀时按号段推断。"""
    s = (code_with_suf or "").strip().upper()
    if s.endswith(".SH"): return "sse"
    if s.endswith(".SZ"): return "szse"
    if s.endswith(".BJ"): return "neeq"
    base = re.sub(r"\D","",s)
    if base.startswith(("600","601","603","605","688","689")): return "sse"
    if base.startswith(("000","001","002","003","300","301")): return "szse"
    return "sse"

EXCHANGE_PLATES = {
    "sse":  ["sh", "shmb", "shkcp"],   # 沪市 → 沪市/主板/科创板
    "szse": ["sz", "szmb", "szcy"],    # 深市 → 深市/主板/创业板
    "neeq": ["bj"],                    # 北交所
}

# ========= 接口 =========
HEADERS = {
    "Accept": "*/*",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    "Origin": "https://www.cninfo.com.cn",
    "Referer": "https://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search",
    "User-Agent": "Mozilla/5.0",
    "X-Requested-With": "XMLHttpRequest",
}
SEDATE = f"{DATE_BEGIN}~{DATE_END}"

def query_once(page_num: int, column: str, plate: str, searchkey: str) -> requests.Response:
    url = "https://www.cninfo.com.cn/new/hisAnnouncement/query"
    payload = {
        "pageNum": page_num, "pageSize": 30,
        "column": column, "tabName": "fulltext",
        "category": "",
        "plate": plate,                 # 按板块
        "searchkey": searchkey,         # 标题关键词：公司名/代码 + 募集说明书
        "secid": "", "stock": "", "trade": "",
        "seDate": SEDATE, "sortName": "code", "sortType": "asc",
        "isHLtitle": "false",
    }
    return requests.post(url, data=payload, headers=HEADERS, timeout=15)

def pull_for_plate(sec: str, column: str, plate: str, key: str) -> List[Dict]:
    """单 plate + 单 key 拉取并过滤到该公司的记录。"""
    out, seen = [], set()
    try:
        first = query_once(1, column, plate, key)
        first.raise_for_status()
        data = first.json() or {}
        total = int(data.get("totalpages", 0) or 0)
        # 若 total 为 0 但首批也许仍有数据，统一按 1 页处理
        total = max(total, 1)
    except Exception:
        return out

    page = 1
    while page <= total:
        try:
            r = query_once(page, column, plate, key); r.raise_for_status()
            anns = r.json().get("announcements", []) or []
            if not anns and total == 1:
                break
            for x in anns:
                if str(x.get("secCode","")) != sec:
                    continue
                title = x.get("announcementTitle","") or ""
                t = norm(title)
                if is_summary_or_bad(t):       continue
                if is_pure_equity(t):          continue
                if is_non_convertible_bond(t): continue
                k = (sec, str(x.get("adjunctUrl","")))
                if k in seen: continue
                seen.add(k); out.append(x)
        except Exception:
            time.sleep(0.3)
        page += 1
    return out

# ========= 单公司检索（先名称后代码；遍历该所对应的板块）=========
def fetch_for_company(sec: str, comp_name: str, column: str) -> List[Dict]:
    results: List[Dict] = []
    plates = EXCHANGE_PLATES.get(column, [])

    # 第一轮：公司名称 + 募集说明书
    if comp_name:
        key1 = f"{comp_name} 募集说明书"
        for plate in plates:
            results.extend(pull_for_plate(sec, column, plate, key1))

    # 若仍无结果：第二轮用 证券代码 + 募集说明书 兜底
    if not results:
        key2 = f"{sec} 募集说明书"
        for plate in plates:
            results.extend(pull_for_plate(sec, column, plate, key2))

    return results

# ========= 汇总 & 去重取优 =========
def collect_all_offering() -> List[Dict]:
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    df_map["sec_no_suf"] = df_map["正股代码"].str.split(".").str[0]
    all_best: Dict[str, Dict] = {}

    for _, row in df_map.iterrows():
        sec_full  = str(row.get("正股代码","") or "")
        sec       = str(row.get("sec_no_suf","") or "")
        comp_name = str(row.get("正股名称","") or "")
        if not sec and not comp_name:
            continue
        column = parse_exchange(sec_full)  # sse / szse / neeq

        cands = fetch_for_company(sec, comp_name, column)
        if not cands:
            continue

        # 优先“公开发行/不特定对象” → ann_weight 已倾斜
        best = max(cands, key=ann_weight)
        # 按公司只保留一条最佳
        if (sec not in all_best) or (ann_weight(best) > ann_weight(all_best[sec])):
            all_best[sec] = best

    return list(all_best.values())

# ========= 写 Excel =========
def write_selected_excel(anns: List[Dict]) -> None:
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    df_map["sec_no_suf"] = df_map["正股代码"].str.split(".").str[0]
    ann_dict: Dict[str, Dict] = {str(it.get("secCode","")): it for it in anns}

    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = SHEET
    ws.append(["可转债代码","可转债名称","公司代码","公司简称","标题","年份","链接"])

    for _, row in df_map.iterrows():
        bond_code = row.get("代码","")
        bond_name = row.get("名称","")
        sec       = row.get("sec_no_suf","")
        comp_name = row.get("正股名称","")

        it = ann_dict.get(sec)
        if it:
            raw_title = re.sub(r"<.*?>","", it.get("announcementTitle","")).replace("：","")
            title = f"《{raw_title}》"
            yr  = extract_year_from_title(raw_title) or year_from_timestamp_ms(it.get("announcementTime",0))
            url = f"https://static.cninfo.com.cn/{it.get('adjunctUrl','')}"
            ws.append([bond_code, bond_name, sec, comp_name, title, yr, url])
        else:
            ws.append([bond_code, bond_name, sec, comp_name, "", "", ""])

    wb.save(OUT_XLSX)
    print(f"✅ 已输出：{OUT_XLSX}（工作表：{SHEET}）")

# ========= 主流程 =========
def main():
    # 先读一次仅为生成 valid_secs（虽然上面已公司内取优，这里保留结构一致性）
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    valid_secs = set(df_map["正股代码"].str.split(".").str[0].tolist())

    raw_best = collect_all_offering()      # 每家公司已取优
    # 若还想再次保险：可在这里再跑一次 pick（通常不需要）
    write_selected_excel(raw_best)
    print("---- 可转债募集说明书（逐公司·按所/板块·名称优先、公开发行优先）完成 ----")

if __name__ == "__main__":
    main()


✅ 已输出：/Users/sam/Desktop/cninfo_output/募集说明书链接_选取公司【募集】.xlsx（工作表：募集说明书）
---- 可转债募集说明书（逐公司·按所/板块·名称优先、公开发行优先）完成 ----
