In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
02_collect_offering_links.py
功能：按《正股映射结果.xlsx》的公司集合，从巨潮接口检索“募集说明书”公告，
     并导出 Excel（格式与年报版一致，只是标题/年份/链接改为募集说明书对应内容）。
"""

import os
import re
import time
import requests
import openpyxl
import pandas as pd
from typing import List, Dict, Tuple
from datetime import date

# ========= 可调参数 =========
# 关键词：改为“募集说明书”
KEYWORDS = ["募集说明书"]
EXTRA_KWS: List[str] = []   # 可追加“更新稿”、“摘要”等

# 时间范围（起止）
DATE_BEGIN = "2000-02-01"
DATE_END   = date.today().strftime("%Y-%m-%d")

output_dir   = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

# 输出文件
OUT_XLSX = os.path.join(output_dir, "募集说明书链接_选取公司【募集】.xlsx")
SHEET    = "募集说明书"

# 市场/板块
MARKETS = ["szse", "shse"]
PLATES  = ["", "sz", "szmb", "szcy", "sh", "shmb", "shkcp", "bj"]

# 排除干扰词
EXCLUDE_KWS = [
    "英文", "英文版", "Annual", "annual", "Summary", "summary",
    "摘要", "摘要版", "取消", "已取消", "提示性公告", "更名提示", "B股", "H股"
]

# ========= 工具函数 =========
_SPACE_RE = re.compile(r"[\s\u00A0\u3000]+")
def norm(s: str) -> str:
    return _SPACE_RE.sub("", str(s or "").strip())

def any_kw_in(text: str, kws: List[str]) -> bool:
    t = norm(text)
    return any(norm(k) in t for k in kws if k)

_TITLE_YEAR_PATS = [
    re.compile(r"(20\d{2})\s*年?\s*度?\s*募\s*集\s*说\s*明\s*书"),
    re.compile(r"(20\d{2})\s*年?\s*募\s*集\s*说\s*明\s*书"),
    re.compile(r"(19\d{2}|20\d{2})")
]
def extract_year_from_title(title: str) -> str:
    t = norm(title)
    for pat in _TITLE_YEAR_PATS:
        m = pat.search(t)
        if m:
            return m.group(1)
    return ""

def year_from_timestamp_ms(ts: int) -> str:
    try:
        return time.strftime("%Y", time.localtime(int(ts)/1000))
    except Exception:
        return ""

def is_target_offering(title: str) -> bool:
    kws = list(KEYWORDS) + list(EXTRA_KWS)
    t = norm(title)
    if not any_kw_in(t, kws):
        return False
    if any_kw_in(t, EXCLUDE_KWS):
        return False
    return True

def ann_weight(it: Dict) -> Tuple[int, int]:
    title = norm(it.get("announcementTitle", ""))
    w_fix = 0
    if any_kw_in(title, ["更新", "修订", "更正"]):
        w_fix = 10
    try:
        ts = int(it.get("announcementTime", 0))
    except Exception:
        ts = 0
    return (w_fix, ts)

# ========= 调接口 =========
def get_report(page_num: int, date_range: str, column: str, plate: str, searchkey: str="") -> requests.Response:
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    payload = {
        "pageNum":   page_num,
        "pageSize":  30,
        "column":    column,
        "tabName":   "fulltext",
        "category":  "",
        "plate":     plate,
        "searchkey": searchkey,
        "secid":     "",
        "trade":     "",
        "seDate":    date_range,
        "sortName":  "code",
        "sortType":  "asc",
        "isHLtitle": "false"
    }
    headers = {
        "Accept":           "*/*",
        "Accept-Language":  "zh-CN,zh;q=0.9",
        "Content-Type":     "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin":           "http://www.cninfo.com.cn",
        "Referer":          "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search",
        "User-Agent":       "Mozilla/5.0",
        "X-Requested-With": "XMLHttpRequest"
    }
    return requests.post(url, data=payload, headers=headers, timeout=15)

def download_for_segments(segments: List[str], column: str, plate: str, searchkey: str) -> List[Dict]:
    all_ann: List[Dict] = []
    seen = set()
    for date_range in segments:
        try:
            total = get_report(1, date_range, column, plate, searchkey).json().get("totalpages", 0)
        except Exception:
            continue
        if total <= 0:
            continue

        page = 1
        while page <= total:
            for _ in range(3):
                try:
                    resp = get_report(page, date_range, column, plate, searchkey)
                    resp.raise_for_status()
                    anns = resp.json().get("announcements", []) or []
                    for x in anns:
                        title = x.get("announcementTitle", "")
                        if not is_target_offering(title):
                            continue
                        key = (str(x.get("secCode", "")), str(x.get("adjunctUrl", "")))
                        if key in seen:
                            continue
                        seen.add(key)
                        all_ann.append(x)
                    break
                except Exception:
                    time.sleep(1.2)
            page += 1
    return all_ann

def collect_all_offering() -> List[Dict]:
    y_begin = int(DATE_BEGIN[:4])
    y_end   = int(DATE_END[:4])
    segments: List[str] = []
    for y in range(y_begin, y_end + 1):
        seg_start = f"{y}-01-01" if y > y_begin else DATE_BEGIN
        seg_end   = f"{y}-12-31" if y < y_end   else DATE_END
        segments.append(f"{seg_start}~{seg_end}")

    search_terms = list({norm(k) for k in (KEYWORDS + EXTRA_KWS) if k})
    raw: List[Dict] = []
    for col in MARKETS:
        for pl in PLATES:
            for kw in (search_terms or ["募集说明书"]):
                raw.extend(download_for_segments(segments, col, pl, kw))
    return raw

# ========= 去重 =========
def filter_latest_per_company(anns: List[Dict], valid_secs: set) -> List[Dict]:
    latest: Dict[Tuple[str, str], Dict] = {}
    for it in anns:
        sec = str(it.get("secCode", ""))
        if sec not in valid_secs:
            continue
        yr = extract_year_from_title(it.get("announcementTitle", "")) or year_from_timestamp_ms(it.get("announcementTime", 0))
        if not yr:
            continue
        key = (sec, yr)
        if key not in latest or ann_weight(it) > ann_weight(latest[key]):
            latest[key] = it
    return list(latest.values())

# ========= 写 Excel =========
def write_selected_excel(anns: List[Dict]) -> None:
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    df_map["sec_no_suf"] = df_map["正股代码"].str.split(".").str[0]

    ann_dict: Dict[Tuple[str, str], Dict] = {}
    for it in anns:
        sec = str(it.get("secCode", ""))
        yr  = extract_year_from_title(it.get("announcementTitle", "")) or year_from_timestamp_ms(it.get("announcementTime", 0))
        if sec and yr:
            ann_dict[(sec, yr)] = it

    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = SHEET
    ws.append(["可转债代码","可转债名称","公司代码","公司简称","标题","年份","年报链接"])

    for _, row in df_map.iterrows():
        bond_code = row.get("代码", "")
        bond_name = row.get("名称", "")
        sec       = row.get("sec_no_suf", "")
        comp_name = row.get("正股名称", "")

        matched = [(k, v) for k, v in ann_dict.items() if k[0] == sec]
        if matched:
            matched.sort(key=lambda kv: kv[0][1])
            for (_, yr), it in matched:
                raw_title = re.sub(r"<.*?>","", it.get("announcementTitle","")).replace("：","")
                title = f"《{raw_title}》"
                url   = f"http://static.cninfo.com.cn/{it.get('adjunctUrl','')}"
                ws.append([bond_code, bond_name, sec, comp_name, title, yr, url])
        else:
            ws.append([bond_code, bond_name, sec, comp_name, "", "", ""])

    wb.save(OUT_XLSX)
    print(f"✅ 已输出：{OUT_XLSX}（工作表：{SHEET}）")

# ========= 主流程 =========
def main():
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    valid_secs = set(df_map["正股代码"].str.split(".").str[0].tolist())

    raw = collect_all_offering()
    picked = filter_latest_per_company(raw, valid_secs)
    write_selected_excel(picked)
    print("---- 募集说明书检索完成 ----")

if __name__ == "__main__":
    main()


✅ 已输出：/Users/sam/Desktop/cninfo_output/募集说明书链接_选取公司【募集】.xlsx（工作表：募集说明书）
---- 募集说明书检索完成 ----
