In [2]:

import os
import re
import time
import requests
import openpyxl
import pandas as pd
from typing import List, Dict, Tuple
from datetime import date

# ========= 可调参数 =========
# 关键词：仅用“招股说明书”作服务端检索，以减少请求量
KEYWORDS = ["招股说明书"]
# 可选：额外关键词（留空，避免扩大服务器端召回导致跑得慢）
EXTRA_KWS: List[str] = []

# 时间范围（起止含边界）
DATE_BEGIN = "2000-01-01"
DATE_END   = date.today().strftime("%Y-%m-%d")

# 输出目录 / 输入映射表
output_dir   = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

# 输出文件（列名仍沿用“链接”）
OUT_XLSX = os.path.join(output_dir, "招股说明书链接_选取公司【招股】.xlsx")
SHEET    = "招股说明书"

# 市场/板块保持你的原设定
MARKETS = ["szse", "shse"]
PLATES  = ["", "sz", "szmb", "szcy", "sh", "shmb", "shkcp", "bj"]

# 显式排除的干扰词
EXCLUDE_KWS = [
    # 语言/摘要/附录/公告类
    "英文", "英文版", "Annual", "annual", "Summary", "summary", "摘要", "摘要版",
    "附录", "公告", "之更正公告", "更正公告",
    # 各类“确认意见”
    "确认意见", "监事对招股说明书", "全体监事对招股说明书",
    "控股股东、实际控制人", "实际控制人及一致行动人", "一致行动人对招股说明书", "发行人监事对招股说明书",
    # GDR/瑞交所相关
    "GDR", "瑞士交易所", "瑞士证券交易所", "价格区间确定", "批准的公告",
    # 非IPO：增发
    "公开增发", "增发",
    # 其他不需要
    "取消", "已取消", "提示性公告", "更名提示", "B股", "H股"
]

# ========= 工具函数 =========
_SPACE_RE = re.compile(r"[\s\u00A0\u3000]+")
def norm(s: str) -> str:
    return _SPACE_RE.sub("", str(s or "").strip())

def any_kw_in(text: str, kws: List[str]) -> bool:
    t = norm(text)
    return any(norm(k) in t for k in kws if k)

# 宽松年份提取：优先从标题中抽取，兜底用公告时间
_TITLE_YEAR_PATS = [
    re.compile(r"(20\d{2})\s*年?\s*度?\s*招\s*股\s*说\s*明\s*书"),
    re.compile(r"(20\d{2})\s*年?\s*招\s*股\s*说\s*明\s*书"),
    re.compile(r"(19\d{2}|20\d{2})")  # 兜底：任何四位年份
]

def extract_year_from_title(title: str) -> str:
    t = norm(title)
    for pat in _TITLE_YEAR_PATS:
        m = pat.search(t)
        if m:
            return m.group(1)
    return ""

def year_from_timestamp_ms(ts: int) -> str:
    try:
        return time.strftime("%Y", time.localtime(int(ts)/1000))
    except Exception:
        return ""

# ========= 关键筛选：只保留 IPO 正文版招股书 =========
def is_target_prospectus(title: str) -> bool:
    """
    仅保留【招股说明书（IPO正文）】：
      - 必含“招股说明书”
      - 显式排除：摘要/附录/公告/确认意见/监事/全体监事/控股股东/一致行动人/GDR/瑞士(交易所/证券交易所)/价格区间确定/批准的公告/增发
      - IPO 判断：
          * 若标题含“首次公开发行”→ 明确 IPO，保留
          * 若标题含“增发/公开增发”→ 非IPO，剔除
          * 其余情况（早期标题常仅写“招股说明书”）→ 默认按 IPO 处理
    """
    t = norm(title)
    if "招股说明书" not in t:
        return False
    if any_kw_in(t, EXCLUDE_KWS):
        return False
    # 出现“增发”则一律排除
    if ("增发" in t) or ("公开增发" in t):
        return False
    # 明确写“首次公开发行”的直接保留；否则默认也按IPO处理（兼容老标题）
    return True

# ========= 权重：修订/更正优先，时间近优先 =========
def ann_weight(it: Dict) -> Tuple[int, int]:
    """
    对同公司多条正文：修正/更正/更新/修订 加分；同级按时间近优先
    """
    title = norm(it.get("announcementTitle", ""))
    w = 100
    if ("修正稿" in title) or ("更正" in title) or ("更新" in title) or ("修订" in title):
        w += 10
    try:
        ts = int(it.get("announcementTime", 0))
    except Exception:
        ts = 0
    return (w, ts)

# ========= 调接口 =========
def get_report(page_num: int, date_range: str, column: str, plate: str, searchkey: str="") -> requests.Response:
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    payload = {
        "pageNum":   page_num,
        "pageSize":  30,
        "column":    column,       # szse/shse
        "tabName":   "fulltext",
        "category":  "",           # 置空：用 searchkey
        "plate":     plate,
        "searchkey": searchkey,    # 招股说明书
        "secid":     "",
        "trade":     "",
        "seDate":    date_range,   # 2000-01-01~YYYY-MM-DD
        "sortName":  "code",
        "sortType":  "asc",
        "isHLtitle": "false"
    }
    headers = {
        "Accept":           "*/*",
        "Accept-Language":  "zh-CN,zh;q=0.9",
        "Content-Type":     "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin":           "http://www.cninfo.com.cn",
        "Referer":          "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search",
        "User-Agent":       "Mozilla/5.0",
        "X-Requested-With": "XMLHttpRequest"
    }
    return requests.post(url, data=payload, headers=headers, timeout=15)

def download_for_segments(segments: List[str], column: str, plate: str, searchkey: str) -> List[Dict]:
    """
    分段分页拉取；在本地做“正文IPO招股书”过滤 + 去重（secCode, adjunctUrl）
    """
    all_ann: List[Dict] = []
    seen = set()
    for date_range in segments:
        try:
            total = get_report(1, date_range, column, plate, searchkey).json().get("totalpages", 0)
        except Exception:
            continue
        if total <= 0:
            continue

        page = 1
        while page <= total:
            for _ in range(3):  # 重试三次
                try:
                    resp = get_report(page, date_range, column, plate, searchkey)
                    resp.raise_for_status()
                    anns = resp.json().get("announcements", []) or []
                    for x in anns:
                        title = x.get("announcementTitle", "") or ""
                        if not is_target_prospectus(title):
                            continue
                        key = (str(x.get("secCode", "")), str(x.get("adjunctUrl", "")))
                        if key in seen:
                            continue
                        seen.add(key)
                        all_ann.append(x)
                    break
                except Exception:
                    time.sleep(1.0)
            page += 1
    return all_ann

def collect_all_prospectus() -> List[Dict]:
    """
    将 2000-01-01 ~ 今天 按年切片，关键词仅用“招股说明书”，避免多关键词造成请求量暴涨
    """
    y_begin = int(DATE_BEGIN[:4])
    y_end   = int(DATE_END[:4])
    segments: List[str] = []
    for y in range(y_begin, y_end + 1):
        seg_start = f"{y}-01-01" if y > y_begin else DATE_BEGIN
        seg_end   = f"{y}-12-31" if y < y_end   else DATE_END
        segments.append(f"{seg_start}~{seg_end}")

    raw: List[Dict] = []
    for col in MARKETS:
        for pl in PLATES:
            raw.extend(download_for_segments(segments, col, pl, "招股说明书"))
    return raw

# ========= 选公司 & 去重取优（公司维度，仅一条） =========
def filter_latest_per_company(anns: List[Dict], valid_secs: set) -> List[Dict]:
    """
    仅保留映射公司；每家公司只取一条：按（修订优先 + 时间近）择最新 IPO 正文招股书
    """
    best: Dict[str, Dict] = {}
    for it in anns:
        sec = str(it.get("secCode", ""))
        if sec not in valid_secs:
            continue
        if (sec not in best) or (ann_weight(it) > ann_weight(best[sec])):
            best[sec] = it
    return list(best.values())

# ========= 写 Excel：每家公司一行 =========
def write_selected_excel(anns: List[Dict]) -> None:
    """
    列顺序与“年报版”一致：可转债代码、可转债名称、公司代码、公司简称、标题、年份、链接
    每家公司仅输出 1 行（最新版 IPO 招股说明书正文）；无命中则空行
    """
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    df_map["sec_no_suf"] = df_map["正股代码"].str.split(".").str[0]

    # (secCode) -> 记录
    ann_by_sec: Dict[str, Dict] = {}
    for it in anns:
        sec = str(it.get("secCode", ""))
        if sec and (sec not in ann_by_sec or ann_weight(it) > ann_weight(ann_by_sec[sec])):
            ann_by_sec[sec] = it

    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = SHEET
    ws.append(["可转债代码","可转债名称","公司代码","公司简称","标题","年份","链接"])

    for _, row in df_map.iterrows():
        bond_code = row.get("代码", "")
        bond_name = row.get("名称", "")
        sec       = row.get("sec_no_suf", "")
        comp_name = row.get("正股名称", "")

        it = ann_by_sec.get(sec)
        if it:
            raw_title = re.sub(r"<.*?>","", it.get("announcementTitle","")).replace("：","")
            title = f"《{raw_title}》"
            yr  = extract_year_from_title(raw_title) or year_from_timestamp_ms(it.get("announcementTime", 0))
            url = f"http://static.cninfo.com.cn/{it.get('adjunctUrl','')}"
            ws.append([bond_code, bond_name, sec, comp_name, title, yr, url])
        else:
            ws.append([bond_code, bond_name, sec, comp_name, "", "", ""])

    wb.save(OUT_XLSX)
    print(f"✅ 已输出：{OUT_XLSX}（工作表：{SHEET}）")

# ========= 主流程 =========
def main():
    # 1) 公司集合
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    valid_secs = set(df_map["正股代码"].str.split(".").str[0].tolist())

    # 2) 拉取 + 粗过滤（仅“招股说明书”关键字）
    raw = collect_all_prospectus()

    # 3) 仅保留映射公司 + 去重取优（公司维度，仅一条）
    picked = filter_latest_per_company(raw, valid_secs)

    # 4) 导出（列顺序与年报版一致）
    write_selected_excel(picked)

    print("---- 招股说明书检索完成 ----")

if __name__ == "__main__":
    main()


✅ 已输出：/Users/sam/Desktop/cninfo_output/招股说明书链接_选取公司【招股】.xlsx（工作表：招股说明书）
---- 招股说明书检索完成 ----


In [3]:


import pandas as pd
import requests
import os
import re
import logging
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

# ========== 日志 ==========
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(message)s')

def log(msg: str):
    print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {msg}")

# ========== 参数 ==========
BASE_DIR    = os.path.expanduser("~/Desktop/cninfo_output")
EXCEL_FILE  = os.path.join(BASE_DIR, "招股说明书链接_选取公司【招股】.xlsx")
SHEET_NAME  = "招股说明书"  # 与采集脚本保持一致
PDF_DIR     = os.path.join(BASE_DIR, "pdf_prospectus_latest")
MAX_WORKERS = 16
os.makedirs(PDF_DIR, exist_ok=True)

# ========== 工具 ==========
def norm_code(x: str) -> str:
    """把类似 600000 / 600000.0 / '600000 ' 统一成 6 位字符串"""
    s = str(x).strip()
    s = s.split('.')[0]
    s = re.sub(r'\D', '', s)
    return s.zfill(6) if s else ""

def is_pdf_response(resp: requests.Response) -> bool:
    ct = (resp.headers.get("Content-Type") or "").lower()
    return ("pdf" in ct) or resp.content.startswith(b"%PDF")

def safe_name(s: str) -> str:
    return re.sub(r'[\\/:*?"<>|]', '', s)

# 直接使用一个 Session 以重用连接，提高下载效率
SESSION = requests.Session()
SESSION.headers.update({
    "User-Agent": "Mozilla/5.0",
    "Referer": "http://www.cninfo.com.cn/",
})

def download_pdf(record):
    code, name, year, title, url = record
    # 文件名：代码_简称_年份（或标题年缺失则留空）
    base = f"{norm_code(code)}_{str(name).strip()}_{str(int(year)) if str(year).isdigit() else str(year).strip()}"
    base = base.strip("_")
    path = os.path.join(PDF_DIR, f"{safe_name(base)}.pdf")

    if os.path.exists(path) and os.path.getsize(path) > 0:
        log(f"Exists, skip: {path}")
        return

    try:
        resp = SESSION.get(url, timeout=25, allow_redirects=True)
        if resp.status_code == 200 and is_pdf_response(resp):
            with open(path, 'wb') as f:
                f.write(resp.content)
            log(f"Downloaded: {path}")
        else:
            log(f"Failed ({resp.status_code}) or not PDF: {url}")
    except Exception as e:
        log(f"Error downloading {url}: {e}")

# ========== 主流程 ==========

def main():
    log(f"Reading Excel: {EXCEL_FILE}")
    if not os.path.exists(EXCEL_FILE):
        log("Excel file not found. 请检查路径和文件名是否为『招股说明书链接_选取公司【招股】.xlsx』。")
        return

    # 读取“招股说明书”sheet
    df = pd.read_excel(EXCEL_FILE, sheet_name=SHEET_NAME, dtype=str)
    df.columns = [str(c).strip() for c in df.columns]

    # 丢弃无链接/空白行（采集脚本对未命中的公司会留空）
    if "链接" not in df.columns:
        log("列名『链接』未找到，请确认采集脚本输出列是否未被修改。")
        return

    df = df.dropna(subset=["链接"])  # 只保留有链接的

    # 有些行年份为空，下载不依赖年份，但文件名里尽量带上
    if "年份" not in df.columns:
        df["年份"] = ""

    records = [
        (
            df.at[i, "公司代码"],
            df.at[i, "公司简称"],
            (str(df.at[i, "年份"]).split('.')[0] if pd.notna(df.at[i, "年份"]) else ""),
            df.at[i, "标题"],
            str(df.at[i, "链接"]).strip(),
        )
        for i in df.index
        if str(df.at[i, "链接"]).strip()
    ]

    log(f"Found {len(records)} prospectus records (latest per company). Start downloading...")

    start = datetime.now()
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [executor.submit(download_pdf, rec) for rec in records]
        for _ in as_completed(futures):
            pass
    elapsed = (datetime.now() - start).total_seconds()
    log(f"All downloads completed in {elapsed:.2f}s. PDFs saved to: {PDF_DIR}")

if __name__ == '__main__':
    main()


[2025-08-28 18:07:15] Reading Excel: /Users/sam/Desktop/cninfo_output/招股说明书链接_选取公司【招股】.xlsx
[2025-08-28 18:07:15] Found 370 prospectus records (latest per company). Start downloading...
[2025-08-28 18:07:15] Downloaded: /Users/sam/Desktop/cninfo_output/pdf_prospectus_latest/600496_精工钢构_2002.pdf
[2025-08-28 18:07:15] Downloaded: /Users/sam/Desktop/cninfo_output/pdf_prospectus_latest/600370_三房巷_2003.pdf
[2025-08-28 18:07:15] Downloaded: /Users/sam/Desktop/cninfo_output/pdf_prospectus_latest/600531_豫光金铅_2002.pdf
[2025-08-28 18:07:15] Downloaded: /Users/sam/Desktop/cninfo_output/pdf_prospectus_latest/600419_天润乳业_2001.pdf
[2025-08-28 18:07:16] Downloaded: /Users/sam/Desktop/cninfo_output/pdf_prospectus_latest/600481_双良节能_2003.pdf
[2025-08-28 18:07:16] Downloaded: /Users/sam/Desktop/cninfo_output/pdf_prospectus_latest/600438_通威股份_2004.pdf
[2025-08-28 18:07:16] Downloaded: /Users/sam/Desktop/cninfo_output/pdf_prospectus_latest/600577_精达股份_2002.pdf
[2025-08-28 18:07:16] Downloaded: /Users/sam/

[2025-08-28 18:09:03,650] Connection pool is full, discarding connection: static.cninfo.com.cn. Connection pool size: 10


[2025-08-28 18:09:03] Downloaded: /Users/sam/Desktop/cninfo_output/pdf_prospectus_latest/003004_ST声迅_2020.pdf


[2025-08-28 18:09:03,984] Connection pool is full, discarding connection: static.cninfo.com.cn. Connection pool size: 10
[2025-08-28 18:09:04,093] Connection pool is full, discarding connection: static.cninfo.com.cn. Connection pool size: 10
[2025-08-28 18:09:04,223] Connection pool is full, discarding connection: static.cninfo.com.cn. Connection pool size: 10


[2025-08-28 18:09:04] Downloaded: /Users/sam/Desktop/cninfo_output/pdf_prospectus_latest/002937_兴瑞科技_2018.pdf
[2025-08-28 18:09:04] Downloaded: /Users/sam/Desktop/cninfo_output/pdf_prospectus_latest/002973_侨银股份_2019.pdf


[2025-08-28 18:09:04,327] Connection pool is full, discarding connection: static.cninfo.com.cn. Connection pool size: 10


[2025-08-28 18:09:04] Downloaded: /Users/sam/Desktop/cninfo_output/pdf_prospectus_latest/002961_瑞达期货_2019.pdf
[2025-08-28 18:09:04] Downloaded: /Users/sam/Desktop/cninfo_output/pdf_prospectus_latest/002941_新疆交建_2018.pdf


[2025-08-28 18:09:04,845] Connection pool is full, discarding connection: static.cninfo.com.cn. Connection pool size: 10


[2025-08-28 18:09:04] Downloaded: /Users/sam/Desktop/cninfo_output/pdf_prospectus_latest/002946_新乳业_2019.pdf
[2025-08-28 18:09:04] All downloads completed in 109.23s. PDFs saved to: /Users/sam/Desktop/cninfo_output/pdf_prospectus_latest
