In [1]:
# ==== Part 3A：IPO 招股说明书下载（独立新增，不修改 Part 1/2）====
import os
import re
import time
import pandas as pd
import requests
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

# —— 和你原工程保持一致的目录风格 ——
BASE_DIR   = os.path.expanduser("~/Desktop/cninfo_output")
# 优先尝试这些现成清单（任选其一存在即可），也可以手动改成你自己的清单文件名
CANDIDATE_INPUTS = [
    os.path.join(BASE_DIR, "年报链接_2024_选取公司【年报】.xlsx"),
    os.path.join(BASE_DIR, "年报链接_2024_选取公司.xlsx"),
    os.path.join(BASE_DIR, "正股映射结果.xlsx"),
]
OUT_DIR    = os.path.join(BASE_DIR, "ipo_prospectus_pdf")
OUT_LOG    = os.path.join(BASE_DIR, "IPO招股说明书_清单.xlsx")

os.makedirs(OUT_DIR, exist_ok=True)

# 你的日志风格
def log(msg):
    print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {msg}")

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "Origin": "https://www.cninfo.com.cn",
    "Referer": "https://www.cninfo.com.cn/"
}
CNINFO_API   = "https://www.cninfo.com.cn/new/hisAnnouncement/query"
CNINFO_STATIC= "https://static.cninfo.com.cn/"

# 标题优先级（从高到低），避免下载“摘要”
TITLE_PRIORITY = [
    r"首次公开发行股票招股说明书(?!.*摘要).*",   # 正式版优先
    r"招股说明书(?!.*摘要).*",                    # 统配（排除摘要）
    r"招股说明书（申报稿）.*",
    r"招股说明书（(预披露|更新).*）.*",
    r"招股意向书.*",
]

def normalize_title(s: str) -> str:
    s = (s or "").strip()
    s = re.sub(r"\s+", "", s)
    s = s.replace("（", "(").replace("）", ")")
    return s

def pick_best(records):
    """在同一家公司候选里按标题优先级 + 公告时间(新) 选一条"""
    if not records:
        return None
    def score(rec):
        title = normalize_title(rec.get("announcementTitle", "") or rec.get("shortTitle", ""))
        ts    = rec.get("announcementTime", 0)
        for i, pat in enumerate(TITLE_PRIORITY):
            if re.search(pat, title):
                return (len(TITLE_PRIORITY) - i, ts)  # 越靠前分越高
        return (0, ts)
    return sorted(records, key=score, reverse=True)[0]

def query_ipo_prospectus(stock_code=None, stock_name=None, org_id=None, session=None, max_pages=40):
    """按公司代码/简称查询 IPO 招股说明书类公告，返回候选记录列表"""
    if session is None:
        session = requests.Session()
    candidates = []
    stock_param = ""
    if stock_code and org_id:
        stock_param = f"{stock_code},{org_id}"
    elif stock_code:
        stock_param = str(stock_code)

    # 先用关键词减少结果量；也可空关键词再本地过滤（更慢）
    search_keys = ["招股说明书", "招股意向书"]
    for sk in search_keys:
        page = 1
        while page <= max_pages:
            data = {
                "pageNum": str(page),
                "pageSize": "30",
                "column": "szse",
                "tabName": "fulltext",
                "plate": "",
                "stock": stock_param,   # 没有也行，会靠下面的本地过滤兜底
                "searchkey": sk,
                "secid": "",
                "category": "",
                "trade": "",
                "seDate": "",
                "sortName": "",
                "sortType": "",
                "isHLtitle": "true",
            }
            resp = session.post(CNINFO_API, headers=HEADERS, data=data, timeout=20)
            resp.raise_for_status()
            js = resp.json()
            anns = js.get("announcements") or []
            if not anns:
                break

            for a in anns:
                sc = str(a.get("secCode", "")).strip()
                sn = str(a.get("secName", "")).strip()
                title = normalize_title(a.get("announcementTitle", "") or a.get("shortTitle", ""))

                # 公司匹配（双保险）：优先代码一致，否则用简称包含
                if stock_code and sc != str(stock_code):
                    continue
                if (not stock_code) and stock_name and (stock_name not in sn):
                    continue

                # 只要“招股/意向书”，排除“摘要”
                if ("招股" in title or "意向书" in title) and ("摘要" not in title):
                    candidates.append(a)

            total_pages = js.get("totalpages") or 1
            page += 1
            if page > total_pages:
                break
            time.sleep(0.2)  # 礼貌性休眠，避免触发风控
    return candidates

def download_pdf(adjunct_url: str, save_path: str, session=None):
    if session is None:
        session = requests.Session()
    url = CNINFO_STATIC + adjunct_url.lstrip("/")
    r = session.get(url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    with open(save_path, "wb") as f:
        f.write(r.content)

def load_company_list():
    """尝试从候选清单中读取公司列表，自动识别常见列名"""
    df = None
    for p in CANDIDATE_INPUTS:
        if os.path.exists(p):
            df = pd.read_excel(p)
            log(f"检测到公司清单：{p}")
            break
    if df is None:
        raise FileNotFoundError(
            "未找到公司清单。请把公司列表放到：\n"
            "  - 年报链接_2024_选取公司【年报】.xlsx 或\n"
            "  - 年报链接_2024_选取公司.xlsx 或\n"
            "  - 正股映射结果.xlsx\n"
            f"目录：{BASE_DIR}"
        )
    # 常见列名兜底
    code_cols = ["证券代码", "股票代码", "sec_code", "code", "stock_code"]
    name_cols = ["证券简称", "公司简称", "sec_name", "name", "stock_name"]
    org_cols  = ["orgId", "org_id", "orgid"]

    def first_col(cols):
        for c in cols:
            if c in df.columns:
                return c
        return None

    col_code = first_col(code_cols)
    col_name = first_col(name_cols)
    col_org  = first_col(org_cols)
    if not col_code and not col_name:
        raise ValueError("公司清单里没找到代码或简称列（示例：证券代码/股票代码/证券简称/公司简称）。")

    # 去重
    if col_code:
        df = df.drop_duplicates(subset=[col_code])
    elif col_name:
        df = df.drop_duplicates(subset=[col_name])

    return df, col_code, col_name, col_org

def main():
    log("开始：IPO 招股说明书下载（不涉及年报）")
    df, col_code, col_name, col_org = load_company_list()
    session = requests.Session()

    results = []
    not_found = []

    log(f"待处理公司数：{len(df)}")
    for _, row in df.iterrows():
        sc  = str(row[col_code]).strip() if col_code else None
        sn  = str(row[col_name]).strip() if col_name else None
        oid = str(row[col_org]).strip() if (col_org and not pd.isna(row[col_org])) else None

        cands = query_ipo_prospectus(sc, sn, oid, session=session)
        best  = pick_best(cands)
        if not best:
            not_found.append({"证券代码": sc or "", "证券简称": sn or "", "备注": "未找到招股说明书类公告"})
            continue

        title = best.get("announcementTitle") or best.get("shortTitle") or ""
        ts    = best.get("announcementTime")
        dt    = datetime.fromtimestamp(ts/1000).strftime("%Y-%m-%d") if ts else ""
        adj   = best.get("adjunctUrl") or ""
        sec   = best.get("secCode") or (sc or "")
        name  = best.get("secName") or (sn or "")

        safe_title = re.sub(r"[\\/:*?\"<>|]", "_", normalize_title(title))
        filename   = f"{sec}_{name}_{dt}_{safe_title}.pdf"
        save_path  = os.path.join(OUT_DIR, filename)

        results.append({
            "证券代码": sec,
            "证券简称": name,
            "公告日期": dt,
            "标题": title,
            "文件链接": CNINFO_STATIC + adj.lstrip("/"),
            "保存路径": save_path
        })

    log(f"匹配到 {len(results)} 条记录，开始下载…")

    def _job(item):
        try:
            download_pdf(item["文件链接"].replace(CNINFO_STATIC, ""), item["保存路径"], session=session)
            return (item["证券代码"], True, item["保存路径"])
        except Exception as e:
            return (item["证券代码"], False, str(e))

    ok, fail = 0, 0
    with ThreadPoolExecutor(max_workers=8) as ex:
        futs = [ex.submit(_job, it) for it in results]
        for fut in as_completed(futs):
            code, suc, msg = fut.result()
            if suc:
                ok += 1
            else:
                fail += 1
                log(f"[下载失败] {code}: {msg}")

    out_df = pd.DataFrame(results)
    if not_found:
        out_df = pd.concat([out_df,
                            pd.DataFrame([{"证券代码":"","证券简称":"","公告日期":"","标题":"","文件链接":"","保存路径":""}]),
                            pd.DataFrame(not_found)],
                           ignore_index=True)

    out_df.to_excel(OUT_LOG, index=False)
    log(f"完成：成功 {ok}，失败 {fail}\n清单：{OUT_LOG}\nPDF目录：{OUT_DIR}")

if __name__ == "__main__":
    main()


[2025-08-15 11:21:02] 开始：IPO 招股说明书下载（不涉及年报）
[2025-08-15 11:21:03] 检测到公司清单：/Users/sam/Desktop/cninfo_output/年报链接_2024_选取公司【年报】.xlsx
[2025-08-15 11:21:03] 待处理公司数：466


ReadTimeout: HTTPSConnectionPool(host='www.cninfo.com.cn', port=443): Read timed out. (read timeout=20)

In [2]:
# ==== Part 3A（稳健版）：IPO 招股说明书批量下载（独立新增，不修改 Part 1/2）====
import os
import re
import time
import random
import pandas as pd
import requests
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from requests.exceptions import ReadTimeout, ConnectTimeout, SSLError, ProxyError

# === 路径与输出 ===
BASE_DIR   = os.path.expanduser("~/Desktop/cninfo_output")
CANDIDATE_INPUTS = [
    os.path.join(BASE_DIR, "年报链接_2024_选取公司【年报】.xlsx"),
    os.path.join(BASE_DIR, "年报链接_2024_选取公司.xlsx"),
    os.path.join(BASE_DIR, "正股映射结果.xlsx"),
]
OUT_DIR    = os.path.join(BASE_DIR, "ipo_prospectus_pdf")
OUT_LOG    = os.path.join(BASE_DIR, "IPO招股说明书_清单.xlsx")
os.makedirs(OUT_DIR, exist_ok=True)

# === 并发&节流参数（可按需调整）===
MAX_WORKERS = 6            # 下载PDF的并发数，别太大，避免被限流
PER_COMPANY_SLEEP = 0.6     # 每家公司查询完后的小睡（含抖动）
PAGE_SLEEP_BASE = 0.35      # 翻页间隔（含抖动）

# === 日志 ===
def log(msg):
    print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {msg}")

# === 请求头 ===
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "Origin": "https://www.cninfo.com.cn",
    "Referer": "https://www.cninfo.com.cn/"
}

# === 接口与静态文件域名 ===
CNINFO_API    = "https://www.cninfo.com.cn/new/hisAnnouncement/query"
CNINFO_STATIC = "https://static.cninfo.com.cn/"

# === 标题优先级（从高到低），排除“摘要” ===
TITLE_PRIORITY = [
    r"首次公开发行股票招股说明书(?!.*摘要).*",   # 正式版
    r"招股说明书(?!.*摘要).*",                    # 统配（排除摘要）
    r"招股说明书（申报稿）.*",
    r"招股说明书（(预披露|更新).*）.*",
    r"招股意向书.*",
]

def normalize_title(s: str) -> str:
    s = (s or "").strip()
    s = re.sub(r"\s+", "", s)
    s = s.replace("（", "(").replace("）", ")")
    return s

def pick_best(records):
    """在同一家公司候选里按标题优先级 + 公告时间(新) 选一条"""
    if not records:
        return None
    def score(rec):
        title = normalize_title(rec.get("announcementTitle", "") or rec.get("shortTitle", ""))
        ts    = rec.get("announcementTime", 0)
        for i, pat in enumerate(TITLE_PRIORITY):
            if re.search(pat, title):
                return (len(TITLE_PRIORITY) - i, ts)  # 越靠前分越高，时间越新越好
        return (0, ts)
    return sorted(records, key=score, reverse=True)[0]

def jitter_sleep(base_sec):
    """加随机抖动，降低“齐刷刷触发风控”的概率"""
    time.sleep(base_sec + random.random() * 0.3)

def make_session():
    """带连接池与自动重试的稳健会话（对超时/5xx/429更耐受）"""
    sess = requests.Session()
    retry = Retry(
        total=6,                  # 总重试上限
        connect=3,                # 连接错误重试
        read=5,                   # 读超时重试
        status=5,                 # 状态码重试
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=frozenset(["GET", "POST"]),
        backoff_factor=0.7,       # 指数退避 0.7, 1.4, 2.8, ...
        raise_on_status=False,
        respect_retry_after_header=True,
    )
    adapter = HTTPAdapter(max_retries=retry, pool_connections=100, pool_maxsize=100)
    sess.mount("https://", adapter)
    sess.mount("http://", adapter)
    sess.headers.update(HEADERS)
    return sess

def safe_post(session, url, data, max_attempts=4, connect_timeout=10, read_timeout=45):
    """对 POST 再包一层：捕捉超时/SSL/代理异常，做手动重试 + 抖动"""
    last_err = None
    for attempt in range(1, max_attempts + 1):
        try:
            resp = session.post(url, data=data, timeout=(connect_timeout, read_timeout))
            if resp.status_code in (429, 500, 502, 503, 504):
                last_err = Exception(f"HTTP {resp.status_code}")
                ra = resp.headers.get("Retry-After")
                if ra and ra.isdigit():
                    time.sleep(int(ra))
                else:
                    jitter_sleep(1.0 * attempt)
                continue
            return resp
        except (ReadTimeout, ConnectTimeout, SSLError, ProxyError) as e:
            last_err = e
            jitter_sleep(1.0 * attempt)
            continue
    raise last_err if last_err else Exception("safe_post failed without explicit error")

def query_ipo_prospectus(stock_code=None, stock_name=None, org_id=None, session=None, max_pages=40):
    """按公司代码/简称查询 IPO 招股说明书类公告，返回候选记录列表"""
    if session is None:
        session = make_session()

    candidates = []
    stock_param = ""
    if stock_code and org_id:
        stock_param = f"{stock_code},{org_id}"
    elif stock_code:
        stock_param = str(stock_code)

    # 先用关键词缩小范围；如需兜底可再加空 searchkey 的慢路径
    search_keys = ["招股说明书", "招股意向书"]

    for sk in search_keys:
        page = 1
        while page <= max_pages:
            data = {
                "pageNum": str(page),
                "pageSize": "30",
                "column": "szse",
                "tabName": "fulltext",
                "plate": "",
                "stock": stock_param,
                "searchkey": sk,
                "secid": "",
                "category": "",
                "trade": "",
                "seDate": "",
                "sortName": "",
                "sortType": "",
                "isHLtitle": "true",
            }
            resp = safe_post(session, CNINFO_API, data,
                             max_attempts=4, connect_timeout=10, read_timeout=45)
            js = resp.json()
            anns = js.get("announcements") or []
            if not anns:
                break

            for a in anns:
                sc = str(a.get("secCode", "")).strip()
                sn = str(a.get("secName", "")).strip()
                title = normalize_title(a.get("announcementTitle", "") or a.get("shortTitle", ""))

                # 公司匹配（代码优先，否则简称包含）
                if stock_code and sc != str(stock_code):
                    continue
                if (not stock_code) and stock_name and (stock_name not in sn):
                    continue

                # 只要“招股/意向书”，排除“摘要”
                if ("招股" in title or "意向书" in title) and ("摘要" not in title):
                    candidates.append(a)

            total_pages = js.get("totalpages") or 1
            page += 1
            if page > total_pages:
                break

            jitter_sleep(PAGE_SLEEP_BASE)

    return candidates

def download_pdf(adjunct_url: str, save_path: str, session=None):
    """下载静态文件；也使用带重试的 session"""
    if session is None:
        session = make_session()
    url = CNINFO_STATIC + adjunct_url.lstrip("/")
    r = session.get(url, timeout=(10, 60))
    r.raise_for_status()
    with open(save_path, "wb") as f:
        f.write(r.content)

def load_company_list():
    """尝试从候选清单中读取公司列表，自动识别常见列名"""
    df = None
    for p in CANDIDATE_INPUTS:
        if os.path.exists(p):
            df = pd.read_excel(p)
            log(f"检测到公司清单：{p}")
            break
    if df is None:
        raise FileNotFoundError(
            "未找到公司清单。请把公司列表放到：\n"
            "  - 年报链接_2024_选取公司【年报】.xlsx 或\n"
            "  - 年报链接_2024_选取公司.xlsx 或\n"
            "  - 正股映射结果.xlsx\n"
            f"目录：{BASE_DIR}"
        )
    code_cols = ["证券代码", "股票代码", "sec_code", "code", "stock_code"]
    name_cols = ["证券简称", "公司简称", "sec_name", "name", "stock_name"]
    org_cols  = ["orgId", "org_id", "orgid"]

    def first_col(cols):
        for c in cols:
            if c in df.columns:
                return c
        return None

    col_code = first_col(code_cols)
    col_name = first_col(name_cols)
    col_org  = first_col(org_cols)
    if not col_code and not col_name:
        raise ValueError("公司清单里没找到代码或简称列（示例：证券代码/股票代码/证券简称/公司简称）。")

    # 去重
    if col_code:
        df = df.drop_duplicates(subset=[col_code])
    elif col_name:
        df = df.drop_duplicates(subset=[col_name])

    return df, col_code, col_name, col_org

def main():
    log("开始：IPO 招股说明书下载（不涉及年报）")
    df, col_code, col_name, col_org = load_company_list()
    session = make_session()

    results = []
    not_found = []

    log(f"待处理公司数：{len(df)}")
    for idx, row in df.iterrows():
        sc  = str(row[col_code]).strip() if col_code else None
        sn  = str(row[col_name]).strip() if col_name else None
        oid = str(row[col_org]).strip() if (col_org and not pd.isna(row[col_org])) else None

        try:
            cands = query_ipo_prospectus(sc, sn, oid, session=session)
            best  = pick_best(cands)
        except Exception as e:
            log(f"[查询异常] {sc or ''} {sn or ''}: {e}")
            best = None

        if not best:
            not_found.append({"证券代码": sc or "", "证券简称": sn or "", "备注": "未找到招股说明书类公告/或查询异常"})
            jitter_sleep(PER_COMPANY_SLEEP)
            continue

        title = best.get("announcementTitle") or best.get("shortTitle") or ""
        ts    = best.get("announcementTime")
        dt    = datetime.fromtimestamp(ts/1000).strftime("%Y-%m-%d") if ts else ""
        adj   = best.get("adjunctUrl") or ""
        sec   = best.get("secCode") or (sc or "")
        name  = best.get("secName") or (sn or "")

        safe_title = re.sub(r"[\\/:*?\"<>|]", "_", normalize_title(title))
        filename   = f"{sec}_{name}_{dt}_{safe_title}.pdf"
        save_path  = os.path.join(OUT_DIR, filename)

        results.append({
            "证券代码": sec,
            "证券简称": name,
            "公告日期": dt,
            "标题": title,
            "文件链接": CNINFO_STATIC + adj.lstrip("/"),
            "保存路径": save_path
        })

        # 每家公司完毕后小睡，降低限流概率
        jitter_sleep(PER_COMPANY_SLEEP)

    log(f"匹配到 {len(results)} 条记录，开始下载…")

    def _job(item):
        try:
            # download_pdf 接受 adjunct_url（不带静态前缀），这里做个转换
            download_pdf(item["文件链接"].replace(CNINFO_STATIC, ""), item["保存路径"], session=session)
            return (item["证券代码"], True, item["保存路径"])
        except Exception as e:
            return (item["证券代码"], False, str(e))

    ok, fail = 0, 0
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
        futs = [ex.submit(_job, it) for it in results]
        for fut in as_completed(futs):
            code, suc, msg = fut.result()
            if suc:
                ok += 1
            else:
                fail += 1
                log(f"[下载失败] {code}: {msg}")

    out_df = pd.DataFrame(results)
    if not_found:
        out_df = pd.concat([out_df,
                            pd.DataFrame([{"证券代码":"","证券简称":"","公告日期":"","标题":"","文件链接":"","保存路径":""}]),
                            pd.DataFrame(not_found)],
                           ignore_index=True)

    out_df.to_excel(OUT_LOG, index=False)
    log(f"完成：成功 {ok}，失败 {fail}\n清单：{OUT_LOG}\nPDF目录：{OUT_DIR}")

if __name__ == "__main__":
    main()


[2025-08-15 12:07:49] 开始：IPO 招股说明书下载（不涉及年报）
[2025-08-15 12:07:49] 检测到公司清单：/Users/sam/Desktop/cninfo_output/年报链接_2024_选取公司【年报】.xlsx
[2025-08-15 12:07:49] 待处理公司数：466


KeyboardInterrupt: 