In [7]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import pandas as pd
import efinance as ef       # pip install efinance

def main():
    # 1. 定位文件
    in_file  = os.path.expanduser("~/Desktop/行情与分析20250605.xlsx")
    out_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

    # 2. 读取原始 Excel
    df_raw = pd.read_excel(in_file, dtype=str)  # 支持 .xlsx/.xls 等格式 :contentReference[oaicite:8]{index=8}

    # 3. 拉取全市场可转债基础信息
    #    包含字段：债券代码、债券名称、正股代码、正股名称、...... :contentReference[oaicite:9]{index=9}
    df_all = ef.bond.get_all_base_info()

    # 4. 清洗字段：提取无后缀的可转债 code 和纯数字正股 code/name
    df_all["bond_code"]  = df_all["债券代码"].astype(str).str.strip()       # e.g. '110059' :contentReference[oaicite:10]{index=10}
    df_all["stock_code"] = df_all["正股代码"].astype(str).str.strip()       # e.g. '600000' :contentReference[oaicite:11]{index=11}
    df_all["stock_name"] = df_all["正股名称"].astype(str).str.strip()       # e.g. '浦发银行' :contentReference[oaicite:12]{index=12}

    # 5. 原表中提取 bond_code（去掉 '.SH'/'.SZ' 后缀）
    df_raw["bond_code"] = df_raw["代码"].str.split(".").str[0]

    # 6. 左合并：把正股信息映射到原表
    df_merged = df_raw.merge(
        df_all[["bond_code", "stock_code", "stock_name"]],
        on="bond_code", how="left"
    )

    # 7. 构造最终列：带后缀的正股代码 + 正股名称
    def add_suffix(code: str) -> str:
        if not isinstance(code, str) or not code.isdigit():
            return ""
        return code + (".SH" if code.startswith(("60","68")) else ".SZ")

    df_merged["正股代码"] = df_merged["stock_code"].apply(add_suffix)
    df_merged["正股名称"] = df_merged["stock_name"].fillna("")

    # 8. 删除中间列并保存到 Excel
    df_final = df_merged.drop(columns=["bond_code", "stock_code", "stock_name"])
    df_final.to_excel(out_file, index=False, engine="openpyxl")  # 支持写入 .xlsx :contentReference[oaicite:13]{index=13}

    print("✅ 正股映射已完成，结果保存在：", out_file)

if __name__ == "__main__":
    main()


✅ 正股映射已完成，结果保存在： /Users/sam/Desktop/正股映射结果.xlsx


In [8]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import re
import time
import requests
import openpyxl
import pandas as pd
from typing import List, Dict, Tuple

# —— 参数配置 —— 
YEAR             = 2024
GZH              = "【年报】"

# 输出目录
output_dir = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)

# 正股映射表路径（第一部分脚本生成）
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

# —— 市场/板块控制（保持你原来的设置） —— 
MARKETS = ["szse", "shse"]
PLATES  = ["", "sz", "szmb", "szcy", "sh", "shmb", "shkcp", "bj"]

# —— 过滤词：排除英文/摘要/取消/提示等 —— 
EXCLUDE_KWS = [
    "英文", "英文版", "Annual", "annual", "Summary", "summary",
    "摘要", "摘要版", "年报摘要", "年度报告摘要",
    "取消", "已取消", "披露日期变更", "变更的公告", "提示性公告",
    "H股公告", "B股"
]

# 把各种空格（含全角、NBSP）统一去掉再判断
_SPACE_RE = re.compile(r"[\s\u00A0\u3000]+")
def norm(s: str) -> str:
    return _SPACE_RE.sub("", str(s or "").strip())

# 是否中文“年度报告/年报”
_INCLUDE_AR_PAT = re.compile(r"(年\s*度\s*报\s*告|年\s*报)")
# 更正/修订/更新 权重关键词
_REV_KWS = ["更正", "修订", "更新"]

def is_cn_annual_2024(title: str) -> bool:
    """必须同时符合：含2024；含“年度报告/年报”；且不含排除词"""
    t = norm(title)
    if "2024" not in t:
        return False
    if not _INCLUDE_AR_PAT.search(t):
        return False
    if any(kw in t for kw in EXCLUDE_KWS):
        return False
    return True

# 宽松年份提取：支持“2024 年/年度/年报”等，允许数字与‘年’之间有空格
_YEAR_PATS = [
    re.compile(r"(20\d{2})\s*年?\s*度?\s*报\s*告"),
    re.compile(r"(20\d{2})\s*年?\s*报(?!告)"),
]
def extract_year(title: str) -> str:
    t = norm(title)
    for pat in _YEAR_PATS:
        m = pat.search(t)
        if m:
            return m.group(1)
    # 兜底：标题里有“2024”但没上面两个形式
    m2 = re.search(r"(20\d{2})", t)
    return m2.group(1) if m2 else ""

def ann_weight(it: Dict) -> Tuple[int, int]:
    """权重：更正/修订/更新优先；同级用公告时间最新"""
    title = norm(it.get("announcementTitle", ""))
    w_fix = 10 if any(k in title for k in _REV_KWS) else 0
    try:
        ts = int(it.get("announcementTime", 0))
    except Exception:
        ts = 0
    return (w_fix, ts)

def get_report(page_num: int, date_range: str, column: str, plate: str) -> requests.Response:
    """调用巨潮网历史公告查询接口（保持你的原参数）"""
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    payload = {
        "pageNum":   page_num,
        "pageSize":  30,
        "column":    column,
        "tabName":   "fulltext",
        "category":  "category_ndbg_szsh",  # 不改你的设置
        "plate":     plate,
        "searchkey": "",
        "secid":     "",
        "trade":     "",
        "seDate":    date_range,
        "sortName":  "code",
        "sortType":  "asc",
        "isHLtitle": "false"
    }
    headers = {
        "Accept":             "*/*",
        "Accept-Language":    "zh-CN,zh;q=0.9",
        "Content-Type":       "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin":             "http://www.cninfo.com.cn",
        "Referer": (
            "http://www.cninfo.com.cn/new/commonUrl/"
            "pageOfSearch?url=disclosure/list/search"
            "&checkedCategory=category_ndbg_szsh"
        ),
        "User-Agent":         "Mozilla/5.0",
        "X-Requested-With":   "XMLHttpRequest"
    }
    return requests.post(url, data=payload, headers=headers, timeout=15)

def download_reports_for_segments(segments: List[str], column: str, plate: str) -> List[Dict]:
    """对给定 market(column)×plate，按分段分页拉取；当场做中文2024年报粗过滤 + 去重"""
    all_ann: List[Dict] = []
    seen = set()  # (secCode, adjunctUrl)
    for date_range in segments:
        try:
            total = get_report(1, date_range, column, plate).json().get("totalpages", 0)
        except Exception:
            continue
        if total <= 0:
            continue

        page = 1
        while page <= total:
            for _ in range(3):  # 重试三次
                try:
                    resp = get_report(page, date_range, column, plate)
                    resp.raise_for_status()
                    anns = resp.json().get("announcements", []) or []
                    # —— 当场粗过滤：必须是“2024 中文年报”，并排除摘要/英文/取消等 —— 
                    for x in anns:
                        title = x.get("announcementTitle", "")
                        if not is_cn_annual_2024(title):
                            continue
                        key = (str(x.get("secCode", "")), str(x.get("adjunctUrl", "")))
                        if key in seen:
                            continue
                        seen.add(key)
                        all_ann.append(x)
                    break
                except Exception:
                    time.sleep(1.5)
            page += 1
    return all_ann

def filter_latest_versions(anns: List[Dict]) -> List[Dict]:
    """同公司同年只保留最新一条：更正/修订/更新 > 时间更近"""
    latest: Dict[Tuple[str, str], Dict] = {}
    for it in anns:
        sec = str(it.get("secCode", ""))
        yr  = extract_year(it.get("announcementTitle", ""))
        if not sec or yr != str(YEAR):
            continue
        key = (sec, yr)
        if key not in latest or ann_weight(it) > ann_weight(latest[key]):
            latest[key] = it
    return list(latest.values())

def collect_all_announcements(year: int) -> List[Dict]:
    """保持你原来的 10 段分段"""
    ny = year + 1
    segments = [
        f"{ny}-01-01~{ny}-04-01", f"{ny}-04-02~{ny}-04-15",
        f"{ny}-04-16~{ny}-04-22", f"{ny}-04-23~{ny}-04-26",
        f"{ny}-04-27~{ny}-04-28", f"{ny}-04-29~{ny}-04-30",
        f"{ny}-05-01~{ny}-07-31", f"{ny}-08-01~{ny}-10-31",
        f"{ny}-11-01~{ny}-11-30", f"{ny}-12-01~{ny}-12-31"
    ]
    raw: List[Dict] = []
    for col in MARKETS:
        for pl in PLATES:
            raw.extend(download_reports_for_segments(segments, col, pl))
    return filter_latest_versions(raw)

def write_selected_excel(anns: List[Dict], year: int) -> None:
    """按映射表顺序输出；没找到就留空"""
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    df_map['sec_no_suf'] = df_map['正股代码'].str.split('.').str[0]

    ann_dict = {
        (str(it.get('secCode', '')),
         extract_year(it.get('announcementTitle', ''))): it
        for it in anns
    }

    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = f"{year}年报"
    ws.append(["可转债代码","可转债名称","公司代码","公司简称","标题","年份","年报链接"])

    for _, row in df_map.iterrows():
        bond_code = row.get('代码', "")
        bond_name = row.get('名称', "")
        sec       = row.get('sec_no_suf', "")
        comp_name = row.get('正股名称', "")

        it = ann_dict.get((sec, str(year)))
        if it:
            raw_title = re.sub(r"<.*?>","", it.get('announcementTitle','')).replace("：","")
            title = f"《{raw_title}》"
            url   = f"http://static.cninfo.com.cn/{it.get('adjunctUrl','')}"
            yr    = str(year)
        else:
            title = ""; url = ""; yr = ""

        ws.append([bond_code, bond_name, sec, comp_name, title, yr, url])

    dst = os.path.join(output_dir, f"年报链接_{year}_选取公司{GZH}.xlsx")
    wb.save(dst)
    print(f"✅ 已输出：{dst}")

if __name__ == "__main__":
    all_ann = collect_all_announcements(YEAR)
    write_selected_excel(all_ann, YEAR)
    print(f"---- {YEAR} 年下载完成 ----")


✅ 已输出：/Users/sam/Desktop/cninfo_output/年报链接_2024_选取公司【年报】.xlsx
---- 2024 年下载完成 ----


In [10]:
import pandas as pd
import requests
import os
import re
import logging
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(message)s')

def log(msg):
    print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {msg}")

BASE_DIR    = os.path.expanduser("~/Desktop/cninfo_output")
EXCEL_FILE  = os.path.join(BASE_DIR, "年报链接_2024_选取公司【年报】.xlsx")
PDF_DIR     = os.path.join(BASE_DIR, "pdf_reports_2024")
MAX_WORKERS = 16
os.makedirs(PDF_DIR, exist_ok=True)

def norm_code(x: str) -> str:
    """把类似 600000 / 600000.0 / '600000 ' 统一成 6 位字符串"""
    s = str(x).strip()
    s = s.split('.')[0]                 # 去掉 .0
    s = re.sub(r'\D', '', s)            # 去掉非数字
    return s.zfill(6) if s else ""

def is_pdf_response(resp: requests.Response) -> bool:
    ct = (resp.headers.get("Content-Type") or "").lower()
    return ("pdf" in ct) or resp.content.startswith(b"%PDF")

def download_pdf(record):
    code, name, year, url = record
    safe = re.sub(r'[\\/:*?"<>|]', '', f"{code}_{name}_{year}")
    path = os.path.join(PDF_DIR, f"{safe}.pdf")
    if os.path.exists(path) and os.path.getsize(path) > 0:
        log(f"Exists, skip: {path}")
        return
    try:
        resp = requests.get(url, timeout=20, headers={
            "User-Agent": "Mozilla/5.0",
            "Referer": "http://www.cninfo.com.cn/",
        })
        if resp.status_code == 200 and is_pdf_response(resp):
            with open(path, 'wb') as f:
                f.write(resp.content)
            log(f"Downloaded: {path}")
        else:
            log(f"Failed ({resp.status_code}) or not PDF: {url}")
    except Exception as e:
        log(f"Error downloading {url}: {e}")

def main():
    log(f"Reading Excel: {EXCEL_FILE}")
    if not os.path.exists(EXCEL_FILE):
        log("Excel file not found. 请检查路径。")
        return

    # 明确读“2024年报”这张表，列名去空格
    df = pd.read_excel(EXCEL_FILE, sheet_name="2024年报")
    df.columns = [str(c).strip() for c in df.columns]

    # 年份转数字再过滤（避免 2024.0 / '2024 '）
    df["年份"] = pd.to_numeric(df["年份"], errors="coerce")
    df_2024 = df[df["年份"] == 2024].copy()

    # 处理公司代码/简称/链接为空的情况
    df_2024 = df_2024.dropna(subset=["年报链接"])

    records = [
        (norm_code(r["公司代码"]), str(r["公司简称"]).strip(), "2024", str(r["年报链接"]).strip())
        for _, r in df_2024.iterrows()
    ]

    log(f"Found {len(records)} records for 2024.")

    start = datetime.now()
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [executor.submit(download_pdf, rec) for rec in records]
        for _ in as_completed(futures):
            pass
    elapsed = (datetime.now() - start).total_seconds()
    log(f"All downloads completed in {elapsed:.2f}s. PDFs saved to: {PDF_DIR}")

if __name__ == '__main__':
    main()


[2025-08-09 17:45:20] Reading Excel: /Users/sam/Desktop/cninfo_output/年报链接_2024_选取公司【年报】.xlsx
[2025-08-09 17:45:21] Found 473 records for 2024.
[2025-08-09 17:45:21] Downloaded: /Users/sam/Desktop/cninfo_output/pdf_reports_2024/600567_山鹰国际_2024.pdf
[2025-08-09 17:45:21] Downloaded: /Users/sam/Desktop/cninfo_output/pdf_reports_2024/600498_烽火通信_2024.pdf
[2025-08-09 17:45:21] Downloaded: /Users/sam/Desktop/cninfo_output/pdf_reports_2024/600939_重庆建工_2024.pdf
[2025-08-09 17:45:21] Downloaded: /Users/sam/Desktop/cninfo_output/pdf_reports_2024/600326_西藏天路_2024.pdf
[2025-08-09 17:45:22] Downloaded: /Users/sam/Desktop/cninfo_output/pdf_reports_2024/600903_贵州燃气_2024.pdf
[2025-08-09 17:45:22] Downloaded: /Users/sam/Desktop/cninfo_output/pdf_reports_2024/600075_新疆天业_2024.pdf
[2025-08-09 17:45:22] Downloaded: /Users/sam/Desktop/cninfo_output/pdf_reports_2024/600029_南方航空_2024.pdf
[2025-08-09 17:45:22] Downloaded: /Users/sam/Desktop/cninfo_output/pdf_reports_2024/600496_精工钢构_2024.pdf
[2025-08-09 17:4