In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import pandas as pd
import efinance as ef       # pip install efinance

def main():
    # 1. 定位文件
    in_file  = os.path.expanduser("~/Desktop/行情与分析20250605.xlsx")
    out_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

    # 2. 读取原始 Excel
    df_raw = pd.read_excel(in_file, dtype=str)  # 支持 .xlsx/.xls 等格式 :contentReference[oaicite:8]{index=8}

    # 3. 拉取全市场可转债基础信息
    #    包含字段：债券代码、债券名称、正股代码、正股名称、...... :contentReference[oaicite:9]{index=9}
    df_all = ef.bond.get_all_base_info()

    # 4. 清洗字段：提取无后缀的可转债 code 和纯数字正股 code/name
    df_all["bond_code"]  = df_all["债券代码"].astype(str).str.strip()       # e.g. '110059' :contentReference[oaicite:10]{index=10}
    df_all["stock_code"] = df_all["正股代码"].astype(str).str.strip()       # e.g. '600000' :contentReference[oaicite:11]{index=11}
    df_all["stock_name"] = df_all["正股名称"].astype(str).str.strip()       # e.g. '浦发银行' :contentReference[oaicite:12]{index=12}

    # 5. 原表中提取 bond_code（去掉 '.SH'/'.SZ' 后缀）
    df_raw["bond_code"] = df_raw["代码"].str.split(".").str[0]

    # 6. 左合并：把正股信息映射到原表
    df_merged = df_raw.merge(
        df_all[["bond_code", "stock_code", "stock_name"]],
        on="bond_code", how="left"
    )

    # 7. 构造最终列：带后缀的正股代码 + 正股名称
    def add_suffix(code: str) -> str:
        if not isinstance(code, str) or not code.isdigit():
            return ""
        return code + (".SH" if code.startswith(("60","68")) else ".SZ")

    df_merged["正股代码"] = df_merged["stock_code"].apply(add_suffix)
    df_merged["正股名称"] = df_merged["stock_name"].fillna("")

    # 8. 删除中间列并保存到 Excel
    df_final = df_merged.drop(columns=["bond_code", "stock_code", "stock_name"])
    df_final.to_excel(out_file, index=False, engine="openpyxl")  # 支持写入 .xlsx :contentReference[oaicite:13]{index=13}

    print("✅ 正股映射已完成，结果保存在：", out_file)

if __name__ == "__main__":
    main()


✅ 正股映射已完成，结果保存在： /Users/sam/Desktop/正股映射结果.xlsx


In [4]:
import requests
import re
import openpyxl
import time
import os

# 放到桌面上的 cninfo_output 文件夹
output_dir = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)

GZH = "【年报】"

def get_report(page_num, date):
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    headers = {
        "Accept": "*/*",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
        "Content-Length": "195",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "Host": "www.cninfo.com.cn",
        "Origin": "http://www.cninfo.com.cn",
        "Proxy-Connection": "keep-alive",
        "Referer": "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&checkedCategory=category_ndbg_szsh",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.42",
        "X-Requested-With": "XMLHttpRequest"
    }
    data = {
        "pageNum": page_num,
        "pageSize": 30,
        "column": "",
        "tabName": "fulltext",
        "plate": plate,
        "searchkey": "",
        "secid": "",
        "category": "category_ndbg_szsh",
        "trade": trade,
        "seDate": date,
        "sortName": "code",
        "sortType": "asc",
        "isHLtitle": "false"
    }
    return requests.post(url, data=data, headers=headers)


def download_report(date):
    global counter
    all_results = []
    page_num = 1
    response_test = get_report(page_num, date)

    try:
        data_test = response_test.json()
        total_pages = data_test.get("totalpages", 0)
        if total_pages == 0:
            return all_results
    except (ValueError, KeyError) as e:
        print(f"获取总页数失败: {e}")
        return all_results

    max_retries = 3
    while page_num <= total_pages + 1:
        retry_count = 0
        while retry_count <= max_retries:
            try:
                response = get_report(page_num, date)
                response.raise_for_status()
                data = response.json()
                if not data.get("announcements"):
                    break
                all_results.extend(data["announcements"])
                if total_pages > 0:
                    per = (counter / total_pages)
                    if per < 1:
                        print(f"\r当前年份下载进度 {per * 100:.2f}%", end='')
                    else:
                        print(f"\r下载完成，正在保存……", end='')
                else:
                    print("无法计算下载进度，总页数为0。")
                break
            except Exception as e:
                print(f"第{page_num}页错误: {e}，5秒后重试...")
                time.sleep(5)
                retry_count += 1
        page_num += 1
        counter += 1
    return all_results


def main(year):
    global sum
    date_count = f"{year}-01-01~{year}-12-31"
    response = get_report(1, date_count)
    data = response.json()
    sum = data.get("totalpages", 0)

    year += 1
    all_results = []
    time_segments = [
        f"{year}-01-01~{year}-04-01",
        f"{year}-04-02~{year}-04-15",
        f"{year}-04-16~{year}-04-22",
        f"{year}-04-23~{year}-04-26",
        f"{year}-04-27~{year}-04-28",
        f"{year}-04-29~{year}-04-30",
        f"{year}-05-01~{year}-07-31",
        f"{year}-08-01~{year}-10-31",
        f"{year}-11-01~{year}-11-30",
        f"{year}-12-01~{year}-12-31"
    ]
    for segment in time_segments:
        all_results.extend(download_report(segment))

    # ========== 可选：保留最新版本年报 ==========

    def filter_latest_versions(ans):
        keywords = ['更正','更正后','更正版','修订后','修订版','更新后','更新版']
        latest = {}
        for item in ans:
            code = item['secCode']
            m = re.search(r"(\d{4})年", item['announcementTitle'])
            yr = m.group(1) if m else ''
            key = (code, yr)
            title = item['announcementTitle']
            if key not in latest or (any(kw in title for kw in keywords) and not any(kw in latest[key]['announcementTitle'] for kw in keywords)):
                latest[key] = item
        return list(latest.values())
    # 启用下行以仅保留更新后年报
    all_results = [item for item in all_results
               if "摘要" not in item["announcementTitle"]]
    
    all_results = filter_latest_versions(all_results)
    

    workbook = openpyxl.Workbook()
    ws = workbook.active
    ws.title = "年报"
    ws.append(["公司代码", "公司简称", "标题", "年份", "年报链接"])
    for item in all_results:
        company_code = item['secCode']
        company_name = item['secName']
        title = re.sub(r"<.*?>", "", item['announcementTitle'].strip()).replace('：','')
        title = f"《{title}》"
        m = re.search(r"(\d{4})年", title)
        year_str = m.group(1) if m else setYear
        url = f"http://static.cninfo.com.cn/{item['adjunctUrl']}"
        if not any(kw in title for kw in exclude_keywords):
            ws.append([company_code, company_name, title, year_str, url])

    save_path = os.path.join(output_dir, f"年报链接_{setYear}{GZH}.xlsx")
    workbook.save(save_path)
    print(f"已保存到 {save_path}")

if __name__ == '__main__':
    # 排除列表可以加入'更正后','修订版'来规避数据重复或公司发布之前年份的年报修订版等问题，
    exclude_keywords = ['英文','已取消','摘要']
    # 控制行业，若为空则不控制，仅可从参考内容中选取，中间用英文分号隔开
    # 参考内容："农、林、牧、渔业;电力、热力、燃气及水生产和供应业;建筑业;采矿业;制造业;批发和零售业;交通运输、仓储和邮政业;住宿和餐饮业;信息传输、软件和信息技术服务业;金融业;房地产业;租赁和商务服务业;科学研究和技术服务业;水利、环境和公共设施管理业;居民服务、修理和其他服务业;教育;卫生和社会工作;文化、体育和娱乐业;综合"
    trade = ""
    # 板块控制：深市sz 沪市sh 深主板szmb 沪主板shmb 创业板szcy 科创板shkcp 北交所bj 请按照格式填写
    # plate = "sz;sh"
    plate = "sz;sh" 

    # ========= 可选设置：自定义公司列表，仅下载指定公司年报 =========
    # 示例：['600519', '000001']，留空则下载全部
    # secCode = []
    # 示例：['中集集团', '中洲控股']，留空则下载全部
    # secName = []


    # ========= 从第一段映射结果读取：仅下载这些正股对应的年报 =========
    mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")
    df_map = pd.read_excel(mapping_file, dtype=str)
    # 去掉后缀，保留纯数字代码列表
    secCode = df_map["正股代码"].str.split(".").str[0].tolist()
    secName = df_map["正股名称"].tolist()



    global counter, sum
    counter = 1
    setYear = 2024
    Flag = 0

    def filter_excel(year):
        if not secCode and not secName:
            return
        try:
            src = os.path.join(output_dir, f"年报链接_{year}{GZH}.xlsx")
            wb = openpyxl.load_workbook(src)
            ws = wb.active
            rows = list(ws.iter_rows(values_only=True))
            header = rows[0]
            filtered = [header] + [r for r in rows[1:] if str(r[0]) in secCode or str(r[1]) in secName]
            wb_new = openpyxl.Workbook()
            ws_new = wb_new.active
            for r in filtered:
                ws_new.append(r)
            dst = os.path.join(output_dir, f"年报链接_{year}_选取公司.xlsx")
            wb_new.save(dst)
            print(f"过滤完成: {dst} 已生成")
        except Exception as e:
            print(f"过滤失败: {e}，已跳过")

    if Flag:
        for y in range(2020, setYear+1):
            counter = 1
            main(y)
            filter_excel(y)
            print(f"----{y}年下载完成")
    else:
        main(setYear)
        filter_excel(setYear)
        print(f"----{setYear}年下载完成")


下载完成，正在保存…….61%

KeyboardInterrupt: 

In [8]:
import requests
import re
import openpyxl
import time
import os
import pandas as pd

# 放到桌面上的 cninfo_output 文件夹
output_dir = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)

GZH = "【年报】"

def get_report(page_num, date):
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    headers = {
        "Accept": "*/*",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
        "Content-Length": "195",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "Host": "www.cninfo.com.cn",
        "Origin": "http://www.cninfo.com.cn",
        "Proxy-Connection": "keep-alive",
        "Referer": "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&checkedCategory=category_ndbg_szsh",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.42",
        "X-Requested-With": "XMLHttpRequest"
    }
    data = {
        "pageNum": page_num,
        "pageSize": 30,
        "column": column,
        "tabName": "fulltext",
        "plate": plate,
        "searchkey": "",
        "secid": "",
        "category": "category_ndbg_szsh",
        "trade": trade,
        "seDate": date,
        "sortName": "code",
        "sortType": "asc",
        "isHLtitle": "false"
    }
    return requests.post(url, data=data, headers=headers)


def download_report(date):
    global counter
    all_results = []
    page_num = 1
    resp = get_report(page_num, date)
    try:
        total_pages = resp.json().get("totalpages", 0)
        if total_pages == 0:
            return all_results
    except:
        return all_results

    max_retries = 3
    while page_num <= total_pages:
        retry = 0
        while retry < max_retries:
            try:
                resp = get_report(page_num, date)
                resp.raise_for_status()
                data = resp.json()
                if not data.get("announcements"):
                    break
                all_results.extend(data["announcements"])
                break
            except Exception:
                time.sleep(5)
                retry += 1
        page_num += 1
        counter += 1
    return all_results


# ----- 全局去重函数 -----
def filter_latest_versions(ans):
    keywords = ['更正','更正后','更正版','修订后','修订版','更新后','更新版']
    latest = {}
    for item in ans:
        code = item['secCode']
        m = re.search(r"(\d{4})年", item['announcementTitle'])
        yr = m.group(1) if m else ''
        key = (code, yr)
        title = item['announcementTitle']
        if key not in latest or (
            any(kw in title for kw in keywords)
            and not any(kw in latest[key]['announcementTitle'] for kw in keywords)
        ):
            latest[key] = item
    return list(latest.values())


# ----- 只抓数据，返回所有公告列表 -----
def download_reports_for_year(year):
    # 1) 按时间段抓取
    date_count = f"{year}-01-01~{year}-12-31"
    _ = get_report(1, date_count).json().get("totalpages", 0)
    year += 1
    all_results = []
    time_segments = [
        f"{year}-01-01~{year}-04-01",
        f"{year}-04-02~{year}-04-15",
        f"{year}-04-16~{year}-04-22",
        f"{year}-04-23~{year}-04-26",
        f"{year}-04-27~{year}-04-28",
        f"{year}-04-29~{year}-04-30",
        f"{year}-05-01~{year}-07-31",
        f"{year}-08-01~{year}-10-31",
        f"{year}-11-01~{year}-11-30",
        f"{year}-12-01~{year}-12-31"
    ]
    for segment in time_segments:
        all_results.extend(download_report(segment))

    # 2) 排除“摘要”，再版本去重
    results = [item for item in all_results if "摘要" not in item["announcementTitle"]]
    results = filter_latest_versions(results)
    return results


# ----- 把公告列表写到 Excel -----
def write_to_excel(results, filepath):
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = "年报"
    ws.append(["公司代码", "公司简称", "标题", "年份", "年报链接"])
    for item in results:
        code = item["secCode"]
        name = item["secName"]
        title = re.sub(r"<.*?>", "", item["announcementTitle"]).replace("：", "")
        title = f"《{title}》"
        m = re.search(r"(\d{4})年", title)
        year_str = m.group(1) if m else ""
        url = f"http://static.cninfo.com.cn/{item['adjunctUrl']}"
        if not any(kw in title for kw in exclude_keywords):
            ws.append([code, name, title, year_str, url])
    wb.save(filepath)
    print(f"已保存：{filepath}")


if __name__ == '__main__':
    exclude_keywords = ['英文','已取消','摘要']
    trade = ""
    # 循环抓取：深市 + 沪市
    mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")
    df_map = pd.read_excel(mapping_file, dtype=str)
    secCode = df_map["正股代码"].str.split(".").str[0].tolist()
    secName = df_map["正股名称"].tolist()

    global counter
    counter = 1
    setYear = 2024

    def filter_excel(year):
        src = os.path.join(output_dir, f"年报链接_{year}{GZH}.xlsx")
        wb = openpyxl.load_workbook(src)
        ws = wb.active
        rows = list(ws.iter_rows(values_only=True))
        header = rows[0]
        filtered = [header] + [
            r for r in rows[1:]
            if str(r[0]) in secCode or str(r[1]) in secName
        ]
        wb_new = openpyxl.Workbook()
        ws_new = wb_new.active
        for r in filtered:
            ws_new.append(r)
        dst = os.path.join(output_dir, f"年报链接_{year}_选取公司.xlsx")
        wb_new.save(dst)
        print(f"过滤完成: {dst}")

    # 先拉深市+沪市公告，合并后写成一份完整链接表
    combined = []
    for column in ("szse", "shse"):
        plate = ""  # 不限制板块
        batch = download_reports_for_year(setYear)
        combined.extend(batch)

    full_path = os.path.join(output_dir, f"年报链接_{setYear}{GZH}.xlsx")
    write_to_excel(combined, full_path)

    # 再过滤出映射表中的公司
    filter_excel(setYear)
    print(f"----{setYear}年下载完成，完整链接保存在 {full_path}")


已保存：/Users/sam/Desktop/cninfo_output/年报链接_2024【年报】.xlsx
过滤完成: /Users/sam/Desktop/cninfo_output/年报链接_2024_选取公司.xlsx
----2024年下载完成，完整链接保存在 /Users/sam/Desktop/cninfo_output/年报链接_2024【年报】.xlsx


In [None]:
import pandas as pd
import requests
import os
import re
import logging
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

# 日志配置
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(message)s')

def log(msg):
    print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {msg}")

# 参数配置
BASE_DIR    = os.path.expanduser("~/Desktop/cninfo_output")
EXCEL_FILE  = os.path.join(BASE_DIR, "年报链接_2024_选取公司.xlsx")
PDF_DIR     = os.path.join(BASE_DIR, "pdf_reports_2024")
MAX_WORKERS = 16  # 并发线程数，可根据网络情况调整

# 确保输出目录存在
os.makedirs(PDF_DIR, exist_ok=True)

# 下载单个 PDF
def download_pdf(record):
    code, name, year, url = record
    safe = re.sub(r'[\\/:*?"<>|]', '', f"{code}_{name}_{year}")
    path = os.path.join(PDF_DIR, f"{safe}.pdf")
    if os.path.exists(path):
        log(f"Exists, skip: {path}")
        return
    try:
        resp = requests.get(url, timeout=15)
        content_type = resp.headers.get("Content-Type", "").lower()
        if resp.status_code == 200 and "pdf" in content_type:
            with open(path, 'wb') as f:
                f.write(resp.content)
            log(f"Downloaded: {path}")
        else:
            log(f"Failed ({resp.status_code}) or not PDF: {url}")
    except Exception as e:
        log(f"Error downloading {url}: {e}")

# 主流程
def main():
    log(f"Reading Excel: {EXCEL_FILE}")
    if not os.path.exists(EXCEL_FILE):
        log("Excel file not found. 请检查路径。")
        return
    df = pd.read_excel(EXCEL_FILE)
    records = [
        (r['公司代码'], r['公司简称'], r['年份'], r['年报链接'])
        for _, r in df.iterrows()
        if str(r.get('年份')) == '2024'
    ]
    log(f"Found {len(records)} records for 2024.")

    start = datetime.now()
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [executor.submit(download_pdf, rec) for rec in records]
        for future in as_completed(futures):
            pass
    elapsed = (datetime.now() - start).total_seconds()
    log(f"All downloads completed in {elapsed:.2f}s. PDFs saved to: {PDF_DIR}")

if __name__ == '__main__':
    main()
