In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import pandas as pd
import efinance as ef       # pip install efinance

def main():
    # 1. 定位文件
    in_file  = os.path.expanduser("~/Desktop/行情与分析20250605.xlsx")
    out_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

    # 2. 读取原始 Excel
    df_raw = pd.read_excel(in_file, dtype=str)  # 支持 .xlsx/.xls 等格式 :contentReference[oaicite:8]{index=8}

    # 3. 拉取全市场可转债基础信息
    #    包含字段：债券代码、债券名称、正股代码、正股名称、...... :contentReference[oaicite:9]{index=9}
    df_all = ef.bond.get_all_base_info()

    # 4. 清洗字段：提取无后缀的可转债 code 和纯数字正股 code/name
    df_all["bond_code"]  = df_all["债券代码"].astype(str).str.strip()       # e.g. '110059' :contentReference[oaicite:10]{index=10}
    df_all["stock_code"] = df_all["正股代码"].astype(str).str.strip()       # e.g. '600000' :contentReference[oaicite:11]{index=11}
    df_all["stock_name"] = df_all["正股名称"].astype(str).str.strip()       # e.g. '浦发银行' :contentReference[oaicite:12]{index=12}

    # 5. 原表中提取 bond_code（去掉 '.SH'/'.SZ' 后缀）
    df_raw["bond_code"] = df_raw["代码"].str.split(".").str[0]

    # 6. 左合并：把正股信息映射到原表
    df_merged = df_raw.merge(
        df_all[["bond_code", "stock_code", "stock_name"]],
        on="bond_code", how="left"
    )

    # 7. 构造最终列：带后缀的正股代码 + 正股名称
    def add_suffix(code: str) -> str:
        if not isinstance(code, str) or not code.isdigit():
            return ""
        return code + (".SH" if code.startswith(("60","68")) else ".SZ")

    df_merged["正股代码"] = df_merged["stock_code"].apply(add_suffix)
    df_merged["正股名称"] = df_merged["stock_name"].fillna("")

    # 8. 删除中间列并保存到 Excel
    df_final = df_merged.drop(columns=["bond_code", "stock_code", "stock_name"])
    df_final.to_excel(out_file, index=False, engine="openpyxl")  # 支持写入 .xlsx :contentReference[oaicite:13]{index=13}

    print("✅ 正股映射已完成，结果保存在：", out_file)

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm


✅ 正股映射已完成，结果保存在： /Users/sam/Desktop/正股映射结果.xlsx


In [3]:
import requests
import re
import openpyxl
import time
import os
import pandas as pd

# 放到桌面上的 cninfo_output 文件夹
output_dir = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)

GZH = "【年报】"

def get_report(page_num, date):
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    headers = {
        "Accept": "*/*",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
        "Content-Length": "195",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "Host": "www.cninfo.com.cn",
        "Origin": "http://www.cninfo.com.cn",
        "Proxy-Connection": "keep-alive",
        "Referer": "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&checkedCategory=category_ndbg_szsh",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.42",
        "X-Requested-With": "XMLHttpRequest"
    }
    data = {
        "pageNum": page_num,
        "pageSize": 30,
        "column": column,
        "tabName": "fulltext",
        "plate": plate,
        "searchkey": "",
        "secid": "",
        "category": "category_ndbg_szsh",
        "trade": trade,
        "seDate": date,
        "sortName": "code",
        "sortType": "asc",
        "isHLtitle": "false"
    }
    return requests.post(url, data=data, headers=headers)


def download_report(date):
    global counter
    all_results = []
    page_num = 1
    resp = get_report(page_num, date)
    try:
        total_pages = resp.json().get("totalpages", 0)
        if total_pages == 0:
            return all_results
    except:
        return all_results

    max_retries = 3
    while page_num <= total_pages:
        retry = 0
        while retry < max_retries:
            try:
                resp = get_report(page_num, date)
                resp.raise_for_status()
                data = resp.json()
                if not data.get("announcements"):
                    break
                all_results.extend(data["announcements"])
                break
            except Exception:
                time.sleep(5)
                retry += 1
        page_num += 1
        counter += 1
    return all_results


# ----- 全局去重函数 -----
def filter_latest_versions(ans):
    keywords = ['更正','更正后','更正版','修订后','修订版','更新后','更新版']
    latest = {}
    for item in ans:
        code = item['secCode']
        m = re.search(r"(\d{4})年", item['announcementTitle'])
        yr = m.group(1) if m else ''
        key = (code, yr)
        title = item['announcementTitle']
        if key not in latest or (
            any(kw in title for kw in keywords)
            and not any(kw in latest[key]['announcementTitle'] for kw in keywords)
        ):
            latest[key] = item
    return list(latest.values())


# ----- 只抓数据，返回所有公告列表 -----
def download_reports_for_year(year):
    # 1) 按时间段抓取
    date_count = f"{year}-01-01~{year}-12-31"
    _ = get_report(1, date_count).json().get("totalpages", 0)
    year += 1
    all_results = []
    time_segments = [
        f"{year}-01-01~{year}-04-01",
        f"{year}-04-02~{year}-04-15",
        f"{year}-04-16~{year}-04-22",
        f"{year}-04-23~{year}-04-26",
        f"{year}-04-27~{year}-04-28",
        f"{year}-04-29~{year}-04-30",
        f"{year}-05-01~{year}-07-31",
        f"{year}-08-01~{year}-10-31",
        f"{year}-11-01~{year}-11-30",
        f"{year}-12-01~{year}-12-31"
    ]
    for segment in time_segments:
        all_results.extend(download_report(segment))

    # 2) 排除“摘要”，再版本去重
    results = [item for item in all_results if "摘要" not in item["announcementTitle"]]
    results = filter_latest_versions(results)
    return results


# ----- 把公告列表写到 Excel -----
def write_to_excel(results, filepath):
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = "年报"
    ws.append(["公司代码", "公司简称", "标题", "年份", "年报链接"])
    for item in results:
        code = item["secCode"]
        name = item["secName"]
        title = re.sub(r"<.*?>", "", item["announcementTitle"]).replace("：", "")
        title = f"《{title}》"
        m = re.search(r"(\d{4})年", title)
        year_str = m.group(1) if m else ""
        url = f"http://static.cninfo.com.cn/{item['adjunctUrl']}"
        if not any(kw in title for kw in exclude_keywords):
            ws.append([code, name, title, year_str, url])
    wb.save(filepath)
    print(f"已保存：{filepath}")


if __name__ == '__main__':
    exclude_keywords = ['英文','已取消','摘要']
    trade = ""
    # 循环抓取：深市 + 沪市
    mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")
    df_map = pd.read_excel(mapping_file, dtype=str)
    secCode = df_map["正股代码"].str.split(".").str[0].tolist()
    secName = df_map["正股名称"].tolist()

    global counter
    counter = 1
    setYear = 2024

    def filter_excel(year):
        src = os.path.join(output_dir, f"年报链接_{year}{GZH}.xlsx")
        wb = openpyxl.load_workbook(src)
        ws = wb.active
        rows = list(ws.iter_rows(values_only=True))
        header = rows[0]
        filtered = [header] + [
            r for r in rows[1:]
            if str(r[0]) in secCode or str(r[1]) in secName
        ]
        wb_new = openpyxl.Workbook()
        ws_new = wb_new.active
        for r in filtered:
            ws_new.append(r)
        dst = os.path.join(output_dir, f"年报链接_{year}_选取公司.xlsx")
        wb_new.save(dst)
        print(f"过滤完成: {dst}")

    # 先拉深市+沪市公告，合并后写成一份完整链接表
    combined = []
    for column in ("szse", "shse"):
        plate = ""  # 不限制板块
        batch = download_reports_for_year(setYear)
        combined.extend(batch)

    full_path = os.path.join(output_dir, f"年报链接_{setYear}{GZH}.xlsx")
    write_to_excel(combined, full_path)

    # 再过滤出映射表中的公司
    filter_excel(setYear)
    print(f"----{setYear}年下载完成，完整链接保存在 {full_path}")


In [12]:
import requests
import re
import openpyxl
import time
import os
import pandas as pd

# 放到桌面上的 cninfo_output 文件夹
output_dir = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)

GZH = "【年报】"

def get_report(page_num, date):
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    headers = {
        "Accept": "*/*",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
        "Content-Length": "195",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "Host": "www.cninfo.com.cn",
        "Origin": "http://www.cninfo.com.cn",
        "Proxy-Connection": "keep-alive",
        "Referer": "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&checkedCategory=category_ndbg_szsh",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.42",
        "X-Requested-With": "XMLHttpRequest"
    }
    data = {
        "pageNum": page_num,
        "pageSize": 30,
        "column": column,
        "tabName": "fulltext",
        "plate": plate,
        "searchkey": "",
        "secid": "",
        "category": "category_ndbg_szsh",
        "trade": trade,
        "seDate": date,
        "sortName": "code",
        "sortType": "asc",
        "isHLtitle": "false"
    }
    return requests.post(url, data=data, headers=headers)


def download_report(date):
    global counter
    all_results = []
    page_num = 1
    resp = get_report(page_num, date)
    try:
        total_pages = resp.json().get("totalpages", 0)
        if total_pages == 0:
            return all_results
    except:
        return all_results

    max_retries = 3
    while page_num <= total_pages:
        retry = 0
        while retry < max_retries:
            try:
                resp = get_report(page_num, date)
                resp.raise_for_status()
                data = resp.json()
                if not data.get("announcements"):
                    break
                all_results.extend(data["announcements"])
                break
            except Exception:
                time.sleep(5)
                retry += 1
        page_num += 1
        counter += 1
    return all_results


# ----- 全局去重函数 -----
def filter_latest_versions(ans):
    keywords = ['更正','更正后','更正版','修订后','修订版','更新后','更新版']
    latest = {}
    for item in ans:
        code = item['secCode']
        m = re.search(r"(\d{4})年", item['announcementTitle'])
        yr = m.group(1) if m else ''
        key = (code, yr)
        title = item['announcementTitle']
        if key not in latest or (
            any(kw in title for kw in keywords)
            and not any(kw in latest[key]['announcementTitle'] for kw in keywords)
        ):
            latest[key] = item
    return list(latest.values())


# ----- 只抓数据，返回所有公告列表 -----
def download_reports_for_year(year):
    # 1) 按时间段抓取
    date_count = f"{year}-01-01~{year}-12-31"
    _ = get_report(1, date_count).json().get("totalpages", 0)
    year += 1
    all_results = []
    time_segments = [
        f"{year}-01-01~{year}-04-01",
        f"{year}-04-02~{year}-04-15",
        f"{year}-04-16~{year}-04-22",
        f"{year}-04-23~{year}-04-26",
        f"{year}-04-27~{year}-04-28",
        f"{year}-04-29~{year}-04-30",
        f"{year}-05-01~{year}-07-31",
        f"{year}-08-01~{year}-10-31",
        f"{year}-11-01~{year}-11-30",
        f"{year}-12-01~{year}-12-31"
    ]
    for segment in time_segments:
        all_results.extend(download_report(segment))

    # 2) 排除“摘要”，再版本去重
    results = [item for item in all_results if "摘要" not in item["announcementTitle"]]
    results = filter_latest_versions(results)
    return results


# ----- 把公告列表写到 Excel -----
def write_to_excel(results, filepath):
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = "年报"
    ws.append(["公司代码", "公司简称", "标题", "年份", "年报链接"])
    for item in results:
        code = item["secCode"]
        name = item["secName"]
        title = re.sub(r"<.*?>", "", item["announcementTitle"]).replace("：", "")
        title = f"《{title}》"
        m = re.search(r"(\d{4})年", title)
        year_str = m.group(1) if m else ""
        url = f"http://static.cninfo.com.cn/{item['adjunctUrl']}"
        if not any(kw in title for kw in exclude_keywords):
            ws.append([code, name, title, year_str, url])
    wb.save(filepath)
    print(f"已保存：{filepath}")


def filter_excel(year):
    # 导入 pandas
    import pandas as pd

    # 读取完整链接表
    src = os.path.join(output_dir, f"年报链接_{year}{GZH}.xlsx")
    df_ann = pd.read_excel(src, dtype=str)

    # 读取正股映射结果
    df_map = pd.read_excel(mapping_file, dtype=str)
    # 提取无后缀的正股代码
    df_map['stock_code_no_suffix'] = df_map['正股代码'].str.split('.').str[0]

    records = []
    for _, row in df_map.iterrows():
        bond_code = row['代码']        # 可转债代码
        bond_name = row['名称']        # 可转债名称
        stock_code = row['stock_code_no_suffix']

        # 匹配该正股代码的所有年报链接
        df_matched = df_ann[df_ann['公司代码'] == stock_code]
        for _, ann in df_matched.iterrows():
            records.append({
                '可转债代码': bond_code,
                '可转债名称': bond_name,
                '公司代码':   ann['公司代码'],
                '公司简称':   ann['公司简称'],
                '标题':      ann['标题'],
                '年份':      ann['年份'],
                '年报链接':   ann['年报链接'],
            })

    # 按映射顺序输出
    df_out = pd.DataFrame(records, columns=[
        '可转债代码', '可转债名称',
        '公司代码', '公司简称', '标题', '年份', '年报链接'
    ])
    dst = os.path.join(output_dir, f"年报链接_{year}_选取公司.xlsx")
    df_out.to_excel(dst, index=False, engine="openpyxl")
    print(f"过滤完成: {dst}")


if __name__ == '__main__':
    exclude_keywords = ['英文', '已取消', '摘要']
    trade = ""
    # 映射表路径
    mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")
    df_map = pd.read_excel(mapping_file, dtype=str)
    secCode = df_map["正股代码"].str.split(".").str[0].tolist()
    secName = df_map["正股名称"].tolist()

    global counter
    counter = 1
    setYear = 2024

    # 第一步：深市 + 沪市公告抓取
    combined = []
    for column in ("szse", "shse"):
        plate = ""  # 不限制板块
        batch = download_reports_for_year(setYear)
        combined.extend(batch)

    # 写出完整链接表
    full_path = os.path.join(output_dir, f"年报链接_{setYear}{GZH}.xlsx")
    write_to_excel(combined, full_path)

    # 第二步：过滤并按映射顺序导出
    filter_excel(setYear)
    print(f"----{setYear}年下载完成，完整链接保存在 {full_path}")


已保存：/Users/sam/Desktop/cninfo_output/年报链接_2024【年报】.xlsx
过滤完成: /Users/sam/Desktop/cninfo_output/年报链接_2024_选取公司.xlsx
----2024年下载完成，完整链接保存在 /Users/sam/Desktop/cninfo_output/年报链接_2024【年报】.xlsx


In [18]:
import requests
import re
import openpyxl
import time
import os
import pandas as pd

# —— 配置年度，只需修改这里即可 —— 
YEAR = 2024

# 放到桌面上的 cninfo_output 文件夹
output_dir = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)

GZH = f"【年报{YEAR}】"
exclude_keywords = ['英文', '已取消', '摘要']
trade = ""

def get_report(page_num, date):
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    headers = {
        "Accept": "*/*",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin": "http://www.cninfo.com.cn",
        "Referer": "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch"
                   "?url=disclosure/list/search&checkedCategory=category_ndbg_szsh",
        "User-Agent": "Mozilla/5.0",
        "X-Requested-With": "XMLHttpRequest"
    }
    data = {
        "pageNum":      page_num,
        "pageSize":     30,
        "column":       column,
        "tabName":      "fulltext",
        "plate":        plate,
        "searchkey":    "",
        "secid":        "",
        "category":     "category_ndbg_szsh",
        "trade":        trade,
        "seDate":       date,
        "sortName":     "code",
        "sortType":     "asc",
        "isHLtitle":    "false"
    }
    return requests.post(url, data=data, headers=headers)


def download_report(date):
    global counter
    all_results = []
    page_num = 1
    resp = get_report(page_num, date)
    try:
        total_pages = resp.json().get("totalpages", 0)
        if total_pages == 0:
            return all_results
    except:
        return all_results

    max_retries = 3
    while page_num <= total_pages:
        retry = 0
        while retry < max_retries:
            try:
                resp = get_report(page_num, date)
                resp.raise_for_status()
                data = resp.json()
                if not data.get("announcements"):
                    break
                all_results.extend(data["announcements"])
                break
            except Exception:
                time.sleep(5)
                retry += 1
        page_num += 1
        counter += 1
    return all_results


# ----- 全局去重函数 -----
def filter_latest_versions(ans):
    keywords = ['更正','更正后','更正版','修订后','修订版','更新后','更新版']
    latest = {}
    for item in ans:
        code = item['secCode']
        m = re.search(r"(\d{4})年", item['announcementTitle'])
        yr = m.group(1) if m else ''
        key = (code, yr)
        title = item['announcementTitle']
        if key not in latest or (
            any(kw in title for kw in keywords)
            and not any(kw in latest[key]['announcementTitle'] for kw in keywords)
        ):
            latest[key] = item
    return list(latest.values())


# ----- 只抓数据，返回所有公告列表 -----
def download_reports_for_year(year):
    # 1) 按时间段抓取上一年全量公告
    date_count = f"{year}-01-01~{year}-12-31"
    _ = get_report(1, date_count).json().get("totalpages", 0)

    # 切换到下一年分段抓取，保证覆盖跨年发布的公告
    next_year = year + 1
    time_segments = [
        f"{next_year}-01-01~{next_year}-04-01",
        f"{next_year}-04-02~{next_year}-04-15",
        f"{next_year}-04-16~{next_year}-04-22",
        f"{next_year}-04-23~{next_year}-04-26",
        f"{next_year}-04-27~{next_year}-04-28",
        f"{next_year}-04-29~{next_year}-04-30",
        f"{next_year}-05-01~{next_year}-07-31",
        f"{next_year}-08-01~{next_year}-10-31",
        f"{next_year}-11-01~{next_year}-11-30",
        f"{next_year}-12-01~{next_year}-12-31"
    ]

    all_results = []
    for segment in time_segments:
        all_results.extend(download_report(segment))

    # 2) 排除“摘要”，再版本去重
    filtered = [item for item in all_results if "摘要" not in item["announcementTitle"]]
    return filter_latest_versions(filtered)


# ----- 把公告列表写到 Excel —— 全量链接 —— 
def write_all_to_excel(results, filepath):
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = f"{YEAR}年报"
    ws.append(["公司代码", "公司简称", "标题", "年份", "年报链接"])
    for item in results:
        code = item["secCode"]
        name = item["secName"]
        title = re.sub(r"<.*?>", "", item["announcementTitle"]).replace("：", "")
        title = f"《{title}》"
        m = re.search(r"(\d{4})年", title)
        year_str = m.group(1) if m else ""
        url = f"http://static.cninfo.com.cn/{item['adjunctUrl']}"
        if not any(kw in title for kw in exclude_keywords):
            ws.append([code, name, title, year_str, url])
    wb.save(filepath)
    print(f"✅ 全量链接已保存：{filepath}")


# ----- 把公告列表写到 Excel —— 按映射过滤 —— 
def filter_excel(year):
    import pandas as pd
    # 读取全量链接表
    src = os.path.join(output_dir, f"年报链接_{year}{GZH}.xlsx")
    df_ann = pd.read_excel(src, dtype=str)

    # 读取正股映射结果
    df_map = pd.read_excel(mapping_file, dtype=str)
    df_map['stock_code_no_suffix'] = df_map['正股代码'].str.split('.').str[0]

    records = []
    for _, row in df_map.iterrows():
        bond_code = row['代码']
        bond_name = row['名称']
        stock_code = row['stock_code_no_suffix']
        df_matched = df_ann[df_ann['公司代码'] == stock_code]
        for _, ann in df_matched.iterrows():
            records.append({
                '可转债代码': bond_code,
                '可转债名称': bond_name,
                '公司代码':   ann['公司代码'],
                '公司简称':   ann['公司简称'],
                '标题':      ann['标题'],
                '年份':      ann['年份'],
                '年报链接':   ann['年报链接'],
            })

    df_out = pd.DataFrame(records, columns=[
        '可转债代码', '可转债名称',
        '公司代码', '公司简称', '标题', '年份', '年报链接'
    ])
    dst = os.path.join(output_dir, f"年报链接_{year}_选取公司.xlsx")
    df_out.to_excel(dst, index=False, engine="openpyxl")
    print(f"✅ 过滤后链接已保存：{dst}")


if __name__ == '__main__':
    # 读取正股映射结果
    mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")
    df_map = pd.read_excel(mapping_file, dtype=str)
    secCode = df_map["正股代码"].str.split(".").str[0].tolist()
    secName = df_map["正股名称"].tolist()

    global counter
    counter = 1

    # 第一步：抓取全量公告链接
    all_links = []
    for column in ("szse", "shse"):
        plate = ""
        batch = download_reports_for_year(YEAR)
        all_links.extend(batch)

    # 写出“全量”年报链接
    full_path = os.path.join(output_dir, f"年报链接_{YEAR}{GZH}.xlsx")
    write_all_to_excel(all_links, full_path)

    # 第二步：按映射结果过滤并导出
    filter_excel(YEAR)

    print(f"---- {YEAR} 年下载完成：全量链接={len(all_links)} 条 ----")


✅ 全量链接已保存：/Users/sam/Desktop/cninfo_output/年报链接_2024【年报2024】.xlsx
✅ 过滤后链接已保存：/Users/sam/Desktop/cninfo_output/年报链接_2024_选取公司.xlsx
---- 2024 年下载完成：全量链接=5343 条 ----


In [21]:
import os
import re
import time
import requests
import openpyxl
import pandas as pd

# —— 配置区 —— 
YEAR = 2024  # 修改此处即可调整年度
GZH = f"【年报{YEAR}】"
exclude_keywords = ['英文', '已取消', '摘要']
trade = ""
plate = ""

# 输出目录
output_dir = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)

# 正股映射表路径（用于第二步过滤）
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")


def get_report(page_num: int, date_range: str) -> requests.Response:
    """
    调用巨潮网历史公告查询接口
    """
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    headers = {
        "Accept": "*/*",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin": "http://www.cninfo.com.cn",
        "Referer": (
            "http://www.cninfo.com.cn/new/commonUrl/"
            "pageOfSearch?url=disclosure/list/search"
            "&checkedCategory=category_ndbg_szsh"
        ),
        "User-Agent": "Mozilla/5.0",
        "X-Requested-With": "XMLHttpRequest"
    }
    payload = {
        "pageNum": page_num,
        "pageSize": 30,
        "column": column,
        "tabName": "fulltext",
        "plate": plate,
        "searchkey": "",
        "secid": "",
        "category": "category_ndbg_szsh",
        "trade": trade,
        "seDate": date_range,
        "sortName": "code",
        "sortType": "asc",
        "isHLtitle": "false"
    }
    return requests.post(url, data=payload, headers=headers)


def download_report(date_range: str) -> list[dict]:
    """
    分页下载单个时间段内的所有公告
    """
    results = []
    page = 1
    resp = get_report(page, date_range)
    try:
        total_pages = resp.json().get("totalpages", 0)
    except Exception:
        return results
    if total_pages == 0:
        return results

    while page <= total_pages:
        for attempt in range(3):
            resp = get_report(page, date_range)
            try:
                resp.raise_for_status()
                ann = resp.json().get("announcements", [])
                if ann:
                    results.extend(ann)
                break
            except Exception:
                time.sleep(5)
        page += 1

    return results


def filter_latest_versions(announcements: list[dict]) -> list[dict]:
    """
    同一公司同一年只保留最新“更正/修订”版本
    """
    rev_kws = ['更正','更正后','更正版','修订后','修订版','更新后','更新版']
    latest = {}
    for item in announcements:
        sec = item['secCode']
        title = item['announcementTitle']
        year_match = re.search(r"(\d{4})年", title)
        year_str = year_match.group(1) if year_match else ""
        key = (sec, year_str)
        if key not in latest:
            latest[key] = item
        else:
            curr = latest[key]['announcementTitle']
            # 如果本条是修订/更正版，而已有条目不是，则覆盖
            if any(kw in title for kw in rev_kws) and not any(kw in curr for kw in rev_kws):
                latest[key] = item
    return list(latest.values())


def download_reports_for_year(year: int) -> list[dict]:
    """
    抓取上一年发布的年度报告公告：
    1) 主时间段：year-01-01~year-12-31
    2) 次年分段：year+1 的若干子区间
    """
    # 主查询时间段
    main_range = f"{year}-01-01~{year}-12-31"
    _ = get_report(1, main_range).json().get("totalpages", 0)

    # 次年分段，以捕获跨年发布时间
    ny = year + 1
    segments = [
        f"{ny}-01-01~{ny}-04-01", f"{ny}-04-02~{ny}-04-15",
        f"{ny}-04-16~{ny}-04-22", f"{ny}-04-23~{ny}-04-26",
        f"{ny}-04-27~{ny}-04-28", f"{ny}-04-29~{ny}-04-30",
        f"{ny}-05-01~{ny}-07-31", f"{ny}-08-01~{ny}-10-31",
        f"{ny}-11-01~{ny}-11-30", f"{ny}-12-01~{ny}-12-31"
    ]

    all_ann = []
    for seg in segments:
        all_ann.extend(download_report(seg))

    # 排除“摘要”及其他关键词
    filtered = [
        x for x in all_ann
        if "摘要" not in x['announcementTitle']
    ]
    return filter_latest_versions(filtered)


def write_to_excel(results: list[dict], filepath: str) -> None:
    """
    将公告列表写入 Excel
    """
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = f"{YEAR}年报"
    ws.append(["公司代码", "公司简称", "标题", "年份", "年报链接"])
    for it in results:
        sec = it["secCode"]
        name = it["secName"]
        raw_title = re.sub(r"<.*?>", "", it["announcementTitle"]).replace("：", "")
        title = f"《{raw_title}》"
        ym = re.search(r"(\d{4})年", raw_title)
        yr = ym.group(1) if ym else ""
        url = f"http://static.cninfo.com.cn/{it['adjunctUrl']}"
        if not any(kw in raw_title for kw in exclude_keywords):
            ws.append([sec, name, title, yr, url])
    wb.save(filepath)
    print(f"已保存：{filepath}")


def filter_excel(year: int) -> None:
    """
    读取全量链接表，并根据映射表导出“选取公司”链接
    """
    full = pd.read_excel(
        os.path.join(output_dir, f"年报链接_{year}{GZH}.xlsx"),
        dtype=str
    )
    mapping = pd.read_excel(mapping_file, dtype=str)
    mapping['code_no_suf'] = mapping['正股代码'].str.split('.').str[0]

    records = []
    for _, row in mapping.iterrows():
        bond_code = row['代码']
        bond_name = row['名称']
        sc = row['code_no_suf']
        matched = full[full['公司代码'] == sc]
        for _, ann in matched.iterrows():
            records.append({
                '可转债代码':   bond_code,
                '可转债名称':   bond_name,
                '公司代码':     ann['公司代码'],
                '公司简称':     ann['公司简称'],
                '标题':        ann['标题'],
                '年份':        ann['年份'],
                '年报链接':     ann['年报链接'],
            })

    df_out = pd.DataFrame(records, columns=[
        '可转债代码', '可转债名称',
        '公司代码', '公司简称', '标题', '年份', '年报链接'
    ])
    dst = os.path.join(output_dir, f"年报链接_{year}_选取公司.xlsx")
    df_out.to_excel(dst, index=False, engine="openpyxl")
    print(f"过滤完成：{dst}")


if __name__ == "__main__":
    # 第一步：抓取并写出“全量”年报链接
    combined = []
    for column in ("szse", "shse"):
        plate = ""
        batch = download_reports_for_year(YEAR)
        combined.extend(batch)

    full_path = os.path.join(output_dir, f"年报链接_{YEAR}{GZH}.xlsx")
    write_to_excel(combined, full_path)

    # 第二步：按映射结果过滤并导出
    filter_excel(YEAR)

    print(f"---- {YEAR} 年下载完成，共 {len(combined)} 条链接 ----")


已保存：/Users/sam/Desktop/cninfo_output/年报链接_2024【年报2024】.xlsx
过滤完成：/Users/sam/Desktop/cninfo_output/年报链接_2024_选取公司.xlsx
---- 2024 年下载完成，共 5343 条链接 ----


In [24]:
import requests
import re
import openpyxl
import time
import os
import pandas as pd

# —— 配置区 —— 
YEAR            = 2024        # ◀ 修改这一行即可抓取对应年度
GZH             = "【年报】"
exclude_keywords = ['英文', '摘要', 'annual report', '已取消']
trade           = ""
plate           = ""

# 文件路径
BASE_DIR     = os.path.expanduser("~/Desktop/cninfo_output")
MAPPING_FILE = os.path.expanduser("~/Desktop/正股映射结果.xlsx")
os.makedirs(BASE_DIR, exist_ok=True)

# 全局变量，用于 get_report 中填充 secid
column = ""
secid  = ""

# —— 单页请求 —— 
def get_report(page_num, date_range):
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    headers = {
        "Accept": "*/*",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "User-Agent": "Mozilla/5.0",
        "X-Requested-With": "XMLHttpRequest"
    }
    data = {
        "pageNum":   page_num,
        "pageSize":  30,
        "column":    column,
        "tabName":   "fulltext",
        "plate":     plate,
        "secid":     secid,
        "category":  "category_ndbg_szsh",
        "trade":     trade,
        "searchkey": f"{YEAR}年年度报告",
        "seDate":    date_range,
        "sortName":  "code",
        "sortType":  "asc",
        "isHLtitle": "false"
    }
    return requests.post(url, data=data, headers=headers)

# —— 分页抓取 —— 
def download_report(date_range):
    results = []
    # 第一次拿总页数
    resp0 = get_report(1, date_range)
    try:
        total = resp0.json().get("totalpages", 0)
    except:
        return results
    # 翻页
    for pn in range(1, total + 1):
        for _ in range(3):
            try:
                r = get_report(pn, date_range)
                r.raise_for_status()
                anns = r.json().get("announcements", [])
                break
            except:
                time.sleep(2)
        else:
            continue
        results.extend(anns)
    return results

# —— 去重：同一公司同一年仅保留最新“修订/更正”版本 —— 
def filter_latest_versions(items):
    rev_kws = ['更正','更正后','更正版','修订后','修订版','更新后','更新版']
    latest = {}
    for it in items:
        code = it['secCode']
        yr_m = re.search(r"(\d{4})年", it['announcementTitle'])
        yr = yr_m.group(1) if yr_m else ''
        key = (code, yr)
        title = it['announcementTitle']
        if key not in latest or (
            any(kw in title for kw in rev_kws)
            and not any(kw in latest[key]['announcementTitle'] for kw in rev_kws)
        ):
            latest[key] = it
    return list(latest.values())

# —— 抓取映射表公司年报 —— 
def fetch_reports_for_mapping(sec_codes):
    # 时间分段：当年 + 次年初
    segments = [
        f"{YEAR}-01-01~{YEAR}-12-31",
        f"{YEAR+1}-01-01~{YEAR+1}-04-30",
    ]
    all_items = []
    for col in ("szse", "shse"):
        global column
        column = col
        for code in sec_codes:
            global secid
            secid = f"{col}_{code}"
            raw = []
            for seg in segments:
                raw.extend(download_report(seg))
            # 只留中文年度报告并排除英文/摘要等
            filtered = [
                it for it in raw
                if f"{YEAR}年年度报告" in it["announcementTitle"]
                and not any(kw in it["announcementTitle"] for kw in exclude_keywords)
            ]
            all_items.extend(filtered)
    return filter_latest_versions(all_items)

# —— 写全量映射链接 Excel —— 
def write_all_to_excel(items, filepath):
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = f"{YEAR}年报"
    ws.append(["公司代码","公司简称","标题","年份","年报链接"])
    for it in items:
        code = it["secCode"]
        name = it["secName"]
        txt  = re.sub(r"<.*?>","", it["announcementTitle"]).replace("：","")
        yr_m = re.search(r"(\d{4})年", txt)
        yr   = yr_m.group(1) if yr_m else ""
        url  = f"http://static.cninfo.com.cn/{it['adjunctUrl']}"
        ws.append([code, name, txt, yr, url])
    wb.save(filepath)
    print("✅ 全量映射链接已保存：", filepath)

# —— 按可转债映射过滤并导出 —— 
def filter_excel(year):
    df_ann = pd.read_excel(
        os.path.join(BASE_DIR, f"年报链接_{year}{GZH}.xlsx"),
        dtype=str
    )
    df_map = pd.read_excel(MAPPING_FILE, dtype=str)
    df_map['stock_code_no_suffix'] = df_map['正股代码'].str.split('.').str[0]

    records = []
    for _, row in df_map.iterrows():
        bc = row['代码']
        bn = row['名称']
        sc = row['stock_code_no_suffix']
        subset = df_ann[df_ann['公司代码']==sc]
        for _, r in subset.iterrows():
            records.append({
                '可转债代码': bc,
                '可转债名称': bn,
                '公司代码':   r['公司代码'],
                '公司简称':   r['公司简称'],
                '标题':      r['标题'],
                '年份':      r['年份'],
                '年报链接':   r['年报链接'],
            })

    df_out = pd.DataFrame(records, columns=[
        '可转债代码','可转债名称',
        '公司代码','公司简称','标题','年份','年报链接'
    ])
    dst = os.path.join(BASE_DIR, f"年报链接_{year}_选取公司.xlsx")
    df_out.to_excel(dst, index=False, engine="openpyxl")
    print("✅ 过滤后链接已保存：", dst)

# —— 主流程 —— 
if __name__ == '__main__':
    # 1) 读取正股映射，无后缀代码列表
    df_map   = pd.read_excel(MAPPING_FILE, dtype=str)
    secCodes = df_map['正股代码'].str.split('.').str[0].tolist()

    # 2) 抓取映射全量年报链接
    items = fetch_reports_for_mapping(secCodes)
    full_path = os.path.join(BASE_DIR, f"年报链接_{YEAR}{GZH}.xlsx")
    write_all_to_excel(items, full_path)

    # 3) 根据映射结果导出“选取公司”年报链接
    filter_excel(YEAR)

    print(f"---- {YEAR}年 下载完成，共抓取{len(items)}条链接 ----")


✅ 全量映射链接已保存： /Users/sam/Desktop/cninfo_output/年报链接_2024【年报】.xlsx
✅ 过滤后链接已保存： /Users/sam/Desktop/cninfo_output/年报链接_2024_选取公司.xlsx
---- 2024年 下载完成，共抓取0条链接 ----


In [None]:
import pandas as pd
import requests
import os
import re
import logging
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

# 日志配置
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(message)s')

def log(msg):
    print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {msg}")

# 参数配置
BASE_DIR    = os.path.expanduser("~/Desktop/cninfo_output")
EXCEL_FILE  = os.path.join(BASE_DIR, "年报链接_2024_选取公司.xlsx")
PDF_DIR     = os.path.join(BASE_DIR, "pdf_reports_2024")
MAX_WORKERS = 16  # 并发线程数，可根据网络情况调整

# 确保输出目录存在
os.makedirs(PDF_DIR, exist_ok=True)

# 下载单个 PDF
def download_pdf(record):
    code, name, year, url = record
    safe = re.sub(r'[\\/:*?"<>|]', '', f"{code}_{name}_{year}")
    path = os.path.join(PDF_DIR, f"{safe}.pdf")
    if os.path.exists(path):
        log(f"Exists, skip: {path}")
        return
    try:
        resp = requests.get(url, timeout=15)
        content_type = resp.headers.get("Content-Type", "").lower()
        if resp.status_code == 200 and "pdf" in content_type:
            with open(path, 'wb') as f:
                f.write(resp.content)
            log(f"Downloaded: {path}")
        else:
            log(f"Failed ({resp.status_code}) or not PDF: {url}")
    except Exception as e:
        log(f"Error downloading {url}: {e}")

# 主流程
def main():
    log(f"Reading Excel: {EXCEL_FILE}")
    if not os.path.exists(EXCEL_FILE):
        log("Excel file not found. 请检查路径。")
        return
    df = pd.read_excel(EXCEL_FILE)
    records = [
        (r['公司代码'], r['公司简称'], r['年份'], r['年报链接'])
        for _, r in df.iterrows()
        if str(r.get('年份')) == '2024'
    ]
    log(f"Found {len(records)} records for 2024.")

    start = datetime.now()
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [executor.submit(download_pdf, rec) for rec in records]
        for future in as_completed(futures):
            pass
    elapsed = (datetime.now() - start).total_seconds()
    log(f"All downloads completed in {elapsed:.2f}s. PDFs saved to: {PDF_DIR}")

if __name__ == '__main__':
    main()
