In [1]:
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''
@Project ：PycharmProjects
@File    ：巨潮资讯年报2.0.py
@IDE     ：PyCharm
@Author  ：lingxiaotian
@Date    ：2023/5/20 12:38
'''

import requests
import re
import openpyxl
import time

GZH = "【公众号：凌小添】"
def get_report(page_num, date):
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    headers = {
        "Accept": "*/*",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
        "Content-Length": "195",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "Host": "www.cninfo.com.cn",
        "Origin": "http://www.cninfo.com.cn",
        "Proxy-Connection": "keep-alive",
        "Referer": "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&checkedCategory=category_ndbg_szsh",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.42",
        "X-Requested-With": "XMLHttpRequest"
    }
    data = {
        "pageNum": page_num,
        "pageSize": 30,
        "column": "szse",
        "tabName": "fulltext",
        "plate": plate,
        "searchkey": "",
        "secid": "",
        "category": "category_ndbg_szsh",
        "trade": trade,
        "seDate": date,
        "sortName": "code",
        "sortType": "asc",
        "isHLtitle": "false"
    }
    response = requests.post(url, data=data, headers=headers)
    return response

# 发送HTTP请求并获取响应
def download_report(date):
    global counter
    all_results = []
    page_num = 1
    response_test = get_report(page_num, date)

    try:
        data_test = response_test.json()
        total_pages = data_test.get("totalpages", 0)
        if total_pages == 0:
            return all_results
    except (ValueError, KeyError) as e:
        print(f"获取总页数失败: {e}")
        return all_results

    max_retries = 3
    while page_num <= total_pages + 1:
        retry_count = 0
        while retry_count <= max_retries:
            try:
                response = get_report(page_num, date)
                response.raise_for_status()
                data = response.json()
                if not data.get("announcements"):
                    break
                all_results.extend(data["announcements"])
                if total_pages > 0:
                    per = (counter / total_pages)
                    if per < 1:
                        print(f"\r当前年份下载进度 {per * 100:.2f} %", end='')
                    else:
                        print(f"\r下载完成，正在保存……", end='')
                else:
                    print("无法计算下载进度，总页数为0。")
                break
            except requests.exceptions.RequestException as e:
                print(f"出现网络请求错误！: {e}")
                print("5秒后重试...")
                time.sleep(5)
                retry_count += 1
            except (ValueError, KeyError) as e:
                print(f"解析响应数据失败: {e}")
                print("5秒后重试...")
                time.sleep(5)
                retry_count += 1
            if retry_count > max_retries:
                print(f"{max_retries}次重试后均失败. 跳过第{page_num}页.")
                break
        page_num += 1
        counter += 1
    return all_results

def main(year):
    global sum
    date_count = f"{year}-01-01~{year}-12-31"
    response = get_report(1, date_count)
    data = response.json()
    sum = data.get("totalpages", 0)

    year = year + 1
    all_results = []
    time_segments = [
        f"{year}-01-01~{year}-04-01",
        f"{year}-04-02~{year}-04-15",
        f"{year}-04-16~{year}-04-22",
        f"{year}-04-23~{year}-04-26",
        f"{year}-04-27~{year}-04-28",
        f"{year}-04-29~{year}-04-30",
        f"{year}-05-01~{year}-07-31",
        f"{year}-08-01~{year}-10-31",
        f"{year}-11-01~{year}-11-30",
        f"{year}-12-01~{year}-12-31"
    ]
    for segment in time_segments:
        results = download_report(segment)
        all_results.extend(results)

    workbook = openpyxl.Workbook()
    worksheet = workbook.active
    worksheet.title = "公众号 凌小添"
    worksheet.append(["公司代码", "公司简称", "标题", "年份", "年报链接"])

    for item in all_results:
        company_code = item["secCode"]
        company_name = item["secName"]
        title = item["announcementTitle"].strip()
        title = re.sub(r"<.*?>", "", title)
        title = title.replace("：", "")
        title = f"《{title}》"
        adjunct_url = item["adjunctUrl"]
        m = re.search(r"(\d{4})年", title)
        year_str = m.group(1) if m else setYear
        announcement_url = f"http://static.cninfo.com.cn/{adjunct_url}"
        exclude_flag = any(kw in title for kw in exclude_keywords)
        if not exclude_flag:
            worksheet.append([company_code, company_name, title, year_str, announcement_url])

    workbook.save(f"年报链接_{setYear}{GZH}.xlsx")

if __name__ == '__main__':
    # 排除列表可以加入'更正后','修订版'来规避数据重复或公司发布之前年份的年报修订版等问题
    exclude_keywords = ['英文','已取消','摘要']
    # 控制行业...详见原模板
    trade = ""
    plate = "sz;sh"

    # ========= 可选设置：自定义公司列表，仅下载指定公司年报 =========
    # 示例：['600519', '000001']，留空则下载全部
    target_companies = []

    global counter
    global sum
    counter = 1
    setYear = 2023
    Flag = 0

    def filter_excel(year):
        if not target_companies:
            return
        try:
            wb = openpyxl.load_workbook(f"年报链接_{year}{GZH}.xlsx")
            ws = wb.active
            rows = list(ws.iter_rows(values_only=True))
            header = rows[0]
            filtered = [header] + [r for r in rows[1:]
                                   if str(r[0]) in target_companies or str(r[1]) in target_companies]
            wb_new = openpyxl.Workbook()
            ws_new = wb_new.active
            for r in filtered:
                ws_new.append(r)
            wb_new.save(f"年报链接_{year}_filtered.xlsx")
            print(f"---- 过滤完成: 年报链接_{year}_filtered.xlsx 已生成")
        except Exception as e:
            print(f"---- 过滤失败: {e}，已跳过")

    if Flag:
        for setYear in range(2020, 2024):
            counter = 1
            main(setYear)
            filter_excel(setYear)
            print(f"---- {setYear}年下载完成")
    else:
        main(setYear)
        filter_excel(setYear)
        print(f"---- {setYear}年下载完成")


下载完成，正在保存…….65 %

OSError: [Errno 30] Read-only file system: '年报链接_2023【公众号：凌小添】.xlsx'