In [7]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
02_collect_offering_links.py
功能：按《正股映射结果.xlsx》的公司集合，从巨潮接口检索“募集说明书”，
     只保留与“可转债/可转换公司债券”相关的最新版；若标题不写“可转债”，
     也保留该公司“通用募集说明书”的最新版（排除优先股/非可转债公司债/股票增发等）。
输出：与年报版相同列顺序；每家公司仅一行（最新版）。
"""

import os
import re
import time
import requests
import openpyxl
import pandas as pd
from typing import List, Dict, Tuple
from datetime import date

# ========= 可调参数 =========
DATE_BEGIN = "2000-01-01"
DATE_END   = date.today().strftime("%Y-%m-%d")

output_dir   = os.path.expanduser("~/Desktop/cninfo_output")
os.makedirs(output_dir, exist_ok=True)
mapping_file = os.path.expanduser("~/Desktop/正股映射结果.xlsx")

OUT_XLSX = os.path.join(output_dir, "募集说明书链接_选取公司【募集】.xlsx")
SHEET    = "募集说明书"

# 市场/板块
MARKETS = ["szse", "shse"]
PLATES  = ["", "sz", "szmb", "szcy", "sh", "shmb", "shkcp", "bj"]

# 识别“可转债”正向关键词（只用于分类，不用于请求）
CONV_POS_KWS = ["可转换公司债券", "可转债", "可转换债券", "可转债券"]

# 一票否决（任何情况下都不要）
HARD_EXCLUDE = [
    "英文", "英文版", "Annual", "annual", "Summary", "summary",
    "摘要", "摘要版", "附录", "备查文件", "取消", "已取消",
    "提示性公告", "公告", "更名提示",
    "确认意见", "审核意见", "核准情况", "监事", "全体监事",
    "控股股东", "实际控制人", "一致行动人",
    "GDR", "全球存托凭证", "瑞士", "价格区间", "批准的公告",
    "公开增发", "增发"
]

# 明确排除“不是可转债”的品种
EQUITY_EXCLUDE = [  # 纯股票类
    "向特定对象发行股票", "以简易程序向特定对象发行股票", "发行A股股票", "非公开发行股票",
]
NON_CONV_BOND_EXCLUDE = [  # 非可转债的公司债/优先股
    "优先股", "公司债券", "绿色公司债", "科技创新公司债", "扶贫专项", "乡村振兴", "可续期公司债",
]

# ========= 工具函数 =========
_SPACE_RE = re.compile(r"[\s\u00A0\u3000]+")
def norm(s: str) -> str:
    return _SPACE_RE.sub("", str(s or "").strip())

def any_kw_in(text: str, kws: List[str]) -> bool:
    t = norm(text)
    return any(norm(k) in t for k in kws if k)

def contains_any(text: str, kws: List[str]) -> bool:
    return any_kw_in(text, kws)

def extract_year_from_title(title: str) -> str:
    m = re.search(r"(19\d{2}|20\d{2})", norm(title))
    return m.group(1) if m else ""

def year_from_timestamp_ms(ts: int) -> str:
    try:
        return time.strftime("%Y", time.localtime(int(ts)/1000))
    except Exception:
        return ""

def is_summary_or_bad(title: str) -> bool:
    """是否属于摘要/附录/公告/确认意见等一票否决"""
    t = norm(title)
    if contains_any(t, HARD_EXCLUDE):
        return True
    return False

def is_pure_equity(title: str) -> bool:
    t = norm(title)
    return contains_any(t, EQUITY_EXCLUDE)

def is_non_convertible_bond(title: str) -> bool:
    """排除非可转债公司债/优先股；但保留含'可转换公司债券'的情况"""
    t = norm(title)
    if "可转换公司债券" in t or "可转债" in t or "可转换债券" in t:
        return False
    return contains_any(t, NON_CONV_BOND_EXCLUDE)

def is_convertible(title: str) -> bool:
    t = norm(title)
    return contains_any(t, CONV_POS_KWS)

def ann_weight(it: Dict) -> Tuple[int, int, int]:
    """
    选择权重：
      1) 非摘要优先（摘要-5分）
      2) 修订/更新/更正 +10 分
      3) 时间越新越好
      4) 备用：标题长度（更长通常是正文）
    """
    title = norm(it.get("announcementTitle", ""))
    w = 0
    if "摘要" in title: w -= 5
    if contains_any(title, ["更新", "修订", "更正"]): w += 10
    try:
        ts = int(it.get("announcementTime", 0))
    except Exception:
        ts = 0
    return (w, ts, len(title))

# ========= 调接口 =========
def get_report(page_num: int, date_range: str, column: str, plate: str) -> requests.Response:
    """用一个关键词：募集说明书"""
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    payload = {
        "pageNum":   page_num,
        "pageSize":  30,
        "column":    column,
        "tabName":   "fulltext",
        "category":  "",
        "plate":     plate,
        "searchkey": "募集说明书",
        "secid":     "",
        "trade":     "",
        "seDate":    date_range,
        "sortName":  "code",
        "sortType":  "asc",
        "isHLtitle": "false"
    }
    headers = {
        "Accept":           "*/*",
        "Accept-Language":  "zh-CN,zh;q=0.9",
        "Content-Type":     "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin":           "http://www.cninfo.com.cn",
        "Referer":          "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search",
        "User-Agent":       "Mozilla/5.0",
        "X-Requested-With": "XMLHttpRequest"
    }
    return requests.post(url, data=payload, headers=headers, timeout=15)

def download_for_segments(segments: List[str], column: str, plate: str) -> List[Dict]:
    all_ann: List[Dict] = []
    seen = set()
    for date_range in segments:
        try:
            total = get_report(1, date_range, column, plate).json().get("totalpages", 0)
        except Exception:
            continue
        if total <= 0:
            continue

        page = 1
        while page <= total:
            for _ in range(3):  # 重试
                try:
                    resp = get_report(page, date_range, column, plate)
                    resp.raise_for_status()
                    anns = resp.json().get("announcements", []) or []
                    for x in anns:
                        title = x.get("announcementTitle", "") or ""
                        t = norm(title)

                        # 一票否决
                        if is_summary_or_bad(t):
                            continue
                        # 过滤掉明显不是可转债方向的
                        if is_pure_equity(t):
                            continue
                        if is_non_convertible_bond(t):
                            continue

                        key = (str(x.get("secCode", "")), str(x.get("adjunctUrl", "")))
                        if key in seen:
                            continue
                        seen.add(key)
                        all_ann.append(x)
                    break
                except Exception:
                    time.sleep(1.2)
            page += 1
    return all_ann

def collect_all_offering() -> List[Dict]:
    # 按年分片，避免接口截断
    y_begin = int(DATE_BEGIN[:4])
    y_end   = int(DATE_END[:4])
    segments: List[str] = []
    for y in range(y_begin, y_end + 1):
        seg_start = f"{y}-01-01" if y > y_begin else DATE_BEGIN
        seg_end   = f"{y}-12-31" if y < y_end   else DATE_END
        segments.append(f"{seg_start}~{seg_end}")

    raw: List[Dict] = []
    for col in MARKETS:
        for pl in PLATES:
            raw.extend(download_for_segments(segments, col, pl))
    return raw

# ========= 去重（公司层面只保留一条）=========
def pick_latest_per_company(anns: List[Dict], valid_secs: set) -> List[Dict]:
    """
    先在每家公司里找“可转债”募集说明书（按权重选最佳）；
    如果没有，再在剩余“通用募集说明书”里（非摘要/非股票/非非转债的公司债/非优先股）取权重最高的一条。
    """
    by_sec_all: Dict[str, List[Dict]] = {}
    for it in anns:
        sec = str(it.get("secCode", ""))
        if sec and (sec in valid_secs):
            by_sec_all.setdefault(sec, []).append(it)

    picked: List[Dict] = []
    for sec, lst in by_sec_all.items():
        # 先挑“可转债”类
        conv = [x for x in lst if is_convertible(x.get("announcementTitle","") or "")]
        base = conv if conv else lst  # 没有可转债标题，就用“通用募集说明书”
        best = max(base, key=ann_weight)
        picked.append(best)
    return picked

# ========= 写 Excel =========
def write_selected_excel(anns: List[Dict]) -> None:
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    df_map["sec_no_suf"] = df_map["正股代码"].str.split(".").str[0]

    ann_dict: Dict[str, Dict] = {str(it.get("secCode", "")): it for it in anns}

    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = SHEET
    ws.append(["可转债代码","可转债名称","公司代码","公司简称","标题","年份","链接"])

    for _, row in df_map.iterrows():
        bond_code = row.get("代码", "")
        bond_name = row.get("名称", "")
        sec       = row.get("sec_no_suf", "")
        comp_name = row.get("正股名称", "")

        it = ann_dict.get(sec)
        if it:
            raw_title = re.sub(r"<.*?>","", it.get("announcementTitle","")).replace("：","")
            title = f"《{raw_title}》"
            yr  = extract_year_from_title(raw_title) or year_from_timestamp_ms(it.get("announcementTime",0))
            url = f"http://static.cninfo.com.cn/{it.get('adjunctUrl','')}"
            ws.append([bond_code, bond_name, sec, comp_name, title, yr, url])
        else:
            # 没拉到就留空一行，便于你后续人工核对
            ws.append([bond_code, bond_name, sec, comp_name, "", "", ""])

    wb.save(OUT_XLSX)
    print(f"✅ 已输出：{OUT_XLSX}（工作表：{SHEET}）")

# ========= 主流程 =========
def main():
    df_map = pd.read_excel(mapping_file, dtype=str).fillna("")
    valid_secs = set(df_map["正股代码"].str.split(".").str[0].tolist())

    raw = collect_all_offering()
    picked = pick_latest_per_company(raw, valid_secs)
    write_selected_excel(picked)
    print("---- 可转债募集说明书检索完成 ----")

if __name__ == "__main__":
    main()
