In [None]:


from pathlib import Path
import pandas as pd
from pypinyin import lazy_pinyin, Style

base_dir = Path('csv_by_category')  # 相对当前 Notebook 所在目录
enc_candidates = ["utf-8-sig", "utf-8", "gb18030", "gbk"]

def build_display_name(val: object) -> str:
    if pd.isna(val):
        return ""
    s = str(val).strip()
    if not s:
        return ""
    py = ''.join(lazy_pinyin(s, style=Style.NORMAL, errors='default'))
    return f"{py}（{s}）"  # 拼音（原名字）

csv_files = list(base_dir.rglob('*.csv'))
print(f"发现 CSV 文件数：{len(csv_files)}")

for csv_path in csv_files:
    used_enc = None
    last_err = None
    for enc in enc_candidates:
        try:
            df = pd.read_csv(csv_path, encoding=enc)
            used_enc = enc
            break
        except Exception as e:
            last_err = e
    if used_enc is None:
        print(f"读取失败：{csv_path}，错误：{last_err}")
        continue

    if '名字' not in df.columns:
        print(f"跳过（无“名字”列）：{csv_path}")
        continue

    df['显示名字'] = df['名字'].apply(build_display_name)

    try:
        df.to_csv(csv_path, index=False, encoding=used_enc)
        print(f"已更新：{csv_path}（编码：{used_enc}）")
    except Exception as e:
        print(f"写回失败：{csv_path}，错误：{e}")

发现 CSV 文件数：46
已更新：csv_by_category\乐舞\乐名.csv（编码：utf-8-sig）
已更新：csv_by_category\乐舞\乐器.csv（编码：utf-8-sig）
已更新：csv_by_category\乐舞\舞名.csv（编码：utf-8-sig）
已更新：csv_by_category\人物\人名.csv（编码：utf-8-sig）
已更新：csv_by_category\人物\尸.csv（编码：utf-8-sig）
已更新：csv_by_category\人物\神名.csv（编码：utf-8-sig）
已更新：csv_by_category\其他\其他.csv（编码：utf-8-sig）
已更新：csv_by_category\其他\视肉.csv（编码：utf-8-sig）
已更新：csv_by_category\动物\兽名.csv（编码：utf-8-sig）
已更新：csv_by_category\动物\虫名.csv（编码：utf-8-sig）
已更新：csv_by_category\动物\螺.csv（编码：utf-8-sig）
已更新：csv_by_category\动物\贝.csv（编码：utf-8-sig）
已更新：csv_by_category\动物\鱼名.csv（编码：utf-8-sig）
已更新：csv_by_category\动物\鸟名.csv（编码：utf-8-sig）
已更新：csv_by_category\动物\龟鳖.csv（编码：utf-8-sig）
已更新：csv_by_category\器物\兵器.csv（编码：utf-8-sig）
已更新：csv_by_category\器物\刑具.csv（编码：utf-8-sig）
已更新：csv_by_category\器物\器物.csv（编码：utf-8-sig）
已更新：csv_by_category\器物\服饰.csv（编码：utf-8-sig）
已更新：csv_by_category\地名\山名.csv（编码：utf-8-sig）
已更新：csv_by_category\地名\山系.csv（编码：utf-8-sig）
已更新：csv_by_category\地名\建筑.csv（编码：utf-8-sig）
已更新：csv_by_category\地

In [None]:
from pathlib import Path
import os
import sys
import pandas as pd
import requests
import time
import random


# 基本参数
BASE_DIR = Path("csv_by_category")
TO_LANG = "en"  # 目标语言，可改为 "ja"、"ko"、"fr" 等

# 读取 Azure Translator 配置
AZ_ENDPOINT = 'https://api.cognitive.microsofttranslator.com/'
AZ_KEY = os.getenv("AZURE_TRANSLATOR_KEY")
AZ_REGION = 'global'

if not AZ_KEY or not AZ_REGION:
    print("缺少环境变量 AZURE_TRANSLATOR_KEY 或 AZURE_TRANSLATOR_REGION。请先 setx 后重试。")
    sys.exit(1)

def translate_texts(texts, to_lang=TO_LANG, max_retries=5):
    """使用 Azure 翻译，带限流重试"""
    if not texts:
        return []
    url = AZ_ENDPOINT.rstrip("/") + "/translate"
    params = {"api-version": "3.0", "to": to_lang}
    headers = {
        "Ocp-Apim-Subscription-Key": AZ_KEY,
        "Ocp-Apim-Subscription-Region": AZ_REGION,
        "Content-Type": "application/json",
    }
    body = [{"text": t} for t in texts]

    for attempt in range(max_retries):
        resp = requests.post(url, params=params, headers=headers, json=body, timeout=30)
        if resp.status_code == 429:
            ra = resp.headers.get("Retry-After")
            delay = float(ra) if ra else min(2 ** attempt, 16) + random.uniform(0, 0.5)
            time.sleep(delay)
            continue
        try:
            resp.raise_for_status()
        except Exception:
            # 对其它错误做一次短暂退避
            time.sleep(min(2 ** attempt, 8))
            if attempt < max_retries - 1:
                continue
            raise
        data = resp.json()
        return [item.get("translations", [{}])[0].get("text", "") for item in data]

    # 理论到不了这里
    return [""] * len(texts)

def safe_read_csv(p: Path):
    """简单编码回退读取"""
    for enc in ("utf-8-sig", "utf-8", "gb18030", "gbk"):
        try:
            df = pd.read_csv(p, encoding=enc)
            return df, enc
        except Exception:
            continue
    raise RuntimeError("无法读取，编码不支持")

def main():
    csv_files = list(BASE_DIR.rglob("*.csv"))
    print(f"发现 CSV：{len(csv_files)} 个")

    for csv_path in csv_files:
        try:
            df, used_enc = safe_read_csv(csv_path)
        except Exception as e:
            print(f"读取失败：{csv_path} -> {e}")
            continue

        if "prompt" not in df.columns:
            print(f"跳过（无 prompt 列）：{csv_path}")
            continue

        # 只翻译缺失的行
        if "prompt翻译" not in df.columns:
            df["prompt翻译"] = None
        mask_need = (
            df["prompt"].notna()
            & (df["prompt"].astype(str).str.strip() != "")
            & (df["prompt翻译"].isna() | (df["prompt翻译"].astype(str).str.strip() == ""))
        )
        idx_list = df.index[mask_need].tolist()
        texts = [str(df.at[i, "prompt"]).strip() for i in idx_list]

        BATCH = 20  # 减小批量，降低 429 概率
        results = []
        for s in range(0, len(texts), BATCH):
            batch = texts[s:s+BATCH]
            try:
                results.extend(translate_texts(batch))
            except Exception as e:
                results.extend([""] * len(batch))
                print(f"翻译失败（{csv_path}, 批 {s//BATCH}）：{e}")
            time.sleep(0.5)  # 批次间暂停，进一步避免 429

        for i, res in zip(idx_list, results):
            df.at[i, "prompt翻译"] = res

        try:
            df.to_csv(csv_path, index=False, encoding=used_enc)
            print(f"已更新：{csv_path}")
        except Exception as e:
            print(f"写入失败：{csv_path} -> {e}")

if __name__ == "__main__":
    main()

发现 CSV：46 个
已更新：csv_by_category\乐舞\乐名.csv
已更新：csv_by_category\乐舞\乐器.csv
已更新：csv_by_category\乐舞\舞名.csv
已更新：csv_by_category\人物\人名.csv
已更新：csv_by_category\人物\尸.csv
已更新：csv_by_category\人物\神名.csv


  df.at[i, "prompt翻译"] = res


已更新：csv_by_category\其他\其他.csv


  df.at[i, "prompt翻译"] = res


已更新：csv_by_category\其他\视肉.csv


  df.at[i, "prompt翻译"] = res


已更新：csv_by_category\动物\兽名.csv


  df.at[i, "prompt翻译"] = res


已更新：csv_by_category\动物\虫名.csv


  df.at[i, "prompt翻译"] = res


已更新：csv_by_category\动物\螺.csv


  df.at[i, "prompt翻译"] = res


已更新：csv_by_category\动物\贝.csv


  df.at[i, "prompt翻译"] = res


已更新：csv_by_category\动物\鱼名.csv


  df.at[i, "prompt翻译"] = res


已更新：csv_by_category\动物\鸟名.csv


  df.at[i, "prompt翻译"] = res


已更新：csv_by_category\动物\龟鳖.csv


  df.at[i, "prompt翻译"] = res


已更新：csv_by_category\器物\兵器.csv


  df.at[i, "prompt翻译"] = res


已更新：csv_by_category\器物\刑具.csv


  df.at[i, "prompt翻译"] = res


已更新：csv_by_category\器物\器物.csv


  df.at[i, "prompt翻译"] = res


已更新：csv_by_category\器物\服饰.csv


  df.at[i, "prompt翻译"] = res


已更新：csv_by_category\地名\山名.csv


  df.at[i, "prompt翻译"] = res


已更新：csv_by_category\地名\山系.csv


  df.at[i, "prompt翻译"] = res


已更新：csv_by_category\地名\建筑.csv


  df.at[i, "prompt翻译"] = res


已更新：csv_by_category\地名\森林.csv
已更新：csv_by_category\地名\水名.csv
已更新：csv_by_category\地名\泽名.csv
已更新：csv_by_category\地名\洞穴.csv
已更新：csv_by_category\地名\海名.csv
已更新：csv_by_category\地名\渊名.csv
已更新：csv_by_category\地名\荒漠.csv
已更新：csv_by_category\地名\谷野名.csv
已更新：csv_by_category\地名\郡国名.csv
已更新：csv_by_category\植物\木名.csv
已更新：csv_by_category\植物\草名.csv
已更新：csv_by_category\植物\谷物.csv
已更新：csv_by_category\疾病\疾病.csv
已更新：csv_by_category\祭祀\祭法.csv
已更新：csv_by_category\祭祀\祭祀用兽.csv
已更新：csv_by_category\祭祀\祭祀用器.csv
已更新：csv_by_category\祭祀\祭祀用玉.csv
已更新：csv_by_category\祭祀\祭祀用禽.csv
已更新：csv_by_category\祭祀\祭祀用粮.csv
已更新：csv_by_category\祭祀\祭祀用酒.csv
已更新：csv_by_category\自然\气候.csv
已更新：csv_by_category\自然\矿名.csv
已更新：csv_by_category\食物\粮食.csv
已更新：csv_by_category\食物\食物.csv


In [5]:
import pandas as pd
import re
from pypinyin import lazy_pinyin, Style

CSV_PATH = "shanhaijing_cooccurrence_expanded_result.csv"

def to_pinyin(text: str) -> str:
    s = str(text).strip()
    if not s:
        return ""
    py = ''.join(lazy_pinyin(s, style=Style.NORMAL, errors='default'))
    return f"{py}（{s}）"

def split_names(cell: object):
    if pd.isna(cell):
        return []
    parts = re.split(r'[，,]', str(cell))
    return [p.strip() for p in parts if p.strip()]

try_encodings = ["utf-8-sig", "utf-8", "gb18030", "gbk"]
df = None
for enc in try_encodings:
    try:
        df = pd.read_csv(CSV_PATH, encoding=enc)
        used_enc = enc
        break
    except Exception:
        continue

if df is None:
    print("读取失败，检查文件或编码。")
else:
    if "名字" in df.columns:
        df["名字（拼音）"] = df["名字"].apply(to_pinyin)
    else:
        print("缺少“名字”列，跳过。")

    if "相关人物" in df.columns:
        df["相关人物（拼音）"] = df["相关人物"].apply(
            lambda cell: ",".join(to_pinyin(name) for name in split_names(cell))
        )
    else:
        print("缺少“相关人物”列，跳过。")

    try:
        df.to_csv(CSV_PATH, index=False, encoding=used_enc)
        print(f"已更新文件：{CSV_PATH}（编码：{used_enc}）")
    except Exception as e:
        print(f"写入失败：{e}")

已更新文件：shanhaijing_cooccurrence_expanded_result.csv（编码：utf-8-sig）


In [None]:
import os
import re
import time
from pathlib import Path
import requests

ROOT = Path("csv_by_category_English")
BATCH = 50
API_VERSION = "3.0"
TO_LANG = "en"

AZ_ENDPOINT = 'https://api.cognitive.microsofttranslator.com/'
AZ_KEY = os.getenv("AZURE_TRANSLATOR_KEY")
AZ_REGION = 'global'

if not AZ_KEY:
    raise SystemExit("未找到环境变量 AZURE_TRANSLATOR_KEY")

invalid_chars = r'<>:"/\\|?*'
sanitize_tbl = str.maketrans({c: " " for c in invalid_chars})

def sanitize(name: str) -> str:
    s = (name or "").strip().translate(sanitize_tbl)
    s = re.sub(r"\s+", " ", s).strip(" .")
    return s or "untitled"

def need_process(name: str) -> bool:
    # 已经是 English（中文） 形式就跳过（粗略判断，有中文括号且包含中文字符）
    return not (("（" in name and "）" in name and re.search(r"[\u4e00-\u9fff]", name)))

def translate_batch(texts):
    if not texts:
        return []
    url = f"{AZ_ENDPOINT}/translate"
    params = {"api-version": API_VERSION, "to": TO_LANG}
    headers = {
        "Ocp-Apim-Subscription-Key": AZ_KEY,
        "Ocp-Apim-Subscription-Region": AZ_REGION,
        "Content-Type": "application/json",
    }
    body = [{"text": t} for t in texts]
    for attempt in range(5):
        r = requests.post(url, params=params, headers=headers, json=body, timeout=30)
        if r.status_code == 429:
            time.sleep(1 + attempt)
            continue
        r.raise_for_status()
        data = r.json()
        return [item.get("translations", [{}])[0].get("text", "") for item in data]
    return [""] * len(texts)

def ensure_unique(target: Path) -> Path:
    if not target.exists():
        return target
    stem, suffix = target.stem, target.suffix
    i = 1
    while True:
        cand = target.with_name(f"{stem} ({i}){suffix}")
        if not cand.exists():
            return cand
        i += 1

def format_new(english: str, original: str) -> str:
    e = english.strip()
    if not e:
        e = original  # 回退
    e = e[:1].upper() + e[1:]
    return f"{e}（{original}）"

def main():
    if not ROOT.exists():
        raise SystemExit(f"目录不存在: {ROOT}")

    # 收集文件与目录
    csv_files = list(ROOT.rglob("*.csv"))
    dirs = sorted([p for p in ROOT.rglob("*") if p.is_dir()],
                  key=lambda p: len(p.relative_to(ROOT).parts),
                  reverse=True)

    # 需要翻译的名称集合（文件 stem 与目录名）
    names_to_translate = []
    for p in csv_files:
        if need_process(p.stem):
            names_to_translate.append(p.stem)
    for d in dirs:
        if need_process(d.name):
            names_to_translate.append(d.name)

    names_to_translate = list(dict.fromkeys(names_to_translate))  # 去重保持顺序

    # 批量翻译
    mapping = {}
    for i in range(0, len(names_to_translate), BATCH):
        batch = names_to_translate[i:i+BATCH]
        trans = translate_batch(batch)
        for orig, eng in zip(batch, trans):
            mapping[orig] = eng

    # 重命名文件
    for f in csv_files:
        orig_cn = f.stem
        if not need_process(orig_cn):
            continue
        eng = mapping.get(orig_cn, "")
        new_base = format_new(eng, orig_cn)
        new_base = sanitize(new_base)
        if new_base == f.stem:
            continue
        target = ensure_unique(f.with_name(new_base + f.suffix))
        try:
            f.rename(target)
            print(f"[文件] {f.name} -> {target.name}")
        except Exception as e:
            print(f"[文件] 失败 {f} -> {target.name}: {e}")

    # 重命名目录（自底向上）
    for d in dirs:
        orig_cn = d.name
        if not need_process(orig_cn):
            continue
        eng = mapping.get(orig_cn, "")
        new_name = sanitize(format_new(eng, orig_cn))
        if new_name == d.name:
            continue
        target = ensure_unique(d.with_name(new_name))
        try:
            d.rename(target)
            print(f"[目录] {orig_cn} -> {target.name}")
        except Exception as e:
            print(f"[目录] 失败 {d} -> {new_name}: {e}")

if __name__ == "__main__":
    main()

In [4]:
# 将本单元作为一个新 Cell 运行，或替换你用于“关系”翻译的那个 Cell

import os
import sys
import re
import time
import random
import requests
import pandas as pd

CSV_PATH = "shanhaijing_cooccurrence_expanded_result.csv"
TO_LANG = "en"

# Azure Translator 配置（需先配置 AZURE_TRANSLATOR_KEY；区域一般用 global）
AZ_ENDPOINT = "https://api.cognitive.microsofttranslator.com/translate"
AZ_KEY = os.getenv("AZURE_TRANSLATOR_KEY")
AZ_REGION = "global"

if not AZ_KEY:
    print("缺少环境变量 AZURE_TRANSLATOR_KEY")
    sys.exit(1)

# 读取 CSV（简单编码回退）
def safe_read_csv(path):
    for enc in ("utf-8-sig", "utf-8", "gb18030", "gbk"):
        try:
            df = pd.read_csv(path, encoding=enc)
            return df, enc
        except Exception:
            pass
    raise RuntimeError("无法读取 CSV，请检查文件和编码")

# 仅翻译给定文本列表（带限流重试）
def translate_texts(texts, to_lang=TO_LANG, max_retries=6):
    if not texts:
        return []
    headers = {
        "Ocp-Apim-Subscription-Key": AZ_KEY,
        "Ocp-Apim-Subscription-Region": AZ_REGION,
        "Content-Type": "application/json",
    }
    params = {"api-version": "3.0", "to": to_lang}
    body = [{"text": t} for t in texts]

    for attempt in range(max_retries):
        resp = requests.post(AZ_ENDPOINT, params=params, headers=headers, json=body, timeout=30)
        # 429 限流：遵循 Retry-After，否则指数退避 + 抖动
        if resp.status_code == 429:
            ra = resp.headers.get("Retry-After")
            delay = float(ra) if ra else min(2 ** attempt, 16) + random.uniform(0, 0.5)
            time.sleep(delay)
            continue
        # 其它 5xx 当成瞬时错误重试
        if 500 <= resp.status_code < 600:
            delay = min(2 ** attempt, 16) + random.uniform(0, 0.5)
            time.sleep(delay)
            continue

        resp.raise_for_status()
        data = resp.json()
        return [item.get("translations", [{}])[0].get("text", "") for item in data]

    # 超过重试：返回空串占位，避免中断
    return [""] * len(texts)

# 分割“关系列”里的多值（支持中文逗号、顿号、分号等常见分隔符）
def split_relations(cell):
    if pd.isna(cell):
        return []
    parts = re.split(r"[，,、;；/|]+", str(cell))
    return [p.strip() for p in parts if p.strip()]

# 主流程
df, used_enc = safe_read_csv(CSV_PATH)

# 猜测“关系列”的列名
src_candidates = ["关系列", "关系", "关系类型"]
src_col = next((c for c in src_candidates if c in df.columns), None)
if not src_col:
    print(f"未找到关系列（候选：{src_candidates}），不做修改。")
    sys.exit(0)

# 1) 收集唯一关系词（按分隔符拆分，去空去重）
unique_tokens = sorted({
    tok
    for cell in df[src_col].dropna()
    for tok in split_relations(cell)
})
print("发现的关系种类（拆分后）：", unique_tokens)

# 2) 翻译唯一关系词（分批 + 批次间暂停）
BATCH = 20
translated = []
for i in range(0, len(unique_tokens), BATCH):
    batch = unique_tokens[i:i+BATCH]
    try:
        translated.extend(translate_texts(batch))
    except Exception as e:
        translated.extend([""] * len(batch))
        print(f"翻译失败（批 {i//BATCH}）：{e}")
    time.sleep(0.5)  # 批次间暂停，降低 429 概率

mapping = dict(zip(unique_tokens, translated))
print("关系翻译表：", mapping)

# 3) 构造新列 relation：逐行拆分 -> 映射 -> 拼回
def translate_cell(cell):
    toks = split_relations(cell)
    if not toks:
        return ""
    # 翻译缺失则回退原中文，避免信息丢失
    return ",".join(mapping.get(t, "") or t for t in toks)

df["relation"] = df[src_col].apply(translate_cell)

# 写回
df.to_csv(CSV_PATH, index=False, encoding=used_enc)
print(f"已写回：{CSV_PATH}（编码：{used_enc}）")

发现的关系种类（拆分后）： ['"彘身而八足', '“刉”是一种祭祀时的宰杀方式', '“投”指的是黄帝将峚山的玉荣投到钟山之阳的动作', '一牝豚刉”', '上', '下', '下友', '不', '不产生', '不厌', '东', '东临', '东北', '东南', '东南注', '东南注江', '东望', '东至', '临', '临于', '临近', '为', '为败', '为首', '主要对象“投”和待分析对象“盡澤”之间的关系是：无关。原文中', '举', '乘', '争神', '争神 断葬', '二者在文本中没有直接关联。', '交', '产', '产出', '产有', '产生', '产自', '享用', '代', '代关系：替代', '代替', '令', '令生', '伐', '伴随', '伺', '似', '位于', '作为', '作为席子使用', '作为槛', '作为祭品', '佩带', '佩戴', '佩戴\r\n践踏', '佩戴\r\n踩踏', '佩戴 踏踩', '使', '使不', '使役', '使用', '侄子', '供', '供奉', '供祭', '依据', '倒', '倒祠', '倒置', '像', '像声', '儛', '兄弟', '入', '共事', '共处', '共存', '共居', '共葬', '关押', '关系：包含', '关系：合', '关联', '关联关系：配偶', '具', '具有', '养育', '出', '出入', '出入必以', '出卫', '出注', '出现', '出现于', '出现则国家多', '出现则该县多土功', '出现在', '出生地', '出產', '出産', '出自', '出衛', '出衛於', '刉', '创作', '创建', '创造', '到', '到访', '到达', '制作', '制成', '制造', '削', '加', '动词：包含', '动词：有', '动词：有 \r\n\r\n这里的关系是描述性的', '动词：用  \r\n\r\n解释：原文中提到“其祠：毛用一雄鷄', '包含', '包裹', '化为', '北', '北旁', '北望', '北江出曼山', '北置', '升降', '协助', '南', '南有', '南望', '南流注于', '卫', '即彘（猪）身体上有着蛇的尾巴。但若严