In [1]:
import os
import csv
import glob

import pandas as pd

# 配置
BASE_DIR = r"d:\study\研一上\CHC5904\demo"
TEXT_DIR = os.path.join(BASE_DIR, "shanhaijing")
NAME_CSV = os.path.join(BASE_DIR, "merged.csv")
OUT_CSV = os.path.join(BASE_DIR, "shanhaijing_cooccurrence.csv")

# 读取名字列
df = pd.read_csv(NAME_CSV, dtype=str)
names = df["名字"].dropna().astype(str).str.strip().unique().tolist()
# 去掉空串
names = [n for n in names if n]

# 可选：按长度降序，减少短名字被长名字覆盖的影响
names_sorted = sorted(names, key=len, reverse=True)

rows = []

for txt_path in glob.glob(os.path.join(TEXT_DIR, "*.txt")):
    chapter = os.path.splitext(os.path.basename(txt_path))[0]
    # 读取文件（尝试常见编码）
    for enc in ("utf-8", "utf-8-sig", "gb18030"):
        try:
            with open(txt_path, "r", encoding=enc) as f:
                lines = f.readlines()
            break
        except UnicodeDecodeError:
            continue
    else:
        print(f"编码无法识别: {txt_path}")
        continue

    for raw_line in lines:
        line = raw_line.strip()
        if not line:
            continue
        hit = [n for n in names_sorted if n in line]
        # 去掉被更长名字完全包含的短名字（可选）
        filtered = []
        for n in hit:
            if any((n != m and n in m) for m in hit):
                # 如果 n 被更长 m 包含，且整行里没有明显分隔，可以选择保留或删除
                # 这里简单删除
                continue
            filtered.append(n)
        # 至少两个
        final_hits = filtered if len(filtered) >= 2 else []
        if final_hits:
            rows.append([line, ",".join(final_hits), chapter])

# 写出
with open(OUT_CSV, "w", newline="", encoding="utf-8-sig") as f:
    writer = csv.writer(f)
    writer.writerow(["原文行", "名字", "章节"])
    writer.writerows(rows)

print(f"完成，输出: {OUT_CSV} 总行数: {len(rows)}")

完成，输出: d:\study\研一上\CHC5904\demo\shanhaijing_cooccurrence.csv 总行数: 737


In [1]:
import os
import pandas as pd

# 读入上一部结果
BASE_DIR = r"d:\study\研一上\CHC5904\demo"
IN_CSV  = os.path.join(BASE_DIR, "shanhaijing_cooccurrence.csv")
OUT_CSV = os.path.join(BASE_DIR, "shanhaijing_cooccurrence_expanded.csv")

df = pd.read_csv(IN_CSV, dtype=str).fillna("")

rows = []
for _, r in df.iterrows():
    line = r["原文行"]
    chapter = r["章节"]
    names = [n.strip() for n in r["名字"].split(",") if n.strip()]
    # 对每个名字拆成一行，“相关人物”为同一行里的其他名字
    for n in names:
        others = [m for m in names if m != n]
        rows.append([n, line, ",".join(others), chapter])

out = pd.DataFrame(rows, columns=["名字", "原文行", "相关人物", "章节"])
out.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
print(f"完成，输出: {OUT_CSV} 共 {len(out)} 行")

完成，输出: d:\study\研一上\CHC5904\demo\shanhaijing_cooccurrence_expanded.csv 共 4680 行


In [8]:
from openai import OpenAI
import os

client = OpenAI(
    # 如果没有配置环境变量，请用API Key将下行替换为：api_key="sk-xxx"
    # 新加坡和北京地域的API Key不同。获取API Key：https://help.aliyun.com/zh/model-studio/get-api-key
    api_key=os.getenv("qwen"),
    # 以下是北京地域base_url，如果使用新加坡地域的模型，需要将base_url替换为：https://dashscope-intl.aliyuncs.com/compatible-mode/v1
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)
main='甘棗之山'
object='𤠢鼠'
line='《中山經》薄山之首，曰甘棗之山。共水出焉，而西流注于河。其上多杻木，其下有草焉，葵本而杏葉，黃華而莢實，名曰蘀，可以已瞢。有獸焉，其狀如𤠢鼠而文題，其名曰㔮，食之已癭。'

completion = client.chat.completions.create(
    model="qwen-max",
    messages=[
        {
            "role": "system",
            "content": "你是《山海经》研究专家，我需要你分析山海经中指定的主要对象和待分析对象之间的关系，然后用简单的动词返回他们之间的关系，我会给你主要对象，待分析对象和原文内容，只返回关系。"
        },
        {
            "role": "user",
            "content": f"主要对象：{main}，待分析对象：{object}，原文内容：{line}", 
        },
    ],
    extra_body={"enable_search": True}
)

json_string = completion.choices[0].message.content
print(json_string)

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [10]:
print(os.getenv("qwen"))

None
