In [1]:
#check
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import openpyxl
from tqdm import tqdm

# -----------------------------
# 1. 设置 Hugging Face 缓存到数据盘
# -----------------------------
os.environ["TRANSFORMERS_CACHE"] = "/root/autodl-tmp/hf_cache"
os.environ["HF_HOME"] = "/root/autodl-tmp/hf_home"
os.environ["HF_DATASETS_CACHE"] = "/root/autodl-tmp/hf_datasets"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:128"

# -----------------------------
# 2. 模型路径
# -----------------------------
model_path = "/root/autodl-tmp/Qwen3-8B"

# -----------------------------
# 3. BitsAndBytes 4-bit 配置
# -----------------------------
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# -----------------------------
# 4. 加载 tokenizer + 模型
# -----------------------------
print(">>> Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map={"": 0},
    quantization_config=bnb_config,
    trust_remote_code=True
)

# -----------------------------
# 5. 批量检查函数
# -----------------------------
def check_metaphor(sentences, batch_size=2, max_new_tokens=64):
    results = []
    for i in tqdm(range(0, len(sentences), batch_size)):
        batch = sentences[i:i+batch_size]
        prompts = [
            f"以下の日本語文がメタファー（比喩表現）かどうか判断してください。返答は「メタファー」または「非メタファー」のみ。\n文: {s}"
            for s in batch
        ]
        for prompt in prompts:
            messages = [{"role": "user", "content": prompt}]
            text = tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True,
                enable_thinking=False
            )
            model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

            generated_ids = model.generate(
                **model_inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False
            )
            # 解析输出，只取模型回答的最后一行
            output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
            last_line = output_text.strip().split("\n")[-1]
            results.append(last_line)

    return results

# -----------------------------
# 6. 读取 CSV
# -----------------------------
df = pd.read_excel("datasets/非隐喻.xlsx")
sentences = df["Sentence"].astype(str).tolist()

# -----------------------------
# 7. 批量检查
# -----------------------------
metaphor_results = check_metaphor(sentences, batch_size=8, max_new_tokens=64)

# -----------------------------
# 8. 保存结果
# -----------------------------
df["metaphor_check"] = metaphor_results
df.to_csv("datasets/literal_data_checked.csv", index=False, encoding="utf-8-sig")
print("✅ 检查完成，结果保存到 datasets/literal_dict_checked.csv")


  from .autonotebook import tqdm as notebook_tqdm


>>> Loading tokenizer and model...


Loading checkpoint shards: 100%|██████████| 5/5 [00:16<00:00,  3.31s/it]
  0%|          | 0/239 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
100%|██████████| 239/239 [08:43<00:00,  2.19s/it]

✅ 检查完成，结果保存到 datasets/literal_dict_checked.csv





In [None]:
#translate
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from tqdm import tqdm

# -----------------------------
# 1. 设置 Hugging Face 缓存到数据盘
# -----------------------------
os.environ["TRANSFORMERS_CACHE"] = "/root/autodl-tmp/hf_cache"
os.environ["HF_HOME"] = "/root/autodl-tmp/hf_home"
os.environ["HF_DATASETS_CACHE"] = "/root/autodl-tmp/hf_datasets"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:128"

# -----------------------------
# 2. 模型路径
# -----------------------------
model_path = "/root/autodl-tmp/Qwen3-8B"

# -----------------------------
# 3. BitsAndBytes 4-bit 配置
# -----------------------------
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# -----------------------------
# 4. 加载 tokenizer + 模型
# -----------------------------
print(">>> Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map={"": 0},
    quantization_config=bnb_config,
    trust_remote_code=True
)

# -----------------------------
# 5. 批量翻译函数
# -----------------------------
def translate_batch(sentences, batch_size=2, max_new_tokens=128):
    translations = []
    for i in tqdm(range(0, len(sentences), batch_size)):
        batch = sentences[i:i+batch_size]
        for sentence in batch:
            # Prompt 强制要求 literal translation（非比喻）
            prompt = (
                f"Translate the following English sentence into Japanese literally, "
                f"do not use any metaphors or figurative expressions.\n"
                f"English: {sentence}\nJapanese (literal):"
            )
            messages = [{"role": "user", "content": prompt}]
            text = tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True,
                enable_thinking=False
            )
            model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
            generated_ids = model.generate(
                **model_inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False
            )
            output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
            # 取最后一行作为翻译结果
            translation = output_text.strip().split("\n")[-1]
            translations.append(translation)
    return translations

# -----------------------------
# 6. 读取 CSV
# -----------------------------
df = pd.read_csv("datasets/moh_trofi_literal.csv")
sentences = df["context"].astype(str).tolist()

# -----------------------------
# 7. 批量翻译
# -----------------------------
translations = translate_batch(sentences, batch_size=8, max_new_tokens=128)

# -----------------------------
# 8. 保存结果
# -----------------------------
df["ja_literal"] = translations
df.to_csv("datasets/moh_trofi_literal_translated.csv", index=False, encoding="utf-8-sig")
print("✅ 翻译完成，结果保存到 datasets/moh_trofi_literal_translated.csv")


In [1]:
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch
from tqdm import tqdm

# -----------------------------
# 1. 设置 Hugging Face 缓存到数据盘
# -----------------------------
os.environ["TRANSFORMERS_CACHE"] = "/root/autodl-tmp/hf_cache"
os.environ["HF_HOME"] = "/root/autodl-tmp/hf_home"
os.environ["HF_DATASETS_CACHE"] = "/root/autodl-tmp/hf_datasets"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:128"

# -----------------------------
# 2. 模型路径
# -----------------------------
model_path = "/root/autodl-tmp/Qwen3-8B"

# -----------------------------
# 3. BitsAndBytes 4-bit 配置
# -----------------------------
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# -----------------------------
# 4. 加载 tokenizer + 模型
# -----------------------------
print(">>> Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map={"": 0},
    quantization_config=bnb_config,
    trust_remote_code=True
)


def detect_metaphor_batch(sentences, batch_size=2, max_new_tokens=256):
    results = []
    for i in tqdm(range(0, len(sentences), batch_size)):
        batch = sentences[i:i+batch_size]
        for sentence in batch:
            prompt = f"""你是一个隐喻识别与解释模型。请对下面的句子进行分析：
1. 判断其中是否包含隐喻性表达。
2. 如果有，指出隐喻词语。
3. 给出隐喻的释义（即对应的字面意思）。
4. 如果可能，请推测该隐喻的源域（Source domain）和目标域（Target domain）。

请严格按照 JSON 输出。
句子: "{sentence}"
"""
            messages = [{"role": "user", "content": prompt}]
            text = tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True,
                enable_thinking=False
            )
            model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
            generated_ids = model.generate(
                **model_inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False
            )
            output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
            # 取最后一个 JSON 段作为输出
            result = output_text.strip().split("\n")[-1]
            results.append(result)
    return results

# -----------------------------
# 3. 测试单句
# -----------------------------
test_sentences = [
    "経済のエンジンを再び回す必要がある。",
    "私は昨日、友達と映画を見た。"
]

outputs = detect_metaphor_batch(test_sentences, batch_size=2, max_new_tokens=256)

for s, o in zip(test_sentences, outputs):
    print("原句:", s)
    print("模型输出:", o)
    print("-" * 50)

  from .autonotebook import tqdm as notebook_tqdm


>>> Loading tokenizer and model...


Loading checkpoint shards: 100%|██████████| 5/5 [00:14<00:00,  2.93s/it]
  0%|          | 0/1 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
100%|██████████| 1/1 [00:06<00:00,  6.52s/it]

原句: 経済のエンジンを再び回す必要がある。
模型输出: }
--------------------------------------------------
原句: 私は昨日、友達と映画を見た。
模型输出: ```
--------------------------------------------------





In [4]:
# -----------------------------
# 2. 隐喻识别函数 (one-shot)
# -----------------------------
def detect_metaphor_batch(sentences, batch_size=2, max_new_tokens=128):
    results = []
    for i in tqdm(range(0, len(sentences), batch_size)):
        batch = sentences[i:i+batch_size]
        for sentence in batch:
            # one-shot 示例
            prompt = f"""下面是隐喻识别的例子：
句子: "経済のエンジンを回す"
输出: {{"metaphor": true, "word": "エンジン", "source": "機械", "target": "経済"}}
句子: 私は昨日、友達と映画を見た。
输出: {{"metaphor": false}}
请按照相同的格式识别下面的句子：
句子: "{sentence}"
输出:"""

            messages = [{"role": "user", "content": prompt}]
            text = tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True,
                enable_thinking=False
            )
            model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
            generated_ids = model.generate(
                **model_inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False
            )
            output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
            result = output_text.strip().split("\n")[-1]
            results.append(result)
    return results

# -----------------------------
# 3. 测试单句
# -----------------------------
test_sentences = [
    "国民の心に火を灯す必要がある。",
    "私は昨日、友達と映画を見た。"
]

outputs = detect_metaphor_batch(test_sentences, batch_size=2)

for s, o in zip(test_sentences, outputs):
    print("原句:", s)
    print("模型输出:", o)
    print("-" * 50)

100%|██████████| 1/1 [00:03<00:00,  3.56s/it]

原句: 国民の心に火を灯す必要がある。
模型输出: 输出: {"metaphor": true, "word": "火", "source": "自然現象", "target": "国民の心"}
--------------------------------------------------
原句: 私は昨日、友達と映画を見た。
模型输出: 输出: {"metaphor": false}
--------------------------------------------------





In [None]:
def detect_metaphor_batch(sentences, batch_size=8, max_new_tokens=128):
    results = []
    for i in tqdm(range(0, len(sentences), batch_size), desc="Processing"):
        batch = sentences[i:i+batch_size]

        prompts = []
        for sentence in batch:
            prompt = f"""下面是隐喻识别的例子：
句子: "経済のエンジンを回す"
输出: {{"metaphor": true, "word": "エンジン", "source": "機械", "target": "経済"}}

句子: "私は昨日、友達と映画を見た。"
输出: {{"metaphor": false}}

请按照相同的格式识别下面的句子：
句子: "{sentence}"
输出:"""
            messages = [{"role": "user", "content": prompt}]
            text = tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True,
                enable_thinking=False
            )
            prompts.append(text)

        # ⚡ 一次性处理整个 batch
        model_inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(model.device)

        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False
        )

        # ⚡ 解码整个 batch
        batch_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        for output_text in batch_outputs:
            result = output_text.strip().split("\n")[-1]
            results.append(result)

    return results
# -----------------------------
# 3. 读取 TXT 文件
# -----------------------------
with open("abe_speech_5000.txt", "r", encoding="utf-8") as f:
    sentences = [line.strip() for line in f if line.strip()]

print(f"✅ 读取完成，共 {len(sentences)} 句")

# -----------------------------
# 4. 批量推理
# -----------------------------
outputs = detect_metaphor_batch(sentences, batch_size=8, max_new_tokens=128)

# -----------------------------
# 5. 保存为 CSV
# -----------------------------
df = pd.DataFrame({"sentence": sentences, "qwen_output": outputs})
df.to_csv("abe_metaphor_results.csv", index=False, encoding="utf-8-sig")
print("✅ 处理完成，结果已保存到 abe_metaphor_results.csv")