In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "Qwen/Qwen2.5-7B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [2]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# import torch

# # model_name = "facebook/nllb-200-3.3B"
# # model_name = 'Helsinki-NLP/opus-mt-zh-en'
# model_name = "facebook/m2m100_1.2B"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSeq2SeqLM.from_pretrained(
#     model_name,
#     torch_dtype=torch.float16,
#     device_map="auto"
# )


In [3]:
import re

def split_sentences(text, max_chars=300):
    sentences = re.split(r"(?<=[。！？])", text)
    chunks, current = [], ""

    for s in sentences:
        if len(current) + len(s) < max_chars:
            current += s
        else:
            chunks.append(current)
            current = s
    if current:
        chunks.append(current)
    return chunks


In [4]:
def translate_chunk(chunk):
    prompt = f"""
你是一个专业的金融研报翻译助手。请将下面的中文内容翻译成专业英文，要求：
- 不要遗漏任何信息
- 不要添加解释
- 保持金融术语准确
- 保持句子结构清晰

中文内容：
{chunk}

英文翻译：
"""

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.2,
            top_p=0.9,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id

        )

    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # 截取“英文翻译：”之后的部分
    return text.split("英文翻译：")[-1].strip()


In [5]:
# def translate_chunks(chunks, tokenizer, model, batch_size=8):
#     results = []
#     for i in range(0, len(chunks), batch_size):
#         batch = chunks[i:i+batch_size]
#         # inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=False).to("cuda")
#         inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=False).to("cuda")
#         with torch.no_grad():
#             outputs = model.generate(
#                 **inputs,
#                 forced_bos_token_id=tokenizer.get_lang_id("en"),
#                 max_new_tokens=256,
#                 num_beams=4
#             )
        


#         results.extend([tokenizer.decode(o, skip_special_tokens=True) for o in outputs])
#     return results


In [6]:
def translate_chunks(chunks):
    results = []
    for c in chunks:
        results.append(translate_chunk(c))
        print(len(c))

    return results


In [7]:
def translate_file(in_path, out_path):
    with open(in_path, "r", encoding="utf-8") as f:
        text = f.read()

    chunks = split_sentences(text)
    translated = translate_chunks(chunks)

    with open(out_path, "w", encoding="utf-8") as f:
        f.write("\n".join(translated))

    print(f"Translated: {in_path} → {out_path}")


In [8]:
# def translate_file(in_path, out_path, tokenizer, model):
#     with open(in_path, "r", encoding="utf-8") as f:
#         text = f.read()

#     chunks = split_sentences(text)
#     translated_chunks = translate_chunks(chunks, tokenizer, model)

#     with open(out_path, "w", encoding="utf-8") as f:
#         f.write("\n".join(translated_chunks))

#     print(f"Translated: {in_path} → {out_path}")


In [9]:
# import os

# def batch_translate(input_dir, output_dir, tokenizer, model):
#     for root, _, files in os.walk(input_dir):
#         for file in files:
#             if not file.endswith(".txt"):
#                 continue

#             in_path = os.path.join(root, file)
#             rel_path = os.path.relpath(root, input_dir)
#             out_dir = os.path.join(output_dir, rel_path)
#             os.makedirs(out_dir, exist_ok=True)

#             out_path = os.path.join(out_dir, file)
#             translate_file(in_path, out_path, tokenizer, model)


In [10]:
import os

def batch_translate(input_dir, output_dir):
    print(model.device)

    for root, _, files in os.walk(input_dir):
        for file in files:
            if not file.endswith(".txt"):
                continue

            in_path = os.path.join(root, file)
            rel_path = os.path.relpath(root, input_dir)
            out_dir = os.path.join(output_dir, rel_path)
            os.makedirs(out_dir, exist_ok=True)

            out_path = os.path.join(out_dir, file)
            translate_file(in_path, out_path)


In [11]:
input_dir = "reports_txt_by_quarter_cleaned"        # 中文研报目录
output_dir = "reports_txt_by_quarter_cleaned_en"    # 英文研报输出目录

batch_translate(input_dir, output_dir)


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


cuda:0
258
278
267
290
32
356
287
276
292
285
232
276
247
167
262
249
289
266
299
212
249
236
183
274
226
296
201
282
289
32
Translated: reports_txt_by_quarter_cleaned\2017_Q1\一汽解放\深度报告\20170209_西南证券_一汽解放_业绩反转确定，2017轻装上阵，扬帆起航.txt → reports_txt_by_quarter_cleaned_en\2017_Q1\一汽解放\深度报告\20170209_西南证券_一汽解放_业绩反转确定，2017轻装上阵，扬帆起航.txt
285
282
265
220
175
253
308
281
202
283
Translated: reports_txt_by_quarter_cleaned\2017_Q1\万华化学\20170316_群益证券_万华化学_公司产品产销两旺，17业绩同比大增203%.txt → reports_txt_by_quarter_cleaned_en\2017_Q1\万华化学\20170316_群益证券_万华化学_公司产品产销两旺，17业绩同比大增203%.txt
261
292
264
112
381
219
286
Translated: reports_txt_by_quarter_cleaned\2017_Q1\万科A\20170113_东莞证券_万科A_事件点评：地铁集团入驻 股权纷争缓解 利于重新着力经营.txt → reports_txt_by_quarter_cleaned_en\2017_Q1\万科A\20170113_东莞证券_万科A_事件点评：地铁集团入驻 股权纷争缓解 利于重新着力经营.txt
0
407
263
239
209
317
256
118
Translated: reports_txt_by_quarter_cleaned\2017_Q1\万科A\20170113_国金证券_万科A_深圳地铁强势入主，股权之争或近尾声.txt → reports_txt_by_quarter_cleaned_en\2017_Q1\万科A\20170113_国金证券_万科A_深圳地铁强势入主，股权之争或近

KeyboardInterrupt: 