In [10]:
BASE_NAME = 'Lorena Scaglia Dataset'
INPUT_FILE_NAME = f'processed_data/{BASE_NAME}.xlsx'
SHEET_FILE_NAME_STAGE_1 = f'train_dataset/{BASE_NAME} Stage1.csv'
SHEET_FILE_NAME_STAGE_2 = f'train_dataset/{BASE_NAME} Stage2.jsonl'
SHEET_FILE_NAME_STAGE_3 = f'train_dataset/{BASE_NAME} Stage3.jsonl'

In [11]:
import pandas as pd

In [12]:
data = pd.read_excel(INPUT_FILE_NAME)
data

Unnamed: 0.1,Unnamed: 0,章节Id,章节名,章节序号,章节内容,机翻内容,翻译精修
0,0,436551,Chapter 928 Getting Married Too,928,Chapter 928 Getting Married Too\nScarlet and Y...,Capítulo 928 Casando também\nScarlet e Yasmin ...,Capítulo 928 Me casar\nAssim que Scarlet e Yas...
1,1,436599,Chapter 930 Care,930,"Chapter 930 Care\n""Thank you,"" Yasmin said wit...","Capítulo 930 Cuidados\n“Obrigada”, disse Yasmi...","Capítulo 930 É claro que me importo\n""Obrigada..."
2,2,437022,Chapter 934 Sarcasm,934,Chapter 934 Sarcasm\nWhen they stepped out of ...,Capítulo 934 Sarcasmo\nQuando saíram da loja d...,Capítulo 934 Sarcasmo\nQuando saíram da loja d...
3,3,439936,Chapter 953 Misunderstanding,953,Chapter 953 Misunderstanding\nYasmin apologize...,Capítulo 953 Mal-entendido\nYasmin se desculpo...,Capítulo 953 No que você acreditou?\nYasmin se...
4,4,439976,Chapter 954 I Don't Like You,954,Chapter 954 I Don't Like You\nYasmin could fee...,Capítulo 954 Eu não gosto de você\nYasmin sent...,Capítulo 954 Eu não gosto de quem você se torn...
...,...,...,...,...,...,...,...
1174,1174,859654,Chapter 166 Thea Is Griffin,166,Chapter 166 Thea Is Griffin\nAfter leaving the...,Capítulo 166 Thea é Griffin\nApós deixar o Hos...,Capítulo 166 Thea é Griffin\nAinda perto do Ho...
1175,1175,859674,Chapter 171 You Really Don't Need To Do That F...,171,Chapter 171 You Really Don't Need To Do That F...,Capítulo 171 Você Realmente Não Precisa Fazer ...,"Capítulo 171 Não há necessidade disso\n""Que pe..."
1176,1176,859677,Chapter 172 I Won't Divorce You,172,Chapter 172 I Won't Divorce You\nIt had been j...,Capítulo 172 Não vou me divorciar\nApenas dois...,Capítulo 172 Eu me recuso a te dar o divórcio\...
1177,1177,859690,Chapter 174 He Knows That She Doesn't Trust Hi...,174,Chapter 174 He Knows That She Doesn't Trust Hi...,Capítulo 174 Ele Sabe Que Ela Não Confia Nele\...,Capítulo 174 Ela não acreditava em uma só pala...


In [13]:
data = data.to_dict(orient='records')
data[0].keys()

dict_keys(['Unnamed: 0', '章节Id', '章节名', '章节序号', '章节内容', '机翻内容', '翻译精修'])

In [14]:
import json

def save_jsonl(data_list: list, file_path: str):
    with open(file_path, 'w', encoding='utf-8') as f:
        for item in data_list:
            json.dump(item, f, ensure_ascii=False)
            f.write('\n')

# 构建 SFT 数据

In [15]:
import time
from dotenv import load_dotenv
from any_llm import LLM
from prompt4py import GeneralTemplate
from vortezwohl.iter import sliding_window

load_dotenv()
claude = LLM(provider='new_api', model_name='claude-sonnet-4-5-20250929-thinking')


def chapter_chunking(text: str) -> list[str]:
    lines = text.lstrip().splitlines(keepends=False)
    chunks = []
    for chunk_lines in sliding_window(lines, window_size=5, stride=1, fill_value=None):
        if None not in chunk_lines:
            chunks.append('\n'.join(chunk_lines))
    return chunks


def chunk_analyse(origin: str, translation: str, polish: str):
    prompt_template = GeneralTemplate()
    prompt_template.role = '你是一个以**巴西**葡萄牙语为母语的小说编辑, 你擅长对葡萄牙语小说进行精修.'
    prompt_template.context = {
        '任务背景-大背景': '我们的翻译同事收到了一份原文稿件([原文稿]), 并对其进行了初步的葡萄牙语翻译([翻译稿]), 但从英文翻译到葡萄牙语的[翻译稿]还是有很多细节对葡萄牙语母语者而言并不流畅地道, 所以现在请你对[翻译稿]进行精修([精修稿]).',
        '精修过程': '精修的目的通常是让文章具有更好流畅度、自然感与丰富的文体风格. 精修时主要对[翻译稿]进行精修, 有时也会直接参考[原文稿]进行"更好的翻译".',
        '上下文的重要性': '通常情况下, 一句话的精修是与其上下文(特别是上文)高度相关的, 你需要密切关注这种联系.'
    }
    prompt_template.input = {
        '原文稿': '{{origin}}',
        '翻译稿': '{{translation}}',
        '精修稿': '{{polish}}'
    }
    prompt_template.instruction = {
        '具体任务': '摆在你眼前的有[原文稿][翻译稿]和[精修稿], 其中[精修稿]出自你手, 但你当初在精修的时候并没有充分解释你的精修过程, 现在我需要你对着[精修稿]重现你的精修思路, 也就是说, 我需要你通过讲解具体思路的方式, 讲解你是如何从[翻译稿]得到[精修稿]的(对于被修改的词汇/句式/语段, 你需要给出具体的修改理由; 对于没被修改的词汇/句式/语段, 你也需要说明为什么不需要修改; 你必须覆盖所有被修改的词汇/句式/语段, 同时尽可能覆盖更多的没被修改的词汇/句式/语段)',
    }
    prompt_template.output_format = '[COT] ...'
    prompt_template.output_dtype = 'plaintext'
    prompt_template.output_language = 'zh-cn'
    prompt_template.constraint = [
        '输出以 "[COT]现在我将以一个小说编辑的身份, 详细讲解我是如何将这份翻译初稿精修成最终稿的. 我的目标是让文本不仅仅是“正确”的, 更是要让它在节奏、语感和文学性上都达到母语者的阅读标准. 对于语句 ..." 开始',
        '假设你正在进行精修的过程, 不要提到复盘等, 直接说明你的详细精修思路',
        '不要输出 "精修稿", 你要模拟真正的精修过程, 所以 "精修稿" 是在精修思路后才形成的, 而不是思路完整前',
        '请只输出精修的思路和过程, 不要输出任何精修后的完整文段', '请使用中文向我汇报', '请始终使用中文向我汇报']
    return claude(prompt_template.render(origin=origin, translation=translation, polish=polish, markdown=True),
                  trace=f'精修思维链数据生成-[{time.time()}]', temperature=.0, top_p=.0, seed=0)

In [None]:
from threading import RLock
from vortezwohl.concurrent import ThreadPool

threads = ThreadPool(32)
train_dataset = []
train_dataset_lock = RLock()


def context_chunk_analyse(origin: str, translation: str, polish: str):
    analysis = chunk_analyse(origin=origin, translation=translation, polish=polish).replace('\n', '')
    with train_dataset_lock:
        train_dataset.append({
            'origin': origin,
            'translation': translation,
            'polish': polish,
            'analysis': analysis
        })
        if len(train_dataset) % 25 == 0:
            pd.DataFrame(train_dataset).to_csv(SHEET_FILE_NAME_STAGE_1, index=False)


for d_item in data:
    threads.submit(context_chunk_analyse, origin=d_item['章节内容'], translation=d_item['机翻内容'],
                   polish=d_item['翻译精修'])


res = threads.wait_all()
res[0].traceback, res[0].returns

[DEBUG] 2025-10-29 10:24:34,241 any-llm : \
ENDPOINT: new_api/claude-sonnet-4-5-20250929-thinking
PROMPT: [{'role': 'user', 'content': '## _TIMESTAMP\n[434850.6877427]\n## ROLE\n你是一个以**巴西**葡萄牙语为母语的小说编辑, 你擅长对葡萄牙语小说进行精修.\n## INSTRUCTION\n- **具体任务**: 摆在你眼前的有[原文稿][翻译稿]和[精修稿], 其中[精修稿]出自你手, 但你当初在精修的时候并没有充分解释你的精修过程, 现在我需要你对着[精修稿]重现你的精修思路, 也就是说, 我需要你通过讲解具体思路的方式, 讲解你是如何从[翻译稿]得到[精修稿]的(对于被修改的词汇/句式/语段, 你需要给出具体的修改理由; 对于没被修改的词汇/句式/语段, 你也需要说明为什么不需要修改; 你必须覆盖所有被修改的词汇/句式/语段, 同时尽可能覆盖更多的没被修改的词汇/句式/语段) \n## CONTEXT\n- **任务背景-大背景**: 我们的翻译同事收到了一份原文稿件([原文稿]), 并对其进行了初步的葡萄牙语翻译([翻译稿]), 但从英文翻译到葡萄牙语的[翻译稿]还是有很多细节对葡萄牙语母语者而言并不流畅地道, 所以现在请你对[翻译稿]进行精修([精修稿]). \n- **精修过程**: 精修的目的通常是让文章具有更好流畅度、自然感与丰富的文体风格. 精修时主要对[翻译稿]进行精修, 有时也会直接参考[原文稿]进行"更好的翻译". \n- **上下文的重要性**: 通常情况下, 一句话的精修是与其上下文(特别是上文)高度相关的, 你需要密切关注这种联系. \n## INPUT\n- **原文稿**: Chapter 934 Sarcasm\nWhen they stepped out of the shop for baby products, they saw that the door of the Cartier shop was closed.\nIt seemed that the shop was attending to very important guests.\

In [None]:
pd.DataFrame(train_dataset).to_csv(SHEET_FILE_NAME_STAGE_1, index=False)

In [None]:
import pandas as pd

data = pd.read_csv(SHEET_FILE_NAME_STAGE_1).to_dict(orient='records')
data[0].keys()

In [None]:
def format_prompt(origin: str, translation: str) -> str:
    prompt_template = GeneralTemplate()
    prompt_template.role = '你是一个以**巴西**葡萄牙语为母语的小说编辑, 你擅长对葡萄牙语小说进行精修.'
    prompt_template.context = {
        '任务背景': '我们的翻译同事收到了一份原文稿件([原文]), 并对其进行了初步的葡萄牙语翻译([译文]), 但从英文翻译到葡萄牙语的[译文]还是有很多细节对葡萄牙语母语者而言并不流畅地道, 所以现在需要你对[译文]进行精修.'
    }
    prompt_template.input = {
        '原文': '{{origin}}',
        '译文': '{{translation}}'
    }
    prompt_template.objective = '思考如何对[译文]进行精修, 仅输出思考决策过程细节.'
    prompt_template.instruction = '摆在你眼前的有[原文]和[译文], 我需要你深入思考如何对[译文]进行精修.'
    prompt_template.output_format = '<think> ... </think>'
    prompt_template.output_dtype = 'plaintext'
    prompt_template.constraint = '输出以 "<think>" 开始'
    return prompt_template.render(origin=origin, translation=translation, markdown=True)

In [None]:
dataset = []

for d_item in data:
    analysis = d_item['analysis']
    if analysis.lstrip().startswith('[COT]'):
        analysis = analysis.replace('[COT]', '', 1)
    analysis = f'<think>{analysis}</think>'
    dataset.append({
        'contents': [
            {
                'role': 'user',
                'parts': [{'text': format_prompt(origin=d_item['origin'], translation=d_item['translation'])}]
            },
            {
                'role': 'model',
                'parts': [{'text': analysis}]
            }
        ]
    })

save_jsonl(dataset, SHEET_FILE_NAME_STAGE_2)

In [None]:
oss_dataset = []

for d_item in data:
    analysis = d_item['analysis']
    if analysis.lstrip().startswith('[COT]'):
        analysis = analysis.replace('[COT]', '', 1)
    analysis = f'<think>{analysis}</think>'
    oss_dataset.append({'prompt': format_prompt(origin=d_item['origin'], translation=d_item['translation']), 'completion': analysis})

save_jsonl(oss_dataset, SHEET_FILE_NAME_STAGE_3)