In [None]:
import os
import re
import json
from tqdm import tqdm

from utils import DATA_ROOT
from utils import load_jsonl_iteratively

response_fn = os.path.join(DATA_ROOT, "response", "zh", "science-trans.response.s0e400000.jsonl")
datasets_fn = os.path.join(DATA_ROOT, "datasets/scientific_bilingual", "en-zh", "zh-only.jsonl")

In [2]:
items = {}
for doc_item in tqdm(load_jsonl_iteratively(datasets_fn), desc=f"Reading documents from preprocessed data: {datasets_fn}"):
    items[str(doc_item['docid'])] = doc_item

Reading documents from preprocessed data: /data/xzhao/dataset/roman-pretrain/datasets/medical/en_pair/data.jsonl: 0it [00:00, ?it/s]

Reading documents from preprocessed data: /data/xzhao/dataset/roman-pretrain/datasets/medical/en_pair/data.jsonl: 614444it [00:07, 78925.57it/s] 


In [None]:
non_match_doc, less_5_qa = [], []
non_match_text = []
qaffix_affix = r"#+\s*(?:\d+\.\s*)?(?:[\u3040-\u30FF\u4E00-\u9FAF\uFF66-\uFF9F]{2,12})?(?:質問|問題|问题|质問|質问|质问|賨問|要問|膪問|贅問|背景的な質問|質急|质询|資問|落川|賶問)(?:\d+)?(?:\s*\d+)?(?::|：)"
qaffix_pattern = qaffix_affix + r"(?!\s*\.)"


def match_trans1(text):
    pattern = r'"title": "(.*?)",\s*"abstract": "(.*?)"'
    match = re.search(pattern, text, re.S)
    title, abstract = None, None
    if match:
        title = match.group(1).strip()
        abstract = match.group(2).strip()
    return title, abstract

def match_trans2(text):
    pattern = r'Title:(.*?)\s*Abstract:(.*?)'
    match = re.search(pattern, text, re.S)
    title, abstract = None, None
    if match:
        title = match.group(1).strip()
        abstract = match.group(2).strip()
    return title, abstract

chinese_char_pattern = r'[\u4e00-\u9fff]'
error_docs, error_ta = [], []
non_error_docs = []
for i, res_item in tqdm(enumerate(load_jsonl_iteratively(response_fn)), desc="Processing QA response"):
# for i, res_item in tqdm(enumerate(error_docs), desc="Processing QA response"):
    assert res_item['id'][9:] in items
    generation = res_item['choices'][0]['message']['content']
    if "</think>" in generation:
        qa_text = generation.split("</think>")[1].strip()
    elif "\n\n\n" in generation and len(generation.split("\n\n\n")) == 2:
        qa_text = generation.split("\n\n\n")[1].strip()
    
    
    try:
        title, abstract = match_trans1(qa_text)
        
        if title is None and abstract is None:
            title, abstract = match_trans2(qa_text)
        
        if title is None and abstract is None:
            error_docs.append(generation)
            print("1", generation)
        elif re.search(chinese_char_pattern, title) or re.search(chinese_char_pattern, abstract):
            error_ta.append((title, abstract))
        else:
            non_error_docs.append(res_item)
    except json.JSONDecodeError as e:
        print("2", generation)
        error_docs.append(generation)


In [25]:
print(len(non_error_docs))

359161


In [5]:
from utils import dump_jsonl


new_response_fn = os.path.join(DATA_ROOT, "response", "zh", "science-trans.response.s0e400000.jsonl")
dump_jsonl(non_error_docs, new_response_fn)

In [5]:
i = 1
print(error_ta[i][0])
print(error_ta[i][1])

Research Progress on Cytoplasmic Male Sterility in Rice
Discoveries in cytoplasmic male sterility have garnered extensive attention from plant breeders. The mechanisms underlying its development are closely linked to the theory of cytoplasmic inheritance. In hybrid rice production, sterile lines play a crucial role in determining the output and planting area of hybrid rice. In fertility restoration research, the primary focus has been on investigating the restoring capability of fertility restoration genes over sterility genes. Previous studies have predominantly focused on investigations involving a single restorer line corresponding to one sterile line. Additionally, some have attempted to use restored hybrid approaches to study the positioning of restoration genes. Different studies have yielded varying conclusions based on the types of interactions and approaches employed. This review provides an overview of the genetics and molecular tagging of CMS types, including the 野败型 (yěbàix

In [8]:
print(error_docs[20])

好，我现在需要将用户提供的中文论文标题和摘要翻译成英文。首先，我会仔细阅读标题和摘要，理解其主要内容和关键术语。

标题是“太湖底泥水华蓝藻复苏的模拟”，这主要涉及太湖、底泥、蓝藻复苏以及模拟研究。我需要确保这些术语在英文中准确无误。"太湖"直接翻译为“Lake Taihu”，“底泥”是“sediment”，“水华蓝藻”应译为“cyanobacteria blooms”，最后“复苏的模拟”可以翻译为“Simulation of Resurgence”。综合起来，标题可以译为“Simulation of Cyanobacteria Blooms from Lake Taihu Sediments”或者更简洁一点，“Simulation Study on the Resurgence of Cyanobacteria Blooms in Lake Taihu Sediments”。

接下来是摘要部分，内容比较详细，涉及实验方法、结果和结论。我需要确保专业术语的准确性，如“光照升温培养”译为“light and temperature incubation”，“色素含量”译为“pigment content”，“藻群体”译为“algal populations”。

在翻译过程中，要保持原意不变，同时确保语言流畅、专业。例如，“在室内模拟条件下”可以译为“under laboratory simulation conditions”，“与环境温度变化密切相关”则是“closely related to environmental temperature changes”。

此外，数字和单位要准确无误，比如“7.2-7.8 μm”保持不变，温度范围如“18-20℃”也要准确翻译。

最后，我会按照用户提供的结构化格式，确保标题和摘要分开，标点符号正确，内容完整。

总结一下，我会逐句翻译，确保每个术语和关键点都准确传达，并且整体读起来专业流畅。
</think>

## **Simulation Study on the Resurgence of Cyanobacteria Blooms in Lake Taihu Sediments**

### **Abstract**

In this experiment, sediment and overlying wat