## 1. 读取语料 

In [None]:
import argparse
import json
import csv
import time
from selenium import webdriver
from selenium.webdriver.common.by import By

class TranslateData:
    def __init__(self, input_file, output_file):
        # 初始化输入输出文件
        self.input_file = input_file
        self.output_file = output_file
        option = webdriver.ChromeOptions()  # 启动浏览器，最大化
        # 屏蔽谷歌浏览器正在接受自动化软件控制提示
        option.add_experimental_option('excludeSwitches', ['enable-authmation'])
        option.add_argument('headless')
        self.browser = webdriver.Chrome(r'F:\\chormedriver\\chromedriver_win32\\chromedriver.exe', options=option)  # 初始化

    def load_data(self):
        with open(self.input_file, 'r', encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
            # 跳过表头
            next(reader)
            # 逐行读取数据, 并将需要叙述存入列表中
            text_list = []
            for row in reader:
                doc_dict = {}
                id = '##id:' + row[0]
                title = row[4]
                introduction = row[6]
                strategy = row[7]
                potential = row[8]
                # 删除字符串首尾的‘’符号
                introduction = self.remove_quotes(introduction)
                strategy = self.remove_quotes(strategy).replace("', '", ", ")
                potential = self.remove_quotes(potential).replace("', '", ". ")
                # 将文章编号、标题、introduction、strategy、potential合并，存入字典和列表中
                doc_dict["id"] = id
                doc_dict["title"] = title
                doc_dict["introduction"] = introduction
                doc_dict["strategy"] = strategy
                doc_dict["potential"] = potential
                if len(introduction) > 0 or len(strategy) > 0 or len(potential) > 0:
                    text_list.append(doc_dict)
            # 对存入字典的文章进行翻译
            for i in range(len(text_list)):
                doc_chinese_dict = {}
                for key, value in text_list[i].items():
                    if key == "id":
                        doc_chinese_dict[key] = value
                    else:
                        value_english_list = value.split(". ")
                        print("待翻译的段落为：", value)
                        value_chinese = ''
                        for sentence in value_english_list:
                            sentence = sentence + ". "
                            print("待翻译的句子为：", sentence)
                            try:
                                sentence_chinese = self.translate(sentence)
                            except:
                                sentence_chinese = "该句翻译不成功"
                            value_chinese = value_chinese + sentence_chinese
                        doc_chinese_dict[key] = value_chinese
                # 将字典逐行写入jsonl文件
                self.write_json(doc_chinese_dict)

    def translate(self, input_content):
        #打开浏览器中的翻译软件
        self.browser.get("https://fanyi.baidu.com")
        start_time = time.time()
        # 等待页面加载完成
        self.browser.implicitly_wait(5)
        # 输入待翻译的内容
        source_input = self.browser.find_element(By.ID, "baidu_translate_input")
        source_input.send_keys(input_content)
        # 等待翻译结果加载完成
        self.browser.implicitly_wait(20)
        # 获取翻译结果
        output = self.browser.find_elements_by_xpath("//*[@class='ordinary-output target-output clearfix']")
        print(output)
        output_content = self.browser.find_elements_by_xpath("//*[@class='ordinary-output target-output clearfix']")[0].text
        print(output_content)
        end_time = time.time()
        print("程序执行了%f秒" % (end_time - start_time))
        return  output_content

    def write_json(self, doc):
        # 数据的写入
        with open(self.output_file, 'a', encoding='utf-8') as f:
            f.write(json.dumps(doc, ensure_ascii=False) + "\n")

    def remove_quotes(self, string):
        if string.startswith("'") and string.endswith("'"):
            string = string.strip("'")
        elif string.startswith('"') and string.endswith('"'):
            string = string.strip('"')
        return string

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_file", type=str, default="data\\original_data\\biological_strategy.csv",
                        help="输入路径")
    parser.add_argument("--output_file", type=str, default="data\\original_data\\biological_strategy_chinese.jsonl",
                        help="输出路径")
    return parser.parse_args()

if __name__ == '__main__':
    input_file = parse_args().input_file
    output_file = parse_args().output_file
    prepro_data = TranslateData(input_file, output_file)
    prepro_data.load_data()