-
Notifications
You must be signed in to change notification settings - Fork 0
/
gpt4_data_translate.py
41 lines (32 loc) · 1.2 KB
/
gpt4_data_translate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import json
from translate import Translate
import time
def translate_json():
translator = Translate(src="en", dest="zh-cn")
with open("./data/comparison_gpt4_data_en.json", encoding="UTF-8") as f:
content = json.load(f)
print(f"Loaded {len(content)} gpt4")
count = len(content)
translate_count = 0
content_zh = []
for c in content:
instruction = c.get("instruction", "")
output = c.get("output", ["", ])
instruction_zh = translator.translate(instruction)
output_zh = translator.translate(output)
content_zh.append(
{
"instruction": instruction_zh,
"input": c["input"],
"output": output_zh,
}
)
translate_count += 1
if translate_count % 1000 == 0:
print(f"Translated {translate_count} / {count}")
translator.save_cache()
with open("./data/comparison_gpt4_data_zh.json", "w+", encoding="UTF-8") as f:
f.write(json.dumps(content_zh, ensure_ascii=False, indent=4))
print(f"\n\nTranslated {translate_count} , {count - translate_count} failed. (Total {count} )")
if __name__ == "__main__":
translate_json()