In [2]:
import os
import sys
import json

from tqdm import tqdm
from utils import DATA_ROOT, dump_json, load_json, load_jsonl_iteratively

In [None]:
## Prepare for English-Japanese balanced bilingual corpora

ROOT = os.path.join(DATA_ROOT, "datasets", "balanced_bilingual", "en-ja")

infn = os.path.join(ROOT, "en-ja", "en-ja.bicleaner05.txt")
outfn = os.path.join(ROOT, "data.jsonl")
outfp = open(outfn, 'w', encoding="utf8")

with open(infn, 'r', encoding="utf8") as fp:
    for line in tqdm(fp):
        _, _, score, en_text, ja_text = line.split("\t")
        score = float(score)
        if score < 0.7:
            continue
        
        item = {
            "en": en_text.strip(),
            "ja": ja_text.strip(),
            "score": score
        }
        string = json.dumps(item, ensure_ascii=False)
        outfp.write(f"{string}\n")
    

3514127it [00:53, 29803.63it/s] 

In [None]:
## Prepare for Chinese-Japanese balanced bilingual corpora

ROOT = os.path.join(DATA_ROOT, "datasets", "balanced_bilingual", "zh-ja")

for filename in ["zh-ja.bicleaner05.txt", "zh-ja.crowdsourcing_b05l07.txt"]:
    infn = os.path.join(ROOT, "zh-ja", filename)
    outfn = os.path.join(ROOT, "data.jsonl")
    outfp = open(outfn, 'a', encoding="utf8")

    with open(infn, 'r', encoding="utf8") as fp:
        for line in tqdm(fp):
            if filename == "zh-ja.bicleaner05.txt":
                _, score, zh_text, ja_text = line.split("\t")
            else:
                _, _, score, zh_text, ja_text = line.split("\t")
            score = float(score)
            if score < 0.7:
                continue
            
            item = {
                "zh": zh_text.strip(),
                "ja": ja_text.strip(),
                "score": score
            }
            string = json.dumps(item, ensure_ascii=False)
            outfp.write(f"{string}\n")
        

83892it [00:04, 17261.45it/s]
4602328it [00:46, 98284.57it/s] 


In [None]:
## Prepare for Chinese-English balanced bilingual corpora

ROOT = os.path.join(DATA_ROOT, "datasets", "balanced_bilingual", "en-zh")

indir = os.path.join(ROOT, "UM-Corpus/data/Bilingual")
outfn = os.path.join(ROOT, "data.jsonl")
outfp = open(outfn, 'a', encoding="utf8")

items = []
for folder in os.listdir(indir):
    fns = os.listdir(os.path.join(indir, folder))
    assert len(fns) == 1
    infn = os.path.join(indir, folder, fns[0])

    with open(infn, 'r', encoding="utf8") as fp:    
        for i, line in tqdm(enumerate(fp)):
            if i % 2 == 0:
                items.append({
                    "en": line.strip()})
            else:
                items[-1]['zh'] = line.strip()
        
    for item in items:
        string = json.dumps(item, ensure_ascii=False)
        outfp.write(f"{string}\n")
        

10000it [00:00, 634069.15it/s]
600000it [00:02, 290433.02it/s]
440000it [00:00, 754503.53it/s]
540000it [00:05, 96676.25it/s] 
900000it [00:01, 822212.10it/s]
600000it [00:00, 773891.33it/s]
440000it [00:01, 265025.25it/s]
900000it [00:00, 940286.12it/s] 


In [None]:
## Prepare for scientific Chinese-English balanced bilingual corpora

ROOT = os.path.join(DATA_ROOT, "datasets", "scientific_bilingual", "en-zh")

indir = os.path.join(ROOT, "csl")
outfn = os.path.join(ROOT, "data.jsonl")
outfp = open(outfn, 'a', encoding="utf8")

items = []
for folder in os.listdir(indir):
    fns = os.listdir(os.path.join(indir, folder))
    assert len(fns) == 1
    infn = os.path.join(indir, folder, fns[0])

    with open(infn, 'r', encoding="utf8") as fp:    
        for i, line in tqdm(enumerate(fp)):
            if i % 2 == 0:
                items.append({
                    "en": line.strip()})
            else:
                items[-1]['zh'] = line.strip()
        
    for item in items:
        string = json.dumps(item, ensure_ascii=False)
        outfp.write(f"{string}\n")
        