In [1]:
import os
import sys
import json

from tqdm import tqdm
from tqdm import tqdm
from utils import DATA_ROOT, load_jsonl, dump_jsonl

In [None]:
## Prepare J-stage raw dataset for ja and en(-pair)
def create_jstage_dataset():
    src = "/data/xzhao/dataset/meta_med/clean/metadata_med_ddp.jsonl"

    ja_tgt_dir = os.path.join(DATA_ROOT, "datasets", "medical", "ja")
    en_tgt_dir = os.path.join(DATA_ROOT, "datasets", "medical", "en_pair")

    os.makedirs(ja_tgt_dir, exist_ok=True)
    os.makedirs(en_tgt_dir, exist_ok=True)
    ja_tgt = os.path.join(ja_tgt_dir, 'data.jsonl')
    en_tgt = os.path.join(en_tgt_dir, 'data.jsonl')

    if os.path.exists(ja_tgt) and os.path.exists(en_tgt): 
        print(f"Skip: {ja_tgt} and {en_tgt} is created.")
        return

    items = load_jsonl(src)
    allowed_subjects = ["医学", "薬学", "歯学"]

    en_new_items, ja_new_items = [], []
    for item in tqdm(items):
        subjects, keywords = [], []
        doc_id = item['articleId']['journalCode'] + "@@" + item['articleId']['articleCode']
        for data in item['title']:
            if data['language'] == "en":
                en_doc_tit = data['content']
            else:
                ja_doc_tit = data['content']
        for data in item['abstract']:
            if data['language'] == "en":
                en_doc_abs = data['content']
            else:
                ja_doc_abs = data['content']
        
        record = False
        if 'subjects' in item['journalInfo'] and item['journalInfo']['subjects'] is not None:
            en_subjects = [subject['content'] for subject in item['journalInfo']['subjects'] if subject['language']=="en"]
            ja_subjects = [subject['content'] for subject in item['journalInfo']['subjects'] if subject['language']=="ja"]
            for subject in ja_subjects:
                record = any([asub in subject for asub in allowed_subjects])
                if record:
                    break
        if not record:
            continue

        if 'keywords' in item and item['keywords'] is not None:
            en_keywords = [keyword['content'] for keyword in item['keywords'] if keyword['language']=="en"]
            ja_keywords = [keyword['content'] for keyword in item['keywords'] if keyword['language']=="ja"]
        
        en_new_items.append({
            'docid': doc_id,
            'title': en_doc_tit,
            'abstract': en_doc_abs,
            'subjects': en_subjects,
            'keywords': en_keywords,
        })

        ja_new_items.append({
            'docid': doc_id,
            'title': ja_doc_tit,
            'abstract': ja_doc_abs,
            'subjects': ja_subjects,
            'keywords': ja_keywords,
        })
        
    dump_jsonl(ja_new_items, ja_tgt)
    print(f"Prepared the raw Japanese J-stage dataset with {len(ja_new_items)} to {ja_tgt}.")
    dump_jsonl(en_new_items, en_tgt)
    print(f"Prepared the raw English J-stage dataset with {len(en_new_items)} to {en_tgt}.")

create_jstage_dataset()

100%|██████████| 1083830/1083830 [00:11<00:00, 91542.11it/s] 


Prepared the raw Japanese J-stage dataset with 614444 to /data/xzhao/dataset/roman-pretrain/ja/data.jsonl.
Prepared the raw English J-stage dataset with 614444 to /data/xzhao/dataset/roman-pretrain/en_pair/data.jsonl.


In [None]:
## Prepare English Pubmed dataset
### Need to follow the instructions in https://github.com/thoppe/The-Pile-PubMed to download and process the data
### NOTE: revise p2 to define our format; do not need run p3
### The code and data is saved at /model/data-scidoc/roman-pretrain/datasets/Pile

tgt_fn = os.path.join(DATA_ROOT, 'en', "data.jsonl")
assert os.path.exists(tgt_fn)    

In [None]:
## Prepare Chinese medical domain dataset

tgt_fn = os.path.join(DATA_ROOT, "datasets", "medical", "zh", "data.jsonl")

if os.path.exists(tgt_fn):
    sys.exit(0)

tgt_dir = os.path.join(DATA_ROOT, 'zh')
os.makedirs(tgt_dir, exist_ok=True)
src_fn = os.path.join(DATA_ROOT, 'datasets', 'raw', 'csl_camera_readly.tsv')

items = []
with open(src_fn, "r", encoding="utf8") as fn:
    for i, line in enumerate(fn):
        fields = line.strip().split("\t")
        subjects = [fields[3], fields[4]]
        if "医学" in subjects or "药学" in subjects or "医药" in subjects:
            items.append({
                "docid": i, 
                "title": fields[0],
                "abstract": fields[1],
                "keywords": fields[2].split("_"),
                "subjects": subjects,
            })

dump_jsonl(items, tgt_fn)

In [None]:
## Prepare for English-Japanese balanced bilingual corpora

ROOT = os.path.join(DATA_ROOT, "datasets", "balanced_bilingual", "en-ja")

infn = os.path.join(ROOT, "en-ja", "en-ja.bicleaner05.txt")
outfn = os.path.join(ROOT, "data.jsonl")
outfp = open(outfn, 'w', encoding="utf8")

with open(infn, 'r', encoding="utf8") as fp:
    for line in tqdm(fp):
        _, _, score, en_text, ja_text = line.split("\t")
        score = float(score)
        if score < 0.7:
            continue
        
        item = {
            "en": en_text.strip(),
            "ja": ja_text.strip(),
            "score": score
        }
        string = json.dumps(item, ensure_ascii=False)
        outfp.write(f"{string}\n")
    

In [None]:
## Prepare for Chinese-Japanese balanced bilingual corpora

ROOT = os.path.join(DATA_ROOT, "datasets", "balanced_bilingual", "zh-ja")

for filename in ["zh-ja.bicleaner05.txt", "zh-ja.crowdsourcing_b05l07.txt"]:
    infn = os.path.join(ROOT, "zh-ja", filename)
    outfn = os.path.join(ROOT, "data.jsonl")
    outfp = open(outfn, 'a', encoding="utf8")

    with open(infn, 'r', encoding="utf8") as fp:
        for line in tqdm(fp):
            if filename == "zh-ja.bicleaner05.txt":
                _, score, zh_text, ja_text = line.split("\t")
            else:
                _, _, score, zh_text, ja_text = line.split("\t")
            score = float(score)
            if score < 0.7:
                continue
            
            item = {
                "zh": zh_text.strip(),
                "ja": ja_text.strip(),
                "score": score
            }
            string = json.dumps(item, ensure_ascii=False)
            outfp.write(f"{string}\n")
        

In [None]:
## Prepare for Chinese-English balanced bilingual corpora

ROOT = os.path.join(DATA_ROOT, "datasets", "balanced_bilingual", "en-zh")

indir = os.path.join(ROOT, "UM-Corpus/data/Bilingual")
outfn = os.path.join(ROOT, "data.jsonl")
outfp = open(outfn, 'a', encoding="utf8")

items = []
for folder in os.listdir(indir):
    fns = os.listdir(os.path.join(indir, folder))
    assert len(fns) == 1
    infn = os.path.join(indir, folder, fns[0])

    with open(infn, 'r', encoding="utf8") as fp:    
        for i, line in tqdm(enumerate(fp)):
            if i % 2 == 0:
                items.append({
                    "en": line.strip()})
            else:
                items[-1]['zh'] = line.strip()
        
    for item in items:
        string = json.dumps(item, ensure_ascii=False)
        outfp.write(f"{string}\n")
        

In [None]:
## Prepare for scientific Chinese-English balanced bilingual corpora

ROOT = os.path.join(DATA_ROOT, "datasets", "scientific_bilingual", "en-zh")
tgt_fn = os.path.join(ROOT, "zh-only.jsonl")

if os.path.exists(tgt_fn):
    sys.exit(0)

tgt_dir = os.path.join(DATA_ROOT, 'zh')
os.makedirs(tgt_dir, exist_ok=True)
src_fn = os.path.join(DATA_ROOT, 'datasets', 'raw', 'csl_camera_readly.tsv')

items = []
with open(src_fn, "r", encoding="utf8") as fn:
    for i, line in enumerate(fn):
        fields = line.strip().split("\t")
        subjects = [fields[3], fields[4]]
        if "医学" not in subjects and "药学" not in subjects and "医药" not in subjects:
            items.append({
                "docid": i, 
                "title": fields[0],
                "abstract": fields[1],
                "keywords": fields[2].split("_"),
                "subjects": subjects,
            })

dump_jsonl(items, tgt_fn)
    

In [11]:
## Prepare for scientific English-Japanese balanced bilingual corpora
import re
ROOT = os.path.join(DATA_ROOT, "datasets", "scientific_bilingual", "en-ja")
tgt_fn = os.path.join(ROOT, "data.jsonl")
os.makedirs(ROOT, exist_ok=True)

if os.path.exists(tgt_fn):
    sys.exit(0)


outfp = open(tgt_fn, 'w', encoding="utf8")
indir = os.path.join(DATA_ROOT, "datasets", "raw/ASPEC/ASPEC-JE")

docids = set()
for filename in ["dev/dev.txt", "devtest/devtest.txt", "test/test.txt", "train/train-1.txt", "train/train-2.txt", "train/train-3.txt"]:
    infn = os.path.join(indir, filename)
    with open(infn, 'r', encoding="utf8") as fp:
        for line in tqdm(fp):
            if 'train' not in filename:
                docid, index, ja_text, en_text = line.split("|||")
                docid = f'{docid.strip()}-{index.strip()}'
            else:
                score, docid, index, ja_text, en_text = line.split("|||")
                docid = f'{docid}-{index}'
            
            assert re.match(r"^\s*[A-Z]-.*", docid), docid
            if re.match(r"^\s*[CEGXY]-.*", docid):
                continue

            assert docid not in docids
            docids.add(docid)
            item = {
                "docid": docid,
                "ja": ja_text.strip(),
                "en": en_text.strip(),
            }
            string = json.dumps(item, ensure_ascii=False)
            outfp.write(f"{string}\n")

1790it [00:00, 89354.15it/s]
1790it [00:00, 89354.15it/s]
1784it [00:00, 89332.14it/s]
1812it [00:00, 90818.78it/s]
1000000it [00:08, 122964.64it/s]
1000000it [00:07, 126659.38it/s]
1008500it [00:07, 137539.66it/s]


In [15]:
## Prepare for scientific Chinese-Japanese balanced bilingual corpora

ROOT = os.path.join(DATA_ROOT, "datasets", "scientific_bilingual", "zh-ja")
tgt_fn = os.path.join(ROOT, "data.jsonl")

outfp = open(tgt_fn, 'w', encoding="utf8")
indir = os.path.join(DATA_ROOT, "datasets", "raw/ASPEC/ASPEC-JC")

docids = set()
for filename in ["dev/dev.txt", "devtest/devtest.txt", "test/test.txt", "train/train.txt"]:
    infn = os.path.join(indir, filename)

    with open(infn, 'r', encoding="utf8") as fp:
        for line in tqdm(fp):
            docid, ja_text, zh_text = line.split("|||")

            assert docid not in docids
            docids.add(docid)
            
            item = {
                "docid": docid,
                "ja": ja_text.strip(),
                "zh": zh_text.strip(),
            }
            string = json.dumps(item, ensure_ascii=False)
            outfp.write(f"{string}\n")

2090it [00:00, 86970.41it/s]
2090it [00:00, 86970.41it/s]
2148it [00:00, 85994.30it/s]
2107it [00:00, 80710.52it/s]
672315it [00:05, 112536.16it/s]
