In [1]:
import re
import os
import sys
import json
import random

from tqdm import tqdm
from utils import DATA_ROOT, load_json, load_jsonl_iteratively, load_config

NUM_1B = 1e+9

In [None]:
config = load_config("test_config")
data_root = config['dataset']['dataset-dir']
save_root = config['dataset']['exp-dir']
collections = config['dataset']['collection']
btoken_kg = collections['knowledge']
btoken_ct = collections['crosslingual-transfer']['en-ja']
save_dir = os.path.join(config['dataset']['exp-dir'], config['dataset']['name'])
os.makedirs(save_dir, exist_ok=True)

tokenizer_type = config['dataset']['tokenizer']['tokenizer-type']

In [46]:
from pathlib import Path

def write_text(src, tgt, b_tokens, tokenizer_type, lang, record_ids=False, docids=None):
    all_cnts = 0
    if record_ids:
        assert docids is None, "docids shouldn't be provided if record_ids is True"
        docid_fn = Path(tgt).parent / "doc_ids.jsonl"
        docid_fp = open(docid_fn, 'a', encoding="utf8") if record_ids else None
        print(f"Writing doc ids to {docid_fn}")
    
    if docids is not None:
        assert isinstance(docids, set), "docids should be a set"
        assert record_ids is False, "docids shouldn't be provided if record_ids is True"
        

    with open(tgt, 'a', encoding="utf8") as fp:
        num_tokens = b_tokens * NUM_1B
        while all_cnts < num_tokens:
            desc = f"{src} → {tgt} ({num_tokens:.2e} tokens)"
            for item in tqdm(load_jsonl_iteratively(src), desc=desc):
                if f'{tokenizer_type}' not in item:
                    raise NotImplementedError(f"Please run `python3 tokenization` first to get tokens for each data item for file {src}, with tokenizer {tokenizer_type}")
                if record_ids:
                    docid_fp.write(f"{json.dumps({'docid': item['docid'], 'lang': lang}, ensure_ascii=False)}\n")
                if docids is not None:
                    if item['docid'] not in docids:
                        continue

                string = json.dumps({'text': item['text']}, ensure_ascii=False)
                fp.write(f"{string}\n")
                
                cnt = item[f'{tokenizer_type}']['num_tokens'] 
                all_cnts += cnt
                if all_cnts >= num_tokens:
                    break    
    print(f"Finished writing {all_cnts} tokens to {tgt} from {src}")

In [47]:

kg_fn = os.path.join(save_dir, "knolwedge.jsonl")
if btoken_kg["ja-medical"] > 0:
    native_fn = os.path.join(data_root, "ja", "medical_native.jsonl")
    write_text(native_fn, kg_fn, b_tokens=btoken_kg["ja-medical"], tokenizer_type=tokenizer_type, lang='ja', record_ids=True, docids=None)
if btoken_kg["en-medical"] > 0:
    native_fn = os.path.join(data_root, "en", "native.subset.jsonl")
    write_text(native_fn, kg_fn, b_tokens=btoken_kg["en-medical"], tokenizer_type=tokenizer_type, lang='en', record_ids=True, docids=None)
if btoken_kg["zh-medical"] > 0:
    native_fn = os.path.join(data_root, "zh", "medical_native.jsonl")
    write_text(native_fn, kg_fn, b_tokens=btoken_kg["zh-medical"], tokenizer_type=tokenizer_type, lang='zh', record_ids=True, docids=None)



Writing doc ids to /data/xzhao/experiments/roman-pretrain/exp-datasets/test/doc_ids.jsonl


/data/xzhao/dataset/roman-pretrain/instructions/ja/medical_native.jsonl → /data/xzhao/experiments/roman-pretrain/exp-datasets/test/knolwedge.jsonl (5.00e+08 tokens): 3461201it [02:51, 20178.45it/s]


Finished writing 500000571 tokens to /data/xzhao/experiments/roman-pretrain/exp-datasets/test/knolwedge.jsonl from /data/xzhao/dataset/roman-pretrain/instructions/ja/medical_native.jsonl
Writing doc ids to /data/xzhao/experiments/roman-pretrain/exp-datasets/test/doc_ids.jsonl


/data/xzhao/dataset/roman-pretrain/instructions/en/native.subset.jsonl → /data/xzhao/experiments/roman-pretrain/exp-datasets/test/knolwedge.jsonl (5.00e+08 tokens): 2050597it [02:39, 12831.11it/s]

Finished writing 500000264 tokens to /data/xzhao/experiments/roman-pretrain/exp-datasets/test/knolwedge.jsonl from /data/xzhao/dataset/roman-pretrain/instructions/en/native.subset.jsonl





In [None]:
ct_fn = os.path.join(save_dir, "transfer.jsonl")

docid_fn = f"{save_dir}/doc_ids.jsonl"
lang2docid = {}
for item in load_json(docid_fn):
    if item['lang'] not in lang2docid:
        lang2docid[item['lang']] = set()
    lang2docid[item['lang']].add(item['docid'])
    
for lang_pair in collections['crosslingual-transfer']:
    btoken_ct = collections['crosslingual-transfer'][lang_pair]
    for data_type in btoken_ct:
        if data_type == "zh-ja":
            raise NotImplementedError('zh-ja is not supported currently')
        if btoken_ct == 0 or btoken_ct[data_type] == 0:
            continue
        
        assert lang_pair.startswith("en-")
        
        data_dir = os.path.join(data_root, lang_pair[3:])
        filename = os.path.join(data_root, lang_pair[3:], f"{data_type.replace('-', '_')}.jsonl")
        if not os.path.exists(filename):
            raise FileNotFoundError(f"File not found: {filename}")
        
        if not data_type.startswith("medical-"):
            write_text(filename, ct_fn, b_tokens=btoken_ct[data_type], tokenizer_type=tokenizer_type)
        
        
        

/data/xzhao/dataset/roman-pretrain/instructions/ja/balanced_trans.jsonl → /data/xzhao/experiments/roman-pretrain/exp-datasets/test/transfer.jsonl (1.00e+09 tokens): 0it [00:00, ?it/s]

/data/xzhao/dataset/roman-pretrain/instructions/ja/balanced_trans.jsonl → /data/xzhao/experiments/roman-pretrain/exp-datasets/test/transfer.jsonl (1.00e+09 tokens): 16967008it [06:45, 41841.06it/s]

Finished writing 1000000020 tokens to /data/xzhao/experiments/roman-pretrain/exp-datasets/test/transfer.jsonl from /data/xzhao/dataset/roman-pretrain/instructions/ja/balanced_trans.jsonl



