In [2]:
import json
import jsonlines
from tqdm.notebook import tqdm
from datasets import load_dataset

In [5]:
raw_datasets = load_dataset("kde4", lang1="en", lang2="fr")

Using custom data configuration en-fr-lang1=en,lang2=fr
Reusing dataset kde4 (C:\Users\asus\.cache\huggingface\datasets\kde4\en-fr-lang1=en,lang2=fr\0.0.0\243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac)


  0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)

Loading cached split indices for dataset at C:\Users\asus\.cache\huggingface\datasets\kde4\en-fr-lang1=en,lang2=fr\0.0.0\243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac\cache-496be247a58b47c1.arrow and C:\Users\asus\.cache\huggingface\datasets\kde4\en-fr-lang1=en,lang2=fr\0.0.0\243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac\cache-2c0faebb61cdd12e.arrow


In [7]:
split_datasets["validation"] = split_datasets.pop("test")

In [8]:
split_datasets["train"][1]

{'id': '152754',
 'translation': {'en': 'Default to expanded threads',
  'fr': 'Par défaut, développer les fils de discussion'}}

In [9]:
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 189155
    })
    validation: Dataset({
        features: ['id', 'translation'],
        num_rows: 21018
    })
})

In [10]:
split_datasets["train"].to_json("../data/train_datasets.json")

Creating json from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

29903961

In [11]:
split_datasets["validation"].to_json("../data/eval_datasets.json")

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

3317985

In [3]:
def read_jsonl_files(filepath):
    result = []

    with open(filepath) as f:
        for x in jsonlines.Reader(f):
            result.append(x)
    
    return result

In [4]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [5]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

In [9]:
def get_token_info(text_list, tokenizer):
    
    token2oldId = {}
    for text in tqdm(text_list, leave=False):
        tokenized_text = tokenizer(text)

        tokens = tokenized_text.tokens()
        id_list = tokenized_text.input_ids

        for token, idx in zip(tokens, id_list):
            token2oldId[token] = idx
    
    pad_token = tokenizer.pad_token
    pad_token_id = tokenizer.pad_token_id
    if pad_token not in token2oldId:
        token2oldId[pad_token] = pad_token_id

    num_tokens = len(token2oldId)
    tokens = list(token2oldId.keys())
    tokens.remove(pad_token)
    tokens = [pad_token] + tokens

    token2newId = {}
    old2new = {}
    new2old = {}

    for new_id, token in enumerate(tokens):
        token2newId[token] = new_id
        old_id = token2oldId[token]
        old2new[old_id] = new_id
        new2old[new_id] = old_id
    
    token_info = {
        "token2oldId": token2oldId,
        "token2newId": token2newId,
        "old2new": old2new,
        "new2old": new2old,
    }

    return token_info

In [10]:
def get_token_info_main():
    eval_data = read_jsonl_files("../data/eval_datasets.json")
    train_data = read_jsonl_files("../data/train_datasets.json")

    en_text_list = [x["translation"]["en"] for x in eval_data] + [x["translation"]["en"] for x in train_data]
    fr_text_list = [x["translation"]["fr"] for x in eval_data] + [x["translation"]["fr"] for x in train_data] 

    en_token_info = get_token_info(en_text_list, tokenizer)
    fr_token_info = get_token_info(fr_text_list, tokenizer)

    with open("../data/en_token_info.json", "w") as f:
        json.dump(en_token_info, f)
    
    with open("../data/fr_token_info.json", "w") as f:
        json.dump(fr_token_info, f)

In [11]:
get_token_info_main()

  0%|          | 0/210173 [00:00<?, ?it/s]

  0%|          | 0/210173 [00:00<?, ?it/s]

In [7]:
eval_data = read_jsonl_files("../data/eval_datasets.json")
train_data = read_jsonl_files("../data/train_datasets.json")

en_text_list = [x["translation"]["en"] for x in eval_data] + [x["translation"]["en"] for x in train_data]
fr_text_list = [x["translation"]["fr"] for x in eval_data] + [x["translation"]["fr"] for x in train_data] 

en_token_info = get_token_info(en_text_list, tokenizer)
# fr_token_info = get_token_info(fr_text_list, tokenizer)

  0%|          | 0/210173 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (623 > 512). Running this sequence through the model will result in indexing errors
