In [5]:
! pip install -U datasets
! pip install -U sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp312-cp312-win_amd64.whl.metadata (8.3 kB)
Downloading sentencepiece-0.2.0-cp312-cp312-win_amd64.whl (991 kB)
   ---------------------------------------- 0.0/992.0 kB ? eta -:--:--
   ---------------------------------------- 992.0/992.0 kB 5.8 MB/s eta 0:00:00
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0


In [6]:
import sentencepiece as spm
import re
import unicodedata

In [7]:
from datasets import load_dataset
dataset = load_dataset("wmt14", "de-en")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 4508785/4508785 [00:01<00:00, 2512399.10 examples/s]
Generating validation split: 100%|██████████| 3000/3000 [00:00<00:00, 998881.64 examples/s]
Generating test split: 100%|██████████| 3003/3003 [00:00<00:00, 1488125.58 examples/s]


In [8]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 4508785
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 3003
    })
})


In [9]:
# Check if examples are not empty
def is_valid(example):
    return example["translation"]["en"].strip() != "" and example["translation"]["de"].strip() != ""

# Remove commas from inside numbers 
def remove_number_commas(text):
    return re.sub(r'(?<=\d),(?=\d)', '', text)

# Space out punctuation 
def space_out_punctuation(text):
    return re.sub(r'([^\w\s])', r' \1 ', text)

# Remove control characters
def remove_control_chars(text):
    return re.sub(r'[\x00-\x1F\x7F]', '', text)

# Normalize unicode
def normalize_unicode(text):
    return unicodedata.normalize("NFKC", text)

#clean data
def clean_text(example):
    for lang in ["en", "de"]:
        text = example["translation"][lang]
        text = normalize_unicode(text)
        text = remove_control_chars(text)
        text = remove_number_commas(text)
        text = space_out_punctuation(text)
        text = text.strip()
        example["translation"][lang] = text
    return example

# Remove duplicates
def is_not_duplicate(example):
    return example["translation"]["en"] != example["translation"]["de"]

# Apply cleaning
for split in ["train", "validation", "test"]:
    dataset[split] = dataset[split].filter(is_valid)
    dataset[split] = dataset[split].map(clean_text)
    dataset[split] = dataset[split].filter(is_not_duplicate)

Filter: 100%|██████████| 4508785/4508785 [00:21<00:00, 207309.97 examples/s]
Map: 100%|██████████| 4508785/4508785 [03:50<00:00, 19527.53 examples/s]
Filter: 100%|██████████| 4508785/4508785 [00:21<00:00, 210595.08 examples/s]
Filter: 100%|██████████| 3000/3000 [00:00<00:00, 114707.12 examples/s]
Map: 100%|██████████| 3000/3000 [00:00<00:00, 18892.67 examples/s]
Filter: 100%|██████████| 3000/3000 [00:00<00:00, 115701.75 examples/s]
Filter: 100%|██████████| 3003/3003 [00:00<00:00, 111473.44 examples/s]
Map: 100%|██████████| 3003/3003 [00:00<00:00, 17993.95 examples/s]
Filter: 100%|██████████| 3003/3003 [00:00<00:00, 111264.67 examples/s]


In [10]:
# Save English and German translations to separate files
def save_translation_files(dataset_split, split="train"):
    with open(f"data/{split}.en", "w", encoding="utf-8") as f_en, \
            open(f"data/{split}.de", "w", encoding="utf-8") as f_de:
        for example in dataset_split:
            f_en.write(example["translation"]["en"] + "\n")
            f_de.write(example["translation"]["de"] + "\n")

save_translation_files(dataset["train"], split="train")
save_translation_files(dataset["validation"], split="validation")
save_translation_files(dataset["test"], split="test")

FileNotFoundError: [Errno 2] No such file or directory: 'data/train.en'

In [None]:
# Train a joint BPE tokenizer on both English and German data
tokenizer = spm.SentencePieceTrainer.train(
    input="train.en,train.de",
    model_prefix="bpe_joint",
    model_type="bpe"
)

In [None]:
def tokenize_and_save(dataset_split, split="train", src_lang="en", tgt_lang="de", limit=None):
    with open(f"{split}.src", "w", encoding="utf-8") as f_src, \
            open(f"{split}.tgt", "w", encoding="utf-8") as f_tgt:
        for i, example in enumerate(dataset_split):
            if limit is not None and i >= limit:
                break
            f_src.write(" ".join(tokenizer.encode(example["translation"][src_lang], out_type=str)) + "\n")
            f_tgt.write(" ".join(tokenizer.encode(example["translation"][tgt_lang], out_type=str)) + "\n")

tokenize_and_save(dataset["train"], split="train", limit=None)
tokenize_and_save(dataset["validation"], split="validation", limit=None)
tokenize_and_save(dataset["test"], split="test", limit=None)