In [88]:
#!pip install -U datasets
#!pip install -U sentencepiece

In [89]:
import sentencepiece as spm
import re

In [90]:
from datasets import load_dataset
dataset = load_dataset("wmt14", "de-en")

In [91]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 4508785
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 3003
    })
})


In [92]:
#Remove empty examples
def is_valid(example):
    return example["translation"]["en"].strip() != "" and example["translation"]["de"].strip() != ""

dataset["train"] = dataset["train"].filter(is_valid)
dataset["validation"] = dataset["validation"].filter(is_valid)
dataset["test"] = dataset["test"].filter(is_valid)

In [93]:
#REMOVE DUPLICATES
def remove_duplicates(example):
    return example["translation"]["en"] != example["translation"]["de"]
dataset["train"] = dataset["train"].filter(remove_duplicates)
dataset["validation"] = dataset["validation"].filter(remove_duplicates)
dataset["test"] = dataset["test"].filter(remove_duplicates)

In [94]:
# Remove commas from numbers
def remove_number_commas(text):
    return re.sub(r'(?<=\d),(?=\d)', '', text)

# Space out punctuation from text
def space_out_punctuation(text):
    return re.sub(r'([.,!?()])', r' \1 ', text)

# Apply text cleaning functions to the dataset
def clean_text(example):
    example["translation"]["en"] = remove_number_commas(example["translation"]["en"])
    example["translation"]["en"] = space_out_punctuation(example["translation"]["en"])

    example["translation"]["de"] = remove_number_commas(example["translation"]["de"])
    example["translation"]["de"] = space_out_punctuation(example["translation"]["de"])
    return example

# Apply the cleaning function
dataset["train"] = dataset["train"].map(clean_text)
dataset["validation"] = dataset["validation"].map(clean_text)
dataset["test"] = dataset["test"].map(clean_text)

In [95]:
# Save English and German translations to separate files
def save_translation_files(dataset_split, split="train"):
    with open(f"{split}.en", "w", encoding="utf-8") as f_en, \
            open(f"{split}.de", "w", encoding="utf-8") as f_de:
        for example in dataset_split:
            f_en.write(example["translation"]["en"] + "\n")
            f_de.write(example["translation"]["de"] + "\n")

# Save the files for train, validation, and test splits
save_translation_files(dataset["train"], split="train")
save_translation_files(dataset["validation"], split="validation")
save_translation_files(dataset["test"], split="test")

In [96]:
# Train a joint BPE tokenizer on both English and German data
spm.SentencePieceTrainer.train(
    input="train.en,train.de",
    model_prefix="bpe_joint",
    vocab_size=8000,
    model_type="bpe"
)

In [97]:
# Save tokenized data after training the tokenizer
tokenizer = spm.SentencePieceProcessor(model_file="bpe_joint.model")

def tokenize_and_save(dataset_split, split="train", src_lang="en", tgt_lang="de", limit=10000):
    with open(f"{split}.src", "w", encoding="utf-8") as f_src, \
            open(f"{split}.tgt", "w", encoding="utf-8") as f_tgt:
        for i, example in enumerate(dataset_split):
            if i >= limit:
                break
            f_src.write(" ".join(tokenizer.encode(example["translation"][src_lang], out_type=str)) + "\n")
            f_tgt.write(" ".join(tokenizer.encode(example["translation"][tgt_lang], out_type=str)) + "\n")

tokenize_and_save(dataset["train"], split="train")
tokenize_and_save(dataset["validation"], split="validation")
tokenize_and_save(dataset["test"], split="test")