In [1]:
import unicodedata
import sentencepiece as spm

In [2]:
def load_parallel_corpus(src, tgt):
    """
    Load data from the given path of a file in utf-8 format.

    Parameters
    ----------
    src: str
        Path of the source language file to load.
    tgt: str
        Path of the target language file to load.
    """
    with open(src, encoding="utf-8") as src_f, open(tgt, encoding="utf-8") as tgt_f:
        src = src_f.readlines()
        tgt = tgt_f.readlines()
        if len(src) != len(tgt):
            raise ValueError("Number of records in source and target files are not equal")
        for i in range(len(src)):
            if not src[i] and tgt[i]:
                src.pop(i)
                tgt.pop(i)
        return src, tgt

In [3]:
def train_spm(iterable, prefix, vocab=2_000, sentence_size=50_000, model_type="bpe"):
    """
    Train a sentence-piece tokenizer.
    The pad, unk, bos, eos tokens corresponds to 0, 1, 3 and 4 id respectively.

    Parameters
    ----------
    iterable: Sequence[str]
        List of sequence to train the tokenizer.
    prefix: str
        A prefix for .model, .vocab files.
    vocab: int, optional
        Size of the vocabulary (default is 2_000)
    sentence_size: int, optional
        Size of sentences to sample for training. (default is 50_000)
    model_type: str, optional
        Type of model. Either "bpe" or "unigram". (default is "bpe")
    """
    spm.SentencePieceTrainer.train(sentence_iterator=iter(iterable), model_prefix=prefix, vocab_size=vocab,
                                   model_type="bpe", normalization_rule_name="identity", 
                                   input_sentence_size=sentence_size, shuffle_input_sentence=True, 
                                   pad_id=0, unk_id=1, bos_id=2, eos_id=3, unk_surface='<unk>')

In [4]:
def load_spm(path):
    """
    Load a pretrained tokenizer

    Parameters
    ----------
    path: str
        Path to the model file

    Returns
    -------
    SentencePieceProcessor
        A trained sentence-piece tokenizer.
    """
    local_spm = spm.SentencePieceProcessor(model_file=f"{path}.model")
    return local_spm

In [5]:
def data_split(arr, train_size=0.8, val_size=0.1):
    """
    Splits a given list into train, validation and test sets. The sum of train &
    validation sizes should not exceed 1. If the sum if < 1, test set has the remaining
    elements of the array.

    Parameters
    ----------
    arr: list
        A list of sentences.
    train_size: float, optional
        Percentage of train set. (dafault is 0.8)
    val_size: float, optional
        Percentage of validation set (default is 0.1)

    Returns
    -------
    (list, list, list)
        A tuple of train, validation and test set.
    """
    LEN = len(arr)
    val_size = train_size + val_size
    return arr[:int(LEN*train_size)], arr[int(LEN*train_size):int(LEN*val_size)], arr[int(LEN*val_size):]

In [6]:
def normalize(arr, form):
    """
    Performs unicode noramlization on sentences.

    Paramters
    ---------
    arr: str
        A list of pairs of sentences.
    form: str
        Normalization form to use. NFD/NFC/NFKD/NFKC. (default is "NFC")

    Returns
    -------
    list
        A list of normalized sentences.
    """
    return [unicodedata.normalize(form, seq) for seq in arr]

In [7]:
en, fr = load_parallel_corpus("./en_fr/en", "./en_fr/fr")

In [8]:
en, fr = normalize(en, "NFC"), normalize(fr, "NFC")

In [9]:
Xtr, Xdev, Xte = data_split(en, train_size=0.7, val_size=0.15)
ytr, ydev, yte = data_split(fr, train_size=0.7, val_size=0.15)

In [10]:
train_spm(Xtr, "./tokenizer/en", 5_000, sentence_size=1_000_000)
train_spm(ytr, "./tokenizer/fr", 5_000, sentence_size=1_000_000)

In [11]:
en_spm, fr_spm = load_spm("./tokenizer/en"), load_spm("./tokenizer/fr")

In [12]:
Xtr, Xdev, Xte = en_spm.tokenize(Xtr, alpha=0.1), en_spm.tokenize(Xdev), en_spm.tokenize(Xte)
ytr, ydev, yte = (
    fr_spm.tokenize(ytr, add_bos=True, add_eos=True, alpha=0.1), 
    fr_spm.tokenize(ydev, add_bos=True, add_eos=True), 
    fr_spm.tokenize(yte, add_bos=True, add_eos=True)
)

In [13]:
import pickle
with open("./data_splits/en_fr_train_tokenized.pickle", 'wb') as f:
    for data in zip(Xtr, ytr):
        f.write(pickle.dumps(data))
        f.write(b'\n\n')

In [14]:
import pickle
with open("./data_splits/en_fr_val_tokenized.pickle", 'wb') as f:
    for data in zip(Xdev, ydev):
        f.write(pickle.dumps(data))
        f.write(b'\n\n')

In [15]:
import pickle
with open("./data_splits/en_fr_test_tokenized.pickle", 'wb') as f:
    for data in zip(Xte, yte):
        f.write(pickle.dumps(data))
        f.write(b'\n\n')