In [1]:
def load_data(path: str):
    with open(path, encoding="utf-8") as f:
        return f.readlines()

In [2]:
en = load_data("./en_fr/en.txt")
fr = load_data("./en_fr/fr.txt")

In [3]:
import string, random

def add_noise(sentence, noise=0.05):
    """
    Adds noise to a sentence by randomly modifying the characters.
    """
    if not 0 <= noise <= 1:
        raise ValueError("Noise must be between 0 and 1")

    modsentence = []
    for char in sentence:
        if random.random() < noise:
            choice = random.choice(["delete", "insert", "replace"])
            new_char = random.choice(string.ascii_letters)
            if choice == "delete":
                continue
            elif choice == "insert":
                modsentence.append(new_char)
            elif choice == "replace":
                char = new_char
        modsentence.append(char)
        
    return ''.join(modsentence)

In [4]:
sentence = "This is a brown fox."
print(add_noise(sentence))

This s a brown fox.


In [5]:
import unicodedata

en = [add_noise(unicodedata.normalize("NFC", sentence[:-1]), noise=0.01) for sentence in en]
fr = [unicodedata.normalize("NFC", sentence[:-1]) for sentence in fr]

In [6]:
for i in range(5):
    print(f"English: {en[i]}")
    print(f"French: {fr[i]}")

English: Approval of the minutesLof the previous sAitting: see Minutes
French: Approbation du procès-verbal de la séance précédente: voir procès-verbal
English: 5.
French: 5.
English: Environmental quality standards in the field of water policy (vote)
French: Normes de qualité environnementale dans le domaine de l'eau (vote)
English: - Report: Laperrouze
French: Rapport: Laperrouze
English: Aprroval of the Minutes
French: Adoption du procès-verbal


In [7]:
len(en)

2455458

In [8]:
import sentencepiece as spm

def train_spm(it, prefix, vocab=2_000, sentence_size=50_000, model_type="bpe"):
    spm.SentencePieceTrainer.train(sentence_iterator=iter(it), 
                                   model_prefix=prefix, 
                                   vocab_size=vocab, 
                                   model_type="bpe", 
                                   normalization_rule_name="identity", 
                                   input_sentence_size=sentence_size, 
                                   shuffle_input_sentence=True)

In [9]:
train_spm(en, "en", vocab=5_000, sentence_size=1_000_000)
train_spm(fr, "fr", vocab=5_000, sentence_size=1_000_000)

In [10]:
def load_spm(prefix):
    local_spm = spm.SentencePieceProcessor(model_file=f"./{prefix}.model")
    return local_spm

In [11]:
en_spm = load_spm("en")
fr_spm = load_spm("fr")

In [12]:
# Checking the vocab size
print(f"English vocab:{en_spm.get_piece_size()}")
print(f"French vocab:{fr_spm.get_piece_size()}")

English vocab:5000
French vocab:5000


In [20]:
import pickle

with open("./en_fr/en_enc.pickle", "wb") as en_f, open("./en_fr/fr_enc.pickle", "wb") as fr_f:
    en_enc = en_spm.encode(en)
    fr_enc = fr_spm.encode(fr, add_bos=True, add_eos=True)
    pickle.dump(en_enc, en_f)
    pickle.dump(fr_enc, fr_f)