# Cleaner


Code to clean (the French – English words) and pair the txt file. For example, it removes the upper case or some of the punctuation of specific characters.


#### Import libraries

In [1]:
import string
import re
from pickle import dump
from pickle import load
from unicodedata import normalize
from numpy import array

In [2]:
## function ##
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs

def clean_file(lines):
    cleaned = list()
    # prepare regex for char filtering
    string_print = ' a-zA-ZàâäôéèëêïîçùûüÿæœÀÂÄÔÉÈËÊÏÎŸÇÙÛÜÆŒ0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'
    re_print = re.compile('[^%s]' % re.escape(string_print))
    string_punc = '!"#$%&\()*+,-./:;<=>?@[\\]^_{|}~'
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string_punc)
    for pair in lines:
        clean_pairs = list()
        for line in pair:
            line = normalize('NFKD', line).encode('latin1', 'ignore')
            line = line.decode('latin1')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
        #    line = [word for word in line if word.isalpha()]
            # store as string
            clean_pairs.append(" ".join(line))
        cleaned.append(clean_pairs)
    return array(cleaned)


# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [3]:
# load dataset
filename = 'fra.txt'
doc = load_doc(filename)
# split into english and french pairs
pairs = to_pairs(doc)

#adding vocabulary
text = [["dogs","chiens"],
        ["I love dogs","j'aime les chiens"],
        ['cats',"chats"],
        ["basketball","basketball"],
        ["I like football and baseball","j'aime le football et le baseball"],
        ['football',"football"],
        ["my favorite sport is football", "mon sport favori est le football"],
        ["baseball is the best sprot","le baseball c'est le meilleur sport"],
        ['baseball','baseball'],
        ["apples","pommes"],
        ["I like a lot apples","j'aime beaucoup les pommes"]
               ]

text.extend(pairs)
#get pairs
clean_pairs = clean_file(text)
# save clean pairs to file
save_clean_data(clean_pairs, 'english-french_final.pkl')

for i in range(100):
    print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved: english-french_final.pkl
[dogs] => [chiens]
[i love dogs] => [j'aime les chiens]
[cats] => [chats]
[basketball] => [basketball]
[i like football and baseball] => [j'aime le football et le baseball]
[football] => [football]
[my favorite sport is football] => [mon sport favori est le football]
[baseball is the best sprot] => [le baseball c'est le meilleur sport]
[baseball] => [baseball]
[apples] => [pommes]
[i like a lot apples] => [j'aime beaucoup les pommes]
[go] => [va ]
[run] => [cours ]
[run] => [courez ]
[fire] => [au feu ]
[help] => [a l'aide ]
[jump] => [saute]
[stop] => [ca suffit ]
[stop] => [stop ]
[stop] => [arretetoi ]
[wait] => [attends ]
[wait] => [attendez ]
[go on] => [poursuis]
[go on] => [continuez]
[go on] => [poursuivez]
[i see] => [je comprends]
[i try] => [j'essaye]
[i won] => [j'ai gagne ]
[i won] => [je l'ai emporte ]
[oh no] => [oh non ]
[attack] => [attaque ]
[attack] => [attaquez ]
[cheers] => [sante ]
[cheers] => [a votre sante ]
[cheers] => [merci ]
[