# Cleaning the German-English Language Sentence Pair

The German-English data set is downloaded from http://www.manythings.org/anki/<br>

This script does the following clean up steps:
1. Ignore all chars that cannot be represented in ASCII
2. Convert all chars to lowercase 
3. Remove punctuations 
4. Remove all non-pretable chars 
5. Remove none alphabet words 

This code is refereneced from https://machinelearningmastery.com/develop-neural-machine-translation-system-keras/ 

In [1]:
import string
import re
from pickle import dump 
from unicodedata import normalize 
from numpy import array 

In [2]:
def load_doc(filename):
    file = open(filename, mode='rt', encoding='utf-8')
    text = file.read()
    file.close()
    return text

In [3]:
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [l.split('\t')[0:2] for l in lines]
    pairs = pairs[:20000]
    return pairs

In [4]:
line = "This is a sentence with non-ASCII characters."
# Do you know why Tom wasn't there yesterday?	Weißt du, warum Tom gestern nicht anwesend war? 
line = "Weißt du, warum Tom gestern nicht anwesend war?"
# Do you really believe that Tom is reliable?	Haltet ihr Tom wirklich für verlässlich?
line = "Haltet ihr Tom wirklich für verlässlich?"
print(line)
#  Converts all of the characters to their decomposed form
normalized_line = normalize('NFD', line)
print(normalized_line)
# converts the string to ASCII, ignoring any characters that cannot be represented in ASCII
encoded_line = normalized_line.encode('ascii', 'ignore')
print(encoded_line)

Haltet ihr Tom wirklich für verlässlich?
Haltet ihr Tom wirklich für verlässlich?
b'Haltet ihr Tom wirklich fur verlasslich?'


In [5]:
# re_print = re.compile('[^%s]' % re.escape(string.printable))
print(string.printable +'\n')
print(re.escape(string.printable) + '\n')
print('%s' % re.escape(string.printable) + '\n')

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	


0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"\#\$%\&'\(\)\*\+,\-\./:;<=>\?@\[\\\]\^_`\{\|\}\~\ \	\
\\

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"\#\$%\&'\(\)\*\+,\-\./:;<=>\?@\[\\\]\^_`\{\|\}\~\ \	\
\\



In [6]:
def clean_pairs(pariedLines):
    cleaned = list()
    # remove non-printable chars
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    table = str.maketrans('', '', string.punctuation)
    for pair in pariedLines:
        clean_pair = list()
        for sentence in pair:
            sentence = normalize('NFD', sentence).encode('ascii', 'ignore')
            sentence = sentence.decode('UTF-8')
            sentence = sentence.split()
            sentence = [word.lower() for word in sentence]
            sentence = [word.translate(table) for word in sentence]
            # remove non-printable chars
            sentence = [re_print.sub('', word) for word in sentence]
            # Remove words with numbers? how to deal with numbers? 
            # How to deal with upper case? And , . ? % these signs? 
            sentence = [word for word in sentence if word.isalpha()] 
            clean_pair.append(' '.join(sentence))
        cleaned.append(clean_pair)
    return array(cleaned)

In [7]:
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print(f'Saved: {filename}')

In [8]:
if __name__ == '__main__' :
    filename = 'deu-eng/deu.txt'
    doc = load_doc(filename)
    pairs = to_pairs(doc)
    cleaned_pairs = clean_pairs(pairs)
    print(f'number of setences = {len(cleaned_pairs)}')
    print(f'max_sentence_len = {len(cleaned_pairs[-1][0].split())}, {len(cleaned_pairs[-1][1].split())}')        
    save_clean_data(cleaned_pairs, 'english-german.pkl')
    for i in range(50):
        print('[%s] -> [%s]' % (cleaned_pairs[-i-1,0], cleaned_pairs[-i-1,1]))

number of setences = 100000
max_sentence_len = 5, 5
Saved: english-german.pkl
[tom really needs help now] -> [tom braucht jetzt wirklich hilfe]
[tom really made it happen] -> [tom hat es wirklich moglich gemacht]
[tom really likes swimming] -> [tom schwimmt fur sein leben gern]
[tom really got a bad deal] -> [tom hat wirklich einen schlechten deal bekommen]
[tom really enjoys his job] -> [tom gefallt seine arbeit wirklich sehr]
[tom really does hate dogs] -> [tom hasst hunde wirklich]
[tom realized he was wrong] -> [tom erkannte dass er sich getauscht hatte]
[tom read a selfhelp book] -> [tom hat ein selbsthilfebuch gelesen]
[tom reached for his sword] -> [tom griff nach seinem schwert]
[tom rarely makes mistakes] -> [tom macht selten fehler]
[tom rarely asks questions] -> [tom stellt selten fragen]
[tom raises racing pigeons] -> [tom zieht renntauben auf]
[tom raised his right hand] -> [tom hob seine rechte hand]
[tom quickly lost interest] -> [tom verlor schnell das interesse]
[tom q