# Cleaning the German-English Language Sentence Pair

The German-English data set is downloaded from http://www.manythings.org/anki/<br>

This script does the following clean up steps:
1. Ignore all chars that cannot be represented in ASCII
2. Convert all chars to lowercase 
3. Remove punctuations 
4. Remove all non-pretable chars 
5. Remove none alphabet words 

This code is refereneced from https://machinelearningmastery.com/develop-neural-machine-translation-system-keras/ 

In [1]:
import string
import re
from pickle import dump 
from unicodedata import normalize 
from numpy import array 

In [2]:
def load_doc(filename):
    file = open(filename, mode='rt', encoding='utf-8')
    text = file.read()
    file.close()
    return text

In [3]:
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [l.split('\t')[0:2] for l in lines]
    pairs = pairs[:20000]
    return pairs

In [4]:
line = "This is a sentence with non-ASCII characters."
# Do you know why Tom wasn't there yesterday?	Weißt du, warum Tom gestern nicht anwesend war? 
line = "Weißt du, warum Tom gestern nicht anwesend war?"
# Do you really believe that Tom is reliable?	Haltet ihr Tom wirklich für verlässlich?
line = "Haltet ihr Tom wirklich für verlässlich?"
print(line)
#  Converts all of the characters to their decomposed form
normalized_line = normalize('NFD', line)
print(normalized_line)
# converts the string to ASCII, ignoring any characters that cannot be represented in ASCII
encoded_line = normalized_line.encode('ascii', 'ignore')
print(encoded_line)

Haltet ihr Tom wirklich für verlässlich?
Haltet ihr Tom wirklich für verlässlich?
b'Haltet ihr Tom wirklich fur verlasslich?'


In [5]:
# re_print = re.compile('[^%s]' % re.escape(string.printable))
print(string.printable +'\n')
print(re.escape(string.printable) + '\n')
print('%s' % re.escape(string.printable) + '\n')

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	


0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"\#\$%\&'\(\)\*\+,\-\./:;<=>\?@\[\\\]\^_`\{\|\}\~\ \	\
\\

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"\#\$%\&'\(\)\*\+,\-\./:;<=>\?@\[\\\]\^_`\{\|\}\~\ \	\
\\



In [6]:
def clean_pairs(pariedLines):
    cleaned = list()
    # remove non-printable chars
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    table = str.maketrans('', '', string.punctuation)
    for pair in pariedLines:
        clean_pair = list()
        for sentence in pair:
            sentence = normalize('NFD', sentence).encode('ascii', 'ignore')
            sentence = sentence.decode('UTF-8')
            sentence = sentence.split()
            sentence = [word.lower() for word in sentence]
            sentence = [word.translate(table) for word in sentence]
            # remove non-printable chars
            sentence = [re_print.sub('', word) for word in sentence]
            # Remove words with numbers? how to deal with numbers? 
            # How to deal with upper case? And , . ? % these signs? 
            sentence = [word for word in sentence if word.isalpha()] 
            clean_pair.append(' '.join(sentence))
        cleaned.append(clean_pair)
    return array(cleaned)

In [7]:
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print(f'Saved: {filename}')

In [8]:
if __name__ == '__main__' :
    filename = 'deu-eng/deu.txt'
    doc = load_doc(filename)
    pairs = to_pairs(doc)
    cleaned_pairs = clean_pairs(pairs)
    print(f'number of setences = {len(cleaned_pairs)}')
    print(f'max_sentence_len = {len(cleaned_pairs[-1][0].split())}, {len(cleaned_pairs[-1][1].split())}')        
    save_clean_data(cleaned_pairs, 'english-german.pkl')
    for i in range(50):
        print('[%s] -> [%s]' % (cleaned_pairs[-i-1,0], cleaned_pairs[-i-1,1]))

number of setences = 20000
max_sentence_len = 4, 4
Saved: english-german.pkl
[she is very wise] -> [sie ist sehr weise]
[she is very busy] -> [sie ist sehr beschaftigt]
[she is shameless] -> [sie ist schamlos]
[she is on a diet] -> [sie ist auf diat]
[she is obstinate] -> [sie ist hartnackig]
[she is obstinate] -> [sie ist eigensinnig]
[she is obstinate] -> [sie ist stur]
[she is not young] -> [sie ist nicht jung]
[she is not wrong] -> [sie hat nicht unrecht]
[she is no beauty] -> [sie ist keine schonheit]
[she is mad at me] -> [sie ist wutend auf mich]
[she is easygoing] -> [sie ist lassig]
[she is beautiful] -> [sie ist schon]
[she is an expert] -> [sie ist vom fach]
[she is a teacher] -> [sie ist lehrerin]
[she is a student] -> [sie ist schulerin]
[she is a student] -> [sie ist studentin]
[she is ethiopian] -> [sie ist athiopierin]
[she insulted him] -> [sie hat ihn beleidigt]
[she insulted him] -> [sie beleidigte ihn]
[she idolized him] -> [sie hat ihn vergottert]
[she idolized him

## Split the data into training and testing

In [9]:
from pickle import load, dump 
from numpy.random import rand
from numpy.random import shuffle

In [10]:
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

In [11]:
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print(f'Saved: {filename}')

In [12]:
raw_dataset = load_clean_sentences('english-german.pkl')
len(raw_dataset)

20000

In [13]:
n_sentences = 10000
train_test_ratio = 0.9
dataset = raw_dataset[:n_sentences, :]
shuffle(dataset)
train, test = dataset[ : int(n_sentences*train_test_ratio)], dataset[int(n_sentences*train_test_ratio):]
print(len(train), len(test))

9000 1000


In [14]:
save_clean_data(dataset, 'english-german-both.pkl')
save_clean_data(train, 'english-german-train.pkl')
save_clean_data(test, 'english-german-test.pkl')

Saved: english-german-both.pkl
Saved: english-german-train.pkl
Saved: english-german-test.pkl
