# Cleaning the German-English Language Sentence Pair

This code is refereneced from https://machinelearningmastery.com/develop-neural-machine-translation-system-keras/ </br> 
and https://github.com/zzc01/Transformer/blob/main/Data_Cleaning/clean_pairs.ipynb

In [7]:
import string
import re
from pickle import dump, load 
from unicodedata import normalize 
from numpy import array 
from numpy.random import shuffle
from numpy import savetxt

In [2]:
def load_doc(filename):
    file = open(filename, mode='rt', encoding='utf-8')
    text = file.read()
    file.close()
    return text

In [3]:
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [[l.split('\t')[1], l.split('\t')[0]] for l in lines]
    return pairs

In [4]:
def clean_pairs(pariedLines):
    cleaned = list()
    # remove non-printable chars
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    table = str.maketrans('', '', string.punctuation)
    for pair in pariedLines:
        clean_pair = list()
        for sentence in pair:
            sentence = normalize('NFD', sentence).encode('ascii', 'ignore')
            sentence = sentence.decode('UTF-8')
            sentence = sentence.split()
            sentence = [word.lower() for word in sentence]
            sentence = [word.translate(table) for word in sentence]
            # remove non-printable chars
            sentence = [re_print.sub('', word) for word in sentence]
            # Remove words with numbers? how to deal with numbers? 
            # How to deal with upper case? And , . ? % these signs? 
            sentence = [word for word in sentence if word.isalpha()] 
            clean_pair.append(' '.join(sentence))
        cleaned.append(clean_pair)
    return array(cleaned)

In [5]:
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print(f'Saved: {filename}')

In [6]:
filename = './data/deu.txt'
doc = load_doc(filename)
pairs = to_pairs(doc)
cleaned_pairs = clean_pairs(pairs)
print(f'number of setences = {len(cleaned_pairs)}')
print(f'max_sentence_len = {len(cleaned_pairs[-1][0].split())}, {len(cleaned_pairs[-1][1].split())}')        
save_clean_data(cleaned_pairs, './data/german-english.pkl')    

number of setences = 261499
max_sentence_len = 75, 101
Saved: ./data/german-english.pkl


In [7]:
for i in range(5):    print('[%s] -> [%s]' % (cleaned_pairs[0+i,0], cleaned_pairs[0+i,1]))
for i in range(5):    print('[%s] -> [%s]' % (cleaned_pairs[10000+i,0], cleaned_pairs[10000+i,1]))
for i in range(5):    print('[%s] -> [%s]' % (cleaned_pairs[-i-1,0], cleaned_pairs[-i-1,1]))

[geh] -> [go]
[hallo] -> [hi]
[gru gott] -> [hi]
[lauf] -> [run]
[lauf] -> [run]
[lass uns nach hause gehen] -> [let us go home]
[lasst uns nach hause gehen] -> [let us go home]
[lasst uns mutig sein] -> [lets be brave]
[rufen wir tom an] -> [lets call tom]
[lasst uns weitermachen] -> [lets continue]
[ohne zweifel findet sich auf dieser welt zu jedem mann genau die richtige ehefrau und umgekehrt wenn man jedoch in betracht zieht dass ein mensch nur gelegenheit hat mit ein paar hundert anderen bekannt zu sein von denen ihm nur ein dutzend oder weniger nahesteht darunter hochstens ein oder zwei freunde dann erahnt man eingedenk der millionen einwohner dieser weltleicht dass seit erschaffung ebenderselben wohl noch nie der richtige mann der richtigen frau begegnet ist] -> [doubtless there exists in this world precisely the right woman for any given man to marry and vice versa but when you consider that a human being has the opportunity of being acquainted with only a few hundred people an

## Split the data into training, validation, and testing

In [8]:
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

In [9]:
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print(f'Saved: {filename}')

In [10]:
raw_dataset = load_clean_sentences('./data/german-english.pkl')
len(raw_dataset)

261499

In [11]:
n_sentences = 10000
train_ratio = 0.8
val_ratio = 0.1
# test_ratio = 0.1
dataset = raw_dataset[:n_sentences, :]
shuffle(dataset)
train = dataset[ : int(n_sentences*train_ratio)]
val   = dataset[int(n_sentences*train_ratio):int(n_sentences*(train_ratio+val_ratio))]
test  = dataset[int(n_sentences*(train_ratio+val_ratio)):]
print(len(train), len(val), len(test))

8000 1000 1000


In [12]:
for i in range(5): print(train[i])

['komm her' 'get over here']
['ich habe verloren' 'ive lost']
['tom wurde adoptiert' 'toms adopted']
['tom liebt rum' 'tom likes rum']
['tom kannte ihn' 'tom knew it']


In [13]:
for i in range(5): print(val[i])

['sie hasste ihn' 'she hated him']
['ich rieche kaffee' 'i smell coffee']
['ach sei still' 'oh be quiet']
['mir tut der kiefer weh' 'my jaw hurts']
['du bist ja stark' 'youre strong']


In [14]:
for i in range(5): print(test[i])

['tom erbricht sich' 'tom is puking']
['welche gehort uns' 'which is ours']
['ich bin arzt' 'im a medic']
['scher dich fort' 'get away']
['er legte auf' 'he hung up']


In [15]:
save_clean_data(dataset, './data/german-english-both.pkl')
save_clean_data(train, './data/german-english-train.pkl')
save_clean_data(val, './data/german-english-val.pkl')
save_clean_data(test, './data/german-english-test.pkl')

Saved: ./data/german-english-both.pkl
Saved: ./data/german-english-train.pkl
Saved: ./data/german-english-val.pkl
Saved: ./data/german-english-test.pkl


In [10]:
with open('./data/german-english-test.pkl', 'rb') as f:
    text = load(f)
print(len(text))
print(text)

In [19]:
with open('./data/german-english-test.txt', 'wb') as f:
    for pair in text:
        word = pair[0]+'\t'+pair[1] + '\n'
        f.write(word.encode('utf-8'))
f.close()
    
# savetxt('./data/german-english-test.txt', text, fmt='%s')