# Data Cleaning

## Library

In [22]:
# import pathlib
# import tensorflow as tf
# import tensorflow_text as tftxt
# from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab
# import tensorflow_datasets as tfds
from pickle import dump, load
import numpy as np
from numpy.random import rand, shuffle
import re
from unicodedata import normalize

## Load the data

In [15]:
filename = '../data/deu.txt'
with open(filename, 'rt', encoding='utf-8') as f:
  text = f.read()
  f.close()

In [16]:
print(type(text))

<class 'str'>


In [17]:
lines = text.strip().split('\n')
for i, p in enumerate(lines):
  if i ==10: break
  print(p)

Go.	Geh.	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8597805 (Roujin)
Hi.	Hallo!	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #380701 (cburgmer)
Hi.	Grüß Gott!	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #659813 (Esperantostern)
Run!	Lauf!	CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #941078 (Fingerhut)
Run.	Lauf!	CC-BY 2.0 (France) Attribution: tatoeba.org #4008918 (JSakuragi) & #941078 (Fingerhut)
Wow!	Potzdonner!	CC-BY 2.0 (France) Attribution: tatoeba.org #52027 (Zifre) & #2122382 (Pfirsichbaeumchen)
Wow!	Donnerwetter!	CC-BY 2.0 (France) Attribution: tatoeba.org #52027 (Zifre) & #2122391 (Pfirsichbaeumchen)
Duck!	Kopf runter!	CC-BY 2.0 (France) Attribution: tatoeba.org #280158 (CM) & #9968521 (wolfgangth)
Fire!	Feuer!	CC-BY 2.0 (France) Attribution: tatoeba.org #1829639 (Spamster) & #1958697 (Tamy)
Help!	Hilfe!	CC-BY 2.0 (France) Attribution: tatoeba.org #435084 (lukaszpp) & #575889 (MUIRIEL)


## Remove the third column

In [18]:
pairs = [l.split('\t')[0:2] for l in lines]
for i, p in enumerate(pairs):
  if i ==10: break
  print(p)

['Go.', 'Geh.']
['Hi.', 'Hallo!']
['Hi.', 'Grüß Gott!']
['Run!', 'Lauf!']
['Run.', 'Lauf!']
['Wow!', 'Potzdonner!']
['Wow!', 'Donnerwetter!']
['Duck!', 'Kopf runter!']
['Fire!', 'Feuer!']
['Help!', 'Hilfe!']


## Normalization - Canonical Decomposition 

In [24]:
cleaned = list()
for i, pair in enumerate(pairs):
  clean_pair = list()
  for sentence in pair:
    sentence = normalize('NFD', sentence)
    sentence = sentence.encode('utf-8')
    clean_pair.append(sentence)
  cleaned.append(clean_pair[::-1])

for i, p in enumerate(cleaned):
  if i ==10: break
  print(p)

[b'Geh.', b'Go.']
[b'Hallo!', b'Hi.']
[b'Gru\xcc\x88\xc3\x9f Gott!', b'Hi.']
[b'Lauf!', b'Run!']
[b'Lauf!', b'Run.']
[b'Potzdonner!', b'Wow!']
[b'Donnerwetter!', b'Wow!']
[b'Kopf runter!', b'Duck!']
[b'Feuer!', b'Fire!']
[b'Hilfe!', b'Help!']


## Split data set into train, validation, and test sets

In [26]:
print(n_sentence)

261499


In [27]:
n_sentence = len(cleaned)
train_ratio = 0.8
val_ratio   = 0.1
test_ratio  = 0.1
dataset = np.array(cleaned)
shuffle(dataset)

for i, p in enumerate(dataset):
  if i ==10: break
  print(p)

train = dataset[ : int(n_sentence*train_ratio)]
val = dataset[int(n_sentence*train_ratio) : int(n_sentence*(train_ratio+val_ratio))]
test = dataset[int(n_sentence*(train_ratio+val_ratio)) : int(n_sentence*(train_ratio+val_ratio+test_ratio))]

[b'Tom zahlt ein Bu\xc3\x9fgeld.' b'Tom is paying a fine.']
[b'Unter dem Bett ist eine Katze.' b'There is a cat under the bed.']
[b'Tom wollte, dass ich lu\xcc\x88ge.' b'Tom wanted me to lie.']
[b'Gefa\xcc\x88llt es euch, so zu leben?'
 b'Do you enjoy living like this?']
[b'Tom und Maria sa\xc3\x9fen auf ihren u\xcc\x88blichen Pla\xcc\x88tzen.'
 b'Tom and Mary were sitting in their usual places.']
[b'Tom ist einer der Besten in der Branche.'
 b'Tom is one of the best in the business.']
[b'Der britische Kommandeur befahl seinen Ma\xcc\x88nnern, sich zur Nachtruhe zu begeben.'
 b'The British commander ordered his men to rest for the night.']
[b'Tom und Maria sind getrennte Wege gegangen.'
 b'Tom and Mary have gone their separate ways.']
[b'Das ist eine schwierige Sache.' b"It's a difficult question."]
[b'Seit seinem Schulabschluss versucht Tom, Mary dazu zu bringen, ihn zu heiraten.'
 b'Tom has been trying to get Mary to marry him ever since he graduated from high school.']


In [28]:
print(len(train), len(val), len(test))

209199 26150 26150


## Save to txt

Sentence are saved in string form 

In [46]:
%%time
with open("../data/deu-eng-train.txt", "w", encoding="utf-8") as output:
  for i, p in enumerate(train):
    output.write(p[0].decode('utf-8') + '\t' + p[1].decode('utf-8') + '\n')

with open("../data/deu-eng-val.txt", "w", encoding="utf-8") as output:
  for i, p in enumerate(val):
    output.write(p[0].decode('utf-8') + '\t' + p[1].decode('utf-8') + '\n')

with open("../data/deu-eng-test.txt", "w", encoding="utf-8") as output:
  for i, p in enumerate(test):
    output.write(p[0].decode('utf-8') + '\t' + p[1].decode('utf-8') + '\n')

CPU times: total: 453 ms
Wall time: 488 ms


## Save to Pkl

Sentence are saved in Byte datastructure 

In [49]:
%%time
dump(train, open("../data/deu-eng-train.pkl", 'wb'))
dump(val, open("../data/deu-eng-val.pkl", 'wb'))
dump(test, open("../data/deu-eng-test.pkl", 'wb'))

CPU times: total: 125 ms
Wall time: 271 ms


## Load from Pkl

In [53]:
filename = '../data/deu-eng-train.pkl'
with open(filename, 'rb') as file:
  dataset = load(file)
for i, pair in enumerate(dataset):
  if i == 3: break
  print(pair[0])
  print(pair[0].decode('utf-8'))
  print(pair[1])
  print(pair[1].decode('utf-8'))
  print('\n')

b'Tom zahlt ein Bu\xc3\x9fgeld.'
Tom zahlt ein Bußgeld.
b'Tom is paying a fine.'
Tom is paying a fine.


b'Unter dem Bett ist eine Katze.'
Unter dem Bett ist eine Katze.
b'There is a cat under the bed.'
There is a cat under the bed.


b'Tom wollte, dass ich lu\xcc\x88ge.'
Tom wollte, dass ich lüge.
b'Tom wanted me to lie.'
Tom wanted me to lie.




# Conclusion