Data is available on http://www.manythings.org/anki/

In [1]:
import string
from unicodedata import normalize
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from numpy import argmax

Using TensorFlow backend.


In [2]:
total_examples_taken = 20000

In [3]:
file = open('deu.txt')
text = file.readlines()

Need to Convert all text to lowercase, remove punctuations and separate German with english and encode special characters in german to utf

In [4]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items(): 
        if index == integer:
            return word
    return None

In [5]:
def one_hot_encoding(trainY):
    ylist = list()
    for sequence in trainY:
        encoded = to_categorical(sequence, num_classes=english_vocab_size)
        ylist.append(encoded)
    y = np.asarray(ylist)
    y = y.reshape(trainY.shape[0], trainY.shape[1], english_vocab_size)
    return y

In [6]:
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

In [7]:
def encode_sequences(tokenizer, length, lines):
    X = tokenizer.texts_to_sequences(lines)
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

In [8]:
def max_length(lines):
    return max(len(line.split()) for line in lines)

In [9]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [10]:
final_corpus_X = []
exclude = set(string.punctuation)
for i in range(len(text)):
    temp_text = text[i].lower().replace('\n','').split('\t')
    for j in range(len(temp_text)):
        temp_text[j] = ''.join(ch for ch in temp_text[j] if ch not in exclude)
        temp_text[j] = normalize('NFD', temp_text[j]).encode('ascii', 'ignore')
        temp_text[j] = temp_text[j].decode('UTF-8') 
    final_corpus_X.append(temp_text)

In [33]:
def predict_sentences(sentence,model):
    german_sentence = encode_sequences(german_tokenization, german_length, [sentence])
    val = model.predict(german_sentence)
    integers = [argmax(vector) for vector in val[0]]
    target = list()
    for i in integers:
        word = word_for_id(i, english_tokenization)
        if word is None:
            break
        target.append(word)
    print(target)

In [11]:
for i in range(1,5):
    print(final_corpus_X[i][0],'=>' ,final_corpus_X[i][1])

hi => gru gott
run => lauf
wow => potzdonner
wow => donnerwetter


In [12]:
final_corpus_X = np.asarray(final_corpus_X)
final_corpus_X[:5,:]

array([['hi', 'hallo'],
       ['hi', 'gru gott'],
       ['run', 'lauf'],
       ['wow', 'potzdonner'],
       ['wow', 'donnerwetter']], dtype='<U370')

In [13]:
final_corpus_X = final_corpus_X[:total_examples_taken]

In [14]:
english_tokenization = create_tokenizer(final_corpus_X[:,0])
english_vocab_size = len(english_tokenization.word_index) + 1
english_length = max_length(final_corpus_X[:, 0])
print(english_tokenization, english_vocab_size, english_length)

<keras_preprocessing.text.Tokenizer object at 0x1354ebb00> 3808 6


In [15]:
german_tokenization = create_tokenizer(final_corpus_X[:,1])
german_vocab_size = len(german_tokenization.word_index) + 1
german_length = max_length(final_corpus_X[:,1])
print(german_tokenization, german_vocab_size, german_length)

<keras_preprocessing.text.Tokenizer object at 0x1390ef630> 5852 10


In [16]:
trainX = encode_sequences(german_tokenization, german_length, final_corpus_X[:, 1])
trainY = encode_sequences(english_tokenization, english_length, final_corpus_X[:, 0])

In [17]:
trainX.shape

(20000, 10)

In [18]:
trainY.shape

(20000, 6)

In [19]:
y = one_hot_encoding(trainY)

In [20]:
y.shape

(20000, 6, 3808)

In [21]:
model = define_model(german_vocab_size, english_vocab_size, german_length, english_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
model.fit(trainX, y, epochs=30, batch_size=64, verbose=2)

Epoch 1/30
 - 39s - loss: 3.7255
Epoch 2/30
 - 38s - loss: 3.0967
Epoch 3/30
 - 35s - loss: 2.7955
Epoch 4/30
 - 35s - loss: 2.5220
Epoch 5/30
 - 36s - loss: 2.2922
Epoch 6/30
 - 36s - loss: 2.0758
Epoch 7/30
 - 36s - loss: 1.8731
Epoch 8/30
 - 35s - loss: 1.6954
Epoch 9/30
 - 35s - loss: 1.5349
Epoch 10/30
 - 35s - loss: 1.3892
Epoch 11/30
 - 48s - loss: 1.2572
Epoch 12/30
 - 36s - loss: 1.1326
Epoch 13/30
 - 35s - loss: 1.0189
Epoch 14/30
 - 36s - loss: 0.9157
Epoch 15/30
 - 35s - loss: 0.8209
Epoch 16/30
 - 52s - loss: 0.7343
Epoch 17/30
 - 44s - loss: 0.6582
Epoch 18/30
 - 38s - loss: 0.5887
Epoch 19/30
 - 35s - loss: 0.5278
Epoch 20/30
 - 39s - loss: 0.4763
Epoch 21/30
 - 35s - loss: 0.4293
Epoch 22/30
 - 35s - loss: 0.3879
Epoch 23/30
 - 35s - loss: 0.3524
Epoch 24/30
 - 35s - loss: 0.3221
Epoch 25/30
 - 35s - loss: 0.2949
Epoch 26/30
 - 35s - loss: 0.2706
Epoch 27/30
 - 35s - loss: 0.2508
Epoch 28/30
 - 35s - loss: 0.2325
Epoch 29/30
 - 35s - loss: 0.2153
Epoch 30/30
 - 36s - lo

<keras.callbacks.History at 0x135750f60>

In [40]:
sentences = ['er ging surfen','ich konnte nicht gehen','das gefallt mir','ich liebe dich','ich sagte du sollst den mund halten','wie geht es eurem vater','ich gehe immer zu fu']
for i in sentences:
    predict_sentences(i,model)

['he', 'went', 'surfing']
['i', 'couldnt', 'walk']
['i', 'like', 'that']
['i', 'love', 'you']
['i', 'said', 'shut', 'it']
['hows', 'your', 'dad']
['i', 'always', 'walk']
