In [2]:
import load_util
import numpy as np
import pickle as pkl
import tensorflow.keras as K

from nltk.translate.bleu_score import corpus_bleu

In [3]:
MODEL_FILEPATH = 'models/model.h5'
EMBED_DIM = 256
LSTM1_SIZE = 128
LSTM2_SIZE = 128

In [7]:
data = load_util.preprocess(load_util.load_data('deu-eng/deu.txt'))[:20000]
np.random.seed(0)
np.random.shuffle(data)
SRC, TGT = data[:, 0], data[:, 1]

In [5]:
def encode_and_pad(X):
    maxlen = max(len(s.split()) for s in X)
    T = K.preprocessing.text.Tokenizer()
    T.fit_on_texts(X)
    X_enc = T.texts_to_sequences(X)
    X_enc_pad = K.preprocessing.sequence.pad_sequences(X_enc, maxlen=maxlen, padding='post')
    return X_enc_pad, T

def onehot_3d(X, vocab_len):
    onehot = np.array([K.utils.to_categorical(seq, vocab_len) for seq in X])
    return onehot

def vocab_len(X):
    return len({num for seq in X for num in seq})

In [8]:
SRC_enc_pad, SRC_tokenizer = encode_and_pad(SRC)
TGT_enc_pad, TGT_tokenizer = encode_and_pad(TGT)

SRC_vocab_len = vocab_len(SRC_enc_pad)
TGT_vocab_len = vocab_len(TGT_enc_pad)

SRC_enc_pad_onehot = onehot_3d(SRC_enc_pad, SRC_vocab_len)
TGT_enc_pad_onehot = onehot_3d(TGT_enc_pad, TGT_vocab_len)

In [54]:
def train_split(X, Y, train_size=.9):
    assert len(SRC_enc_pad) == len(TGT_enc_pad)
    cutoff = int(len(X) * train_size)
    return X[:cutoff], X[cutoff:], Y[:cutoff], Y[cutoff:]

In [55]:
SRC_train, SRC_test, TGT_train, TGT_test = train_split(SRC_enc_pad, TGT_enc_pad, train_size=.9)

TGT_train_onehot = onehot_3d(TGT_train, TGT_vocab_len)
TGT_test_onehot = onehot_3d(TGT_train, TGT_vocab_len)

In [12]:
def build_model(
    input_vocab_len,
    input_max_len,
    output_vocab_len,
    output_max_len,
    embed_dim,
    lstm1_units,
    lstm2_units):
    
    model = K.models.Sequential()
    model.add(K.layers.Embedding(
        input_dim=input_vocab_len,
        output_dim=embed_dim,
        mask_zero=True,
        input_length=input_max_len))
    model.add(K.layers.LSTM(units=lstm1_units))
    model.add(K.layers.RepeatVector(n=output_max_len))
    model.add(K.layers.LSTM(units=lstm2_units, return_sequences=True))
    model.add(K.layers.TimeDistributed(K.layers.Dense(output_vocab_len, activation='softmax')))
    
    return model

In [13]:
model = build_model(
    input_vocab_len=SRC_vocab_len,
    input_max_len=SRC_train.shape[1],
    output_vocab_len=TGT_vocab_len,
    output_max_len=TGT_train_onehot.shape[1],
    embed_dim=EMBED_DIM,
    lstm1_units=LSTM1_SIZE,
    lstm2_units=LSTM2_SIZE
)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 5, 256)            944384    
_________________________________________________________________
lstm (LSTM)                  (None, 128)               197120    
_________________________________________________________________
repeat_vector (RepeatVector) (None, 10, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 10, 128)           131584    
_________________________________________________________________
time_distributed (TimeDistri (None, 10, 5742)          740718    
Total params: 2,013,806
Trainable params: 2,013,806
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [15]:
model_checkpoint = K.callbacks.ModelCheckpoint(
    filepath=MODEL_FILEPATH,
    monitor='val_loss',
    verbose=True,
    save_best_only=True)
model.fit(
    x=SRC_train,
    y=TGT_train_onehot,
    batch_size=64,
    epochs=30,
    verbose=True,
    callbacks=[model_checkpoint],
    validation_data=[SRC_test, TGT_test_onehot])

Train on 18000 samples, validate on 2000 samples
Epoch 1/30
Epoch 00001: val_loss improved from inf to 2.48037, saving model to models/model.h5
Epoch 2/30
Epoch 00002: val_loss improved from 2.48037 to 2.30046, saving model to models/model.h5
Epoch 3/30
Epoch 00003: val_loss improved from 2.30046 to 2.21923, saving model to models/model.h5
Epoch 4/30
Epoch 00004: val_loss improved from 2.21923 to 2.11934, saving model to models/model.h5
Epoch 5/30
Epoch 00005: val_loss improved from 2.11934 to 2.02586, saving model to models/model.h5
Epoch 6/30
Epoch 00006: val_loss improved from 2.02586 to 1.95704, saving model to models/model.h5
Epoch 7/30
Epoch 00007: val_loss improved from 1.95704 to 1.86880, saving model to models/model.h5
Epoch 8/30
Epoch 00008: val_loss improved from 1.86880 to 1.80243, saving model to models/model.h5
Epoch 9/30
Epoch 00009: val_loss improved from 1.80243 to 1.73454, saving model to models/model.h5
Epoch 10/30
Epoch 00010: val_loss improved from 1.73454 to 1.670

<tensorflow.python.keras.callbacks.History at 0x6469e6810>

In [78]:
def bleu(true, pred):
    weights = [
        (1,     0,   0,   0),
        (1/2, 1/2,   0,   0),
        (1/3, 1/3, 1/3,   0),
        (1/4, 1/4, 1/4, 1/4)
    ]
    return [corpus_bleu(true, pred, w) for w in weights]

def evaluate(model, X, Y, X_tokenizer, Y_tokenizer):
    SRC_idx2word = {v:k for k, v in X_tokenizer.word_index.items()}
    TGT_idx2word = {v:k for k, v in Y_tokenizer.word_index.items()}
    predictions = model.predict(X)
    y = []
    y_pred = []
    for i, pred in enumerate(predictions):
        src_sent = [SRC_idx2word[idx] for idx in X[i] if idx in SRC_idx2word]
        tgt_sent = [TGT_idx2word[idx] for idx in Y[i] if idx in TGT_idx2word]
        tgt_pred = [np.argmax(val) for val in pred]
        tgt_pred = [TGT_idx2word[idx] for idx in tgt_pred if idx in TGT_idx2word]
        y.append([tgt_sent])
        y_pred.append(tgt_pred)
        if i < 20:
            print(' '.join(tgt_sent), ' --> ', ' '.join(tgt_pred))
    bleu_scores = bleu(y, y_pred)
    for idx, bleu_score in enumerate(bleu_scores):
        print('{}-gram BLEU: {:.4f}'.format(idx + 1, bleu_score))

In [76]:
evaluate(
    model=finalized_model,
    X=SRC_train,
    Y=TGT_train,
    X_tokenizer=SRC_tokenizer,
    Y_tokenizer=TGT_tokenizer)

er starb gestern er ist gestern gestorben
ich habe zweifel ich habe zweifel
tom erblindete tom erblindete
er kann dir nicht helfen er kann dir nicht helfen
ich bin kein fachmann ich bin kein fachmann fach
seid tapfer seid tapfer
hndige es aus hndige es aus
lasst tom nachhause gehen lass tom nachhause gehen
dein hund hat mich gebissen die hund hat mich gebissen
tom ist gemein tom ist gemein
1-gram BLEU: 0.7341
2-gram BLEU: 0.6401
3-gram BLEU: 0.5506
4-gram BLEU: 0.4073


In [77]:
evaluate(
    model=finalized_model,
    X=SRC_test,
    Y=TGT_test,
    X_tokenizer=SRC_tokenizer,
    Y_tokenizer=TGT_tokenizer)

tom hat einen dachschaden tom hat ein wahnsinniger
bin ich frh dran bin ich frh
ich bin kein spion ich bin ein feigling
ich mag knoblauch ich liebe australien
bist du in gefahr sind sie in gefahr
fhre mich nicht in versuchung fhren sie mich mich mich versuchung
stimmt das nicht ist es nicht wahr
tom war naiv tom war naiv
er verlie das zimmer er ist den dem
ich sehe den jungen ich sehe auto
1-gram BLEU: 0.5429
2-gram BLEU: 0.4113
3-gram BLEU: 0.3002
4-gram BLEU: 0.1873
