In [0]:
from __future__ import print_function

import os

In [0]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense

import numpy as np

In [0]:
batch_size = 64  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 25000  # Number of samples to train on.


In [0]:
def prepare_data(filename, is_train=True, score_filter=3, limit=None):
    usecols = [1, 2, 5] if is_train else [1, 2, 3]
    
    df = pd.read_csv(filename, sep='\t', usecols=usecols, names=['source', 'target', 'score'])[:limit]
    df = df[df['score'] >= score_filter]
    
    source = df['source']
    target = df['target']
    
    source = '\t ' + source + '\n'
    target = '\t ' + target + '\n'
    
    source_chars = {char for string in source.values for char in string}
    target_chars = {char for string in target.values for char in string}
    
    return source.values, target.values, source_chars, target_chars

In [0]:
train_input_texts, train_target_texts, train_input_chars, train_target_chars = prepare_data('en-train-100K.txt', limit=num_samples)
# dev_input_texts, dev_target_texts, dev_input_chars, dev_target_chars = prepare_data('en-dev.txt', False)
# test_input_texts, test_targset_texts, test_input_chars, test_target_chars = prepare_data('en-test.txt', False)

In [22]:
print(train_input_chars)

{'ó', 'g', '♪', '7', ' ', '|', 'Z', ')', '/', 'm', '\u200b', 'Q', '$', '(', 'j', '0', 'ο', 'é', '6', 'S', 'c', 'f', 's', 'u', ':', '3', 'í', 'W', 'p', 'e', 'ï', 'v', 'V', '8', 'J', 'C', 'h', 'x', '9', ',', 'X', 'B', 'l', 'F', 'i', 'z', '\x9d', '!', 'G', 'O', "'", 'á', 'K', '@', 'M', 'd', '"', 'I', 'Y', 'H', '?', 'T', '1', 'N', 'ν', 'a', 'L', '5', 'R', 'o', 'n', 'P', 'U', '4', 'b', '%', '-', '2', 'E', 'r', '\n', 'y', '.', '\t', 't', 'w', 'k', 'D', 'A', 'q'}


In [23]:
train_input_texts[:4]

array(['\t Jumby now wants to be born .\n',
       '\t It was a difficult and long delivery .\n',
       '\t I like to be beautiful everyday .\n',
       '\t Bernadette wants a prenup .\n'], dtype=object)

In [24]:
train_target_texts[:4]

array(['\t Jumby want birth .\n',
       '\t The delivery was difficult and long .\n',
       '\t I like to be pretty everyday .\n',
       '\t Bernadette wants to get a prenup .\n'], dtype=object)

In [25]:
input_characters = sorted(list(train_input_chars))
target_characters = sorted(list(train_target_chars))
num_encoder_tokens = len(train_input_chars) + 1
num_decoder_tokens = len(train_target_chars) + 1
max_encoder_seq_length = max([len(txt) for txt in train_input_texts])
max_decoder_seq_length = max([len(txt) for txt in train_target_texts])

print('Number of samples:', len(train_input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 25000
Number of unique input tokens: 91
Number of unique output tokens: 112
Max sequence length for inputs: 212
Max sequence length for outputs: 127


In [0]:
def prepare_enc_dec(input_texts, target_texts, input_characters, target_characters):
    input_token_index = dict(
        [(char, i) for i, char in enumerate(input_characters)])
    target_token_index = dict(
        [(char, i) for i, char in enumerate(target_characters)])

    encoder_input_data = np.zeros(
        (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
        dtype='float32')
    decoder_input_data = np.zeros(
        (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
        dtype='float32')
    decoder_target_data = np.zeros(
        (len(target_texts), max_decoder_seq_length, num_decoder_tokens),
        dtype='float32')

    for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
        for t, char in enumerate(input_text):
            encoder_input_data[i, t, input_token_index[char]] = 1.
        encoder_input_data[i, t + 1:, input_token_index['\n']] = 1.
        for t, char in enumerate(target_text):
            decoder_input_data[i, t, target_token_index[char]] = 1.
            if t > 0:
                decoder_target_data[i, t - 1, target_token_index[char]] = 1.
        decoder_input_data[i, t + 1:, target_token_index['\n']] = 1.
        decoder_target_data[i, t:, target_token_index['\n']] = 1.
    return input_token_index, target_token_index, encoder_input_data, decoder_input_data, decoder_target_data


input_token_index, target_token_index, encoder_input_data, decoder_input_data, decoder_target_data = prepare_enc_dec(train_input_texts, train_target_texts, input_characters, target_characters)

In [0]:
learning_rate = 0.005

In [0]:
def scheduler(epoch, lr):
    if epoch:
        if epoch % 10 == 0:
            return learning_rate / 2
        return lr*0.8
    return lr

In [14]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
# encoder = LSTM(latent_dim, return_state=True, dropout=0.4, recurrent_dropout=0.4)
encoder = LSTM(latent_dim, return_state=True, dropout=0.3)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
# decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.3)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

opt = keras.optimizers.RMSprop(learning_rate=learning_rate)


# Run training
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

model.fit(
    [encoder_input_data, decoder_input_data], 
    decoder_target_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
    callbacks=[
        keras.callbacks.LearningRateScheduler(scheduler, verbose=1),
        keras.callbacks.EarlyStopping(
          monitor='val_loss',
          min_delta=0,
          patience=5,
          verbose=1,
          mode='auto',
        )
    ],
)


Epoch 00001: LearningRateScheduler reducing learning rate to 0.004999999888241291.
Epoch 1/100

Epoch 00002: LearningRateScheduler reducing learning rate to 0.003999999910593033.
Epoch 2/100

Epoch 00003: LearningRateScheduler reducing learning rate to 0.0031999997794628144.
Epoch 3/100

Epoch 00004: LearningRateScheduler reducing learning rate to 0.0025599997490644458.
Epoch 4/100

Epoch 00005: LearningRateScheduler reducing learning rate to 0.0020479997619986534.
Epoch 5/100

Epoch 00006: LearningRateScheduler reducing learning rate to 0.0016383998095989229.
Epoch 6/100

Epoch 00007: LearningRateScheduler reducing learning rate to 0.0013107198290526869.
Epoch 7/100

Epoch 00008: LearningRateScheduler reducing learning rate to 0.0010485759004950524.
Epoch 8/100

Epoch 00009: LearningRateScheduler reducing learning rate to 0.0008388606831431389.
Epoch 9/100

Epoch 00010: LearningRateScheduler reducing learning rate to 0.0006710885558277369.
Epoch 10/100

Epoch 00011: LearningRateSched

<tensorflow.python.keras.callbacks.History at 0x7f0de0061c50>

In [0]:
# adam, cycle each 10th epoch, starting at 0.001
# Epoch 00050: LearningRateScheduler reducing learning rate to 0.0032804996240884065.
# Epoch 50/100
# 300/300 [==============================] - 13s 44ms/step - loss: 0.1955 - accuracy: 0.9415 - val_loss: 0.1606 - val_accuracy: 0.9533 - lr: 0.0033
# Epoch 00050: early stopping

# rms, cycle each 10th epoch, starting at 0.001
# Epoch 00035: LearningRateScheduler reducing learning rate to 0.0032804996240884065.
# Epoch 35/100
# 300/300 [==============================] - 13s 44ms/step - loss: 0.1837 - accuracy: 0.9454 - val_loss: 0.1561 - val_accuracy: 0.9552 - lr: 0.0033
# Epoch 00035: early stopping

# rms, cycle each 10th epoch, starting at 0.005
# Epoch 00075: LearningRateScheduler reducing learning rate to 0.0010239998809993267.
# Epoch 75/100
# 300/300 [==============================] - 13s 45ms/step - loss: 0.1643 - accuracy: 0.9507 - val_loss: 0.1547 - val_accuracy: 0.9565 - lr: 0.0010
# Epoch 00075: early stopping

# rms, cycle each 15th epoch, starting at 0.005
# Epoch 00064: LearningRateScheduler reducing learning rate to 0.0012799998745322229.
# Epoch 64/100
# 300/300 [==============================] - 12s 40ms/step - loss: 0.1877 - accuracy: 0.9440 - val_loss: 0.1623 - val_accuracy: 0.9533 - lr: 0.0013
# Epoch 00064: early stopping

# no decay
# Epoch 46/100
# 300/300 [==============================] - 12s 41ms/step - loss: 0.1739 - accuracy: 0.9479 - val_loss: 0.1529 - val_accuracy: 0.9566
# Epoch 00046: early stopping

In [0]:
# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())

In [0]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [32]:
from rouge import Rouge 

rouge = Rouge()


r1 = []

for seq_index in range(100):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    score = rouge.get_scores(decoded_sentence, train_input_texts[seq_index])[0]
    r1.append(score["rouge-1"]["f"])
    print('-------')
    print('Input sentence:', train_input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)
    print('Rouge-1:', (score["rouge-1"]["f"]))

-------
Input sentence: 	 Jumby now wants to be born .

Decoded sentence:  But that 's not going to happen .

Rouge-1: 0.266666661688889
-------
Input sentence: 	 It was a difficult and long delivery .

Decoded sentence:  It 's not that say .

Rouge-1: 0.2857142808163266
-------
Input sentence: 	 I like to be beautiful everyday .

Decoded sentence:  I think I 'm gonna go the time .

Rouge-1: 0.24999999507812506
-------
Input sentence: 	 Bernadette wants a prenup .

Decoded sentence:  I 'm not sure you 're all right .

Rouge-1: 0.14285713826530627
-------
Input sentence: 	 Don 't say you don 't remember me .

Decoded sentence:  Don 't you forget that .

Rouge-1: 0.5333333285333334
-------
Input sentence: 	 Hyah ! Hmm .

Decoded sentence:  The tool .

Rouge-1: 0.2857142808163266
-------
Input sentence: 	 He believes in you .

Decoded sentence:  He 's not going to be okay .

Rouge-1: 0.3076923029585799
-------
Input sentence: 	 Sun 's going to come up soon .

Decoded sentence:  She 's gon

In [34]:
f'Rouge-1 average: {np.mean(np.array(r1))}'

'Rouge-1 average: 0.34554883725468355'