In [1]:
import numpy as np
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

In [2]:
from keras import backend as K
from keras.models import Model
from keras.layers import Add, Bidirectional, Concatenate, Input, LSTM, Dense, Conv2D, Flatten, MaxPooling2D, Reshape, Activation, Embedding, TimeDistributed
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer
import pandas as pd

from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
df = pd.read_csv('reuters.csv', delimiter='\t', names=['id', 'timestamp', 'title', 'url', 'first_line'])

In [4]:
df = df.set_index('id')

In [5]:
batch_size = 64  # Batch size for training.
epochs = 10000  # Number of epochs to train for.
latent_dim = 10#24  # Latent dimensionality of the encoding space (number of nodes per LSTM layer).
dense_dim = 10#24 # Number of nodes in the Dense layer of the attention module
num_samples = min(10, len(df))  # Number of samples to train on.
num_lstm = 4 # Number of LSTM layers in the encoder and decoder.

In [6]:
input_texts = df['first_line'][:num_samples]
output_texts = df['title'][:num_samples]

In [7]:
input_texts = input_texts.apply(lambda i: i.strip().lower() + ' \n')
output_texts = output_texts.apply(lambda o: '\t '+ o.strip().lower() + ' \n')

In [8]:
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~', oov_token='<unk>')
tokenizer.fit_on_texts(input_texts)
tokenizer.fit_on_texts(output_texts)

In [9]:
input_sequences = tokenizer.texts_to_sequences(input_texts)
output_sequences = tokenizer.texts_to_sequences(output_texts)
word_id_dict = tokenizer.word_index
id_word_dict = dict()
for k in word_id_dict:
    id_word_dict[word_id_dict[k]] = k;

In [10]:
num_samples = len(input_texts)
max_encoder_seq_length = max([len(seq) for seq in input_sequences])
max_decoder_seq_length = max([len(seq) for seq in output_sequences])
num_dict_size = len(tokenizer.word_index)
# num_input_tokens = len(input_dict)
# num_output_tokens = len(output_dict)

In [11]:
# input_data = np.zeros((num_samples, max_encoder_seq_length, latent_dim))
# output_data = np.zeros((num_samples, max_decoder_seq_length, latent_dim))

In [12]:
print('Number of samples:', num_samples)
print('Max sequence length for inputs:', max_encoder_seq_length)
# print('Num of tokens for inputs:', num_input_tokens)
print('Max sequence length for outputs:', max_decoder_seq_length)
# print('Num of tokens for outputs:', num_output_tokens)
print('Number of words in the dictionary (including OOV token):', num_dict_size)

Number of samples: 10
Max sequence length for inputs: 42
Max sequence length for outputs: 14
Number of words in the dictionary (including OOV token): 258


In [13]:
# encoder_input_data = np.zeros((num_samples, max_encoder_seq_length, num_input_tokens),dtype='float32')
# reversed_encoder_input_data = np.zeros((num_samples, max_encoder_seq_length, num_input_tokens),dtype='float32')
# decoder_input_data = np.zeros((num_samples, max_decoder_seq_length, num_output_tokens),dtype='float32')
# decoder_target_data = np.zeros((num_samples, max_decoder_seq_length, num_output_tokens),dtype='float32')

In [14]:
# for i in range(num_samples):
#     input_text = input_tokenizer.texts_to_sequences(input_texts[i])
#     input_text = input_dict
#     encoder_input_data[i, :, :] = input_tokenizer.texts_to_sequences(input_texts[i])
#     reversed_encoder_input_data[i, :, :] = encoder_input_data[i, ::-1, :]
#     decoder_input_data[i, :, :] = output_tokenizer.texts_to_sequences(output_texts[i])
#     decoder_target_data[i, :, :] = np.row(decoder_input_data[i, :, :], 1)
#     decoder_target_data[i, 0, :] = 0;

In [82]:
## GNMT variant 1: 
# Define an input sequence and process it.
encoder_inputs = Input(shape=(max_encoder_seq_length,), name='encoder_inputs')
# print(encoder_inputs.shape)
first_encoder_lstm = Bidirectional(LSTM(latent_dim, return_sequences=True), merge_mode='concat')
# first_encoder_lstm = LSTM(latent_dim)
next_input = Reshape((max_encoder_seq_length, 1))(encoder_inputs)
next_input = first_encoder_lstm(next_input)
# next_input = Reshape((1, latent_dim))(next_input)
current_output = LSTM(latent_dim, return_sequences=True)(next_input)
next_input = current_output
# Assuming there are at least 3 LSTM layers
for i in range(num_lstm-3):
#     current_output = Reshape((1, latent_dim))(next_input)
    current_output = LSTM(latent_dim, return_sequences=True)(next_input)
    next_input = Add()([current_output, next_input])

# current_output = Reshape((1, latent_dim))(next_input)
encoder_outputs = LSTM(latent_dim, return_sequences=True, name='encoder_outputs')(next_input)

# First layer of decoder
decoder_inputs = Input(shape=(max_decoder_seq_length,), name='decoder_inputs')
next_input = Reshape((max_decoder_seq_length, 1))(decoder_inputs)
first_decoder_lstm = LSTM(latent_dim, return_sequences=True)
decoder_output = first_decoder_lstm(next_input)
# decoder_output = Reshape((latent_dim, 1))(decoder_output)

# Attention module as a model
attention_input = Input((latent_dim,))
attention_input_reshaped = Reshape((1, latent_dim))(attention_input)
attention_input_with_encoded = Concatenate(name='attention_input', axis=1)([encoder_outputs, attention_input_reshaped])
attention_output = Dense(dense_dim, activation='relu')(attention_input_with_encoded)
attention_output = Dense(max_encoder_seq_length, activation='softmax')(attention_output)
attention_model = Model(inputs=attention_input, outputs=attention_output)
attention_model.summary()

# Rest of the decoder module
print(decoder_output.shape)
attention_model_output = TimeDistributed(attention_model)(decoder_output)
print(attention_model_output.shape)
# attention_model_output = Reshape()
next_input = Concatenate()([attention_model_output, decoder_output])
current_output = LSTM(latent_dim, return_sequences=True)(next_input)
# current_output = Reshape((latent_dim, 1))(current_output)
next_input = current_output
for _ in range(num_lstm-3):
    next_input_with_attention = Concatenate()([attention_model_output, next_input])
    current_output = LSTM(latent_dim, return_sequences=True)(next_input_with_attention)
#     current_output = Reshape((latent_dim, 1))(current_output)
    next_input = Add()([current_output, next_input])

next_input_with_attention = Concatenate()([attention_output, next_input])
current_output = LSTM(latent_dim, return_sequences=True)(next_input_with_attention)
decoder_output = TimeDistributed(Dense(num_dict_size, activation='softmax'))(current_output)

# model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_output)
# model.compile(optimizer='rmsprop', loss='binary_crossentropy')
# model.summary()

RuntimeError: Graph disconnected: cannot obtain value for tensor Tensor("encoder_inputs_41:0", shape=(?, 42), dtype=float32) at layer "encoder_inputs". The following previous layers were accessed without issue: []

In [44]:
encoder_input_data = np.zeros((num_samples, latent_dim, max_encoder_seq_length))
decoder_input_data = np.zeros((num_samples, latent_dim, max_decoder_seq_length))
decoder_target_data = np.zeros((num_samples, max_decoder_seq_length, num_dict_size))

In [36]:
for i in range(num_samples):
    for j in range(latent_dim):
        input_row = input_sequences[i]
        output_row = output_sequences[i]
        encoder_input_data[i, j, :len(input_row)] = input_row
        decoder_input_data[i, j, :len(output_row)] = output_row
    for k in range(len(output_row)-1):
        decoder_target_data[i, k, output_row[k+1]] = 1
    for k in range(len(output_row), max_decoder_seq_length):
        decoder_target_data[i, k, :] = 1/max_decoder_seq_length

In [37]:
encoder_input_data[0, 0, :]

array([102., 103.,  22.,  32.,   3.,   9.,   1., 104.,  12., 105.,   4.,
       106.,   1., 107.,   7.,   1., 108., 109., 110., 111.,  33.,   5.,
        34., 112., 113., 114.,  35., 115., 116., 117., 118.,  11.,  36.,
       119.,   1., 120.,   7., 121.,   2.,   0.,   0.,   0.])

In [38]:
decoder_input_data[0, 0, :]

array([  6., 235.,  12., 236.,  11.,  36.,   3., 237.,   7.,  33.,   2.,
         0.,   0.,   0.])

In [39]:
decoder_target_data[0, 0, :]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [40]:
# for k in range(num_samples):
#     encoder_input_data[k][:len(input_sequences[k])] = input_sequences[k][:]
#     decoder_input_data[k][:len(output_sequences[k])] = output_sequences[k][:]
#     decoder_target_data[k][:len(output_sequences[k])-1] = output_sequences[k][1:]

In [41]:
# encoder_input_data = encoder_input_data.reshape((*encoder_input_data.shape, 1))
# decoder_input_data = decoder_input_data.reshape((*decoder_input_data.shape, 1))
# decoder_target_data = decoder_target_data.reshape((*decoder_target_data.shape, 1))

In [42]:
# Note that `decoder_target_data` needs to be one-hot encoded,
# rather than sequences of integers like `decoder_input_data`!
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs#,
          #validation_split=0.2
         )

ValueError: Error when checking target: expected time_distributed_2 to have shape (10, 258) but got array with shape (14, 258)

In [None]:
# # Next: inference mode (sampling).
# # Here's the drill:
# # 1) encode input and retrieve initial decoder state
# # 2) run one step of decoder with this initial state
# # and a "start of sequence" token as target.
# # Output will be the next target token
# # 3) Repeat with the current target token and current states

# # Define sampling models
# encoder_model = Model(encoder_inputs, encoder_states)
# encoder_model.summary()

# decoder_state_input_h = Input(shape=(latent_dim,))
# decoder_state_input_c = Input(shape=(latent_dim,))
# decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
# decoder_mapped_input = decoder_embedding(decoder_inputs)
# decoder_outputs, state_h, state_c = decoder_lstm(decoder_mapped_input, initial_state=decoder_states_inputs)
# decoder_states = [state_h, state_c]
# decoder_outputs = Flatten()(decoder_outputs)
# decoder_outputs = decoder_dense(decoder_outputs)
# decoder_model = Model(
#     [decoder_inputs] + decoder_states_inputs,
#     [decoder_outputs] + decoder_states)
# decoder_model.summary()

In [None]:
# def decode_sequence(input_seq):
#     # Encode the input as state vectors.
#     states_value = encoder_model.predict(input_seq)

#     # Generate empty target sequence of length 1.
#     target_seq = np.zeros((1, max_decoder_seq_length), dtype='int32')
#     # Populate the first character of target sequence with the start character.
#     target_seq[0, 0] = word_id_dict['\t']

#     # Sampling loop for a batch of sequences
#     # (to simplify, here we assume a batch of size 1).
#     stop_condition = False
#     decoded_sentence = ''
#     i = 1
#     id_word_dict_cpy = id_word_dict.copy()
#     id_word_dict_cpy[0] = '0'
#     while not stop_condition:
# #         print(target_seq)
#         output_tokens, h, c = decoder_model.predict(
#             [target_seq] + states_value)

#         # Sample a token
# #         print("output_token:", output_tokens)
# #         sampled_char = id_word_dict[int(output_tokens)]
#         decoded_sentence = str([id_word_dict_cpy[int(round(output_token))] for output_token in output_tokens[0]])
#         if i > 14:
#             stop_condition = True
#         else:
#             target_seq[0, 1:i] = output_tokens[0, 1:i]
#             i += 1
# #         # Exit condition: either hit max length
# #         # or find stop character.
# #         if (sampled_char == '\n' or
# #            len(decoded_sentence) > max_decoder_seq_length):
# #             stop_condition = True

# #         # Update the target sequence (of length 1).
# #         target_seq[0, i] = output_tokens
# #         i += 1

# #         # Update states
# #         states_value = [h, c]

#     return decoded_sentence

In [None]:
# for seq_index in range(num_samples):
#     # Take one sequence (part of the training set)
#     # for trying out decoding.
#     input_seq = encoder_input_data[seq_index: seq_index + 1]
#     decoded_sentence = decode_sequence(input_seq)
#     print('-')
#     print('Input sentence:', input_texts[seq_index])
#     print('Decoded sentence:', decoded_sentence)