Inspired by the [tutorial](https://keras.io/examples/lstm_seq2seq/), I am going to implement character-level seq2seq for paraphrasing.

In [1]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import Sequence

In [2]:
batch_size = 16  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
# num_samples = 10000  # Number of samples to train on.

In [3]:
def load_data(file):
    with open(file, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')
        return lines

In [4]:
data_path = 'opusparcus_v2/en-train-100K.txt'

In [5]:
def parse_data(file, line_validator, bi_directional=False):
    lines = load_data(file)
    input_texts, target_texts =[], []
    
    for line in lines:
        input_text, target_text = line_validator(line)
        
        if input_text is not None and target_text is not None:
            
            input_texts.append(input_text)
            target_texts.append(target_text)
            
            if bi_directional:
                input_texts.append(target_text)
                target_texts.append(input_text)
        
    
    return input_texts, target_texts


In [6]:
def validate_for_charectr_level(line):
    splited_lines = line.split('\t')
    if len(splited_lines) < 3:
        return None, None
    
    input_text = splited_lines[1]
    target_text = splited_lines[2]


    # input_text, target_text, _ = line.split('\t')
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = '\t' + target_text + '\n'
   
    return input_text, target_text

In [8]:
input_texts, target_texts = parse_data(data_path, validate_for_charectr_level)

In [9]:
input_texts, val_input_texts, target_texts, val_target_texts = train_test_split(input_texts, target_texts, test_size=0.20, random_state=42)

In [13]:
input_characters = set()
target_characters = set()

for i in range(len(input_characters)):
    for char in input_texts[i]:
        if char not in input_characters:
            input_characters.add(char)
    
    for char in target_texts[i]:
        if char not in target_characters:
            target_characters.add(char)


In [9]:
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters) + 1
num_decoder_tokens = len(target_characters) + 1
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

In [10]:
print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 80000
Number of unique input tokens: 105
Number of unique output tokens: 143
Max sequence length for inputs: 209
Max sequence length for outputs: 210


In [11]:
input_token_index = dict(
    [(char, i) for i, char in enumerate(input_characters)])
input_token_index['oov'] = len(input_token_index)

target_token_index = dict(
    [(char, i) for i, char in enumerate(target_characters)])
target_token_index['oov'] = len(target_token_index)

As the training data is big, and creating sparse matrix cannot be load into RAM, let's create Dataset Generator to avoid out of memory error.

https://towardsdatascience.com/keras-data-generators-and-how-to-use-them-b69129ed779c

In [12]:
from dataset_generator import DataGenerator


In [13]:
dataset = DataGenerator(input_texts=input_texts, target_texts=target_texts,
                        input_token_index=input_token_index, target_token_index=target_token_index,
                        max_encoder_seq_length=max_encoder_seq_length, num_encoder_tokens=num_encoder_tokens,
                        max_decoder_seq_length=max_decoder_seq_length, num_decoder_tokens=num_decoder_tokens,
                        batch_size=batch_size)

In [14]:
val_dataset = DataGenerator(input_texts=val_input_texts, target_texts=val_target_texts,
                        input_token_index=input_token_index, target_token_index=target_token_index,
                        max_encoder_seq_length=max_encoder_seq_length, num_encoder_tokens=num_encoder_tokens,
                        max_decoder_seq_length=max_decoder_seq_length, num_decoder_tokens=num_decoder_tokens,
                        batch_size=batch_size)

In [15]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [16]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [17]:
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['accuracy'])


In [18]:
%%time 
model.fit(dataset,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=val_dataset)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
CPU times: user 5h 36min 59s, sys: 15min 37s, total: 5h 52min 36s
Wall time: 4h 13min 1s


<tensorflow.python.keras.callbacks.History at 0x7f9b3804c9d0>

In [19]:
# Save model
model.save('s2s_paraphrase.h5')

### Run several examples to see the results

In [20]:
# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())

In [21]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [None]:
def validate_for_dev_corpus(line):
    splited_lines = line.split('\t')
    if len(splited_lines) < 3:
        return None, None

    input_text = splited_lines[1]
    target_text = splited_lines[2]
    mark = float(splited_lines[3])
    
    if mark < 3.0 or mark > 4.0:
        return None, None
    
    target_text = '\t' + target_text + '\n'
    
    return input_text, target_text

In [60]:
dev_input_texts, dev_target_texts = parse_data("opusparcus_v2/en-dev.txt", validate_for_dev_corpus)

In [61]:
for seq_index in range(10):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = dataset.create_encoder_input_item_for_text(dev_input_texts[seq_index])
    input_seq = np.array([input_seq])
#     print(input_seq)
    decoded_sentence = decode_sequence(input_seq)
    print('----------------------------------------------------')
    print('Input sentence:', dev_input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)
    
    print("-Reverse example-")
    target_text = dev_target_texts[seq_index].strip()
    input_seq = dataset.create_encoder_input_item_for_text(target_text)
    input_seq = np.array([input_seq])
#     print(input_seq)
    decoded_sentence = decode_sequence(input_seq)
    print('Input sentence:', target_text)
    print('Decoded sentence:', decoded_sentence)




----------------------------------------------------
Input sentence: When 'd you last see him ?
Decoded sentence: When will you be back ?

-Reverse example-
Input sentence: When was the last time you saw him ?
Decoded sentence: When will you be back ?

----------------------------------------------------
Input sentence: Anyone who can verify that ?
Decoded sentence: Is there anything you need ?

-Reverse example-
Input sentence: Can anyone corroborate that ?
Decoded sentence: Can anyone hear me ?

----------------------------------------------------
Input sentence: I 'm not promising anything .
Decoded sentence: I don 't feel so good .

-Reverse example-
Input sentence: No promises , okay ?
Decoded sentence: There 's no need to .

----------------------------------------------------
Input sentence: Nothing 's changed .
Decoded sentence: Nothing that matters .

-Reverse example-
Input sentence: Things ain 't no different .
Decoded sentence: Things will be fine .

-----------------------

### Metric

In [62]:
import rouge

In [63]:
dev_data_path = 'opusparcus_v2/en-dev.txt'
dev_lines = load_data(data_path)

In [64]:
evaluator = rouge.Rouge(['rouge-l'])

In [65]:
def evaluate(input_texts, target_texts, evaluator):
    predicted_texts = []
    for input_text in input_texts:
        input_seq = dataset.create_encoder_input_item_for_text(input_text)
        input_seq = np.array([input_seq])

        decoded_sentence = decode_sequence(input_seq)

        predicted_texts.append(decoded_sentence)

    return evaluator.get_scores(target_texts, predicted_texts, avg=True)

In [66]:
scores = evaluate(dev_input_texts, dev_target_texts, evaluator)

In [67]:
scores

{'rouge-l': {'f': 0.320527282671693,
  'p': 0.3191297290558371,
  'r': 0.3353515450067171}}

## Word level seq2seq for paraphrasing.

### Preprocess data

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from scipy.ndimage.interpolation import shift

In [7]:
train_lines = load_data(data_path)

In [8]:
train_lines[:10]

['en-N7\tJumby now wants to be born .\tJumby want birth .\t77.5163\t2.5\t5\t9',
 'en-N8\tIt was a difficult and long delivery .\tThe delivery was difficult and long .\t77.5163\t2.5\t5\t14',
 'en-N12\tI like to be beautiful everyday .\tI like to be pretty everyday .\t77.5163\t2.5\t5\t8',
 'en-N22\tBernadette wants a prenup .\tBernadette wants to get a prenup .\t77.5163\t2.5\t5\t7',
 "en-N45\tDon 't say you don 't remember me .\tDon 't tell me you don 't remember me .\t74.3904\t3.33333\t5\t7",
 'en-N71\tHyah ! Hmm .\tWiggle your big toe .\t72.2903\t3.35714\t5\t9',
 'en-N127\tHe believes in you .\tHe has faith in you .\t70.2803\t3.66667\t5\t9',
 "en-N153\tSun 's going to come up soon .\tThe sun 's coming up soon .\t69.6198\t0.842262\t5\t12",
 'en-N180\tMars-1 , Houston .\tMars-1 , this is Houston .\t69.1678\t2.50758\t5\t7',
 "en-N202\tBut we have no money .\tBut we haven 't got any money .\t68.7652\t1.35256\t5\t8"]

In [9]:
def validate_for_word_level(line):
    splited_lines = line.split('\t')
    if len(splited_lines) < 3:
        return None, None
    
    input_text = splited_lines[1].strip().lower()
    target_text = splited_lines[2].strip().lower()
    
    
    input_text = 'start_ '+ input_text + ' _end'
    target_text = 'start_ '+ target_text + ' _end'

   
    return input_text, target_text

In [10]:
train_input_texts, target_input_texts = parse_data(data_path, validate_for_word_level)

In [11]:
oov_token='<oov>'

In [12]:
tokenizer = Tokenizer(filters='', oov_token=oov_token)

In [13]:
tokenizer.fit_on_texts(np.concatenate([train_input_texts, target_input_texts]))

In [14]:
max_length = max([len(txt) for txt in tokenizer.texts_to_sequences(np.concatenate([train_input_texts, target_input_texts]))])

In [15]:
max_length

47

In [16]:
train_input_texts, val_input_texts, target_input_texts, val_target_input_texts = train_test_split(train_input_texts, target_input_texts, test_size=0.20, random_state=42)

#### decoder_target_data should be one-hot-encoded 

As matrix which contains one hot vectors is too big, sometimes it is caused to out of memory, so we need to have dataset generator to avoid this issue.

In [17]:
class WordLevelDatasetGenerator(Sequence):

    def __init__(self, input_texts, 
                 target_texts, 
                 word_index,
                 batch_size,
                 vocab_size,
                 max_length):
        
        self.input_texts = input_texts
        self.target_texts = target_texts        
        self.indexes = np.arange(len(self.input_texts))
        self.word_index = word_index
        
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.max_length = max_length

    
    def create_encoder_input_item_for_text(self, text):
        encoder_input_data = np.zeros(self.max_length, dtype='float32')
        
        for t, word in enumerate(text.split()):
            if t >= len(encoder_input_data):
                break
            encoder_input_data[t] = self.__get_token_value(word)
            
        return encoder_input_data

    def __getitem__(self, index):
        
        indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
        encoder_input_data, decoder_input_data, decoder_target_data = self.__create_zero_data()
        
        
        # generate data
        for i, idx in enumerate(indexes):
            input_text = self.input_texts[idx]
            target_text = self.target_texts[idx]

            for t, word in enumerate(input_text.split()):
                encoder_input_data[i, t] = self.word_index[word]
            for t, word in enumerate(target_text.split()):
                # decoder_target_data is ahead of decoder_input_data by one timestep
                decoder_input_data[i, t] = self.__get_token_value(word)
                if t > 0:
                    # decoder_target_data will be ahead by one timestep
                    # and will not include the start character.
                    decoder_target_data[i, t - 1, self.__get_token_value(word)] = 1.


        X = [encoder_input_data, decoder_input_data]
        y = decoder_target_data

        return X, y    

    
    def __len__(self):
        """Denotes the number of batches per epoch
        :return: number of batches per epoch
        """
        return int(np.floor(len(self.input_texts) / self.batch_size))
    
    def __create_zero_data(self):
        encoder_input_data = np.zeros(
            (self.batch_size, self.max_length),
            dtype='float32')
        decoder_input_data = np.zeros(
            (self.batch_size, self.max_length),
            dtype='float32')
        decoder_target_data = np.zeros(
            (self.batch_size, self.max_length, self.vocab_size),
            dtype='float32')
        
        return encoder_input_data, decoder_input_data, decoder_target_data
    
    def __get_token_value(self, token):
        if token in self.word_index:
            return self.word_index[token]
        
        return self.word_index[oov_token]

### Seq2SEQ

In [33]:
import tensorflow as tf

In [34]:
vocab_size = len(tokenizer.word_index) + 1
num_encoder_tokens = vocab_size
num_decoder_tokens = vocab_size

In [35]:
batch_size = 16  # Batch size for training.
epochs = 10  # Number of epochs to train for.
embedding_size = 256
latent_dim = 256  # Latent dimensionality of the encoding space.
# num_samples = 10000  # Number of samples to train on.

In [36]:
dataset = WordLevelDatasetGenerator(train_input_texts, 
                                    target_input_texts, 
                                    tokenizer.word_index,
                                    batch_size, 
                                    vocab_size,
                                    max_length)

In [37]:
val_dataset = WordLevelDatasetGenerator(val_input_texts, 
                                    val_target_input_texts, 
                                    tokenizer.word_index,
                                    batch_size, 
                                    vocab_size,
                                    max_length)

### encoder

In [38]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None,))
x = tf.keras.layers.Embedding(num_encoder_tokens, embedding_size)(encoder_inputs)
x, state_h, state_c = LSTM(latent_dim,
                           return_state=True)(x)
encoder_states = [state_h, state_c]

### decoder

In [39]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dex = tf.keras.layers.Embedding(num_decoder_tokens, embedding_size)
final_dex = dex(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(final_dex, initial_state=encoder_states)

decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

### model

In [40]:
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile & run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])


In [41]:
model.fit(dataset,
          validation_data=val_dataset,
          batch_size=batch_size,
          epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f9140798dd0>

In [42]:
# define the encoder model 
encoder_model = Model(encoder_inputs, encoder_states)
encoder_model.summary()

Model: "functional_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding_4 (Embedding)      (None, None, 256)         2494208   
_________________________________________________________________
lstm_4 (LSTM)                [(None, 256), (None, 256) 525312    
Total params: 3,019,520
Trainable params: 3,019,520
Non-trainable params: 0
_________________________________________________________________


In [43]:
# Redefine the decoder model with decoder will be getting below inputs from encoder while in prediction
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
final_dex2= dex(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(final_dex2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

In [44]:
# sampling model will take encoder states and decoder_input(seed initially) and output the predictions(french word index) We dont care about decoder_states2
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

In [45]:
# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in tokenizer.word_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in tokenizer.word_index.items())


In [46]:
def decode_sequence(input_text):
    input_seq = np.array([dataset.create_encoder_input_item_for_text(input_text)])

    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = tokenizer.word_index['start_']
# Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)
# Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char
# Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_end' or
           len(decoded_sentence) > max_length):
            stop_condition = True
# Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index
# Update states
        states_value = [h, c]
    
    return decoded_sentence


## Evaluate on dev-corpus

In [47]:
def validate_for_word_level_dev_corpus(line):
    splited_lines = line.split('\t')
    if len(splited_lines) < 3:
        return None, None
    
    input_text = splited_lines[1].strip().lower()
    target_text = splited_lines[2].strip().lower()
    mark = float(splited_lines[3])
    
    if mark < 3.0 or mark > 4.0:
        return None, None
    
    
    input_text = 'start_ '+ input_text + ' _end'
    target_text = 'start_ '+ target_text + ' _end'

   
    return input_text, target_text

In [48]:
dev_input_texts, dev_target_texts = parse_data("opusparcus_v2/en-dev.txt", validate_for_word_level_dev_corpus)

In [49]:
def remove_start_end(text):
    text = text.replace("start_ ", "")
    text = text.replace(" _end", "")
    return text

In [50]:
original_input_texts, original_target_texts = [], []
predicted_texts = []
for i in range(len(dev_input_texts)):
    input_text = dev_input_texts[i]
    predicted_text = decode_sequence(input_text)
    predicted_text = remove_start_end(predicted_text)
    target_text = dev_target_texts[i]
    
    original_input_texts.append(remove_start_end(input_text))
    original_target_texts.append(remove_start_end(target_text))
    
    predicted_texts.append(predicted_text)

In [53]:
for i in range(10):
    print('----------------------------------------------------')
    print('Input sentence:', original_input_texts[i])
    print('Decoded sentence:', predicted_texts[i])

----------------------------------------------------
Input sentence: when 'd you last see him ?
Decoded sentence:  when did you get here ?
----------------------------------------------------
Input sentence: anyone who can verify that ?
Decoded sentence:  did they find him ?
----------------------------------------------------
Input sentence: i 'm not promising anything .
Decoded sentence:  i can 't do anything .
----------------------------------------------------
Input sentence: nothing 's changed .
Decoded sentence:  nothing to say .
----------------------------------------------------
Input sentence: anybody hearing me ?
Decoded sentence:  hear me ?
----------------------------------------------------
Input sentence: i 'm not familiar with who that is .
Decoded sentence:  i 'm not sure you are .
----------------------------------------------------
Input sentence: i need you to trust me .
Decoded sentence:  you have to trust me .
----------------------------------------------------


In [55]:
import rouge

In [56]:
evaluator = rouge.Rouge(['rouge-l'])

In [80]:
evaluator.get_scores(original_input_texts, predicted_texts, avg=True)

{'rouge-l': {'f': 0.39533632010766623,
  'p': 0.3864904498894639,
  'r': 0.4244862417276205}}

As wee can see word level seq2seq gave us better rouge-l average (by the way, we trained word level se2seq only on 10 epoch while charecter level on 100). 

## Annotaion of 50 examples

In [61]:
import pandas as pd

In [64]:
df = pd.DataFrame({
    'text': original_input_texts,
    'paraphrase_text': predicted_texts,
    'macthed' : len(predicted_texts) * [None]
})

In [65]:
df.head(50)

Unnamed: 0,text,paraphrase_text,macthed
0,when 'd you last see him ?,when did you get here ?,
1,anyone who can verify that ?,did they find him ?,
2,i 'm not promising anything .,i can 't do anything .,
3,nothing 's changed .,nothing to say .,
4,anybody hearing me ?,hear me ?,
5,i 'm not familiar with who that is .,i 'm not sure you are .,
6,i need you to trust me .,you have to trust me .,
7,we hope you enjoy the flight .,you 've had a good call .,
8,that 's why you rollin ' with the p.l.c. .,this is your and i know what it 's .,
9,did you see him ?,you 've seen him ?,


In [66]:
df.head(50).to_csv("generated-data.csv")

#### Review annotated examples

In [68]:
df_annotated = pd.read_csv("generated-data-annotated.csv")

In [69]:
df_annotated

Unnamed: 0.1,Unnamed: 0,text,paraphrase_text,macthed
0,0,when 'd you last see him ?,when did you get here ?,No
1,1,anyone who can verify that ?,did they find him ?,No
2,2,i 'm not promising anything .,i can 't do anything .,No
3,3,nothing 's changed .,nothing to say .,Yes
4,4,anybody hearing me ?,hear me ?,Yes
5,5,i 'm not familiar with who that is .,i 'm not sure you are .,No
6,6,i need you to trust me .,you have to trust me .,Yes
7,7,we hope you enjoy the flight .,you 've had a good call .,No
8,8,that 's why you rollin ' with the p.l.c. .,this is your and i know what it 's .,No
9,9,did you see him ?,you 've seen him ?,Yes


In [71]:
df['macthed'].values

array([None, None, None, ..., None, None, None], dtype=object)

In [72]:
df_yes = df_annotated.loc[df_annotated['macthed'] == 'Yes']

In [79]:
df_yes

Unnamed: 0.1,Unnamed: 0,text,paraphrase_text,macthed
3,3,nothing 's changed .,nothing to say .,Yes
4,4,anybody hearing me ?,hear me ?,Yes
6,6,i need you to trust me .,you have to trust me .,Yes
9,9,did you see him ?,you 've seen him ?,Yes
10,10,do you need any help ?,you need help ?,Yes
14,14,i 'm joking .,i 'm kidding .,Yes
15,15,let me out of here .,let me see .,Yes
16,16,ain 't no next time .,there 's no time .,Yes
22,22,there 's an issue .,there 's a problem .,Yes
30,30,come sit over here .,come here again .,Yes


The are several interesting behaviours, where sense are same but predicted paraphrase is grammatically incorrectly. For example: <br>
*do you need any help ? - you need help ?* <br>	
*anybody hearing me ? - hear me ?* <br>
It looks like informal speaking language. <br><br>
Also, there are some interesing good examples like: <br>
*i 'm joking . - i 'm kidding .*  <br>
*there 's an issue  - there 's a problem .*

In [76]:
acc = len(df_yes) / len(df_annotated)

In [78]:
print(f"The abstract estimation accuracy by annotation: {acc}")

The abstract estimation accuracy by annotation: 0.44


The average rouge-l estimation:

In [83]:
evaluator.get_scores(original_input_texts[:50], predicted_texts[:50], avg=True)

{'rouge-l': {'f': 0.41068242427474727,
  'p': 0.3900670995670996,
  'r': 0.44760317460317467}}

It is wonderful that averaged F1 score is less than annotated accuracy because I accepted vice versa behaviors. However, as we calculated the Longest Common Subsequence, maybe it is not so good metric for this task, because it should not consider synonyms and different order of subsequences, which was included by the annotator.