# Big Data Content Analytics - AUEB

## Siamese LSTM Networks Implementation 

* Lab Assistant: George Perakis
* Email: gperakis[at]aeub.gr | perakisgeorgios[at]gmail.com

### Import Modules

In [None]:
import re
from pathlib import Path
from time import time

import tensorflow.python.keras.backend as K
import numpy as np
import pandas as pd
from tensorflow.python.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.python.keras.layers import Input, Embedding, LSTM, Lambda, Dense
from tensorflow.python.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import StratifiedShuffleSplit
from tqdm import tqdm

tqdm.pandas()

### ETL related functions

In [None]:
def clean_text(text):
    """
    Pre process and convert texts to a list of words
    :param text:
    :return:
    """

    text = str(text)
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    return text

### Pre-trained Embeddings (GLOVE) related functions

In [None]:
def load_glove_embeddings(glove_dir: Path,
                          dim: int = 100) -> dict:
    """

    :param dim: The embeddings size (dimensions)
    :return:
    """
    #     print('Loading word vectors')

    embed_index = dict()  # We create a dictionary of word -> embedding

    fname = glove_dir.joinpath(f'glove.6B.{dim}d.txt')

    f = open(fname, encoding="utf8")  # Open file

    # In the dataset, each line represents a new word embedding
    # The line starts with the word and the embedding values follow
    for line in tqdm(f, desc='Loading Embeddings', unit='word'):
        values = line.split()
        # The first value is the word, the rest are the values of the embedding
        word = values[0]
        # Load embedding
        embedding = np.asarray(values[1:], dtype='float32')

        # Add embedding to our embedding dictionary
        embed_index[word] = embedding
    f.close()

    print(f'Found %s word vectors. {len(embed_index)}')

    return embed_index

In [None]:
def create_embeddings_matrix(emb_index: dict,
                             max_words: int,
                             tokenizer: Tokenizer,
                             emb_dim: int = 100) -> np.ndarray:
    """

    :param max_words:
    :param emb_index: Embeddings Index
    :param tokenizer: Keras fitted tokenizer.
    :param emb_dim: Embeddings dimension.
    :return: A matrix of shape (nb_words, emb_dim) containing the globe embeddings.
    """
    assert emb_dim in [50, 100, 200, 300]

    # Create a matrix of all embeddings
    # (stacking=concatenating all the vectors)
    all_embs = np.stack(emb_index.values())  # .values() gets the all the arrays from the keys

    # Calculate mean
    emb_mean = all_embs.mean()
    # Calculate standard deviation
    emb_std = all_embs.std()

    print(f"Embeddings AVG: {emb_mean} | STD: {emb_std}")

    # We can now create an embedding matrix holding all word vectors.

    word_index = tokenizer.word_index

    # How many words are there actually. Because we may have requested X most common tokens
    # and the total tokens are X/2
    nb_words = min(max_words, len(word_index))

    # Create a random matrix with the same mean and std as the embeddings

    embedding_matrix = np.random.normal(emb_mean,  # mean
                                        emb_std,  # std
                                        (nb_words, emb_dim)  # shape of the matrix
                                        )

    # The vectors need to be in the same position as their index.
    # Meaning a word with token 1 needs to be in the second row (rows start with zero) and so on

    # Loop over all words in the word index
    for word, i in word_index.items():  # .items() return a tuple with (word, word_index)

        # If we are above the amount of words we want to use we do nothing
        if i >= max_words:
            continue

        # Get the embedding vector for the word
        embedding_vector = emb_index.get(word)

        # If there is an embedding vector, put it in the embedding matrix
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

### Siamese model implementation related functions

In [None]:
def exponent_neg_manhattan_distance(left, right):
    """
    Helper function for the similarity estimate of the LSTMs outputs
    :param left:
    :param right:
    :return:
    """
    return K.exp(-K.sum(K.abs(left - right),
                        axis=1,
                        keepdims=True))

<img src="https://miro.medium.com/max/900/1*SZM2gDnr-OTx9ytVKQEuOg.png">

In [None]:
def build_model(max_seq_len,
                emb_matrix,
                emb_dim,
                grad_clip_norm,
                n_hidden: int = 50) -> Model:
    """

    :param max_seq_len:
    :param emb_matrix:
    :param emb_dim:
    :param grad_clip_norm:
    :param n_hidden:
    :return:
    """
    # The visible layer
    left_input = Input(shape=(max_seq_len,),
                       dtype='int32',
                       name='left_input')

    right_input = Input(shape=(max_seq_len,),
                        dtype='int32',
                        name='right_input')

    embedding_layer = Embedding(len(emb_matrix),
                                emb_dim,
                                weights=[emb_matrix],
                                input_length=max_seq_len,
                                trainable=False,
                                name='emb_layer')

    # Embedded version of the inputs
    encoded_left = embedding_layer(left_input)
    encoded_right = embedding_layer(right_input)

    # Since this is a siamese network, both sides share the same LSTM
    shared_lstm = LSTM(n_hidden, name='shared_lstm')

    left_output = shared_lstm(encoded_left)
    right_output = shared_lstm(encoded_right)

    # # Calculates the distance as defined by the MaLSTM model
    # malstm_distance = Lambda(function=lambda tensors: exponent_neg_manhattan_distance(tensors[0],
    #                                                                                   tensors[1]),
    #                          output_shape=lambda x: (x[0][0], 1))([left_output,
    #                                                                right_output])
    # Add a customized layer to compute the absolute difference between the encodings

    l1_layer = Lambda(lambda tensors: K.abs(tensors[0] - tensors[1]))

    l1_distance = l1_layer([left_output, right_output])

    output = Dense(units=1, activation='sigmoid',
                   name='output')(l1_distance)

    # Pack it all up into a model
    malstm = Model(inputs=[left_input,
                           right_input],
                   outputs=[output])

    # Ada-delta optimizer, with gradient clipping by norm
    optimizer = Adam(clipnorm=grad_clip_norm)

    malstm.compile(loss='binary_crossentropy',
                   optimizer=optimizer,
                   metrics=['accuracy'])

    print(malstm.summary())

    return malstm

### Configuration (Hyper-parameter) functions

In [None]:
class Config:
    MAX_FEATURES = 20_000

    # Model variables
    N_HIDDEN = 50
    GRADIENT_CLIPPING_NORM = 1.20
    BATCH_SIZE = 128
    NB_EPOCHS = 25
    EMB_DIMENSIONS = 300

In [None]:
#  download glove embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip
#  unzip clove embeddings 
!unzip glove.6B.zip

In [None]:
#  extract quora train dataset
! unzip train.csv.zip

## Example

### Load Data

In [None]:
data = pd.read_csv('train.csv')

In [None]:
print(data.iloc[5]['question1'])
data.iloc[5]['question2']
data.head()

#### Clean Questions

In [None]:
data['question1'] = data['question1'].progress_apply(clean_text)
data['question2'] = data['question2'].progress_apply(clean_text)

In [None]:
X = data[['question1', 'question2']]

y = data['is_duplicate']

#### Stratified Split

In [None]:
# We will use this object to split at first in Train - Test in a stratified manner.
val_sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

X_train, X_val, y_train, y_val = None, None, None, None

for train_index, val_index in val_sss.split(X, y):
    X_train, X_val = X.loc[train_index], X.loc[val_index]
    y_train, y_val = y.loc[train_index], y.loc[val_index]

print('Train')
print(y_train.value_counts() / y_train.count())
print('Validation')
print(y_val.value_counts() / y_val.count())

### Create Tokenizer

In [None]:
max_features = Config.MAX_FEATURES

tokenizer = Tokenizer(num_words=max_features, oov_token='<OOV>')

# fitting on the train dataset only
tokenizer.fit_on_texts(list(X_train['question1']) + list(X_train['question2']))

#### Data Vectorization

In [None]:
X_train['question1_seqs'] = tokenizer.texts_to_sequences(X_train['question1'])
X_train['question2_seqs'] = tokenizer.texts_to_sequences(X_train['question2'])

In [None]:
X_val['question1_seqs'] = tokenizer.texts_to_sequences(X_val['question1'])
X_val['question2_seqs'] = tokenizer.texts_to_sequences(X_val['question2'])

In [None]:
all_train_lengths = list(X_train.question1_seqs.apply(len)) + list(X_train.question1_seqs.apply(len))

max_len = int(np.percentile(all_train_lengths, q=90))
print(f'Max Length: {max_len}')

print(X_train.sample(5))

In [None]:
X_train_q1 = pad_sequences(X_train['question1_seqs'],
                           maxlen=max_len,
                           padding='post',
                           truncating='post')

X_train_q2 = pad_sequences(X_train['question2_seqs'],
                           maxlen=max_len,
                           padding='post',
                           truncating='post')

X_val_q1 = pad_sequences(X_val['question1_seqs'],
                         maxlen=max_len,
                         padding='post',
                         truncating='post')

X_val_q2 = pad_sequences(X_val['question2_seqs'],
                         maxlen=max_len,
                         padding='post',
                         truncating='post')

In [None]:
# Make sure everything is ok
assert X_train_q1.shape == X_train_q2.shape
assert X_val_q1.shape == X_val_q2.shape

assert len(X_train_q1) == len(y_train)
assert len(X_train_q2) == len(y_train)

### Load pre-trained embeddings

In [None]:
gl_dir = Path('.')

glove_embeddings = load_glove_embeddings(glove_dir=gl_dir,
                                         dim=Config.EMB_DIMENSIONS)

emb_matrix = create_embeddings_matrix(emb_index=glove_embeddings,
                                      max_words=max_features,
                                      tokenizer=tokenizer,
                                      emb_dim=Config.EMB_DIMENSIONS)
print(emb_matrix.shape)

### Siamese LSTM Model

#### Build Model

In [None]:
model = build_model(max_seq_len=max_len,
                    emb_matrix=emb_matrix,
                    emb_dim=Config.EMB_DIMENSIONS,
                    grad_clip_norm=Config.GRADIENT_CLIPPING_NORM,
                    n_hidden=Config.N_HIDDEN)

#### Load previous weigths

In [None]:
weights_fname = 'quora_siamese_lstm_weights.h5'

try:
    model.load_weights(weights_fname)
except:
    print('Pre-trained weights not found. Fitting from start')

#### Create Callbacks

In [None]:
monitor_metric = 'val_loss'

callbacks = [
    EarlyStopping(monitor=monitor_metric,
                  patience=3,
                  verbose=1,
                  restore_best_weights=True),

    ModelCheckpoint(filepath=weights_fname,
                    monitor=monitor_metric,
                    verbose=1,
                    save_best_only=True,
                    save_weights_only=True)
]

#### Fit Model

In [None]:
s = time()
history = model.fit({'left_input': X_train_q1,
                     'right_input': X_train_q2},
                    y_train,
                    batch_size=Config.BATCH_SIZE,
                    epochs=Config.NB_EPOCHS,
                    validation_data=(
                        {'left_input': X_val_q1,
                         'right_input': X_val_q2},
                        y_val),
                    verbose=1,
                    callbacks=callbacks)

duration = time() - s
print(f"Training time finished. Duration {duration} secs")