<a href="https://colab.research.google.com/github/ujjalkumarmaity/research-paper-implementation/blob/main/Manhattan-LSTM-Model/Manhattan-LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Siamese Recurrent Architectures for Learning Sentence Similarity

https://ojs.aaai.org/index.php/AAAI/article/view/10350/10209

<!-- ![image.png](attachment:image.png) -->
<img src = 'model.png'>

In [1]:
import pandas as pd
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Model
from keras import layers
from keras import backend as K
import random
from sklearn.model_selection import train_test_split
from keras.optimizers import SGD
import tensorflow as tf

In [2]:
df = pd.read_excel('https://github.com/ujjalkumarmaity/research-paper-implementation/raw/main/Manhattan-LSTM-Model/resume-similarity-dataset.xlsx')

In [3]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,resume1,label1,resume2,label2
0,0,Skills * Programming Languages: Python (pandas...,Data Science,Skills * Programming Languages: Python (pandas...,Data Science
1,1,Skills * Programming Languages: Python (pandas...,Data Science,Education Details _x000D_\nMay 2013 to May 201...,Data Science
2,2,Skills * Programming Languages: Python (pandas...,Data Science,"Areas of Interest Deep Learning, Control Syste...",Data Science


In [4]:
df = df[['resume1','resume2','label1','label2']]

In [5]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

--2023-09-02 11:00:18--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-09-02 11:00:18--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-09-02 11:00:18--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [6]:
def load_embeddings(file_name):
    embeddings_index = {}
    f = open(file_name, encoding='utf-8')
    for line in f:
        word = line.split()
        embeddings_index[word[0]] = np.asarray(word[1:],dtype='float32')
    return embeddings_index
file_name = '/content/glove.6B.100d.txt'
embdding_mat = load_embeddings(file_name)

In [7]:
embdding_mat['python']

array([ 0.24934  ,  0.68318  , -0.044711 , -1.3842   , -0.0073079,
        0.651    , -0.33958  , -0.19785  , -0.33925  ,  0.26691  ,
       -0.033062 ,  0.15915  ,  0.89547  ,  0.53999  , -0.55817  ,
        0.46245  ,  0.36722  ,  0.1889   ,  0.83189  ,  0.81421  ,
       -0.11835  , -0.53463  ,  0.24158  , -0.038864 ,  1.1907   ,
        0.79353  , -0.12308  ,  0.6642   , -0.77619  , -0.45713  ,
       -1.054    , -0.20557  , -0.13296  ,  0.12239  ,  0.88458  ,
        1.024    ,  0.32288  ,  0.82105  , -0.069367 ,  0.024211 ,
       -0.51418  ,  0.8727   ,  0.25759  ,  0.91526  , -0.64221  ,
        0.041159 , -0.60208  ,  0.54631  ,  0.66076  ,  0.19796  ,
       -1.1393   ,  0.79514  ,  0.45966  , -0.18463  , -0.64131  ,
       -0.24929  , -0.40194  , -0.50786  ,  0.80579  ,  0.53365  ,
        0.52732  ,  0.39247  , -0.29884  ,  0.009585 ,  0.99953  ,
       -0.061279 ,  0.71936  ,  0.32901  , -0.052772 ,  0.67135  ,
       -0.80251  , -0.25789  ,  0.49615  ,  0.48081  , -0.6840

In [8]:
def prepare_dataset(df):
    # df = pd.read_csv(path, sep='\t', lineterminator='\n',header=None)
    # df = df.rename(columns = {'resume1':'name1','resume2':'name2',3:'label'})
    x1 = []
    x2 = []
    label = []
    resume1 = df.resume1.values.tolist()
    resume2 = df.resume2.values.tolist()
    label1 = df.label1.values.tolist()
    label2 = df.label2.values.tolist()
    for ind,(n1,n2) in enumerate(zip(resume1,resume2)):
        if random.random()>0.5:
            x1.append(n1)
            x2.append(n2)
        else:
            x1.append(n2)
            x2.append(n1)
        if label1[ind]==label2[ind]:
            label.append(1)
        else:
            label.append(0)
    return pd.DataFrame({'resume1':x1,'resume2':x2,'label':label})

In [15]:
data = prepare_dataset(df)

In [16]:
data.label.value_counts()

0    883328
1     42116
Name: label, dtype: int64

In [23]:
# down sample
from sklearn.utils import resample
match_label_data = data.loc[data.label==1]
no_match_label_data = data.loc[data.label==0]
no_match_label_data = resample(no_match_label_data,replace=True,n_samples=70000,random_state=13)
df1 = pd.concat([match_label_data,no_match_label_data])

In [24]:
df1 = df1.reset_index(drop=True)
df1.head(2)

Unnamed: 0,resume1,resume2,label
0,Skills * Programming Languages: Python (pandas...,Skills * Programming Languages: Python (pandas...,1
1,Skills * Programming Languages: Python (pandas...,Education Details _x000D_\nMay 2013 to May 201...,1


In [25]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_word = list(stopwords.words('english'))
stop_word.extend(['skills'])
def per_processing_data(text):
    text = text.lower() #
    text = re.sub(r'http\S+','',text) # remove url
    text = re.sub(r'[^\w ]+', " ", text) # remove spacial charecter
    text = ' '.join(text.split()) # remove white space
    text = ' '.join([i for i in text.split() if i not in stop_word]) #remove stopword
    return text
text = 'Skills * Programming Languages: Python (pandas'
per_processing_data(text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'programming languages python pandas'

In [26]:
df1['resume1'] = df1['resume1'].apply(per_processing_data)
df1['resume2'] = df1['resume2'].apply(per_processing_data)

In [30]:
df1.to_excel('Manhattan-LSTM.xlsx',index=False)

In [29]:
df1.to_csv('Manhattan-LSTM.csv',index=False)

In [31]:
MAX_SEQ_LEN = 1000

In [32]:
def prepere_training_data(df,tokenizer):
    name1_seq = tokenizer.texts_to_sequences(df['resume1'])
    name2_seq = tokenizer.texts_to_sequences(df['resume2'])

    name1_seq = pad_sequences(name1_seq,maxlen=MAX_SEQ_LEN)
    name2_seq = pad_sequences(name2_seq,maxlen=MAX_SEQ_LEN)
    return name1_seq,name2_seq,df['label'].values

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df1['resume1'])
train,test = train_test_split(df1,test_size=0.2,stratify = df1['label'])
train_text2seq_1,train_text2seq_2,train_label = prepere_training_data(train,tokenizer)
test_text2seq_1,test_text2seq_2,test_label = prepere_training_data(test,tokenizer)


In [None]:
train_text2seq_1[0]

In [40]:
vocab_size = len(tokenizer.word_index)+1
embdding_matrix = np.zeros((vocab_size,100))
for word,ind in tokenizer.word_index.items():
    if embdding_mat.get(word)is not None:
        embdding_matrix[ind] = embdding_mat.get(word)

## Model

In [48]:
import keras
num_word = len(tokenizer.word_index)+1
def euclidean_distance(vects):
    x, y = vects
    sum_square = tf.math.reduce_sum(tf.math.square(x - y), axis=1, keepdims=True)
    return tf.math.sqrt(tf.math.maximum(sum_square, tf.keras.backend.epsilon()))

inp_seq = layers.Input(shape=(MAX_SEQ_LEN,))
x = layers.Embedding(input_dim=vocab_size, output_dim=100, embeddings_initializer=keras.initializers.Constant(embdding_matrix), trainable=False)(inp_seq)

# x = layers.Embedding(num_word,output_dim=16,mask_zero=False)(inp_seq)
x = layers.BatchNormalization()(x)
x = layers.Bidirectional(layers.LSTM(128,return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(128,return_sequences=True))(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dense(512)(x)

embed_network = keras.Model(inp_seq, x)

inp_seq1 = layers.Input(shape=(MAX_SEQ_LEN,))
inp_seq2 = layers.Input(shape=(MAX_SEQ_LEN,))

network1 = embed_network(inp_seq1)
network2 = embed_network(inp_seq2)


merge = layers.Lambda(euclidean_distance)([network1, network2])
merge = layers.BatchNormalization()(merge)
out = layers.Dense(1,activation='sigmoid')(merge)

model = Model(inputs=[inp_seq1,inp_seq2],outputs = out)
opt = SGD(lr=0.01)
# model.compile(optimizer=opt,loss='binary_crossentropy',metrics='acc')
# model.fit([train_text2seq_1,train_text2seq_2,],train_label,epochs=5,batch_size=64,verbose=1,
#           validation_data=([test_text2seq_1,test_text2seq_2],test_label))

In [49]:
def loss(margin=1):
    """Provides 'contrastive_loss' an enclosing scope with variable 'margin'.

    Arguments:
        margin: Integer, defines the baseline for distance for which pairs
                should be classified as dissimilar. - (default is 1).

    Returns:
        'contrastive_loss' function with data ('margin') attached.
    """

    # Contrastive loss = mean( (1-true_value) * square(prediction) +
    #                         true_value * square( max(margin-prediction, 0) ))
    def contrastive_loss(y_true, y_pred):
        """Calculates the contrastive loss.

        Arguments:
            y_true: List of labels, each label is of type float32.
            y_pred: List of predictions of same length as of y_true,
                    each label is of type float32.

        Returns:
            A tensor containing contrastive loss as floating point value.
        """
        y_pred = tf.cast(y_pred, tf.float32)
        y_true = tf.cast(y_true, tf.float32)


        square_pred = tf.math.square(y_pred)
        margin_square = tf.math.square(tf.math.maximum(margin - (y_pred), 0))
        return tf.math.reduce_mean(
            (1 - y_true) * square_pred + (y_true) * margin_square
        )

    return contrastive_loss


In [50]:
margin =1
model.compile(loss=loss(margin=margin), optimizer="adam", metrics=["accuracy"])#RMSprop
model.summary()


Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, 1000)]       0           []                               
                                                                                                  
 input_8 (InputLayer)           [(None, 1000)]       0           []                               
                                                                                                  
 model_2 (Functional)           (None, 512)          1515020     ['input_7[0][0]',                
                                                                  'input_8[0][0]']                
                                                                                                  
 lambda_1 (Lambda)              (None, 1)            0           ['model_2[0][0]',          

In [None]:
model.fit([train_text2seq_1,train_text2seq_2,],train_label,epochs=5,batch_size=128,verbose=1,
          validation_data=([test_text2seq_1,test_text2seq_2],test_label))


Epoch 1/5
