In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import spacy
import nltk

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tensorflow.keras.layers import Input, Embedding, LSTM, Lambda, Dense, Dropout
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import binary_accuracy
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.stem import WordNetLemmatizer

# Data Pre-processing

## Read data from tsv file
Link MSRP data: https://drive.google.com/file/d/1EMx67iZn5fsn4FJ1B2rUscEDhPj2zfwH/view?usp=sharing

In [None]:
data = pd.read_csv('/content/data.tsv', sep='\t', on_bad_lines = 'skip')
data

Unnamed: 0,Quality,#1 ID,#2 ID,#1 String,#2 String
0,1,702876,702977,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi..."
1,0,2108705,2108831,Yucaipa owned Dominick's before selling the ch...,Yucaipa bought Dominick's in 1995 for $693 mil...
2,1,1330381,1330521,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an..."
3,0,3344667,3344648,"Around 0335 GMT, Tab shares were up 19 cents, ...","Tab shares jumped 20 cents, or 4.6%, to set a ..."
4,1,1236820,1236712,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...
...,...,...,...,...,...
5572,1,1620264,1620507,"At this point, Mr. Brando announced: 'Somebody...","Brando said that ""somebody ought to put a bull..."
5573,0,1848001,1848224,"Martin, 58, will be freed today after serving ...",Martin served two thirds of a five-year senten...
5574,1,747160,747144,We have concluded that the outlook for price s...,"In a statement, the ECB said the outlook for p..."
5575,1,2539933,2539850,The notification was first reported Friday by ...,MSNBC.com first reported the CIA request on Fr...


## Stop Words Removal

In [None]:
nlp = spacy.load("en_core_web_sm")

def remove_stopwords(text):
  doc = nlp(text)
  tokens = [token.text.lower() for token in doc if not token.is_stop]
  return " ".join(tokens)

In [None]:
valid1 = []
valid2 = []
quality = []

for i, j, k in zip(data['#1 String'], data['#2 String'], data['Quality']):
  try:
    i = remove_stopwords(i)
    j = remove_stopwords(j)
    valid1.append(i)
    valid2.append(j)
    quality.append(k)
    print(i, j, k)
  except (ValueError, TypeError):
    continue;

In [None]:
data = pd.DataFrame({'Quality': quality, '#1 String': valid1, '#2 String': valid2})
data

Unnamed: 0,Quality,#1 String,#2 String
0,1,"amrozi accused brother , called "" witness "" , ...","referring "" witness "" , amrozi accused brother..."
1,0,yucaipa owned dominick selling chain safeway 1...,yucaipa bought dominick 1995 $ 693 million sol...
2,1,"published advertisement internet june 10 , off...","june 10 , ship owners published advertisement ..."
3,0,"0335 gmt , tab shares 19 cents , 4.4 % , a$ 4....","tab shares jumped 20 cents , 4.6 % , set recor..."
4,1,"stock rose $ 2.11 , 11 percent , close friday ...",pg&e corp. shares jumped $ 1.63 8 percent $ 21...
...,...,...,...
5542,1,"point , mr. brando announced : ' somebody ough...","brando said "" somebody ought bullet "" head , a..."
5543,0,"martin , 58 , freed today serving thirds - yea...",martin served thirds - year sentence manslaugh...
5544,1,concluded outlook price stability medium term ...,"statement , ecb said outlook price stability m..."
5545,1,notification reported friday msnbc .,msnbc.com reported cia request friday .


## Part Of Speech (POS) Tagging and Lemmatization

In [None]:
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet

nltk.download('wordnet')
nltk.download('punkt')

lemmatizer = WordNetLemmatizer()

# Define function to lemmatize each word with its POS tag

# POS_TAGGER_FUNCTION : TYPE 1
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

# Using for loop through all the dataset for POS Tagging and Lemmatization
for i in range(len(data)):
  # POS Tagging processing
  pos_tagged_number1 = nltk.pos_tag(nltk.word_tokenize(data['#1 String'].loc[i]))
  pos_tagged_number2 = nltk.pos_tag(nltk.word_tokenize(data['#2 String'].loc[i]))

  wordnet_tagged_number1 = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged_number1))
  wordnet_tagged_number2 = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged_number2))

  # Lemmatization processing
  lemmatized_sentence_number1 = []
  lemmatized_sentence_number2 = []

  for word, tag in wordnet_tagged_number1:
      if tag is None:
          # if there is no available tag, append the token as is
          lemmatized_sentence_number1.append(word)
      else:
          # else use the tag to lemmatize the token
          lemmatized_sentence_number1.append(lemmatizer.lemmatize(word, tag))
  data['#1 String'].loc[i] = " ".join(lemmatized_sentence_number1)

  for word, tag in wordnet_tagged_number2:
      if tag is None:
          # if there is no available tag, append the token as is
          lemmatized_sentence_number2.append(word)
      else:
          # else use the tag to lemmatize the token
          lemmatized_sentence_number2.append(lemmatizer.lemmatize(word, tag))
  data['#2 String'].loc[i] = " ".join(lemmatized_sentence_number2)

In [None]:
df = pd.DataFrame(data)


## Data Augmentation, Sequencing, Padding, Data Splitting

In [None]:
# Data augmentation

similar_samples = df[df['#1 String'] == 1]
df = pd.concat([df, similar_samples], ignore_index = True)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['#1 String'] + df['#2 String']) # learns the vocabulary and assigns a unique integer ID to each word in the text

In [None]:
# Sequencing

max_sequence_length = max(
    max(df['#1 String'].apply(len)),
    max(df['#2 String'].apply(len))
)

sentence1_sequences = tokenizer.texts_to_sequences(df['#1 String'])
sentence2_sequences = tokenizer.texts_to_sequences(df['#2 String'])

In [None]:
# Padding
X1 = pad_sequences(sentence1_sequences, maxlen = max_sequence_length)
X2 = pad_sequences(sentence2_sequences, maxlen = max_sequence_length)
y = df['Quality'].values

In [None]:
# Data splitting
X1_train, X1_val_test, X2_train, X2_val_test, y_train, y_val_test = train_test_split(X1, X2, y, test_size=0.2, random_state=42)
X1_val, X1_test, X2_val, X2_test, y_val, y_test = train_test_split(X1_val_test, X2_val_test, y_val_test, test_size=0.5, random_state=42)

y_train = y_train.astype(np.float32)
y_val = y_val.astype(np.float32)
y_test = y_test.astype(np.float32)

# Model Building

In [None]:
# Siamese network model

input1 = Input(shape=(max_sequence_length,))
input2 = Input(shape=(max_sequence_length,))

In [None]:
vocab_size = len(tokenizer.word_index) + 1

embedding_layer = Embedding(input_dim=vocab_size, output_dim=100)
lstm_layer = LSTM(100)

In [None]:
encoded1 = lstm_layer(embedding_layer(input1))
encoded2 = lstm_layer(embedding_layer(input2))

In [None]:
dropout = Dropout(0.5, seed=42)

encoded1 = dropout(encoded1)
encoded2 = dropout(encoded2)

In [None]:
def contrastive_loss(y_true, y_pred, margin=1):
  square_pred = tf.square(y_pred)
  margin_square = tf.square(tf.maximum(margin - y_pred, 0))
  return tf.reduce_mean(y_true * square_pred + (1 - y_true) * margin_square)

In [None]:
distance = Lambda(lambda x:tf.keras.backend.abs(x[0] - x[1]))([encoded1, encoded2])

In [None]:
output = Dense(1, activation="sigmoid")(distance)

In [None]:
model = Model(inputs=[input1, input2], outputs=output)

In [None]:
model.compile(loss=contrastive_loss, optimizer=Adam(learning_rate=0.001), metrics=[binary_accuracy])

## Early stopping

In [None]:
from keras import callbacks
earlystopping = callbacks.EarlyStopping(monitor="val_binary_accuracy",
                                        mode="max", patience=8,
                                        restore_best_weights=True)

## Model training

In [None]:
model.fit([X1_train, X2_train], y_train, validation_data=([X1_val, X2_val], y_val), epochs=25, batch_size=64, callbacks=[earlystopping])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25


<keras.src.callbacks.History at 0x78560412eb00>

In [None]:
y_pred = model.predict([X1_test, X2_test])



In [None]:
y_pred = y_pred.flatten()
y_pred = np.where(y_pred > 0.5, 1, 0)
y_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [None]:
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [None]:
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("F1 Score: {:}".format(f1))

Accuracy: 66.42%
F1 Score: 0.7932584269662922


# Save model to Google Drive for future use

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
model.save('/content/drive/MyDrive/Trained Models/Siamese_Networks_for_Paraphase_Identification_v2.h5')

# Load model from Drive

In [None]:
# siamese = load_model('/content/drive/MyDrive/Trained Models/Siamese_Networks_for_Paraphase_Identification.h5', custom_objects={'contrastive_loss': contrastive_loss})