# Siamese BiLSTM Neural Network with Attention

<p>A Siamese BiLSTM for sentence similarity scores is a type of deep learning model that is designed to compare two input sentences and produce a score indicating how similar or dissimilar they are.</p>
<p>The Siamese BiLSTM architecture consists of two identical sub-networks that take in the two input sentences separately and process them through a Bidirectional Long Short-Term Memory (BiLSTM) layer. The BiLSTM layer captures the contextual information of the input sentences by processing them in both forward and backward directions, and produces a sequence of hidden states for each sentence. The output of each BiLSTM layer is typically fed through a fully connected layer to produce a final similarity score. The fully connected layer is essentially a linear transformation that maps the BiLSTM output to a scalar score</p>
<p>Various loss functions are applied:
    <li> MSE loss - such that loss can be differentiable </li>
    <li> Pearson Loss - assumes the linear relationship </li>
</p>
<p>During training, the model learns to adjust its parameters to minimize the difference between the predicted similarity scores and the true similarity scores.</p>

<p>Word2vec embeddings are fed as input to the BiLSTM models</p>

   

In [18]:
from gensim.models import KeyedVectors
import pandas as pd
import pickle

In [3]:
word2vec_path = "../data/GoogleNews-vectors-negative300.bin"
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [4]:
embedding_matrix = word2vec.vectors

In [5]:
word2idx = {word: i for i, word in enumerate(word2vec.index_to_key)}

In [367]:
df = pd.read_csv('../data/cleaned_train_df1.csv')

In [368]:
val_df = pd.read_csv('../data/cleaned_val_df1.csv')
test_df = pd.read_csv('../data/cleaned_test_df1.csv')

In [369]:
df.head()

Unnamed: 0,sent1,sent2,score
0,"['plane', 'taking']","['air', 'plane', 'taking']",5.0
1,"['man', 'playing', 'large', 'flute']","['man', 'playing', 'flute']",3.8
2,"['man', 'spreading', 'shreded', 'cheese', 'piz...","['man', 'spreading', 'shredded', 'cheese', 'un...",3.8
3,"['three', 'men', 'playing', 'chess']","['two', 'men', 'playing', 'chess']",2.6
4,"['man', 'playing', 'cello']","['man', 'seated', 'playing', 'cello']",4.25


In [370]:
df['sent1'] = df['sent1'].apply(eval)
df['sent2'] = df['sent2'].apply(eval)

In [371]:
sent1 = list(df['sent1'])
sent2 = list(df['sent2'])

In [372]:
with open('../data/word_dict_v1.pickle', 'rb') as f:
    vocab = pickle.load(f)

In [373]:
vocab_list = list(vocab.keys())

In [374]:
len(vocab_list)

10072

In [375]:
vocab_list.append("unk")

In [376]:
len(vocab_list)

10073

In [377]:
vocab_list = [i for i in vocab_list if i in word2idx]

In [378]:
len(vocab_list)

8500

In [379]:
vocab_dict = {k:i for i,k in enumerate(vocab_list)}

In [380]:
len(vocab_dict)

8500

In [381]:
vocab_dict['unk']

8499

In [382]:
word2idx_trunc = {}
for i in list(vocab_dict.keys()):
  word2idx_trunc[vocab_dict[i]] = word2idx[i]

In [383]:
word2idx['unk']

1459665

In [384]:
word2idx_trunc[8499]

1459665

In [385]:
word_indexes = list(word2idx_trunc.values())

In [386]:
subset_embedding_matrix = word2vec.vectors[word_indexes]

In [387]:
import numpy as np

In [388]:
np.array_equal(subset_embedding_matrix[8499], embedding_matrix[1459665])

True

In [441]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class MyDataset(Dataset):
    def __init__(self, sentences1, sentences2, scores, word_to_ix):
        self.sentences1 = sentences1
        self.sentences2 = sentences2
        self.scores = scores
        self.word_to_ix = word_to_ix

    def __len__(self):
        return max(len(self.sentences1),len(self.sentences2))

    def __getitem__(self, idx):
        unk_token = self.word_to_ix['unk']
        sentence1 = self.sentences1[idx]
        sentence2 = self.sentences2[idx]
        score = self.scores[idx]
        seq1 = [self.word_to_ix[word] if word in self.word_to_ix else unk_token for word in sentence1]
        seq2 = [self.word_to_ix[word] if word in self.word_to_ix else unk_token for word in sentence2]
        #seq1 = [self.word_to_ix[word] for word in sentence1 if word in self.word_to_ix]
        #seq2 = [self.word_to_ix[word] for word in sentence2 if word in self.word_to_ix]
        return seq1, seq2, score

    def collate_fn(self, batch):
        sequences1, sequences2, scores = zip(*batch)
        padded_seqs1 = pad_sequence([torch.LongTensor(seq) for seq in sequences1], batch_first=True, padding_value=0)
        padded_seqs2 = pad_sequence([torch.LongTensor(seq) for seq in sequences2], batch_first=True, padding_value=0)
        #return padded_seqs1, padded_seqs2, torch.tensor(scores, dtype=torch.float)
        return padded_seqs1, padded_seqs2, torch.LongTensor(scores)

In [442]:
sent1_tokens = list(df['sent1'])
sent2_tokens = list(df['sent2'])
scores = list(df['score'])

In [443]:
word_to_ix = vocab_dict
train_dataset = MyDataset(sent1_tokens, sent2_tokens, scores, word_to_ix)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=train_dataset.collate_fn)

In [444]:
val_sent1_tokens = list(val_df['sent1'])
val_sent2_tokens = list(val_df['sent2'])
val_scores = list(val_df['score'])

In [445]:
val_dataset = MyDataset(val_sent1_tokens, val_sent2_tokens, val_scores, word_to_ix)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=True, collate_fn=val_dataset.collate_fn)

In [446]:
test_sent1_tokens = list(test_df['sent1'])
test_sent2_tokens = list(test_df['sent2'])
test_scores = list(test_df['score'])

In [447]:
test_dataset = MyDataset(test_sent1_tokens, test_sent2_tokens, test_scores, word_to_ix)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True, collate_fn=test_dataset.collate_fn)

In [396]:
len(subset_embedding_matrix)

8500

In [397]:
subset_embedding_matrix.shape

(8500, 300)

In [522]:
import torch
import torch.nn as nn


class SiameseBiLSTM(nn.Module):
    def __init__(self, hidden_size, num_layers, embedding_dim, embd_matrix, dropout=0.2):
        super(SiameseBiLSTM, self).__init__()

        # LSTM parameters
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding_dim = embedding_dim
        self.embd_matrix = embd_matrix

        # Word embeddings
        self.word_embeddings = nn.Embedding(len(embd_matrix), embedding_dim)
        self.word_embeddings.weight = nn.Parameter(torch.from_numpy(self.embd_matrix))
        self.word_embeddings.weight.requires_grad = False

        # BiLSTM layers
        self.bilstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_layers,
                              batch_first=True, bidirectional=True)

        # Dropout layer
        self.dropout = nn.Dropout(dropout)

        # Attention layers
        self.attention_fc = nn.Linear(hidden_size * 2, 1)
        self.attention_softmax = nn.Softmax(dim=1)

        # Similarity scoring layer
        self.fc = nn.Linear(hidden_size * 4, 1)  # 4 because we concatenate forward and backward hidden states of both LSTMs

    def forward_once(self, sentence):
        # Word embeddings
        embeds = self.word_embeddings(sentence)

        # BiLSTM
        lstm_out, _ = self.bilstm(embeds)

        # Apply dropout to hidden layers
        lstm_out = self.dropout(lstm_out)

        # Attention mechanism
        attention_weights = self.attention_softmax(self.attention_fc(lstm_out))
        lstm_out = lstm_out * attention_weights
        lstm_out = lstm_out.sum(dim=1)

        return lstm_out

    def forward(self, sentence1, sentence2):
        # Process sentence 1
        output1 = self.forward_once(sentence1)

        # Process sentence 2
        output2 = self.forward_once(sentence2)

        # Concatenate outputs of both LSTMs
        concatenated = torch.cat((output1, output2), dim=1)

        # Pass through similarity scoring layer
        similarity_score = torch.sigmoid(self.fc(concatenated))

        return similarity_score


In [523]:
import torch
from scipy.stats import pearsonr

class PearsonLoss(nn.Module):
    def __init__(self):
        super(PearsonLoss, self).__init__()
        
    def forward(self, pred, target):
        pred = pred.view(-1)
        target = target.view(-1)
        pearson_r, _ = pearsonr(pred.detach().cpu().numpy(), target.detach().cpu().numpy())
        loss = 1 - pearson_r
        return torch.tensor(loss, requires_grad=True, device=pred.device)


In [524]:
# Define model and optimizer
model = SiameseBiLSTM(hidden_size=50, num_layers=2, embedding_dim=300, embd_matrix = subset_embedding_matrix)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Define loss function
#criterion = nn.MSELoss()
criterion = PearsonLoss()

num_epochs = 10

# Train model
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for i, (sentence1, sentence2, score) in enumerate(train_dataloader):
        # Convert inputs and output to PyTorch tensors
        sentence1_tensor = sentence1
        sentence2_tensor = sentence2
        score_tensor = torch.tensor(score, dtype=torch.float)/5.0
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        output = model(sentence1_tensor, sentence2_tensor)
        
        #print(score_tensor.squeeze().shape)
        # Compute loss
        #loss = criterion(outputs, score_tensor.unsqueeze(-1))
        loss = criterion(output.squeeze(), score_tensor.squeeze())
        #loss = model.loss(output, score_tensor.unsqueeze(-1))
        
        # Backward pass
        loss.backward()
        
        # Update weights
        optimizer.step()
        
        epoch_loss += loss.item()  # add batch loss to total epoch loss
        
     # Validation loop
    model.eval()  # set model to evaluation mode
    total_val_loss = 0
    with torch.no_grad():
        for j, (val_sentence1, val_sentence2, val_score) in enumerate(val_dataloader):
            val_sentence1_tensor = val_sentence1
            val_sentence2_tensor = val_sentence2
            val_score_tensor = torch.tensor(val_score, dtype=torch.float)/5.0
            outputs = model(val_sentence1_tensor, val_sentence2_tensor)
            #val_loss = criterion(outputs, val_score_tensor.unsqueeze(-1))
            val_loss = criterion(outputs.squeeze(), val_score_tensor.squeeze())
            #val_loss = model.loss(outputs, val_score_tensor.unsqueeze(-1))
            total_val_loss += val_loss.item()
            
    avg_train_loss = epoch_loss / len(train_dataloader) 
    avg_val_loss = total_val_loss / len(val_dataloader)
    print('Epoch [{}/{}], Train Loss: {:.4f}, Val Loss: {:.4f}'.format(epoch+1, num_epochs, avg_train_loss, avg_val_loss))

  return padded_seqs1, padded_seqs2, torch.LongTensor(scores)
  score_tensor = torch.tensor(score, dtype=torch.float)/5.0
  val_score_tensor = torch.tensor(val_score, dtype=torch.float)/5.0


Epoch [1/10], Train Loss: 0.7830, Val Loss: 0.8916
Epoch [2/10], Train Loss: 0.7847, Val Loss: 0.8830
Epoch [3/10], Train Loss: 0.7799, Val Loss: 0.8753
Epoch [4/10], Train Loss: 0.7798, Val Loss: 0.8873
Epoch [5/10], Train Loss: 0.7750, Val Loss: 0.8754
Epoch [6/10], Train Loss: 0.7777, Val Loss: 0.8769
Epoch [7/10], Train Loss: 0.7726, Val Loss: 0.8778
Epoch [8/10], Train Loss: 0.7726, Val Loss: 0.8916
Epoch [9/10], Train Loss: 0.7816, Val Loss: 0.8686
Epoch [10/10], Train Loss: 0.7775, Val Loss: 0.8709


In [540]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
test_predictions = []
test_labels = []
model.eval()  # set model to evaluation mode
with torch.no_grad():
    for k, (test_sentence1, test_sentence2, test_score) in enumerate(test_dataloader):
        test_sentence1_tensor = test_sentence1
        test_sentence2_tensor = test_sentence2
        test_score_tensor = torch.tensor(test_score, dtype=torch.float)/5.0
        test_output = model(test_sentence1_tensor, test_sentence2_tensor)
        test_predictions.extend(test_output.tolist())
        test_labels.extend(test_score)
test_predictions = np.array(test_predictions)
test_labels = np.array(test_labels)
test_mse = mean_squared_error(test_labels, test_predictions)
print('Test MSE: {:.4f}'.format(test_mse))

  return padded_seqs1, padded_seqs2, torch.LongTensor(scores)
  test_score_tensor = torch.tensor(test_score, dtype=torch.float)/5.0


Test MSE: 5.6369


In [401]:
test_predictions

array([[0.68017125],
       [0.61859626],
       [0.62256104],
       ...,
       [0.59949923],
       [0.68392903],
       [0.64198142]])

In [362]:
len(test_labels)

1379

In [363]:
test_predictions.shape, test_labels.shape

((1379, 1), (1379,))

In [364]:
from scipy.stats import pearsonr

def pearson_corr(y_true, y_pred):
    """
    Calculate Pearson correlation coefficient between two arrays.
    """
    
    corr, _ = pearsonr(y_true, y_pred)
    return corr

In [541]:
corr = pearson_corr(test_labels, test_predictions.ravel())

In [542]:
corr

0.1299918837761417

In [209]:
len(vocab_dict)

8500

In [210]:
vocab_dict['unk']

8499

In [543]:
torch.save(model, "../data/siamese_model_v1.pt")