<h1> Doc2Vec embeddings </h1>
<p>Doc2Vec model is trained to generate representative embeddings of sentences and with these embeddings following approaches are applied to generate STS scores:</p>
<li>Normalized cosine similarity score </li>
<li>BiLSTM Regression neural network model</li>
<li>BiGRU Regression neural network model</li>

In [138]:
import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from scipy import spatial

In [29]:
train_df = pd.read_csv('../data/cleaned_train_df.csv')
val_df = pd.read_csv('../data/cleaned_val_df.csv')
test_df = pd.read_csv('../data/cleaned_test_df.csv')

In [37]:
train_df['sent1'] = train_df['sent1'].apply(eval)
train_df['sent2'] = train_df['sent2'].apply(eval)

In [51]:
val_df['sent1'] = val_df['sent1'].apply(eval)
val_df['sent2'] = val_df['sent2'].apply(eval)

In [52]:
test_df['sent1'] = test_df['sent1'].apply(eval)
test_df['sent2'] = test_df['sent2'].apply(eval)

In [39]:
total_sents_unk = list(train_df['sent1'])
total_sents_unk.extend(list(train_df['sent2']))

In [40]:
documents = []
for idx, sent in enumerate(total_sents_unk):
    documents.append(TaggedDocument(sent, [idx]))

In [41]:
documents[0:5]

[TaggedDocument(words=['a', 'plane', 'is', 'take', 'off'], tags=[0]),
 TaggedDocument(words=['a', 'man', 'is', 'play', 'a', 'larg', 'flute'], tags=[1]),
 TaggedDocument(words=['a', 'man', 'is', 'spread', 'unk', 'chees', 'on', 'a', 'pizza'], tags=[2]),
 TaggedDocument(words=['three', 'men', 'are', 'play', 'chess'], tags=[3]),
 TaggedDocument(words=['a', 'man', 'is', 'play', 'the', 'cello'], tags=[4])]

In [42]:
model = Doc2Vec(documents, vector_size=25, window=6, min_count=1, workers=1, epochs=30, alpha=0.1, min_alpha=0.001, hs=1)

In [43]:
# Generate sentence embeddings
embedding1 = model.infer_vector(train_df['sent1'][0])
embedding2 = model.infer_vector(train_df['sent2'][0])

# Compare the sentence embeddings using cosine similarity
similarity = 1 - spatial.distance.cosine(embedding1, embedding2)

In [44]:
def sts_score(sim_score):
    sts_score = (sim_score+1) * 2.5
    return sts_score

In [45]:
model.random.seed(42)
train_df['sent1_embedding'] = train_df['sent1'].apply(lambda x: model.infer_vector(x))
train_df['sent2_embedding'] = train_df['sent2'].apply(lambda x: model.infer_vector(x))

In [46]:
train_df['y_pred'] = train_df.apply(lambda x: sts_score(1 - spatial.distance.cosine(x['sent1_embedding'], x['sent2_embedding'])), axis=1)

In [47]:
y_pred = train_df['y_pred']

In [48]:
y_train = train_df['score']

In [49]:
from scipy.stats import pearsonr
def pearson_corr(y_true, y_pred):
    """
    Calculate Pearson correlation coefficient between two arrays.
    """
    corr, _ = pearsonr(y_true, y_pred)
    return corr

In [50]:
# Calculate Pearson correlation coefficient between predicted values and target values
corr = pearson_corr(y_train, y_pred)

# Print the correlation coefficient
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.52


In [53]:
model.random.seed(42)
val_df['sent1_embedding'] = val_df['sent1'].apply(lambda x: model.infer_vector(x))
val_df['sent2_embedding'] = val_df['sent2'].apply(lambda x: model.infer_vector(x))

In [54]:
val_df['y_pred'] = val_df.apply(lambda x: sts_score(1 - spatial.distance.cosine(x['sent1_embedding'], x['sent2_embedding'])), axis=1)

In [55]:
y_val_pred = val_df['y_pred']

In [56]:
y_val = val_df['score']

In [59]:
# Calculate Pearson correlation coefficient between predicted values and target values
corr = pearson_corr(y_val, y_val_pred)

# Print the correlation coefficient
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.61


In [60]:
model.random.seed(42)
test_df['sent1_embedding'] = test_df['sent1'].apply(lambda x: model.infer_vector(x))
test_df['sent2_embedding'] = test_df['sent2'].apply(lambda x: model.infer_vector(x))

In [61]:
test_df['y_pred'] = test_df.apply(lambda x: sts_score(1 - spatial.distance.cosine(x['sent1_embedding'], x['sent2_embedding'])), axis=1)

In [62]:
y_test_pred = test_df['y_pred']

In [63]:
y_test = test_df['score']

In [64]:
# Calculate Pearson correlation coefficient between predicted values and target values
corr = pearson_corr(y_test, y_test_pred)

# Print the correlation coefficient
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.52


# BILstm model

In [66]:
embeddings1 = list(train_df['sent1_embedding'])
embeddings2 = list(train_df['sent2_embedding'])
scores = list(train_df['score'])

In [68]:
val_embeddings1 = list(val_df['sent1_embedding'])
val_embeddings2 = list(val_df['sent2_embedding'])
val_scores = list(val_df['score'])

In [69]:
import torch
import torch.nn as nn
import torch.optim as optim
# Convert the data into PyTorch tensors
embeddings1 = torch.tensor(embeddings1, dtype=torch.float)
embeddings2 = torch.tensor(embeddings2, dtype=torch.float)
scores = torch.tensor(scores, dtype=torch.float)

  from .autonotebook import tqdm as notebook_tqdm


In [70]:
val_embeddings1 = torch.tensor(val_embeddings1, dtype=torch.float)
val_embeddings2 = torch.tensor(val_embeddings2, dtype=torch.float)
val_scores = torch.tensor(val_scores, dtype=torch.float)

In [71]:
train_embeddings1 = embeddings1
train_embeddings2 = embeddings2
train_scores = scores

In [113]:
# Define the hyperparameters
input_dim = 25 # The dimension of the sentence embeddings
hidden_dim = 25
lr = 0.001
num_epochs = 5
#batch_size = 10

In [114]:
class BiLSTMRegression(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.bilstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim*2, 1)

    def forward(self, x1, x2):
        x = torch.cat((x1, x2), dim=1)
        x = x.view(len(x), 1, -1)
        h0 = torch.zeros(self.num_layers*2, len(x), self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.num_layers*2, len(x), self.hidden_dim).to(x.device)
        out, _ = self.bilstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [115]:
# Create the model and optimizer
#model = SentenceSimilarityModel(input_dim*2, hidden_dim)
model = BiLSTMRegression(input_dim*2, hidden_dim, num_layers=2)
optimizer = optim.Adam(model.parameters(), lr=lr)

# Define the loss function
loss_fn = nn.MSELoss()

In [116]:
import torch.utils.data as data

# Define a custom dataset class
class SentenceSimilarityDataset(data.Dataset):
    def __init__(self, embeddings1, embeddings2, scores):
        self.embeddings1 = embeddings1
        self.embeddings2 = embeddings2
        self.scores = scores

    def __len__(self):
        return len(self.embeddings1)

    def __getitem__(self, index):
        return self.embeddings1[index], self.embeddings2[index], self.scores[index]

In [117]:
# Create the dataset
train_dataset = SentenceSimilarityDataset(train_embeddings1, train_embeddings2, train_scores)
val_dataset = SentenceSimilarityDataset(val_embeddings1, val_embeddings2, val_scores)

In [118]:
# Define the batch size
batch_size = 10

# Create the DataLoader
train_dataloader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

In [119]:
# Train the model
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for batch in train_dataloader:
        optimizer.zero_grad()
        embeddings1_batch, embeddings2_batch, scores_batch = batch
        output = model(embeddings1_batch, embeddings2_batch)
        loss = loss_fn(output.squeeze(), scores_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * len(embeddings1_batch)
    train_loss /= len(train_embeddings1)

    # Evaluate the model on the validation set
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in val_dataloader:
            embeddings1_batch, embeddings2_batch, scores_batch = batch
            val_output = model(embeddings1_batch, embeddings2_batch)
            val_loss += loss_fn(val_output.squeeze(), scores_batch).item() * len(embeddings1_batch)
        val_loss /= len(val_embeddings1)

    print('Epoch {} - Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch+1, train_loss, val_loss))

Epoch 1 - Training Loss: 2.8065, Validation Loss: 2.0717
Epoch 2 - Training Loss: 1.6861, Validation Loss: 1.9265
Epoch 3 - Training Loss: 1.4261, Validation Loss: 1.9203
Epoch 4 - Training Loss: 1.2171, Validation Loss: 1.9439
Epoch 5 - Training Loss: 1.0567, Validation Loss: 2.0148


In [120]:
test_embeddings1 = list(test_df['sent1_embedding'])
test_embeddings2 = list(test_df['sent2_embedding'])
test_scores = list(test_df['score'])
test_embeddings1 = torch.tensor(test_embeddings1, dtype=torch.float)
test_embeddings2 = torch.tensor(test_embeddings2, dtype=torch.float)
test_scores = torch.tensor(test_scores, dtype=torch.float)
model.eval()
with torch.no_grad():
    test_output = model(test_embeddings1, test_embeddings2)

In [121]:
y_pred_test = test_output.squeeze().tolist()

In [122]:
y_test = test_scores

In [123]:
# Calculate Pearson correlation coefficient between predicted values and target values
corr = pearson_corr(y_test, y_pred_test)

# Print the correlation coefficient
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.33


In [128]:
class GRURegression(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.gru = nn.GRU(input_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim*2, 1)

    def forward(self, x1, x2):
        x = torch.cat((x1, x2), dim=1)
        x = x.view(len(x), 1, -1)
        h0 = torch.zeros(self.num_layers*2, len(x), self.hidden_dim).to(x.device)
        out, _ = self.gru(x, h0)
        out = self.fc(out[:, -1, :])
        return out

In [129]:
model = GRURegression(input_dim*2, hidden_dim, num_layers=2)

In [130]:
# Train the model
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for batch in train_dataloader:
        optimizer.zero_grad()
        embeddings1_batch, embeddings2_batch, scores_batch = batch
        output = model(embeddings1_batch, embeddings2_batch)
        loss = loss_fn(output.squeeze(), scores_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * len(embeddings1_batch)
    train_loss /= len(train_embeddings1)

    # Evaluate the model on the validation set
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in val_dataloader:
            embeddings1_batch, embeddings2_batch, scores_batch = batch
            val_output = model(embeddings1_batch, embeddings2_batch)
            val_loss += loss_fn(val_output.squeeze(), scores_batch).item() * len(embeddings1_batch)
        val_loss /= len(val_embeddings1)

    print('Epoch {} - Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch+1, train_loss, val_loss))

Epoch 1 - Training Loss: 9.2868, Validation Loss: 7.6936
Epoch 2 - Training Loss: 9.2868, Validation Loss: 7.6936
Epoch 3 - Training Loss: 9.2868, Validation Loss: 7.6936
Epoch 4 - Training Loss: 9.2868, Validation Loss: 7.6936
Epoch 5 - Training Loss: 9.2868, Validation Loss: 7.6936


In [131]:
test_embeddings1 = list(test_df['sent1_embedding'])
test_embeddings2 = list(test_df['sent2_embedding'])
test_scores = list(test_df['score'])
test_embeddings1 = torch.tensor(test_embeddings1, dtype=torch.float)
test_embeddings2 = torch.tensor(test_embeddings2, dtype=torch.float)
test_scores = torch.tensor(test_scores, dtype=torch.float)
model.eval()
with torch.no_grad():
    test_output = model(test_embeddings1, test_embeddings2)

In [132]:
y_pred_test = test_output.squeeze().tolist()
y_test = test_scores

In [133]:
corr = pearson_corr(y_test, y_pred_test)

# Print the correlation coefficient
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: -0.02


In [135]:
embeddings1 = list(train_df['sent1_embedding'])
embeddings2 = list(train_df['sent2_embedding'])
scores = list(train_df['score'])

In [136]:
val_embeddings1 = list(val_df['sent1_embedding'])
val_embeddings2 = list(val_df['sent2_embedding'])
val_scores = list(val_df['score'])

In [142]:
test_embeddings1 = list(test_df['sent1_embedding'])
test_embeddings2 = list(test_df['sent2_embedding'])
test_scores = list(test_df['score'])

In [141]:
# generate sentence embeddings
X1 = np.array(embeddings1)
X2 = np.array(embeddings2)
# generate sentence embeddings
val_X1 = np.array(val_embeddings1)
val_X2 = np.array(val_embeddings2)

In [143]:
test_X1 = np.array(test_embeddings1)
test_X2 = np.array(test_embeddings2)

In [145]:
from sklearn.linear_model import LinearRegression
#concatenate sentence embeddings to create feature matrix
X = np.concatenate([X1, X2], axis=1)

# train a linear regression model
reg = LinearRegression().fit(X, scores)

In [146]:
val_X = np.concatenate([val_X1, val_X2], axis=1)
val_y_pred = reg.predict(val_X)

In [150]:
corr = pearson_corr(val_scores, val_y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.07


In [147]:
# concatenate sentence embeddings to create feature matrix
test_X = np.concatenate([test_X1, test_X2], axis=1)
test_y_pred = reg.predict(test_X)

In [149]:
# Calculate Pearson correlation coefficient between predicted values and target values
corr = pearson_corr(test_scores, test_y_pred)

# Print the correlation coefficient
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.11


Unnamed: 0,sent1,sent2,score,sent1_embedding,sent2_embedding,y_pred
0,"[a, plane, is, take, off]","[an, air, plane, is, take, off]",5.0,"[-0.4224272, -2.6225648, 1.2613771, -1.0724506...","[-1.5233037, -1.2017787, 1.3848532, -1.6483687...",4.266091
1,"[a, man, is, play, a, larg, flute]","[a, man, is, play, a, flute]",3.8,"[-0.5728938, 1.937188, -0.59581786, 2.0941627,...","[0.30465078, 0.5849875, -0.21188323, 0.2666212...",4.220219
2,"[a, man, is, spread, unk, chees, on, a, pizza]","[a, man, is, spread, shred, chees, on, an, unk...",3.8,"[-0.34313184, -0.74568987, 0.19610271, -0.7560...","[0.6708356, 0.45349512, 1.2122866, 9.2444825e-...",3.646576
3,"[three, men, are, play, chess]","[two, men, are, play, chess]",2.6,"[1.751888, -0.30320084, 0.28331283, 0.30995673...","[1.272545, -0.33465645, -0.6245619, 0.8188742,...",4.547019
4,"[a, man, is, play, the, cello]","[a, man, seat, is, play, the, cello]",4.25,"[-0.06809825, -0.06937426, -2.022168, -0.36487...","[-1.2138389, -2.2551486, -2.0910704, 1.0399933...",4.203823
