In [23]:
# import all necessary packages for SS_LSTM
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import random
import os
import matplotlib.pyplot as plt
import pickle
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import collections
import itertools
import re
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from torchtext.vocab import GloVe
from datasets import load_dataset

In [24]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [25]:
# Load the data

def load_data():
    dataset = load_dataset("paws", "labeled_final")
    train_dataset = dataset['train']
    test_dataset = dataset['test']
    val_dataset = dataset['validation']
    return train_dataset, val_dataset, test_dataset


In [26]:
# Load datasets
ltrain_dataset, lval_dataset, ltest_dataset = load_data()

In [27]:
# Just take 100 samples for now
ltrain_dataset = ltrain_dataset.select(range(15000))
# lval_dataset = lval_dataset.select(range(100))
ltest_dataset = ltest_dataset

In [28]:
# Check datasets
print(type(ltrain_dataset))

<class 'datasets.arrow_dataset.Dataset'>


In [29]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('bert-base-nli-mean-tokens')
model.to(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [30]:
# find test accuracy
def find_accuracy(model, test_dataset):
    test_acc = 0
    for i in tqdm(range(len(test_dataset))):
        # get the sentence pairs
        sent1 = test_dataset[i]['sentence1']
        sent2 = test_dataset[i]['sentence2']
        # get the label
        label = test_dataset[i]['label']
        # get the embeddings
        sent1_emb = model.encode(sent1).reshape(1, -1)
        sent2_emb = model.encode(sent2).reshape(1, -1)
        # calculate cosine similarity
        cos = cosine_similarity(sent1_emb, sent2_emb)
        # get the predicted label
        pred = 1 if cos > 0.5 else 0
        # check if prediction is correct
        if pred == label:
            test_acc += 1
    return test_acc/len(test_dataset)

In [31]:
print(find_accuracy(model, ltest_dataset))

100%|██████████| 8000/8000 [04:18<00:00, 30.92it/s]

0.442





In [32]:
# Function to get the model label
def get_label(model, sentence1, sentence2):
    sent1_emb = model.encode(sentence1, convert_to_tensor=True)
    sent2_emb = model.encode(sentence2, convert_to_tensor=True)
    # Convert tensors to numpy arrays
    sent1_emb = sent1_emb.cpu().detach().numpy()
    sent2_emb = sent2_emb.cpu().detach().numpy()
    # Normalize the embeddings
    sent1_emb = sent1_emb / np.linalg.norm(sent1_emb)
    sent2_emb = sent2_emb / np.linalg.norm(sent2_emb)
    # Calculate cosine similarity
    cos = cosine_similarity([sent1_emb], [sent2_emb])[0][0]
    # Get the predicted label
    pred = 1 if cos >= 0.5 else 0
    return pred,cos

In [33]:
# Let's test for a tough example
sentence1= "This was a series of nested angular standards , so that measurements in azimuth and elevation could be done directly in polar coordinates relative to the ecliptic ."
sentence2= "This was a series of nested polar scales , so that measurements in azimuth and elevation could be performed directly in angular coordinates relative to the ecliptic ."
pred,sim = get_label(model, sentence1, sentence2)
print("Cosine similarity before: ", sim)
print("Predicted label: ", pred)

Cosine similarity before:  0.99204415
Predicted label:  1


In [34]:
import pandas as pd
from sentence_transformers import InputExample
from torch.utils.data import DataLoader
from sentence_transformers import losses

# Dataset

train_examples = [InputExample(texts=[item['sentence1'], item['sentence2']], label=float(item['label'])) for item in ltrain_dataset]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)


# Define a loss function (choose appropriate for your task)
train_loss = losses.CosineSimilarityLoss(model)

# Training loop
num_epochs = 4
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          warmup_steps=warmup_steps)

Iteration: 100%|██████████| 938/938 [02:17<00:00,  6.82it/s]
Iteration: 100%|██████████| 938/938 [02:18<00:00,  6.79it/s]
Iteration: 100%|██████████| 938/938 [02:18<00:00,  6.79it/s]
Iteration: 100%|██████████| 938/938 [02:18<00:00,  6.79it/s]
Epoch: 100%|██████████| 4/4 [09:12<00:00, 138.02s/it]


In [35]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def find_accuracy(model, test_dataset, threshold=0.5):
    test_acc = 0
    for i in tqdm(range(len(test_dataset))):
        # Get the sentence pairs
        sent1 = test_dataset[i]['sentence1']
        sent2 = test_dataset[i]['sentence2']
        # Get the label
        label = test_dataset[i]['label']
        # Get the embeddings
        sent1_emb = model.encode(sent1, convert_to_tensor=True)
        sent2_emb = model.encode(sent2, convert_to_tensor=True)
        # Convert tensors to numpy arrays
        sent1_emb = sent1_emb.cpu().detach().numpy()
        sent2_emb = sent2_emb.cpu().detach().numpy()
        # Normalize the embeddings
        sent1_emb = sent1_emb / np.linalg.norm(sent1_emb)
        sent2_emb = sent2_emb / np.linalg.norm(sent2_emb)
        # Calculate cosine similarity
        cos = cosine_similarity([sent1_emb], [sent2_emb])[0][0]
        # Get the predicted label
        pred = 1 if cos >= threshold else 0
        # Check if prediction is correct
        if pred == label:
            test_acc += 1
    return test_acc / len(test_dataset)

# Calculate and print the accuracy
accuracy = find_accuracy(model, ltest_dataset)
print(f"Accuracy: {accuracy}")


100%|██████████| 8000/8000 [04:21<00:00, 30.64it/s]

Accuracy: 0.755375



