# [Sentence-BERT](https://arxiv.org/pdf/1908.10084.pdf)

[Reference Code](https://www.pinecone.io/learn/series/nlp/train-sentence-transformers-softmax/)

In [1]:
import os
import math
import re
from   random import *
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'
torch.set_default_device('cpu')
print(device)

cuda


## 1. Data

### Train, Test, Validation 

In [2]:
import datasets
snli = datasets.load_dataset('snli')
mnli = datasets.load_dataset('glue', 'mnli')
mnli['train'].features, snli['train'].features

  from .autonotebook import tqdm as notebook_tqdm


({'premise': Value(dtype='string', id=None),
  'hypothesis': Value(dtype='string', id=None),
  'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None),
  'idx': Value(dtype='int32', id=None)},
 {'premise': Value(dtype='string', id=None),
  'hypothesis': Value(dtype='string', id=None),
  'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None)})

In [3]:
# List of datasets to remove 'idx' column from
mnli.column_names.keys()

dict_keys(['train', 'validation_matched', 'validation_mismatched', 'test_matched', 'test_mismatched'])

In [4]:
# Remove 'idx' column from each dataset
for column_names in mnli.column_names.keys():
    mnli[column_names] = mnli[column_names].remove_columns('idx')

In [5]:
mnli.column_names.keys()

dict_keys(['train', 'validation_matched', 'validation_mismatched', 'test_matched', 'test_mismatched'])

In [6]:
import numpy as np
np.unique(mnli['train']['label']), np.unique(snli['train']['label'])
#snli also have -1

(array([0, 1, 2]), array([-1,  0,  1,  2]))

In [7]:
# there are -1 values in the label feature, these are where no class could be decided so we remove
snli = snli.filter(
    lambda x: 0 if x['label'] == -1 else 1
)

In [8]:
import numpy as np
np.unique(mnli['train']['label']), np.unique(snli['train']['label'])
#snli also have -1

(array([0, 1, 2]), array([0, 1, 2]))

In [9]:
# Assuming you have your two DatasetDict objects named snli and mnli
from datasets import DatasetDict
# Merge the two DatasetDict objects
raw_dataset = DatasetDict({
    'train': datasets.concatenate_datasets([snli['train'], mnli['train']]).shuffle(seed=55).select(list(range(1000))),
    'test': datasets.concatenate_datasets([snli['test'], mnli['test_mismatched']]).shuffle(seed=55).select(list(range(100))),
    'validation': datasets.concatenate_datasets([snli['validation'], mnli['validation_mismatched']]).shuffle(seed=55).select(list(range(1000)))
})
#remove .select(list(range(1000))) in order to use full dataset
# Now, merged_dataset_dict contains the combined datasets from snli and mnli
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1000
    })
})

## 2. Preprocessing

In [10]:
# from transformers import BertTokenizer

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [11]:
# def preprocess_function(examples):
#     max_seq_length = 512
#     padding = 'max_length'
#     # Tokenize the premise
#     premise_result = tokenizer(
#         examples['premise'], padding=padding, max_length=max_seq_length, truncation=True)
#     #num_rows, max_seq_length
#     # Tokenize the hypothesis
#     hypothesis_result = tokenizer(
#         examples['hypothesis'], padding=padding, max_length=max_seq_length, truncation=True)
#     #num_rows, max_seq_length
#     # Extract labels
#     labels = examples["label"]
#     #num_rows
#     return {
#         "premise_input_ids": premise_result["input_ids"],
#         "premise_attention_mask": premise_result["attention_mask"],
#         "hypothesis_input_ids": hypothesis_result["input_ids"],
#         "hypothesis_attention_mask": hypothesis_result["attention_mask"],
#         "labels" : labels
#     }

# tokenized_datasets = raw_dataset.map(
#     preprocess_function,
#     batched=True,
# )

# tokenized_datasets = tokenized_datasets.remove_columns(['premise','hypothesis','label'])
# tokenized_datasets.set_format("torch")

In [12]:
import spacy
import pickle

tokenizer = spacy.load("en_core_web_sm")
word2id = pickle.load(open('./model/elements/word2id.pkl', 'rb'))

In [13]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1000
    })
})

In [14]:
import re

def preprocess_function(examples):
    max_seq_length = 512
    # Tokenize the premise

    # tokenized_premise = []
    # for sent in examples['premise']:
    #     print(sent)
    #     tokenized_premise.append(tokenizer(re.sub("[.,!?\\-]=", '', sent.lower())))

    tokenized_premise = [tokenizer(re.sub("[.,!?\\-']=", ' ', sent.lower())) for sent in examples['premise']]

    # premise_input_ids = []
    # for tokens in tokenized_premise:
    #     premise_input_ids.append([])
    #     premise_input_ids[-1].append([word2id['[CLS]']])
    #     for token in tokens:
    #         try:
    #             premise_input_ids[-1].append(word2id[token])
    #         except KeyError as k:
    #             pass
    #     premise_input_ids[-1].append([word2id['[SEP]']])
    
    premise_input_ids = [[word2id['[CLS]']] + [word2id[str(token)] for token in tokens if str(token) in word2id] + [word2id['[SEP]']] for tokens in tokenized_premise]
    premise_pad_len = [max_seq_length - len(premise) for premise in premise_input_ids]
    premise_attn_mask = [([1] * len(premise)) + ([0] * pad_len) for premise, pad_len in zip(premise_input_ids, premise_pad_len)]
    premise_input_ids = [premise + [word2id['[PAD]']] * pad_len for premise, pad_len in zip(premise_input_ids, premise_pad_len)]

    #num_rows, max_seq_length
    # Tokenize the hypothesis
    tokenized_hypothesis = [tokenizer(re.sub("[.,!?\\-]=", '', sent.lower())) for sent in examples['hypothesis']]

    # hypothesis_input_ids = []
    # for tokens in tokenized_hypothesis:
    #     premise_input_ids.append([])
    #     hypothesis_input_ids[-1].append([word2id['[CLS]']])
    #     for token in tokens:
    #         try:
    #             hypothesis_input_ids[-1].append(word2id[token])
    #         except KeyError as k:
    #             pass
    #     hypothesis_input_ids[-1].append([word2id['[SEP]']])

    hypothesis_input_ids = [[word2id['[CLS]']] + [word2id[str(token)] for token in tokens if str(token) in word2id] + [word2id['[SEP]']] for tokens in tokenized_hypothesis]
    hypothesis_pad_len = [max_seq_length - len(hypothesis) for hypothesis in hypothesis_input_ids]
    hypothesis_attn_mask = [([1] * len(hypothesis)) + ([0] * pad_len) for hypothesis, pad_len in zip(hypothesis_input_ids, hypothesis_pad_len)]
    hypothesis_input_ids = [hypothesis + [word2id['[PAD]']] * pad_len for hypothesis, pad_len in zip(hypothesis_input_ids, hypothesis_pad_len)]
    #num_rows, max_seq_length
    # Extract labels
    labels = examples["label"]
    #num_rows
    return {
        "premise_input_ids": premise_input_ids,
        "premise_attention_mask": premise_attn_mask,
        "hypothesis_input_ids": hypothesis_input_ids,
        "hypothesis_attention_mask": hypothesis_attn_mask,
        "labels" : labels
    }

tokenized_datasets = raw_dataset.map(
    preprocess_function,
    batched=True,
)

tokenized_datasets = tokenized_datasets.remove_columns(['premise','hypothesis','label'])
tokenized_datasets.set_format("torch")

In [15]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 1000
    })
})

## 3. Data loader

In [16]:
from torch.utils.data import DataLoader

# initialize the dataloader
batch_size = 8
train_dataloader = DataLoader(
    tokenized_datasets['train'], 
    batch_size=batch_size, 
    shuffle=True
)
eval_dataloader = DataLoader(
    tokenized_datasets['validation'], 
    batch_size=batch_size
)
test_dataloader = DataLoader(
    tokenized_datasets['test'], 
    batch_size=batch_size
)

In [17]:
torch.set_default_device('cpu')

In [18]:
for batch in train_dataloader:
    print(batch['premise_input_ids'].shape)
    print(batch['premise_attention_mask'].shape)
    print(batch['hypothesis_input_ids'].shape)
    print(batch['hypothesis_attention_mask'].shape)
    print(batch['labels'].shape)
    break

torch.Size([8, 512])
torch.Size([8, 512])
torch.Size([8, 512])
torch.Size([8, 512])
torch.Size([8])


## 4. Model

In [19]:
from model import bert

In [20]:
import importlib
importlib.reload(bert);

In [21]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [22]:
model = bert.BERT().to(device)
model.load_state_dict(torch.load('./model/BERT3.pth'))

<All keys matched successfully>

### Pooling
SBERT adds a pooling operation to the output of BERT / RoBERTa to derive a fixed sized sentence embedding

In [23]:
# define mean pooling function
def mean_pool(token_embeds, attention_mask):
    # reshape attention_mask to cover 768-dimension embeddings
    in_mask = attention_mask.unsqueeze(-1).expand(
        token_embeds.size()
    ).float()
    # perform mean-pooling but exclude padding tokens (specified by in_mask)
    pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(
        in_mask.sum(1), min=1e-9
    )
    pool = pool.to(device)

    return pool

## 5. Loss Function

## Classification Objective Function 
We concatenate the sentence embeddings $u$ and $v$ with the element-wise difference  $\lvert u - v \rvert $ and multiply the result with the trainable weight  $ W_t ∈  \mathbb{R}^{3n \times k}  $:

$ o = \text{softmax}\left(W^T \cdot \left(u, v, \lvert u - v \rvert\right)\right) $

where $n$ is the dimension of the sentence embeddings and k the number of labels. We optimize cross-entropy loss. This structure is depicted in Figure 1.

## Regression Objective Function. 
The cosine similarity between the two sentence embeddings $u$ and $v$ is computed (Figure 2). We use means quared-error loss as the objective function.

(Manhatten / Euclidean distance, semantically  similar sentences can be found.)

<img src="./figures/sbert-architecture.png" >

In [24]:
def configurations(u,v):
    # build the |u-v| tensor
    uv = torch.sub(u, v)   # batch_size,hidden_dim
    uv_abs = torch.abs(uv) # batch_size,hidden_dim
    
    # concatenate u, v, |u-v|
    x = torch.cat([u, v, uv_abs], dim=-1) # batch_size, 3*hidden_dim
    return x

def cosine_similarity(u, v):
    dot_product = np.dot(u, v)
    norm_u = np.linalg.norm(u)
    norm_v = np.linalg.norm(v)
    similarity = dot_product / (norm_u * norm_v)
    return similarity

In [25]:
classifier_head = torch.nn.Linear(768*3, 3).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
optimizer_classifier = torch.optim.Adam(classifier_head.parameters(), lr=2e-5)

criterion = nn.CrossEntropyLoss()

In [26]:
from transformers import get_linear_schedule_with_warmup

# and setup a warmup for the first ~10% steps
total_steps = int(len(raw_dataset) / batch_size)
warmup_steps = int(0.1 * total_steps)
scheduler = get_linear_schedule_with_warmup(
		optimizer, num_warmup_steps=warmup_steps,
  	num_training_steps=total_steps - warmup_steps
)

# then during the training loop we update the scheduler per step
scheduler.step()

scheduler_classifier = get_linear_schedule_with_warmup(
		optimizer_classifier, num_warmup_steps=warmup_steps,
  	num_training_steps=total_steps - warmup_steps
)

# then during the training loop we update the scheduler per step
scheduler_classifier.step()



## 6. Training

In [27]:
max_len = bert.max_len
max_len

512

In [28]:
# embedding(inputs_ids_a, segment_ids)

In [29]:
# # def get_attn_pad_mask(seq_q, seq_k):
# #     batch_size, len_q = seq_q.size()
# #     batch_size, len_k = seq_k.size()
# #     # eq(zero) is PAD token
# #     pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # batch_size x 1 x len_k(=len_q), one is masking
# #     pad_attn_mask.to(device)
# #     return pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k


# enc_self_attn_mask = bert2.get_attn_pad_mask(inputs_ids_a, inputs_ids_a)
# embedding = bert2.Embedding()
# output = embedding(inputs_ids_a, segment_ids)

# for i, layer in enumerate(test):
#     print(i)
#     output, _ = layer(output, enc_self_attn_mask)

In [30]:
test = nn.ModuleList([bert.EncoderLayer() for _ in range(6)])

In [31]:
for layer in test:
    print(layer)

EncoderLayer(
  (enc_self_attn): MultiHeadAttention(
    (W_Q): Linear(in_features=768, out_features=512, bias=True)
    (W_K): Linear(in_features=768, out_features=512, bias=True)
    (W_V): Linear(in_features=768, out_features=512, bias=True)
  )
  (pos_ffn): PoswiseFeedForwardNet(
    (fc1): Linear(in_features=768, out_features=3072, bias=True)
    (fc2): Linear(in_features=3072, out_features=768, bias=True)
  )
)
EncoderLayer(
  (enc_self_attn): MultiHeadAttention(
    (W_Q): Linear(in_features=768, out_features=512, bias=True)
    (W_K): Linear(in_features=768, out_features=512, bias=True)
    (W_V): Linear(in_features=768, out_features=512, bias=True)
  )
  (pos_ffn): PoswiseFeedForwardNet(
    (fc1): Linear(in_features=768, out_features=3072, bias=True)
    (fc2): Linear(in_features=3072, out_features=768, bias=True)
  )
)
EncoderLayer(
  (enc_self_attn): MultiHeadAttention(
    (W_Q): Linear(in_features=768, out_features=512, bias=True)
    (W_K): Linear(in_features=768, out_fe

In [32]:
# class Embedding(nn.Module):
#     def __init__(self):
#         super(Embedding, self).__init__()
#         self.tok_embed = nn.Embedding(17751, 768)  # token embedding
#         self.pos_embed = nn.Embedding(max_len, 768)      # position embedding
#         self.seg_embed = nn.Embedding(2, 768)  # segment(token type) embedding
#         self.norm = nn.LayerNorm(768)

#     def forward(self, x, seg):
#         #x, seg: (bs, len)
#         seq_len = x.size(1)
#         pos = torch.arange(seq_len, dtype=torch.long)
#         pos = pos.unsqueeze(0).expand_as(x)  # (len,) -> (bs, len)
#         ### the error is in the next line

#         # global x_fd, seg_fd, pos_fd
#         # x_fd = x
#         # seg_fd = seg
#         # pos_fd = pos

#         # print(x_fd.max(), pos_fd.max(), seg_fd.max())
#         pos = pos.to(device)
#         embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
#         out = self.norm(embedding)
#         print(out.device)
#         return out

In [33]:
from tqdm.auto import tqdm

num_epoch = 5
# 1 epoch should be enough, increase if wanted
for epoch in range(num_epoch):
    model.train()  
    classifier_head.train()
    # initialize the dataloader loop with tqdm (tqdm == progress bar)
    torch.set_default_device('cpu')
    for step, batch in enumerate(tqdm(train_dataloader, leave=True)):
        # zero all gradients on each new step
        optimizer.zero_grad()
        optimizer_classifier.zero_grad()
        
        # prepare batches and more all to the active device
        inputs_ids_a = batch['premise_input_ids'].to(device)
        inputs_ids_b = batch['hypothesis_input_ids'].to(device)
        attention_a = batch['premise_attention_mask'].to(device)
        attention_b = batch['hypothesis_attention_mask'].to(device)
        label = batch['labels'].to(device)
        
        # # extract token embeddings from BERT at last_hidden_state
        # u = model(inputs_ids_a, attention_mask=attention_a)  
        # v = model(inputs_ids_b, attention_mask=attention_b)  

        segment_ids = torch.zeros(batch_size, max_len, dtype=torch.int32).to(device)

        u_last_hidden_state = model.last_hidden_state(inputs_ids_a, segment_ids) # all token embeddings A = batch_size, seq_len, hidden_dim
        v_last_hidden_state = model.last_hidden_state(inputs_ids_b, segment_ids) # all token embeddings B = batch_size, seq_len, hidden_dim

         # get the mean pooled vectors
        u_mean_pool = mean_pool(u_last_hidden_state, attention_a) # batch_size, hidden_dim
        v_mean_pool = mean_pool(v_last_hidden_state, attention_b) # batch_size, hidden_dim
        
        # build the |u-v| tensor
        uv = torch.sub(u_mean_pool, v_mean_pool)   # batch_size,hidden_dim
        uv_abs = torch.abs(uv) # batch_size,hidden_dim
        
        # concatenate u, v, |u-v|
        x = torch.cat([u_mean_pool, v_mean_pool, uv_abs], dim=-1) # batch_size, 3*hidden_dim
        
        # process concatenated tensor through classifier_head
        x = classifier_head(x) #batch_size, classifer
        
        # calculate the 'softmax-loss' between predicted and true label
        loss = criterion(x, label)
        
        # using loss, calculate gradients and then optimizerize
        loss.backward()
        optimizer.step()
        optimizer_classifier.step()

        scheduler.step() # update learning rate scheduler
        scheduler_classifier.step()
        torch.set_default_device('cpu')
        
    print(f'Epoch: {epoch + 1} | loss = {loss.item():.6f}')
    

100%|██████████| 125/125 [02:14<00:00,  1.08s/it]


Epoch: 1 | loss = 2.628225


100%|██████████| 125/125 [02:27<00:00,  1.18s/it]


Epoch: 2 | loss = 4.527915


100%|██████████| 125/125 [02:35<00:00,  1.24s/it]


Epoch: 3 | loss = 3.147285


100%|██████████| 125/125 [02:38<00:00,  1.27s/it]


Epoch: 4 | loss = 4.277563


100%|██████████| 125/125 [02:40<00:00,  1.28s/it]

Epoch: 5 | loss = 2.206258





In [36]:
model.eval()
classifier_head.eval()
total_similarity = 0
with torch.no_grad():
    for step, batch in enumerate(eval_dataloader):
        # prepare batches and more all to the active device
        inputs_ids_a = batch['premise_input_ids'].to(device)
        inputs_ids_b = batch['hypothesis_input_ids'].to(device)
        attention_a = batch['premise_attention_mask'].to(device)
        attention_b = batch['hypothesis_attention_mask'].to(device)
        label = batch['labels'].to(device)
        
        # extract token embeddings from BERT at last_hidden_state
        segment_ids = torch.zeros(batch_size, max_len, dtype=torch.int32).to(device)

        u = model.last_hidden_state(inputs_ids_a, segment_ids) # all token embeddings A = batch_size, seq_len, hidden_dim
        v = model.last_hidden_state(inputs_ids_b, segment_ids) # all token embeddings B = batch_size, seq_len, hidden_dim

        # get the mean pooled vectors
        u_mean_pool = mean_pool(u, attention_a).detach().cpu().numpy().reshape(-1) # batch_size, hidden_dim
        v_mean_pool = mean_pool(v, attention_b).detach().cpu().numpy().reshape(-1) # batch_size, hidden_dim

        similarity_score = cosine_similarity(u_mean_pool, v_mean_pool)
        total_similarity += similarity_score
    
average_similarity = total_similarity / len(eval_dataloader)
print(f"Average Cosine Similarity: {average_similarity:.4f}")

Average Cosine Similarity: 0.9960


In [35]:
torch.save(model, './model/SBERT.pt')
torch.save(model.state_dict(), './model/SBERT.pth')

## 7. Inference

In [37]:
from model import bert

In [38]:
import importlib
importlib.reload(bert);

In [39]:
model = bert.BERT().to(device)
model.load_state_dict(torch.load('./model/SBERT.pth'))

<All keys matched successfully>

In [42]:
import spacy
import pickle

tokenizer = spacy.load("en_core_web_sm")
word2id = pickle.load(open('./model/elements/word2id.pkl', 'rb'))

In [51]:
device

device(type='cuda')

In [62]:
def my_tokenizer(sent):
    max_seq_length = 512
    tokens = tokenizer(re.sub("[.,!?\\-']=", ' ', sent.lower()))
    input_ids = [word2id['[CLS]']] + [word2id[str(token)] for token in tokens if str(token) in word2id] + [word2id['[SEP]']]
    pad_len = max_seq_length - len(input_ids)
    attn_mask = ([1] * len(input_ids)) + ([0] * pad_len)
    input_ids += [word2id['[PAD]']] * pad_len

    input_ids_tensor = torch.tensor(input_ids).unsqueeze(0).to(device)
    attn_mask_tensor = torch.tensor(attn_mask).unsqueeze(0).to(device)

    return input_ids_tensor, attn_mask_tensor

In [78]:
batch_size, max_len

(8, 512)

In [63]:
import torch
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(model, sentence_a, sentence_b, device):
    # Tokenize and convert sentences to input IDs and attention masks
    inputs_ids_a, attention_a = my_tokenizer(sentence_a)
    inputs_ids_b, attention_b = my_tokenizer(sentence_b)

    # Extract token embeddings from BERT
    segment_ids = torch.zeros(batch_size, max_len, dtype=torch.int32).to(device)

    u = model.last_hidden_state(inputs_ids_a, segment_ids) # all token embeddings A = batch_size, seq_len, hidden_dim
    v = model.last_hidden_state(inputs_ids_b, segment_ids) # all token embeddings B = batch_size, seq_len, hidden_dim

    # Get the mean-pooled vectors
    u = mean_pool(u, attention_a).detach().cpu().numpy().reshape(-1)  # batch_size, hidden_dim
    v = mean_pool(v, attention_b).detach().cpu().numpy().reshape(-1)  # batch_size, hidden_dim

    # Calculate cosine similarity
    similarity_score = cosine_similarity(u.reshape(1, -1), v.reshape(1, -1))[0, 0]

    return similarity_score

# Example usage:
sentence_a = 'Your contribution helped make it possible for us to provide our students with a quality education.'
sentence_b = "Your contributions were of no help with our students' education."
similarity = calculate_similarity(model, sentence_a, sentence_b, device)
print(f"Cosine Similarity: {similarity:.4f}")

Cosine Similarity: 0.9988


## 8. Evaluation

In [55]:
from sentence_transformers import SentenceTransformer
pretrained_model = SentenceTransformer("all-MiniLM-L6-v2")

modules.json: 100%|██████████| 349/349 [00:00<?, ?B/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
config_sentence_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 136kB/s]
README.md: 100%|██████████| 10.7k/10.7k [00:00<00:00, 10.7MB/s]
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<?, ?B/s]
config.json: 100%|██████████| 612/612 [00:00<?, ?B/s] 
pytorch_model.bin: 100%|██████████| 90.9M/90.9M [00:13<00:00, 6.75MB/s]
  return self.fget.__get__(instance, owner)()
tokenizer_config.json: 100%|██████████| 350/350 [00:00<?, ?B/s] 
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 5.63MB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 548kB/s]
special_tokens_map.json: 100%|██████████| 112/112 [00:00<?, ?B/s] 
1_Pooling/config.json: 100%|█

In [64]:
import torch
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity2(model, sentence_a, sentence_b):
    encoded_a = model.encode(sentence_a)
    encoded_b = model.encode(sentence_b)
    return cosine_similarity(encoded_a.reshape(1, -1), encoded_b.reshape(1, -1))[0, 0]

# Example usage:
sentence_a = 'Your contribution helped make it possible for us to provide our students with a quality education.'
sentence_b = "Your contributions were of no help with our students' education."
similarity = calculate_similarity2(pretrained_model, sentence_a, sentence_b)
print(f"Cosine Similarity: {similarity:.4f}")

Cosine Similarity: 0.5605


In [71]:
for i in range(10):
    p, h, l = ['premise', 'hypothesis', 'label']
    p = raw_dataset['train'][p][i]
    h = raw_dataset['train'][h][i]
    l = raw_dataset['train'][l][i]
    l = f"{l}  ({['entailment; expected 1', 'neutral; expected 0.5', 'contradition; expected 0'][l]})"
    my_similarity = calculate_similarity(model, p, h, device)
    pt_similarity = calculate_similarity2(pretrained_model, p, h)
    print(f'Premise: \t{p}')
    print(f'Hypothesis: \t{h}')
    print(f'\tLabel: \t\t{l}')
    print(f'\tMy sim: \t{my_similarity}')
    print(f'\tPt sim: \t{pt_similarity}')
    print('\n\n')

Premise: 	well they're dangerous
Hypothesis: 	They offer nothing to worry about.
	Label: 		2  (contradition; expected 0)
	My sim: 	0.9972643852233887
	Pt sim: 	0.24803051352500916



Premise: 	A man wearing snow gear hiking.
Hypothesis: 	A man hikes in winter.
	Label: 		0  (entailment; expected 1)
	My sim: 	0.9988747239112854
	Pt sim: 	0.8531558513641357



Premise: 	A man in a blue shirt, jeans, and wearing a tool belt is climbing down a metal rod.
Hypothesis: 	A person is flying an airplane.
	Label: 		2  (contradition; expected 0)
	My sim: 	0.9990731477737427
	Pt sim: 	0.043636757880449295



Premise: 	A baby wearing a "my best buddy" shirt on a bed.
Hypothesis: 	THe baby is sitting in the highchair.
	Label: 		2  (contradition; expected 0)
	My sim: 	0.997869610786438
	Pt sim: 	0.40219008922576904



Premise: 	Tours originate from the center.
Hypothesis: 	Tours start in the plaza downtown.
	Label: 		1  (neutral; expected 0.5)
	My sim: 	0.9990047216415405
	Pt sim: 	0.6614198684692383



In [None]:
def transform_label(example):
    label_map = {0: 1,  # entailment sentences (label == 0) should have a cosine similarity of 1
                 1: 0,  # neutral sentences (label == 1) should have a cosine similarity of 0
                 2: -1  # contradiction sentences (label == 2) should have a cosine similarity of -1
                }
    
    example['label'] = label_map[example['label']]

    return example

In [76]:
from scipy.stats import spearmanr

result = []

my_similarities = []
pt_similarities = []
l_cossims = []

for sample in raw_dataset['test']:
    p, h, l = ['premise', 'hypothesis', 'label']
    p = sample[p]
    h = sample[h]
    l = sample[l]
    l_cossim = [1, 0.5, 0][l]

    my_similarity = calculate_similarity(model, p, h, device)
    pt_similarity = calculate_similarity2(pretrained_model, p, h)

    my_similarities.append(my_similarity)
    pt_similarities.append(pt_similarity)
    l_cossims.append(l_cossim)

correlation_my = spearmanr(my_similarities, l_cossims)[0]
correlation_pt = spearmanr(pt_similarities, l_cossims)[0]

print(f"Spearman correlation between my_similarity and l_cossim: {correlation_my:.4f}")
print(f"Spearman correlation between pt_similarity and l_cossim: {correlation_pt:.4f}")

Spearman correlation between my_similarity and l_cossim: 0.0079
Spearman correlation between pt_similarity and l_cossim: 0.1344
