# [Sentence-BERT](https://arxiv.org/pdf/1908.10084.pdf)

[Reference Code](https://www.pinecone.io/learn/series/nlp/train-sentence-transformers-softmax/)

In [1]:
import os
import math
import re
from   random import *
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Set GPU device
#os.environ["CUDA_VISIBLE_DEVICES"] = "0"

#os.environ['http_proxy']  = 'http://192.41.170.23:3128'
#os.environ['https_proxy'] = 'http://192.41.170.23:3128'
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = 'cpu'
device

'cpu'

## 1. Data

### Train, Test, Validation 

In [3]:
import datasets
snli = datasets.load_dataset('snli')
mnli = datasets.load_dataset('glue', 'mnli')
mnli['train'].features, snli['train'].features

Downloading readme:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/412k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/413k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/19.6M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/550152 [00:00<?, ? examples/s]

({'premise': Value(dtype='string', id=None),
  'hypothesis': Value(dtype='string', id=None),
  'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None),
  'idx': Value(dtype='int32', id=None)},
 {'premise': Value(dtype='string', id=None),
  'hypothesis': Value(dtype='string', id=None),
  'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None)})

In [4]:
# List of datasets to remove 'idx' column from
mnli.column_names.keys()

dict_keys(['train', 'validation_matched', 'validation_mismatched', 'test_matched', 'test_mismatched'])

In [5]:
# Remove 'idx' column from each dataset
for column_names in mnli.column_names.keys():
    mnli[column_names] = mnli[column_names].remove_columns('idx')

In [6]:
mnli.column_names.keys()

dict_keys(['train', 'validation_matched', 'validation_mismatched', 'test_matched', 'test_mismatched'])

In [7]:
import numpy as np
np.unique(mnli['train']['label']), np.unique(snli['train']['label'])
#snli also have -1

(array([0, 1, 2]), array([-1,  0,  1,  2]))

In [8]:
# there are -1 values in the label feature, these are where no class could be decided so we remove
snli = snli.filter(
    lambda x: 0 if x['label'] == -1 else 1
)

mnli = mnli.filter(
    lambda x: 0 if x['label'] == -1 else 1
)

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/550152 [00:00<?, ? examples/s]

Filter:   0%|          | 0/392702 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9815 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9832 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9796 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9847 [00:00<?, ? examples/s]

In [9]:
import numpy as np
np.unique(mnli['train']['label']), np.unique(snli['train']['label'])
#snli also have -1

(array([0, 1, 2]), array([0, 1, 2]))

In [10]:
# Assuming you have your two DatasetDict objects named snli and mnli
from datasets import DatasetDict
# Merge the two DatasetDict objects
raw_dataset = DatasetDict({
    'train': datasets.concatenate_datasets([snli['train'], mnli['train']]).shuffle(seed=55).select(list(range(1000))),
    'test': datasets.concatenate_datasets([snli['test'], mnli['test_mismatched']]).shuffle(seed=55).select(list(range(100))),
    'validation': datasets.concatenate_datasets([snli['validation'], mnli['validation_mismatched']]).shuffle(seed=55).select(list(range(1000)))
})
#remove .select(list(range(1000))) in order to use full dataset
# Now, merged_dataset_dict contains the combined datasets from snli and mnli
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1000
    })
})

## 2. Preprocessing

In [13]:
import torchtext

tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
vocab = torch.load('./model/vocab.pth')

In [14]:
max_seq_length = 256  # Define the maximum sequence length for padding/truncation

def preprocess_function(examples):
    """
    Preprocesses input examples by tokenizing premises and hypotheses, generating input IDs, 
    attention masks, and converting labels for model training.

    Args:
        examples: A batch from the dataset, containing premises, hypotheses, and labels.

    Returns:
        A dictionary with preprocessed model inputs and labels.
    """
    
    # Tokenize the premise and clean special characters, then lowercase
    tokenized_premise = [tokenizer(re.sub("[.,!?\\-]", '', sent.lower())) for sent in examples['premise']]
    # Convert tokens to IDs, adding special tokens '[CLS]' at the start and '[SEP]' at the end of each sequence
    premise_input_ids = [[vocab['[CLS]']] + [vocab[token] for token in tokens] + [vocab['[SEP]']] for tokens in tokenized_premise]
    # Calculate the number of padding tokens needed to reach max_seq_length
    premise_n_pad = [max_seq_length - len(tokens) for tokens in premise_input_ids]
    # Generate attention masks (1 for tokens, 0 for padding)
    premise_attn_mask = [([1] * len(tokens)) + ([0] * n_pad) for tokens, n_pad in zip(premise_input_ids, premise_n_pad)]
    # Apply padding to input IDs to ensure uniform sequence length
    premise_input_ids = [tokens + ([0] * n_pad) for tokens, n_pad in zip(premise_input_ids, premise_n_pad)]

    # Repeat the process for hypotheses
    tokenized_hypothesis = [tokenizer(re.sub("[.,!?\\-]", '', sent.lower())) for sent in examples['hypothesis']]
    hypothesis_input_ids = [[vocab['[CLS]']] + [vocab[token] for token in tokens] + [vocab['[SEP]']] for tokens in tokenized_hypothesis]
    hypothesis_n_pad = [max_seq_length - len(tokens) for tokens in hypothesis_input_ids]
    hypothesis_attn_mask = [([1] * len(tokens)) + ([0] * n_pad) for tokens, n_pad in zip(hypothesis_input_ids, hypothesis_n_pad)]
    hypothesis_input_ids = [tokens + ([0] * n_pad) for tokens, n_pad in zip(hypothesis_input_ids, hypothesis_n_pad)]

    # Extract and return labels directly
    labels = examples["label"]

    return {
        "premise_input_ids": premise_input_ids,
        "premise_attention_mask": premise_attn_mask,
        "hypothesis_input_ids": hypothesis_input_ids,
        "hypothesis_attention_mask": hypothesis_attn_mask,
        "labels": labels
    }

# Apply the preprocessing function to each batch in the dataset
tokenized_datasets = raw_dataset.map(preprocess_function, batched=True)

# Remove original columns to keep only the processed ones and set the data format to PyTorch tensors
tokenized_datasets = tokenized_datasets.remove_columns(['premise', 'hypothesis', 'label'])
tokenized_datasets.set_format("torch")


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [15]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 1000
    })
})

## 3. Data loader

In [16]:
from torch.utils.data import DataLoader

# Set the batch size for loading the data
batch_size = 4

# Initialize the DataLoader for the training dataset
train_dataloader = DataLoader(
    tokenized_datasets['train'], 
    batch_size=batch_size, 
    shuffle=True  # Shuffling is important for the training dataset to reduce overfitting and improve generalization
)

# Initialize the DataLoader for the validation dataset
# Shuffling is typically not needed for validation and testing
eval_dataloader = DataLoader(
    tokenized_datasets['validation'], 
    batch_size=batch_size  # No shuffling for evaluation/validation
)

# Initialize the DataLoader for the testing dataset
test_dataloader = DataLoader(
    tokenized_datasets['test'], 
    batch_size=batch_size  # No shuffling for testing
)


In [17]:
for batch in train_dataloader:
    print(batch['premise_input_ids'].shape)
    print(batch['premise_attention_mask'].shape)
    print(batch['hypothesis_input_ids'].shape)
    print(batch['hypothesis_attention_mask'].shape)
    print(batch['labels'].shape)
    break

torch.Size([4, 256])
torch.Size([4, 256])
torch.Size([4, 256])
torch.Size([4, 256])
torch.Size([4])


## 4. Model

In [19]:
from bert import BERT  # Import the BERT model definition

# Specify the path where the model and its parameters are saved
save_path = './model/bert.pt'

# Load the saved model parameters and state dictionary
params, state_dict = torch.load(save_path, map_location=device)

# Recreate the model instance with the loaded parameters
model = BERT(**params, device=device).to(device)

# Load the saved state dictionary into the model
model.load_state_dict(state_dict)

model.eval()


BERT(
  (embedding): Embedding(
    (tok_embed): Embedding(6944, 768)
    (pos_embed): Embedding(256, 768)
    (seg_embed): Embedding(2, 768)
    (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (enc_self_attn): MultiHeadAttention(
        (W_Q): Linear(in_features=768, out_features=512, bias=True)
        (W_K): Linear(in_features=768, out_features=512, bias=True)
        (W_V): Linear(in_features=768, out_features=512, bias=True)
        (fc): Linear(in_features=512, out_features=768, bias=True)
        (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (pos_ffn): PoswiseFeedForwardNet(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
      )
    )
  )
  (fc): Linear(in_features=768, out_features=768, bias=True)
  (activ): Tanh()
  (linear): Linear(in_features=768, out_features=768, bias=Tru

### Pooling
SBERT adds a pooling operation to the output of BERT / RoBERTa to derive a fixed sized sentence embedding

In [20]:
# define mean pooling function
def mean_pool(token_embeds, attention_mask):
    # reshape attention_mask to cover 768-dimension embeddings
    in_mask = attention_mask.unsqueeze(-1).expand(
        token_embeds.size()
    ).float()
    # perform mean-pooling but exclude padding tokens (specified by in_mask)
    pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(
        in_mask.sum(1), min=1e-9
    )
    return pool

## 5. Loss Function

## Classification Objective Function 
We concatenate the sentence embeddings $u$ and $v$ with the element-wise difference  $\lvert u - v \rvert $ and multiply the result with the trainable weight  $ W_t ∈  \mathbb{R}^{3n \times k}  $:

$ o = \text{softmax}\left(W^T \cdot \left(u, v, \lvert u - v \rvert\right)\right) $

where $n$ is the dimension of the sentence embeddings and k the number of labels. We optimize cross-entropy loss. This structure is depicted in Figure 1.

## Regression Objective Function. 
The cosine similarity between the two sentence embeddings $u$ and $v$ is computed (Figure 2). We use means quared-error loss as the objective function.

(Manhatten / Euclidean distance, semantically  similar sentences can be found.)

<img src="./figures/sbert-architecture.png" >

In [21]:
def configurations(u,v):
    # build the |u-v| tensor
    uv = torch.sub(u, v)   # batch_size,hidden_dim
    uv_abs = torch.abs(uv) # batch_size,hidden_dim
    
    # concatenate u, v, |u-v|
    x = torch.cat([u, v, uv_abs], dim=-1) # batch_size, 3*hidden_dim
    return x

def cosine_similarity(u, v):
    dot_product = np.dot(u, v)
    norm_u = np.linalg.norm(u)
    norm_v = np.linalg.norm(v)
    similarity = dot_product / (norm_u * norm_v)
    return similarity

<img src="./figures/sbert-ablation.png" width="350" height="300">

In [22]:
classifier_head = torch.nn.Linear(768*3, 3).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
optimizer_classifier = torch.optim.Adam(classifier_head.parameters(), lr=2e-5)

criterion = nn.CrossEntropyLoss()

In [23]:
from transformers import get_linear_schedule_with_warmup

# and setup a warmup for the first ~10% steps
total_steps = int(len(raw_dataset) / batch_size)
warmup_steps = int(0.1 * total_steps)
scheduler = get_linear_schedule_with_warmup(
		optimizer, num_warmup_steps=warmup_steps,
  	num_training_steps=total_steps - warmup_steps
)

# then during the training loop we update the scheduler per step
scheduler.step()

scheduler_classifier = get_linear_schedule_with_warmup(
		optimizer_classifier, num_warmup_steps=warmup_steps,
  	num_training_steps=total_steps - warmup_steps
)

# then during the training loop we update the scheduler per step
scheduler_classifier.step()



## 6. Training

In [25]:
from tqdm.auto import tqdm

num_epoch = 1
# 1 epoch should be enough, increase if wanted
for epoch in range(num_epoch):
    model.train()  
    classifier_head.train()
    # initialize the dataloader loop with tqdm (tqdm == progress bar)
    for step, batch in enumerate(tqdm(train_dataloader, leave=True)):
        # zero all gradients on each new step
        optimizer.zero_grad()
        optimizer_classifier.zero_grad()
        
        # prepare batches and more all to the active device
        inputs_ids_a = batch['premise_input_ids'].to(device)
        inputs_ids_b = batch['hypothesis_input_ids'].to(device)
        attention_a = batch['premise_attention_mask'].to(device)
        attention_b = batch['hypothesis_attention_mask'].to(device)
        segment_ids = torch.zeros(batch_size, max_seq_length, dtype=torch.int32).to(device)  # each input contains only one sentence hence we define them all as sentence '0'
        label = batch['labels'].to(device)
        
        # extract token embeddings from BERT at last_hidden_state
        u_last_hidden_state = model.get_last_hidden_state(inputs_ids_a, segment_ids)  
        v_last_hidden_state = model.get_last_hidden_state(inputs_ids_b, segment_ids)  

        # u_last_hidden_state = u.last_hidden_state # all token embeddings A = batch_size, seq_len, hidden_dim
        # v_last_hidden_state = v.last_hidden_state # all token embeddings B = batch_size, seq_len, hidden_dim

         # get the mean pooled vectors
        u_mean_pool = mean_pool(u_last_hidden_state, attention_a) # batch_size, hidden_dim
        v_mean_pool = mean_pool(v_last_hidden_state, attention_b) # batch_size, hidden_dim
        
        # build the |u-v| tensor
        uv = torch.sub(u_mean_pool, v_mean_pool)   # batch_size,hidden_dim
        uv_abs = torch.abs(uv) # batch_size,hidden_dim
        
        # concatenate u, v, |u-v|
        x = torch.cat([u_mean_pool, v_mean_pool, uv_abs], dim=-1) # batch_size, 3*hidden_dim
        
        # process concatenated tensor through classifier_head
        x = classifier_head(x) #batch_size, classifer
        
        # calculate the 'softmax-loss' between predicted and true label
        loss = criterion(x, label)
        
        # using loss, calculate gradients and then optimizerize
        loss.backward()
        optimizer.step()
        optimizer_classifier.step()

        scheduler.step() # update learning rate scheduler
        scheduler_classifier.step()
        
    print(f'Epoch: {epoch + 1} | loss = {loss.item():.6f}')

  0%|          | 0/250 [00:00<?, ?it/s]

Epoch: 1 | loss = 7.144242


In [27]:
# Saving the classifier head component of the model
classifier_head_path = './model/classifier_head_bert.pt'  
torch.save(classifier_head.state_dict(), classifier_head_path) 

# Save the entire custom BERT model configuration and state
custom_model_path = './model/custom_s_bert.pt' 
torch.save({'params': model.params, 'state_dict': model.state_dict()}, custom_model_path)


In [28]:
model.eval()
classifier_head.eval()
total_similarity = 0
with torch.no_grad():
    for step, batch in enumerate(eval_dataloader):
        # prepare batches and more all to the active device
        inputs_ids_a = batch['premise_input_ids'].to(device)
        inputs_ids_b = batch['hypothesis_input_ids'].to(device)
        attention_a = batch['premise_attention_mask'].to(device)
        attention_b = batch['hypothesis_attention_mask'].to(device)
        segment_ids = torch.zeros(batch_size, max_seq_length, dtype=torch.int32).to(device)
        label = batch['labels'].to(device)
        
        # extract token embeddings from BERT at last_hidden_state
        u = model.get_last_hidden_state(inputs_ids_a, segment_ids)  # all token embeddings A = batch_size, seq_len, hidden_dim
        v = model.get_last_hidden_state(inputs_ids_b, segment_ids)  # all token embeddings B = batch_size, seq_len, hidden_dim

        # get the mean pooled vectors
        u_mean_pool = mean_pool(u, attention_a).detach().cpu().numpy().reshape(-1) # batch_size, hidden_dim
        v_mean_pool = mean_pool(v, attention_b).detach().cpu().numpy().reshape(-1) # batch_size, hidden_dim

        similarity_score = cosine_similarity(u_mean_pool, v_mean_pool)
        total_similarity += similarity_score
    
average_similarity = total_similarity / len(eval_dataloader)
print(f"Average Cosine Similarity: {average_similarity:.4f}")

Average Cosine Similarity: 1.0000


## 7. Inference

In [31]:
model_path = './model/custom_s_bert.pt'
# Load the saved model data
saved_model = torch.load(model_path)

# Extract the 'params' and 'state_dict' from the loaded model data
params = saved_model['params']
state_dict = saved_model['state_dict']

device = params.pop('device', device)  # Use the loaded 'device' or fallback to the global 'device' variable

# Reconstruct the model using the loaded parameters, now explicitly including 'device'
model = BERT(**params, device=device).to(device)

# Load the saved state dictionary into the model
model.load_state_dict(state_dict)


<All keys matched successfully>

In [32]:
def get_inputs(sentence, tokenizer, vocab, max_seq_length):
    tokens = tokenizer(re.sub("[.,!?\\-]", '', sentence.lower()))
    input_ids = [vocab['[CLS]']] + [vocab[token] for token in tokens] + [vocab['[SEP]']]
    n_pad = max_seq_length - len(input_ids)
    attention_mask = ([1] * len(input_ids)) + ([0] * n_pad)
    input_ids = input_ids + ([0] * n_pad)

    return {'input_ids': torch.LongTensor(input_ids).reshape(1, -1),
            'attention_mask': torch.LongTensor(attention_mask).reshape(1, -1)}

In [33]:
import torch
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(model, tokenizer, vocab, sentence_a, sentence_b, device):
    # Tokenize and convert sentences to input IDs and attention masks
    inputs_a = get_inputs(sentence_a, tokenizer, vocab, max_seq_length)
    inputs_b = get_inputs(sentence_b, tokenizer, vocab, max_seq_length)
    

    # Move input IDs and attention masks to the active device
    inputs_ids_a = inputs_a['input_ids'].to(device)
    attention_a = inputs_a['attention_mask'].to(device)
    inputs_ids_b = inputs_b['input_ids'].to(device)
    attention_b = inputs_b['attention_mask'].to(device)
    segment_ids = torch.zeros(1, max_seq_length, dtype=torch.int32).to(device)

    # Extract token embeddings from BERT
    u = model.get_last_hidden_state(inputs_ids_a, segment_ids)
    v = model.get_last_hidden_state(inputs_ids_b, segment_ids)

    u = mean_pool(u, attention_a).detach().cpu().numpy().reshape(-1)  # batch_size, hidden_dim
    v = mean_pool(v, attention_b).detach().cpu().numpy().reshape(-1)  # batch_size, hidden_dim

    # Calculate cosine similarity
    similarity_score = cosine_similarity(u.reshape(1, -1), v.reshape(1, -1))[0, 0]

    return similarity_score
# Example usage:
sentence_a = 'Your contribution helped make it possible for us to provide our students with a quality education.'
sentence_b = "Your contributions were of no help with our students' education."
similarity = calculate_similarity(model, tokenizer,vocab, sentence_a, sentence_b, device)
print(f"Cosine Similarity: {similarity:.4f}")

Cosine Similarity: 1.0000


## 8. Evaluation and Analysis

### Performance Evaluation with Custom Predictions


In [43]:
def predict_similarity_labels(model, inputs_ids_a, inputs_ids_b, attention_a, attention_b, segment_ids):
    """
    Predict similarity labels for pairs of sentences using a model.
    
    Args:
        model: The BERT model or a similar model that outputs embeddings.
        inputs_ids_a (Tensor): Input IDs for the first set of sentences.
        inputs_ids_b (Tensor): Input IDs for the second set of sentences.
        attention_a (Tensor): Attention mask for the first set of sentences.
        attention_b (Tensor): Attention mask for the second set of sentences.
        segment_ids (Tensor): Segment IDs for distinguishing sentence pairs.
    
    Returns:
        Tensor: Predicted classes based on cosine similarity scores.
    """
    # Extract token embeddings from the model for both sets of input IDs.
    u_last_hidden_state = model.get_last_hidden_state(inputs_ids_a, segment_ids)  
    v_last_hidden_state = model.get_last_hidden_state(inputs_ids_b, segment_ids)  

    # Apply mean pooling to the extracted embeddings to get sentence-level representations.
    u_mean_pool = mean_pool(u_last_hidden_state, attention_a)  # Shape: [batch_size, hidden_dim]
    v_mean_pool = mean_pool(v_last_hidden_state, attention_b)  # Shape: [batch_size, hidden_dim]

    # Compute cosine similarity between the pooled embeddings of each sentence pair.
    similarity_scores = torch.cosine_similarity(u_mean_pool, v_mean_pool, dim=1)

    # Classify the similarity scores into categories based on predefined thresholds.
    predictions = [0 if score >= 0.5 else 1 if score > -0.5 else 2 for score in similarity_scores]
    predicted_classes = torch.tensor(predictions)  # Convert the list of predictions to a tensor.

    return predicted_classes.view(-1, 1)  # Reshape for compatibility with evaluation metrics.

def evaluate_model_performance(model, dataloader):
    """
    Evaluate the model's performance on a given dataset.
    
    Args:
        model: The model to evaluate.
        dataloader (DataLoader): DataLoader containing the dataset for evaluation.
    """
    model.eval()  # Set the model to evaluation mode.
    all_labels = []  # Collect all true labels from the dataset.
    all_predictions = []  # Collect all predictions made by the model.

    with torch.no_grad():  # Disable gradient calculation for efficiency.
        for batch in dataloader:
            # Extract input IDs, attention masks, and labels from the current batch.
            inputs_ids_a = batch['premise_input_ids'].to(device)
            inputs_ids_b = batch['hypothesis_input_ids'].to(device)
            attention_a = batch['premise_attention_mask'].to(device)
            attention_b = batch['hypothesis_attention_mask'].to(device)
            segment_ids = torch.zeros(batch_size, max_seq_length, dtype=torch.int32).to(device)  # Initialize segment IDs.
            labels = batch['labels'].to(device)

            # Predict similarity scores for each pair of sentences in the batch.
            similarity_scores = predict_similarity_labels(model, inputs_ids_a, inputs_ids_b, attention_a, attention_b, segment_ids)

            # Convert similarity scores to predictions and extend the collection of predictions and labels.
            predictions = torch.argmax(similarity_scores, dim=1).cpu().numpy()  # Determine predicted labels.
            all_labels.extend(labels.cpu().numpy())  # Append true labels to the collection.
            all_predictions.extend(predictions)  # Append predicted labels to the collection.

    # Print a classification report comparing true labels and model predictions.
    print(classification_report(all_labels, all_predictions))


In [44]:
# Evaluate on the test dataset
test_dataloader = DataLoader(
    tokenized_datasets['test'],
    batch_size=batch_size
)
evaluate_model_performance(model, test_dataloader)

              precision    recall  f1-score   support

           0       0.33      1.00      0.50        33
           1       0.00      0.00      0.00        38
           2       0.00      0.00      0.00        29

    accuracy                           0.33       100
   macro avg       0.11      0.33      0.17       100
weighted avg       0.11      0.33      0.16       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Interpretation of Metrics

Precision measures the accuracy of the positive predictions for each class. For class 0, it's 0.33, meaning that when the model predicts class 0, it is correct 33% of the time. Classes 1 and 2 have a precision of 0.00, indicating the model never correctly predicts these classes.

Recall indicates the fraction of positives that were correctly identified. For class 0, it's 1.00, showing that the model identified all actual instances of class 0 but possibly at the expense of misclassifying instances from other classes into class 0.

F1-Score provides a balance between precision and recall. For class 0, it's 0.50, which is higher compared to classes 1 and 2, which both have an F1-score of 0.00. This suggests the model's predictions are biased toward class 0.

Support shows the number of actual occurrences of each class in the dataset. The distribution seems relatively balanced, which indicates that the issue isn't due to class imbalance.

Accuracy of 0.33 across the dataset shows that the model only correctly predicts the class about one-third of the time, which is not better than random guessing in a three-class scenario.

### Comparing with Another Pre-trained Model


In [36]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.6.1-py3-none-any.whl.metadata (11 kB)
Downloading sentence_transformers-2.6.1-py3-none-any.whl (163 kB)
   ---------------------------------------- 163.3/163.3 kB 9.6 MB/s eta 0:00:00
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.6.1


In [52]:
from sentence_transformers import SentenceTransformer

# Initialize the pre-trained model from Sentence Transformers
other_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def calculate_similarity_other(model, sentence_a, sentence_b):
    """
    Calculate cosine similarity between two sentences using a pre-trained model.
    """
    embeddings = model.encode([sentence_a, sentence_b])
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]



In [53]:
sentence_a = "A gentle breeze soothes my mind."
sentence_b = "A gentle breeze whispers secrets in its own mysterious language."

# Calculate cosine similarity using  custom model
similarity = calculate_similarity(model, tokenizer, vocab, sentence_a, sentence_b, device)

# Calculate cosine similarity using another pre-trained model from Sentence Transformers
other_similarity = calculate_similarity_other(other_model, sentence_a, sentence_b)

# Print the cosine similarity scores
print(f"Cosine Similarity with custom model: {similarity:.4f}")
print(f"Cosine Similarity with SentenceTransformer model: {other_similarity:.4f}")

Cosine Similarity with custom model: 1.0000
Cosine Similarity with SentenceTransformer model: 0.5258


In [54]:
sentence_a = "Your contribution helped make it possible for us to provide our students with a quality education."
sentence_b = "Your contribution helped make it possible for us to provide our students with a quality education."

# Calculate cosine similarity using  custom model
similarity = calculate_similarity(model, tokenizer, vocab, sentence_a, sentence_b, device)

# Calculate cosine similarity using another pre-trained model from Sentence Transformers
other_similarity = calculate_similarity_other(other_model, sentence_a, sentence_b)

# Print the cosine similarity scores
print(f"Cosine Similarity with custom model: {similarity:.4f}")
print(f"Cosine Similarity with SentenceTransformer model: {other_similarity:.4f}")

Cosine Similarity with custom model: 1.0000
Cosine Similarity with SentenceTransformer model: 1.0000


# Inference Cosine Similarity Analysis

## Similar Sentences

- **Custom Model**: Demonstrates exceptionally high cosine similarity scores (close to 1) for similar sentences, indicating strong performance in capturing semantic similarity. This suggests that the model effectively identifies and represents the similarities between sentences that are closely related in context.
  
- **Pretrained Model**: Similarly, shows very high cosine similarity (almost 1) for similar sentences, aligning with expectations for models trained on large and diverse datasets, thereby validating its capability to discern semantic similarity accurately.

## Dissimilar Sentences

- **Custom Model**: Unexpectedly maintains a high cosine similarity score during inference with dissimilar sentences. Ideally, the model should register a lower similarity score for sentences that are contextually unrelated, indicating a potential area for model adjustment to better distinguish between semantically unrelated sentences.

- **Pretrained Model**: Exhibits lower cosine similarity scores for dissimilar sentences compared to the custom model, which is a desired outcome. This performance suggests that the pretrained model is more attuned to differences in semantic context, thus better distinguishing between unrelated sentences.


## Analyze the impact of hyperparameter choices on the model’s performance.



## Key Hyperparameters and Their Impacts

### Learning Rate
- **Impact**: Crucial for determining the optimization step size. Too high can cause overshooting, too low may lead to slow convergence.
- **Strategy**: Employ adaptive learning rate methods and consider schedules like warm-up or decay to fine-tune training.

### Batch Size
- **Impact**: Influences training stability and speed. Smaller sizes may improve generalization but introduce noise in gradient estimates.
- **Strategy**: Experiment with sizes and use gradient accumulation for larger effective batches on constrained hardware.

### Number of Epochs
- **Impact**: Determines the number of training cycles through the dataset. Balancing is key to avoid underfitting or overfitting.
- **Strategy**: Implement early stopping based on validation performance to prevent overfitting.

### Model Architecture Parameters (n_layers, n_heads, d_model, d_ff, d_k, d_v)
- **Impact**: These define the model's capacity. An optimal configuration is vital for learning complex patterns without overfitting.
- **Strategy**: Start with known configurations for similar tasks. Adjust based on validation performance and apply regularization techniques as necessary.

### Regularization Parameters
- **Impact**: Controls the model's generalization capabilities to prevent over-reliance on specific features.
- **Strategy**: Tune based on validation set performance. Adjust dropout rates in conjunction with model size and learning rate.


##  Discuss any limitations and improvements or modifications.

- **Dataset Imbalance**: Exploring data augmentation techniques could help address imbalance and improve model robustness.
- **Computational Resources**: Utilizing more efficient architectures or cloud-based resources could alleviate computational constraints.
- **Hyperparameter Optimization**: Employing automated hyperparameter tuning methods might identify configurations that enhance model performance.
- **Model Generalization**: Further fine-tuning on diverse datasets or incorporating domain-specific pretraining could improve model generalization.


## Overview

## Datasets Used

### BookCorpus
- **Source**: [Hugging Face Datasets](https://huggingface.co/datasets/bookcorpus)
- **Description**: The BookCorpus dataset contains a large collection of books from various genres, offering rich textual content for training language models and semantic analysis tasks.
- **Usage**: We used the first 10,000 samples from the 'train' split for training our model.

### SNLI (Stanford Natural Language Inference)
- **Source**: [Hugging Face Datasets](https://huggingface.co/datasets/snli)
- **Description**: A collection of sentence pairs labeled with entailment relations, serving as a foundational dataset for natural language inference tasks.

### MNLI (MultiNLI)
- **Source**: [Hugging Face Datasets](https://huggingface.co/datasets/multi_nli)
- **Description**: An extension of SNLI, MNLI provides sentence pairs across a wider range of genres, challenging the model's generalization across different domains.

## Model Details

### Custom Model Configuration
- **Hyperparameters**:
  - `n_layers`: 6
  - `n_heads`: 8
  - `d_model`: 768
  - `d_ff`: 3072 (768 * 4)
  - `d_k` and `d_v`: 64
  - `n_segments`: 2
