In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer
from sklearn.metrics import accuracy_score, classification_report
from datasets import load_dataset
from tqdm.auto import tqdm
from model import SBERT

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load Tokenizer
tokenizer = BertTokenizer.from_pretrained("distilbert-base-uncased")

# Load SNLI dataset
dataset = load_dataset("snli")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["premise"], examples["hypothesis"], padding="max_length", truncation=True, max_length=64)

# Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns
columns_to_remove = ['premise', 'hypothesis']
available_columns = tokenized_datasets['train'].column_names
columns_to_remove = [col for col in columns_to_remove if col in available_columns]
tokenized_datasets = tokenized_datasets.remove_columns(columns_to_remove)

# Filter out invalid labels
def filter_invalid_labels(example):
    return example["label"] in [0, 1, 2]

tokenized_datasets = tokenized_datasets.filter(filter_invalid_labels)

# Format dataset for PyTorch
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Create evaluation dataloader
eval_subset = tokenized_datasets['validation'].select(range(min(10000, len(tokenized_datasets['validation']))))
eval_dataloader = DataLoader(eval_subset, batch_size=4, num_workers=8)

print("Dataset Loaded & Tokenized!")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


Dataset Loaded & Tokenized!


In [3]:
# Load Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SBERT().to(device)
model.load_state_dict(torch.load("sbert_task2.pth", map_location=device))
model.eval()

print("Model Loaded!")

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertModel were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.inte

Model Loaded!


In [4]:
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(eval_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        mid_point = input_ids.shape[1] // 2
        input_ids_a, input_ids_b = input_ids[:, :mid_point], input_ids[:, mid_point:]
        attention_mask_a, attention_mask_b = attention_mask[:, :mid_point], attention_mask[:, mid_point:]

        outputs = model(input_ids_a, attention_mask_a, input_ids_b, attention_mask_b)
        preds = torch.argmax(outputs, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print("Evaluation Completed!")

100%|██████████| 2461/2461 [09:21<00:00,  4.38it/s]


Evaluation Completed!


In [5]:
# Compute Metrics
accuracy = accuracy_score(all_labels, all_preds)
print(f"Validation Accuracy: {accuracy:.4f}")

report = classification_report(all_labels, all_preds, target_names=["Entailment", "Neutral", "Contradiction"])
print(report)

# Save results to a file
with open("evaluation_results.txt", "w") as f:
    f.write(f"Validation Accuracy: {accuracy:.4f}\n")
    f.write(report)

Validation Accuracy: 0.3287
               precision    recall  f1-score   support

   Entailment       0.00      0.00      0.00      3329
      Neutral       0.33      1.00      0.49      3235
Contradiction       0.00      0.00      0.00      3278

     accuracy                           0.33      9842
    macro avg       0.11      0.33      0.16      9842
 weighted avg       0.11      0.33      0.16      9842



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
import pandas as pd

# Create Performance Table
performance_data = {"Model Type": ["Our Model"], "SNLI Performance": [f"{accuracy:.4f}"]}
df = pd.DataFrame(performance_data)

# Display Table
df

Unnamed: 0,Model Type,SNLI Performance
0,Our Model,0.3287
