In [None]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForQuestionAnswering
from transformers import pipeline
from datasets import load_dataset
from sklearn.metrics import precision_score, recall_score,  f1_score
import time
import random


# Set random seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


## Sentiment Analysis Benchmark

- DistilBERT fine-tuned on the SST-2 dataset.
-  This model will classify text as having positive or negative sentiment.

In [None]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

tokenizer_sentiment = AutoTokenizer.from_pretrained(model_name)
model_sentiment = AutoModelForSequenceClassification.from_pretrained(model_name)

print(f"Sentiment analysis model and tokenizer are ready.")

Sentiment analysis model and tokenizer are ready.


Use of a tokenizer is to convert the input text into tokenized format, ensuring padding and truncation to the specified maximum length, returning PyTorch tensors.

Sequence_length - (default 128) to process text into a format the model can understand.

Convert the tokenized output into a dictionary where each key-value pair is transferred to device using a dictionary comprehension.

In [None]:
def tokenize_sentiment_data(input_text, tokenizer, sequence_length=128):
  tokenized = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
  inputs = {key: value.to(device) for key, value in tokenized.items()}
  return inputs

## Metrics

- precision_score measures the proportion of correctly predicted positive cases out of all predicted positive cases

- recall_score measures the proportion of actual positive cases correctly identified.

In [None]:
def calculate_metrics(true_labels, predictions):
  precision = precision_score(true_labels, predictions)
  recall = recall_score(true_labels, predictions)
  return precision, recall

## Evaluation

The model is set to evaluation mode using model.eval()

Generate model predictions by passing inputs to model and extracting the predicted label using argmax.

Append prediction to predictions and item['label'] to true_labels.

In [None]:
def evaluate_sentiment(model, tokenizer, dataset):
  model.eval()  # Set model to evaluation mode
  correct = 0
  total = 0
  predictions = []
  true_labels = []

  # Tokenize the sentence
  for item in dataset:
    inputs = tokenize_sentiment_data(item['sentence'], tokenizer)

  # Get model predictions
    outputs = model(**inputs)
    prediction = outputs.logits.argmax(-1).item()

    # Collect predictions and true labels
    predictions.append(prediction)
    true_labels.append(item['label'])

    # Update correct and total counts for accuracy
    correct += (prediction == item['label'])
    total += 1

  # Calculate accuracy
  accuracy = correct / total

  # Calculate Precision and Recall
  precision, recall = calculate_metrics(true_labels, predictions)

  return accuracy, precision, recall

In [None]:
# Load the SST-2 dataset (sentiment analysis dataset from GLUE)
sentiment_dataset = load_dataset("glue", "sst2", split="validation")

# Sample a subset for faster evaluation (e.g., 500 samples)
sample_size = 500
sampled_indices = random.sample(range(len(sentiment_dataset)), min(sample_size, len(sentiment_dataset)))
sentiment_dataset = sentiment_dataset.select(sampled_indices)

# Evaluate the model
accuracy, precision, recall = evaluate_sentiment(model_sentiment, tokenizer_sentiment, sentiment_dataset)

print(f"Sentiment Analysis Accuracy: {accuracy:.2f}")
print(f"Sentiment Analysis Precision: {precision:.2f}")
print(f"Sentiment Analysis Recall: {recall:.2f}")

README.md: 0.00B [00:00, ?B/s]

sst2/train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

sst2/validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

sst2/test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Sentiment Analysis Accuracy: 0.91
Sentiment Analysis Precision: 0.90
Sentiment Analysis Recall: 0.92


# Question and Answering Analysis benchmark

- model name : "distilbert-base-uncased-distilled-squad"
- SQuAD dataset



In [None]:
# Define the model for question answering (SQuAD)
model_qna = "distilbert-base-uncased-distilled-squad"
tokenizer_qna = AutoTokenizer.from_pretrained(model_qna)
model_qna = AutoModelForQuestionAnswering.from_pretrained(model_qna)
model_qna.to(device)

print(f"Question answering model and tokenizer are ready.")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

Question answering model and tokenizer are ready.


In [None]:
def evaluate_qna(model, tokenizer, dataset):
    model.eval()  # Set model to evaluation mode
    correct = 0
    total = 0
    exact_matches = []
    f1_scores = []

    for item in dataset:
        question = item['question']
        context = item['context']
        answers = item['answers']

        inputs = tokenizer(question, context, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        outputs = model(**inputs)
        start_score, end_score = outputs.start_logits, outputs.end_logits

        start_idx = torch.argmax(start_score)
        end_idx = torch.argmax(end_score)

        predicted_answer = tokenizer.convert_tokens_to_string(
            tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_idx:end_idx+1])
        )

        # Exact Match (EM)
        exact_match = int(predicted_answer.strip().lower() == answers['text'][0].strip().lower())
        exact_matches.append(exact_match)

        # F1 Score
        f1 = f1_score([answers['text'][0]], [predicted_answer], average='micro')
        f1_scores.append(f1)

        correct += exact_match
        total += 1

    accuracy = correct / total
    avg_f1 = sum(f1_scores) / len(f1_scores)
    avg_exact_match = sum(exact_matches) / len(exact_matches)

    return accuracy, avg_exact_match, avg_f1


In [None]:
# Load the SQuAD dataset (question answering dataset)
qna_dataset = load_dataset("squad", split="validation")

# Sample a subset for faster evaluation (e.g., 100 samples)
sample_size = 100
sampled_indices = random.sample(range(len(qna_dataset)), min(sample_size, len(qna_dataset)))
qna_dataset = qna_dataset.select(sampled_indices)

# Evaluate the model
qna_accuracy, qna_exact_match, qna_f1 = evaluate_qna(model_qna, tokenizer_qna, qna_dataset)

print(f"Question Answering Accuracy: {qna_accuracy:.2f}")
print(f"Question Answering Exact Match: {qna_exact_match:.2f}")
print(f"Question Answering F1 Score: {qna_f1:.2f}")

README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Question Answering Accuracy: 0.56
Question Answering Exact Match: 0.56
Question Answering F1 Score: 0.28


In [None]:
def run_all_benchmarks():
    sentiment_results = {}
    qna_results = {}

    # Evaluate Sentiment Analysis
    print("\n=== Running Sentiment Analysis Benchmark ===")
    sentiment_accuracy, sentiment_precision, sentiment_recall = evaluate_sentiment(model_sentiment, tokenizer_sentiment, sentiment_dataset)
    sentiment_results["sentiment_analysis"] = {
        "accuracy": sentiment_accuracy,
        "precision": sentiment_precision,
        "recall": sentiment_recall
    }

    # Evaluate Question Answering
    print("\n=== Running Question Answering Benchmark ===")
    qna_accuracy, qna_exact_match, qna_f1 = evaluate_qna(model_qna, tokenizer_qna, qna_dataset)
    qna_results["question_answering"] = {
        "accuracy": qna_accuracy,
        "exact_match": qna_exact_match,
        "f1_score": qna_f1
    }

    return sentiment_results, qna_results

In [None]:
# Run all benchmarks
benchmark_results = run_all_benchmarks()

# Print the results
print("\nBenchmark Results:")
print(benchmark_results)


=== Running Sentiment Analysis Benchmark ===

=== Running Question Answering Benchmark ===

Benchmark Results:
({'sentiment_analysis': {'accuracy': 0.908, 'precision': 0.9007633587786259, 'recall': 0.921875}}, {'question_answering': {'accuracy': 0.56, 'exact_match': 0.56, 'f1_score': 0.28}})
