In [1]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


## Select Device (CPU, CUDA, or MPS if on Apple Silicon)

In [2]:

if torch.cuda.is_available():
    device = torch.device("cuda")
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")


Using device: mps


## Load the SST2 Dataset

In [3]:
# The SST2 dataset is part of the GLUE benchmark. 
# We use the "validation" split for demonstration.
dataset_sst2 = load_dataset("glue", "sst2")
dataset = dataset_sst2["validation"]

## Load Model and Tokenizer

In [4]:
model_name = "HuggingFaceTB/SmolLM-1.7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.to(device)
model.eval()

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at HuggingFaceTB/SmolLM-1.7B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 2048, padding_idx=2)
    (layers): ModuleList(
      (0-23): 24 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,)

## Preprocessing Function

In [5]:
def preprocess(examples):
    """
    Tokenize the text from the SST2 dataset.
    SST2 has a "sentence" key. The label is in "label".
    """
    return tokenizer(
        examples["sentence"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

## Preprocess Dataset

In [6]:
encoded_dataset = dataset.map(preprocess, batched=True)
encoded_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

## Perform Inference to Obtain Model Outputs

In [7]:
all_logits = []
all_labels = []

with torch.no_grad():
    for i in range(len(encoded_dataset)):
        batch = {
            "input_ids": encoded_dataset[i]["input_ids"].unsqueeze(0).to(device),
            "attention_mask": encoded_dataset[i]["attention_mask"].unsqueeze(0).to(device),
        }
        outputs = model(**batch)
        logits = outputs.logits
        all_logits.append(logits.cpu().numpy()[0])  # move logits back to CPU for processing
        all_labels.append(encoded_dataset[i]["label"].item())

all_logits = np.array(all_logits)
all_labels = np.array(all_labels)

## Convert Logits to Probabilities and Predictions

In [8]:
probabilities = torch.nn.functional.softmax(torch.tensor(all_logits), dim=-1).numpy()
pred_labels = np.argmax(probabilities, axis=-1)

## Compute Accuracy

In [10]:
accuracy_score = np.mean(pred_labels == all_labels)

## Compute Negative Log-Likelihood (NLL)

In [11]:
def compute_nll(probs, labels):
    """
    NLL is defined as -log(p(correct_label)).
    We'll compute the average NLL over the dataset.
    """
    p_correct = probs[np.arange(len(labels)), labels]
    # clip to avoid numerical instability
    p_correct = np.clip(p_correct, 1e-12, 1.0)
    return -np.mean(np.log(p_correct))

nll_score = compute_nll(probabilities, all_labels)

## Compute Expected Calibration Error (ECE)

In [12]:
# A simple ECE implementation using equally spaced bins.
def compute_ece(probs, labels, num_bins=10):
    """
    :param probs: Array of shape (N, num_classes) with predicted probabilities
    :param labels: 1D array of shape (N,) with ground-truth labels
    :param num_bins: Number of confidence bins
    :return: ECE value (float)
    """
    confidences = np.max(probs, axis=-1)
    predictions = np.argmax(probs, axis=-1)
    
    bin_boundaries = np.linspace(0.0, 1.0, num_bins + 1)
    ece = 0.0
    
    for i in range(num_bins):
        start = bin_boundaries[i]
        end = bin_boundaries[i+1]
        mask = (confidences >= start) & (confidences < end)
        if np.sum(mask) == 0:
            continue
        
        bin_accuracy = np.mean(predictions[mask] == labels[mask])
        bin_confidence = np.mean(confidences[mask])
        bin_proportion = np.mean(mask)
        
        ece += abs(bin_confidence - bin_accuracy) * bin_proportion
    
    return ece

ece_score = compute_ece(probabilities, all_labels, num_bins=10)

print(f"Number of examples: {len(dataset)}")
print(f"Accuracy: {accuracy_score:.4f}")
print(f"NLL: {nll_score:.4f}")
print(f"ECE: {ece_score:.4f}")


Number of examples: 872
Accuracy: 0.5103
NLL: 0.7362
ECE: 0.1326
