# ImpPres Baseline

This notebook illustrates how to use the DeBERTa-v3-base-mnli-fever-anli model to perform specialized inference on the ImpPres dataset.

In [22]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [23]:
label_names = ["entailment", "neutral", "contradiction"]
def evaluate(premise, hypothesis):
    input = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
    output = model(input["input_ids"].to(device))
    prediction = torch.softmax(output["logits"][0], -1).tolist()
    prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
    return prediction

In [24]:
evaluate("The weather is nice today.", "It is sunny outside.")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'entailment': 0.1, 'neutral': 99.8, 'contradiction': 0.0}

In [25]:
def get_prediction(pred_dict):
    if pred_dict["entailment"] > pred_dict["contradiction"]  and pred_dict["entailment"] > pred_dict["neutral"]:
        return "entailment"
    elif pred_dict["contradiction"] > pred_dict["entailment"]:
        return "contradiction"
    else:
        return "neutral"

## Load ImpPres Dataset

In [26]:
from datasets import load_dataset

sections = ['presupposition_all_n_presupposition', 
            'presupposition_both_presupposition', 
            'presupposition_change_of_state', 
            'presupposition_cleft_existence', 
            'presupposition_cleft_uniqueness', 
            'presupposition_only_presupposition', 
            'presupposition_possessed_definites_existence', 
            'presupposition_possessed_definites_uniqueness', 
            'presupposition_question_presupposition']

dataset = {}
for section in sections:
    print(f"Loading dataset for section: {section}")
    dataset[section] = load_dataset("facebook/imppres", section)


Loading dataset for section: presupposition_all_n_presupposition
Loading dataset for section: presupposition_both_presupposition
Loading dataset for section: presupposition_change_of_state
Loading dataset for section: presupposition_cleft_existence
Loading dataset for section: presupposition_cleft_uniqueness
Loading dataset for section: presupposition_only_presupposition
Loading dataset for section: presupposition_possessed_definites_existence
Loading dataset for section: presupposition_possessed_definites_uniqueness
Loading dataset for section: presupposition_question_presupposition


In [27]:
dataset

{'presupposition_all_n_presupposition': DatasetDict({
     all_n_presupposition: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID'],
         num_rows: 1900
     })
 }),
 'presupposition_both_presupposition': DatasetDict({
     both_presupposition: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID'],
         num_rows: 1900
     })
 }),
 'presupposition_change_of_state': DatasetDict({
     change_of_state: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID'],
         num_rows: 1900
     })
 }),
 'presupposition_cleft_existence': DatasetDict({
     cleft_existence: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UI

In [28]:
# Evaluate the model on the ImpPres dataset
from tqdm import tqdm
def evaluate_on_dataset(dataset):
    results = []
    label_names = ["entailment", "neutral", "contradiction"]
    for example in tqdm(dataset):
        premise = example['premise']
        hypothesis = example['hypothesis']
        prediction = evaluate(premise, hypothesis)
        results.append({
            'premise': premise,
            'hypothesis': hypothesis,
            'prediction': prediction,
            'pred_label': get_prediction(prediction),
            'gold_label': label_names[example['gold_label']],
        })
    return results

## Evaluate Metrics

Let's use the huggingface `evaluate` package to compute the performance of the baseline.


In [29]:
import evaluate as eval

accuracy = eval.load("accuracy")
precision = eval.load("precision")
recall = eval.load("recall")
f1 = eval.load("f1")


In [30]:
clf_metrics = eval.combine(["accuracy", "f1", "precision", "recall"])

In [31]:
clf_metrics.compute(predictions=[0, 1, 0], references=[0, 1, 1])

{'accuracy': 0.6666666666666666,
 'f1': 0.6666666666666666,
 'precision': 1.0,
 'recall': 0.5}

## Your Turn

Compute the classification metrics on the baseline model on each section of the ImpPres dataset.

https://www.kaggle.com/code/faijanahamadkhan/llm-evaluation-framework-hugging-face provides good documentation on how to use the Huggingface evaluate library.

In [33]:
label_names = ["entailment", "neutral", "contradiction"]
label2id = {label: i for i, label in enumerate(label_names)}
section_to_split = {
    'presupposition_all_n_presupposition': 'all_n_presupposition',
    'presupposition_both_presupposition': 'both_presupposition',
    'presupposition_change_of_state': 'change_of_state',
    'presupposition_cleft_existence': 'cleft_existence',
    'presupposition_cleft_uniqueness': 'cleft_uniqueness',
    'presupposition_only_presupposition': 'only_presupposition',
    'presupposition_possessed_definites_existence': 'possessed_definites_existence',
    'presupposition_possessed_definites_uniqueness': 'possessed_definites_uniqueness',
    'presupposition_question_presupposition': 'question_presupposition',
}


def compute_metrics(results):
    pred_labels = [label2id[r['pred_label']] for r in results]
    gold_labels = [label2id[r['gold_label']] for r in results]
    return {
        "accuracy": accuracy.compute(predictions=pred_labels, references=gold_labels)["accuracy"],
        "precision": precision.compute(predictions=pred_labels, references=gold_labels, average="macro")["precision"],
        "recall": recall.compute(predictions=pred_labels, references=gold_labels, average="macro")["recall"],
        "f1": f1.compute(predictions=pred_labels, references=gold_labels, average="macro")["f1"],
    }


all_results = []
results_dict = {}
for section in sections:
    print(f"\n🔍 Evaluating section: {section}")
   
    # Access the actual dataset split from your dict
    split_name = section_to_split[section]
    ds = dataset[section][split_name]


    results = evaluate_on_dataset(ds)


    metrics = compute_metrics(results)
    results_dict[section] = metrics
    all_results.extend(results)


# Evaluate combined
print("\n🔁 Evaluating ALL sections combined")
metrics_all = compute_metrics(all_results)
results_dict["ALL"] = metrics_all


import pandas as pd


# Display table
df = pd.DataFrame.from_dict(results_dict, orient="index")
df = df[["accuracy", "precision", "recall", "f1"]]
df = df.round(4)


print("\n📊 Final results:")
print(df)


🔍 Evaluating section: presupposition_all_n_presupposition


100%|██████████| 1900/1900 [03:33<00:00,  8.90it/s]



🔍 Evaluating section: presupposition_both_presupposition


100%|██████████| 1900/1900 [03:33<00:00,  8.89it/s]



🔍 Evaluating section: presupposition_change_of_state


100%|██████████| 1900/1900 [03:48<00:00,  8.30it/s]



🔍 Evaluating section: presupposition_cleft_existence


100%|██████████| 1900/1900 [03:31<00:00,  8.97it/s]



🔍 Evaluating section: presupposition_cleft_uniqueness


100%|██████████| 1900/1900 [03:21<00:00,  9.44it/s]



🔍 Evaluating section: presupposition_only_presupposition


100%|██████████| 1900/1900 [03:31<00:00,  8.98it/s]



🔍 Evaluating section: presupposition_possessed_definites_existence


100%|██████████| 1900/1900 [03:31<00:00,  8.97it/s]



🔍 Evaluating section: presupposition_possessed_definites_uniqueness


100%|██████████| 1900/1900 [03:29<00:00,  9.08it/s]



🔍 Evaluating section: presupposition_question_presupposition


100%|██████████| 1900/1900 [05:41<00:00,  5.56it/s]  



🔁 Evaluating ALL sections combined

📊 Final results:
                                               accuracy  precision  recall  \
presupposition_all_n_presupposition              0.4626     0.4334  0.4750   
presupposition_both_presupposition               0.3968     0.2751  0.3937   
presupposition_change_of_state                   0.3084     0.3343  0.3244   
presupposition_cleft_existence                   0.6411     0.6781  0.6938   
presupposition_cleft_uniqueness                  0.1953     0.2145  0.1859   
presupposition_only_presupposition               0.5832     0.6459  0.6396   
presupposition_possessed_definites_existence     0.6705     0.7987  0.7368   
presupposition_possessed_definites_uniqueness    0.3900     0.2517  0.3826   
presupposition_question_presupposition           0.6253     0.6988  0.6947   
ALL                                              0.4748     0.4932  0.5029   

                                                   f1  
presupposition_all_n_presupposi