In [17]:
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

In [18]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
model.eval()

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartEncoderLayer(
          (self_attn): MBartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
    

In [19]:
datasets = {
    "MT": [
        ("Ik hou van programmeren.", "I love programming."),
        ("Het regent vandaag.", "It is raining today."),
        ("De kat zit op de mat.", "The cat is sitting on the mat.")
    ],
    "TS": [
        ("De overheid heeft vandaag aangekondigd dat de belastingen volgend jaar zullen stijgen vanwege economische omstandigheden.", "Belastingen stijgen volgend jaar."),
        ("Na een spannende wedstrijd won Ajax met 2-1 van PSV in de laatste minuut.", "Ajax wint met 2-1 van PSV."),
        ("Wetenschappers ontdekten een nieuwe planeet buiten ons zonnestelsel.", "Nieuwe planeet ontdekt.")
    ],
    "QA": [
        ("Wat is de hoofdstad van Nederland? Amsterdam is de grootste stad van Nederland en ook de hoofdstad.", "Amsterdam"),
        ("Wie schreef het boek 'De Avonden'? Gerard Reve schreef het in 1947.", "Gerard Reve"),
        ("Wat is de langste rivier in Nederland? De Rijn stroomt door meerdere landen en is de langste rivier.", "De Rijn")
    ]
}

for task_name, samples in datasets.items():
    for input_text, reference in tqdm(samples):
        print(input_text)
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True).to(device)
        print(inputs)

  0%|          | 0/3 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 3/3 [00:00<00:00, 87.21it/s]


Ik hou van programmeren.
{'input_ids': tensor([[250004,   3256,  19660,    131,  77848,     33,      5,      2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
Het regent vandaag.
{'input_ids': tensor([[250004,   1947, 119555,     18,  64697,      5,      2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}
De kat zit op de mat.
{'input_ids': tensor([[250004,    262,   3133,  21583,    233,      8,   2589,      5,      2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}


100%|██████████| 3/3 [00:00<00:00, 283.34it/s]


De overheid heeft vandaag aangekondigd dat de belastingen volgend jaar zullen stijgen vanwege economische omstandigheden.
{'input_ids': tensor([[250004,    262, 118541,   3188,  64697, 233033,     71,    607,      8,
         110119,     33,  73172,     71,   3325,  33607,  13629,    170,   1409,
         142726, 221584, 210492,      5,      2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Na een spannende wedstrijd won Ajax met 2-1 van PSV in de laatste minuut.
{'input_ids': tensor([[250004,    353,    293, 124238,  78336,  23742, 157715,    435,  58671,
            131,   7940,    856,     23,      8,  31836, 141259,      5,      2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Wetenschappers ontdekten een nieuwe planeet buiten ons zonnestelsel.
{'input_ids': tensor([[250004,   1401,    510,  28624,  21777,  98178,    510,    293,   9800,
         157402,  28605,   1260, 148906,  96638,  

100%|██████████| 3/3 [00:00<00:00, 275.75it/s]

Wat is de hoofdstad van Nederland? Amsterdam is de grootste stad van Nederland en ook de hoofdstad.
{'input_ids': tensor([[250004,   6586,     83,      8,  35912,   5481,    131,  13666,     32,
          21391,     83,      8,  53653,  18961,    131,  13666,     22,   1232,
              8,  35912,   5481,      5,      2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Wie schreef het boek 'De Avonden'? Gerard Reve schreef het in 1947.
{'input_ids': tensor([[250004,   4887, 116648,    225,  14666,    242,   4657, 147202,    555,
             25,     32, 125947, 110230, 116648,    225,     23,  40191,      5,
              2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Wat is de langste rivier in Nederland? De Rijn stroomt door meerdere landen en is de langste rivier.
{'input_ids': tensor([[250004,   6586,     83,      8,   1937,    824, 141808,     23,  13666,
             32,    262,  




In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from lm_polygraph.estimators import *
from lm_polygraph.utils.model import WhiteboxModel
from lm_polygraph.utils.dataset import Dataset
from lm_polygraph.utils.processor import Logger
from lm_polygraph.utils.manager import UEManager
from lm_polygraph.ue_metrics import PredictionRejectionArea
from lm_polygraph.generation_metrics import RougeMetric, BartScoreSeqMetric, ModelScoreSeqMetric, ModelScoreTokenwiseMetric, AggregatedMetric
from lm_polygraph.utils.builder_enviroment_stat_calculator import (
    BuilderEnvironmentStatCalculator
)
from lm_polygraph.defaults.register_default_stat_calculators import (
    register_default_stat_calculators,
)
from lm_polygraph.utils.factory_stat_calculator import StatCalculatorContainer
from omegaconf import OmegaConf
from lm_polygraph import estimate_uncertainty



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base_model = AutoModelForCausalLM.from_pretrained('bigscience/bloomz-560m', device_map='cpu',)
tokenizer = AutoTokenizer.from_pretrained('bigscience/bloomz-560m')
model = WhiteboxModel(base_model, tokenizer)

In [3]:
estimator = MaximumTokenProbability()
estimate_uncertainty(model, estimator, input_text='Qui est George Bush?')

UncertaintyOutput(uncertainty=array([-0.16278946, -0.40806034, -0.38379776], dtype=float32), input_text='Qui est George Bush?', generation_text=' le président américain', generation_tokens=[578, 17635, 52762], model_path=None, estimator='MaximumTokenProbability')

In [4]:
from evaluate import load
bertscore = load("bertscore")
predictions = ["de kat slaapt op de bank"]
references = ["een poes ligt op een sofa"]
bertscore.compute(predictions=predictions, references=references, lang="nl")

{'precision': [0.7690080404281616],
 'recall': [0.7674813270568848],
 'f1': [0.7682439088821411],
 'hashcode': 'bert-base-multilingual-cased_L9_no-idf_version=0.3.12(hug_trans=4.50.1)'}

In [5]:
device = "cuda"
model_type = "Whitebox"
dataset_name = ("trivia_qa", "rc.nocontext")
batch_size = 4
seed = 42

In [6]:
dataset = Dataset.load(
    dataset_name,
    'question', 'answer',
    batch_size=batch_size,
    prompt="Question: {question}\nAnswer:{answer}",
    split="validation"
)
dataset.subsample(1, seed=seed)

train_dataset = Dataset.load(
    dataset_name,
    'question', 'answer',
    batch_size=batch_size,
    prompt="Question: {question}\nAnswer:{answer}",
    split="train"
)
train_dataset.subsample(1, seed=seed)

In [7]:
ue_methods = [MaximumSequenceProbability(), 
              SemanticEntropy(),
              MahalanobisDistanceSeq("decoder"),
             ]

ue_metrics = [PredictionRejectionArea(), PredictionRejectionArea(max_rejection=0.5)]

# Wrap generation metric in AggregatedMetric, since trivia_qa is a multi-reference dataset
# (y is a list of possible correct answers)
metrics = [AggregatedMetric(RougeMetric('rougeL'))]

loggers = [Logger()]

In [8]:
TrainingStatistic_config = {
    "dataset": dataset_name,
    "text_column": 'question',
    "label_column": 'answer',
    "description": '',
    "prompt": "Question: {question}\nAnswer:",
    "few_shot_split": 'train',
    "train_split": 'train',
    "load_from_disk": False,
    "subsample_train_dataset": 10,
    "n_shot": 5,
    "train_dataset": dataset_name,
    "train_test_split": False,
    "background_train_dataset": 'allenai/c4',
    "background_train_dataset_text_column": 'text',
    "background_train_dataset_label_column": 'url',
    "background_train_dataset_data_files": 'en/c4-train.00000-of-01024.json.gz',
    "background_load_from_disk": False,
    "subsample_background_train_dataset": 10,
    "batch_size": 1,
    "seed": 1,
    "size": 1,
    "bg_size": 1
}

In [9]:
result_stat_calculators = dict()
scs = register_default_stat_calculators(model_type)
for sc in scs:
    result_stat_calculators[sc.name] = sc

# register TrainingStatisticExtractionCalculator for the Mahalanobis Distance method
result_stat_calculators.update(
    {
        "TrainingStatisticExtractionCalculator": StatCalculatorContainer(
            name="TrainingStatisticExtractionCalculator",
            cfg=OmegaConf.create(TrainingStatistic_config),
            stats=["train_embeddings", "background_train_embeddings", "train_greedy_log_likelihoods"],
            dependencies=[],
            builder="lm_polygraph.defaults.stat_calculator_builders.default_TrainingStatisticExtractionCalculator",
        )
    }
)
    
builder_env_stat_calc = BuilderEnvironmentStatCalculator(model=model)
available_stat_calculators = list(result_stat_calculators.values())

In [10]:
man = UEManager(
    data=dataset,
    model=model,
    estimators=ue_methods,
    builder_env_stat_calc=builder_env_stat_calc,
    available_stat_calculators=available_stat_calculators,
    generation_metrics=metrics,
    ue_metrics=ue_metrics,
    processors=loggers,
    ignore_exceptions=False,
    max_new_tokens=10, 
    
)

Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Generating train split: 356317 examples [00:06, 59020.36 examples/s]


In [11]:
results = man()


  0%|          | 0/1 [00:00<?, ?it/s]
[A
100%|██████████| 1/1 [00:01<00:00,  1.34s/it]

[A
100%|██████████| 1/1 [00:03<00:00,  3.74s/it]
  cov_scaled = torch.cov(train_features.T)
100%|██████████| 1/1 [00:17<00:00, 17.14s/it]
  prr_score = np.sum(scores) / num_rej
  0%|          | 0/1 [00:17<?, ?it/s]


In [12]:
for key in results.keys():
    print(f"UE Score: {key[1]}, Metric: {key[2]}, UE Metric: {key[3]}, Score: {results[key]:.3f}")

UE Score: MaximumSequenceProbability, Metric: Rouge_rougeL, UE Metric: prr, Score: 0.500
UE Score: MaximumSequenceProbability, Metric: Rouge_rougeL, UE Metric: prr_normalized, Score: 0.500
UE Score: SemanticEntropy, Metric: Rouge_rougeL, UE Metric: prr, Score: 0.500
UE Score: SemanticEntropy, Metric: Rouge_rougeL, UE Metric: prr_normalized, Score: 0.500
UE Score: MahalanobisDistanceSeq_decoder, Metric: Rouge_rougeL, UE Metric: prr, Score: 0.500
UE Score: MahalanobisDistanceSeq_decoder, Metric: Rouge_rougeL, UE Metric: prr_normalized, Score: 0.500
UE Score: MaximumSequenceProbability, Metric: Rouge_rougeL, UE Metric: prr_0.5, Score: nan
UE Score: MaximumSequenceProbability, Metric: Rouge_rougeL, UE Metric: prr_0.5_normalized, Score: nan
UE Score: SemanticEntropy, Metric: Rouge_rougeL, UE Metric: prr_0.5, Score: nan
UE Score: SemanticEntropy, Metric: Rouge_rougeL, UE Metric: prr_0.5_normalized, Score: nan
UE Score: MahalanobisDistanceSeq_decoder, Metric: Rouge_rougeL, UE Metric: prr_0.5,