In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from lm_polygraph.estimators import *
from lm_polygraph.utils.model import WhiteboxModel
from lm_polygraph.utils.dataset import Dataset
from lm_polygraph.utils.processor import Logger
from lm_polygraph.utils.manager import UEManager
from lm_polygraph.ue_metrics import PredictionRejectionArea
from lm_polygraph.generation_metrics import RougeMetric, BartScoreSeqMetric, ModelScoreSeqMetric, ModelScoreTokenwiseMetric, AggregatedMetric
from lm_polygraph.utils.builder_enviroment_stat_calculator import (
    BuilderEnvironmentStatCalculator
)
from lm_polygraph.defaults.register_default_stat_calculators import (
    register_default_stat_calculators,
)
from lm_polygraph.utils.factory_stat_calculator import StatCalculatorContainer
from omegaconf import OmegaConf
from lm_polygraph import estimate_uncertainty



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base_model = AutoModelForCausalLM.from_pretrained('bigscience/bloomz-560m', device_map='cpu',)
tokenizer = AutoTokenizer.from_pretrained('bigscience/bloomz-560m')
model = WhiteboxModel(base_model, tokenizer)

In [3]:
estimator = MaximumTokenProbability()
estimate_uncertainty(model, estimator, input_text='Qui est George Bush?')

UncertaintyOutput(uncertainty=array([-0.16278946, -0.40806034, -0.38379776], dtype=float32), input_text='Qui est George Bush?', generation_text=' le président américain', generation_tokens=[578, 17635, 52762], model_path=None, estimator='MaximumTokenProbability')

In [4]:
from evaluate import load
bertscore = load("bertscore")
predictions = ["de kat slaapt op de bank"]
references = ["een poes ligt op een sofa"]
bertscore.compute(predictions=predictions, references=references, lang="nl")

{'precision': [0.7690080404281616],
 'recall': [0.7674813270568848],
 'f1': [0.7682439088821411],
 'hashcode': 'bert-base-multilingual-cased_L9_no-idf_version=0.3.12(hug_trans=4.50.1)'}

In [5]:
device = "cuda"
model_type = "Whitebox"
dataset_name = ("trivia_qa", "rc.nocontext")
batch_size = 4
seed = 42

In [6]:
dataset = Dataset.load(
    dataset_name,
    'question', 'answer',
    batch_size=batch_size,
    prompt="Question: {question}\nAnswer:{answer}",
    split="validation"
)
dataset.subsample(1, seed=seed)

train_dataset = Dataset.load(
    dataset_name,
    'question', 'answer',
    batch_size=batch_size,
    prompt="Question: {question}\nAnswer:{answer}",
    split="train"
)
train_dataset.subsample(1, seed=seed)

In [7]:
ue_methods = [MaximumSequenceProbability(), 
              SemanticEntropy(),
              MahalanobisDistanceSeq("decoder"),
             ]

ue_metrics = [PredictionRejectionArea(), PredictionRejectionArea(max_rejection=0.5)]

# Wrap generation metric in AggregatedMetric, since trivia_qa is a multi-reference dataset
# (y is a list of possible correct answers)
metrics = [AggregatedMetric(RougeMetric('rougeL'))]

loggers = [Logger()]

In [8]:
TrainingStatistic_config = {
    "dataset": dataset_name,
    "text_column": 'question',
    "label_column": 'answer',
    "description": '',
    "prompt": "Question: {question}\nAnswer:",
    "few_shot_split": 'train',
    "train_split": 'train',
    "load_from_disk": False,
    "subsample_train_dataset": 10,
    "n_shot": 5,
    "train_dataset": dataset_name,
    "train_test_split": False,
    "background_train_dataset": 'allenai/c4',
    "background_train_dataset_text_column": 'text',
    "background_train_dataset_label_column": 'url',
    "background_train_dataset_data_files": 'en/c4-train.00000-of-01024.json.gz',
    "background_load_from_disk": False,
    "subsample_background_train_dataset": 10,
    "batch_size": 1,
    "seed": 1,
    "size": 1,
    "bg_size": 1
}

In [9]:
result_stat_calculators = dict()
scs = register_default_stat_calculators(model_type)
for sc in scs:
    result_stat_calculators[sc.name] = sc

# register TrainingStatisticExtractionCalculator for the Mahalanobis Distance method
result_stat_calculators.update(
    {
        "TrainingStatisticExtractionCalculator": StatCalculatorContainer(
            name="TrainingStatisticExtractionCalculator",
            cfg=OmegaConf.create(TrainingStatistic_config),
            stats=["train_embeddings", "background_train_embeddings", "train_greedy_log_likelihoods"],
            dependencies=[],
            builder="lm_polygraph.defaults.stat_calculator_builders.default_TrainingStatisticExtractionCalculator",
        )
    }
)
    
builder_env_stat_calc = BuilderEnvironmentStatCalculator(model=model)
available_stat_calculators = list(result_stat_calculators.values())

In [10]:
man = UEManager(
    data=dataset,
    model=model,
    estimators=ue_methods,
    builder_env_stat_calc=builder_env_stat_calc,
    available_stat_calculators=available_stat_calculators,
    generation_metrics=metrics,
    ue_metrics=ue_metrics,
    processors=loggers,
    ignore_exceptions=False,
    max_new_tokens=10, 
    
)

Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Generating train split: 356317 examples [00:06, 59020.36 examples/s]


In [11]:
results = man()


  0%|          | 0/1 [00:00<?, ?it/s]
[A
100%|██████████| 1/1 [00:01<00:00,  1.34s/it]

[A
100%|██████████| 1/1 [00:03<00:00,  3.74s/it]
  cov_scaled = torch.cov(train_features.T)
100%|██████████| 1/1 [00:17<00:00, 17.14s/it]
  prr_score = np.sum(scores) / num_rej
  0%|          | 0/1 [00:17<?, ?it/s]


In [12]:
for key in results.keys():
    print(f"UE Score: {key[1]}, Metric: {key[2]}, UE Metric: {key[3]}, Score: {results[key]:.3f}")

UE Score: MaximumSequenceProbability, Metric: Rouge_rougeL, UE Metric: prr, Score: 0.500
UE Score: MaximumSequenceProbability, Metric: Rouge_rougeL, UE Metric: prr_normalized, Score: 0.500
UE Score: SemanticEntropy, Metric: Rouge_rougeL, UE Metric: prr, Score: 0.500
UE Score: SemanticEntropy, Metric: Rouge_rougeL, UE Metric: prr_normalized, Score: 0.500
UE Score: MahalanobisDistanceSeq_decoder, Metric: Rouge_rougeL, UE Metric: prr, Score: 0.500
UE Score: MahalanobisDistanceSeq_decoder, Metric: Rouge_rougeL, UE Metric: prr_normalized, Score: 0.500
UE Score: MaximumSequenceProbability, Metric: Rouge_rougeL, UE Metric: prr_0.5, Score: nan
UE Score: MaximumSequenceProbability, Metric: Rouge_rougeL, UE Metric: prr_0.5_normalized, Score: nan
UE Score: SemanticEntropy, Metric: Rouge_rougeL, UE Metric: prr_0.5, Score: nan
UE Score: SemanticEntropy, Metric: Rouge_rougeL, UE Metric: prr_0.5_normalized, Score: nan
UE Score: MahalanobisDistanceSeq_decoder, Metric: Rouge_rougeL, UE Metric: prr_0.5,