# Libraries

In [1]:
import random
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification
from datasets import load_from_disk

from config import DEVICE
from generate_answers import generate_answers
from my_utils.data import sample_ds, load_ds
from my_utils.metrics import calculate_auroc

# Load models

In [None]:
print(DEVICE)
seed = 42
random.seed(seed)

# LLM
Gemma_model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b-it", device_map="auto", torch_dtype=torch.bfloat16)
Gemma_tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it")

# Entailment Transformer
Roberta_model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli").to(DEVICE)
Roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
print(torch.cuda.memory_allocated())

# Entailment LLM
Qwen_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct", torch_dtype="auto", device_map="auto").to(DEVICE)
Qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")

torch.cuda.empty_cache()

cuda
0


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

5229745664


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


6651206656
9739567616


# Load datasets

In [3]:
n_samples = 4
triviaqa_train, triviaqa_val = load_ds("trivia_qa", seed)
squad_train, squad_val = load_ds("squad", seed)
svamp_train, svamp_val = load_ds("svamp", seed)
nq_train, nq_val = load_ds("nq", seed)


triviaqa_sample = sample_ds(triviaqa_val, n_samples, seed, "trivia_qa")
squad_sample = sample_ds(squad_val, n_samples, seed, "squad")
svamp_sample = sample_ds(svamp_val, n_samples, seed, "svamp")
nq_sample = sample_ds(nq_val, n_samples, seed, "nq")

datasets = [triviaqa_sample, squad_sample, svamp_sample, nq_sample]

Dataset:  trivia_qa
Dataset({
    features: ['id', 'question', 'context', 'answers'],
    num_rows: 4
}) 

Dataset:  squad
Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 4
}) 

Dataset:  svamp
Dataset({
    features: ['ID', 'Body', 'Question', 'Equation', 'Answer', 'Type', 'question_concat', 'question', 'context', 'type', 'equation', 'id', 'answers'],
    num_rows: 4
}) 

Dataset:  nq
Dataset({
    features: ['question', 'answer', 'answers', 'context', 'id'],
    num_rows: 4
}) 



# Generate answers and calculate Semantic Entropy

In [None]:
data_transformer_path = "data/transformer/"
data_llm_path = "data/llm/"
generate_answers(datasets, data_transformer_path, data_llm_path, Gemma_model, Gemma_tokenizer, Roberta_model, Roberta_tokenizer, Qwen_model, Qwen_tokenizer)

del triviaqa_train, triviaqa_val, squad_train, squad_val, svamp_train, svamp_val, nq_train, nq_val 
del triviaqa_sample, squad_sample, svamp_sample, nq_sample, datasets
torch.cuda.empty_cache()


Generating responses for trivia_qa dataset...


  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████| 4/4 [02:19<00:00, 34.82s/it]


Saving the dataset (0/1 shards):   0%|          | 0/4 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4 [00:00<?, ? examples/s]


Generating responses for squad dataset...


100%|██████████| 4/4 [03:08<00:00, 47.02s/it]


Saving the dataset (0/1 shards):   0%|          | 0/4 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4 [00:00<?, ? examples/s]


Generating responses for svamp dataset...


100%|██████████| 4/4 [01:26<00:00, 21.66s/it]


Saving the dataset (0/1 shards):   0%|          | 0/4 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4 [00:00<?, ? examples/s]


Generating responses for nq dataset...


100%|██████████| 4/4 [02:21<00:00, 35.50s/it]


Saving the dataset (0/1 shards):   0%|          | 0/4 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4 [00:00<?, ? examples/s]

# Load Results

In [5]:
triviaqa_sample = load_from_disk(data_transformer_path + "trivia_qa")
squad_sample = load_from_disk(data_transformer_path + "squad")
svamp_sample = load_from_disk(data_transformer_path + "svamp")
nq_sample = load_from_disk(data_transformer_path + "nq")
datasets_transformers = [triviaqa_sample, squad_sample, svamp_sample, nq_sample]

triviaqa_sample = load_from_disk(data_llm_path + "trivia_qa")
squad_sample = load_from_disk(data_transformer_path + "squad")
svamp_sample = load_from_disk(data_llm_path + "svamp")
nq_sample = load_from_disk(data_llm_path + "nq")
datasets_llm = [triviaqa_sample, squad_sample, svamp_sample, nq_sample]


# Calculate Metrics

In [6]:
print("AUROC scores for Transformer")
calculate_auroc(datasets_transformers)

print("\nAUROC scores for LLM")
calculate_auroc(datasets_llm)

AUROC scores for Transformer
AUROC score for trivia_qa dataset: 0.0
AUROC score for squad dataset: 0.0
AUROC score for svamp dataset: 0.6666666666666667
AUROC score for nq dataset: 1.0

AUROC scores for LLM
AUROC score for trivia_qa dataset: 0.33333333333333337
AUROC score for squad dataset: 0.0
AUROC score for svamp dataset: 0.6666666666666667
AUROC score for nq dataset: 0.5


[0.33333333333333337, 0.0, 0.6666666666666667, 0.5]