In [13]:
from pathlib import Path
import sys

PROJECT_ROOT = Path.cwd().resolve().parent  # если CWD = notebooks
sys.path.insert(0, str(PROJECT_ROOT / "external" / "semantic_uncertainty"))
sys.path.insert(0, str(PROJECT_ROOT / "external" / "semantic_uncertainty" / "semantic_uncertainty"))
sys.path.insert(0, str(PROJECT_ROOT / "external" / "semantic_uncertainty" / "semantic_uncertainty" / "uncertainty"))


import semantic_uncertainty
import uncertainty

print("uncertainty:", getattr(uncertainty, "__file__", None))

from semantic_uncertainty.uncertainty.uncertainty_measures.semantic_entropy import (
    EntailmentDeberta,
    get_semantic_ids,
    logsumexp_by_id,
    predictive_entropy_rao,
)
print("semantic_entropy imports OK")

uncertainty: /home/zazamrykh/projects/internal_probing/external/semantic_uncertainty/semantic_uncertainty/uncertainty/__init__.py
semantic_entropy imports OK


In [38]:
# Imports
import os 

import numpy as np
from tqdm import tqdm
from datasets import load_dataset
import random
import pandas as pd

import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer

print("torch:", torch.__version__)
print("cuda available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("gpu:", torch.cuda.get_device_name(0))

def pick_dtype():
    # bf16 работает только на GPU с поддержкой bf16 (обычно Ampere/Hopper и новее).
    if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
        return torch.bfloat16
    return torch.float16

DTYPE = pick_dtype()
DTYPE


torch: 2.9.1+cu128
cuda available: True
gpu: NVIDIA GeForce RTX 3060


torch.bfloat16

In [15]:
# Params
SEED = 42

In [16]:
from datasets import load_dataset
import random
from itertools import islice

SEED = 42
random.seed(SEED)

ds_stream = load_dataset(
    "mandarjoshi/trivia_qa",
    "rc.nocontext",
    split="validation",
    streaming=True,
)

ds_stream = ds_stream.shuffle(seed=SEED, buffer_size=2_000)

sample = list(islice(ds_stream, 10))

len(sample), sample[0].keys()

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

'HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.' thrown while requesting GET https://huggingface.co/datasets/mandarjoshi/trivia_qa/resolve/0f7faf33a3908546c6fd5b73a660e0f8ff173c2f/rc.nocontext/validation-00000-of-00001.parquet
Retrying in 1s [Retry 1/5].


(10,
 dict_keys(['question', 'question_id', 'question_source', 'entity_pages', 'search_results', 'answer']))

In [17]:
def extract_gold_normalized(ex):
    """
    Возвращает список "нормализованных" корректных ответов.
    В TriviaQA они уже подготовлены как normalized_aliases / normalized_value. [web:352][web:359]
    """
    ans = ex["answer"]
    golds = ans.get("normalized_aliases") or []
    if not golds:
        nv = ans.get("normalized_value")
        if nv:
            golds = [nv]
    return golds

rows = []
for ex in sample:
    rows.append({
        "question_id": ex.get("question_id"),
        "question": ex["question"],
        "gold_normalized": extract_gold_normalized(ex),
        "gold_value_raw": ex["answer"].get("value"),
        "gold_aliases_raw": ex["answer"].get("aliases"),
    })

df = pd.DataFrame(rows)
df


Unnamed: 0,question_id,question,gold_normalized,gold_value_raw,gold_aliases_raw
0,tc_2167,Which US city was named after a British Prime ...,"[steel city, climate of pittsburgh pennsylvani...",Pittsburgh,"[Smoky City, Pittsburgh (Pa.), Pittsburgh, Pen..."
1,qb_9081,Harold Holt became Prime Minister of which cou...,"[australie, orstraya, federal australia, austr...",Australia,"[Australia (Commonwealth realm), AustraliA, Co..."
2,qb_6552,The Suez Canal joins the Red Sea and which oth...,"[sea of mediterranea, mediterranian sea, roman...",Mediterranean Sea,"[Mediterranian, Meditiranean, West Mediterrane..."
3,qb_2020,Selenology is the scientific study of which ce...,"[moonless, earth and moon, lunar mass, luna sa...",The moon,"[Sol 3a, Moon-like, Mass of the Moon, Solar an..."
4,qb_1830,In September 2006 the government of Prime Mini...,"[thailand, kingdom of thailand, kingdom of tha...",Thailand,"[Muang Thai, Taihland, ISO 3166-1:TH, Thai Emp..."
5,dpql_452,Anatomy. Where are the intercostal muscles sit...,[between ribs],Between the RIBS,[Between the RIBS]
6,tc_2090,Who first drew Mickey Mouse when ?Disney first...,"[celebrity productions, iwerks ub, ub iwerks, ...",Ub Iwerks,"[Iwerks, Ub, Ub Iwerks, Ub Iwerks Studio, Cele..."
7,qb_7419,A quadruped is an animal with how many feet?,"[four, 4]",Four,"[Four, four, 4]"
8,qz_2194,"Who was part man, part machine, all cop and ha...","[robocop 1987 film, robotic police officer, ro...",Robocop,"[I'd buy that for a dollar, RoboCop, RobotCop,..."
9,tc_2250,"What kind of disaster claimed some 100,000 liv...","[地震, earth quakes, earthquakes, tectonic earth...",Earthquake,"[Seism, Earthquake, Seismic event, The kinds o..."


In [18]:
i = 0
print("Q:", df.loc[i, "question"])
print("Gold normalized:", df.loc[i, "gold_normalized"])
print("Gold raw value:", df.loc[i, "gold_value_raw"])
print("Gold raw aliases (first 5):", (df.loc[i, "gold_aliases_raw"] or [])[:5])

Q: Which US city was named after a British Prime Minister?
Gold normalized: ['steel city', 'climate of pittsburgh pennsylvania', 'pittsbrugh', 'un locode uspit', 'pittsburgh', 'pittsburgh frick 6–8 middle school', 'pittsburgh pennsylvania usa', 'pittsburgh style', 'city of pittsburgh', 'pittsburgh pennsylvania u s', 'st justin s high school', 'pittsburgh pa', 'glenwood pennsylvania', 'da burgh', 'pittsburgh style of literature', 'pitsburgh', 'east end pittsburgh', 'pittsburgh pennsylvania us', 'pittsburgh usa', 'smoky city', 'city of bridges', 'fort du quesne', 'pittsburg pennsylvania', 'pittsburgh frick 6 8 middle school', 'pittsburgh pennsylvania', 'pittsburgh united states of america', 'education in pittsburgh', 'pittsburg pa', 'pittsburgh pennsyvania', 'burgh', 'frick international studies academy middle school', 'pittsburgh allegheny county pennsylvania', 'pittsburgh pgh']
Gold raw value: Pittsburgh
Gold raw aliases (first 5): ['Smoky City', 'Pittsburgh (Pa.)', 'Pittsburgh, Pennsy

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

DTYPE = torch.bfloat16 

MODEL = "../models/mistral-7b-instruct" # "mistralai/Mistral-7B-Instruct-v0.1" # choose model name instead of path to local folder with model

tokenizer = AutoTokenizer.from_pretrained(MODEL)

model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    device_map="auto",
    torch_dtype=DTYPE,
)

model.eval()
print("Loaded from:", MODEL)

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


Loaded from: ../models/mistral-7b-instruct


In [33]:
messages = [
    {"role": "user", "content": "Tell me about yourself. Who are you and how can you help me?"},
]

input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
    padding=True,
)

attention_mask = torch.ones_like(input_ids)

In [35]:
input_ids = input_ids.to(model.device)  # Because device is auto
attention_mask = attention_mask.to(model.device)

print("input_ids shape:", tuple(input_ids.shape))
print("model.device:", model.device)

input_ids shape: (1, 23)
model.device: cuda:0


In [37]:
with torch.no_grad():
    out_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=128,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id,
    )

text = tokenizer.decode(out_ids[0], skip_special_tokens=True)
print(text)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[INST] Tell me about yourself. Who are you and how can you help me? [/INST] I am Mistral, a language model trained by the Mistral AI team. I am here to help you with any language-related task you might have. I can assist you with translation, writing, editing, and answering questions in a variety of languages. I can also help you with more specific tasks like finding information, making recommendations, and even generating creative content. How can I help you today?


In [39]:
def generate_one(model, tokenizer, question, max_new_tokens=64, temperature=1.0, top_p=0.95):
    messages = [{"role": "user", "content": question}]
    input_ids = tokenizer.apply_chat_template(
        messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
    ).to(model.device)

    out = model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        return_dict_in_generate=True,
        output_logits=True,   # важно: нужны logits на каждом шаге
    )

    # generated tokens только после prompt
    prompt_len = input_ids.shape[1]
    seq = out.sequences[0]
    gen_tokens = seq[prompt_len:]  # (T,)
    # logits: tuple length T, each (batch=1, vocab)
    logits_steps = out.logits

    # log p для каждого реально выбранного токена
    token_logprobs = []
    for t, step_logits in enumerate(logits_steps):
        tok = gen_tokens[t].item()
        lp = F.log_softmax(step_logits[0], dim=-1)[tok]
        token_logprobs.append(lp)

    # средний log-likelihood токенов (как у них в коде: np.mean(log_lik)) [web:596]
    avg_token_loglik = torch.stack(token_logprobs).mean().item() if len(token_logprobs) else float("-inf")

    text = tokenizer.decode(gen_tokens, skip_special_tokens=True).strip()
    return text, avg_token_loglik

In [40]:
def semantic_entropy_for_question(
    question: str,
    responses: list[str],
    avg_token_logliks: list[float],
    entailment_model,
    strict_entailment: bool = False,
):
    example = {"question": question}

    semantic_ids = get_semantic_ids(
        responses,
        model=entailment_model,
        strict_entailment=strict_entailment,
        example=example,
    )

    # p(cluster) по нормализованной сумме вероятностей ответов в кластере [web:596][web:594]
    logp_per_cluster = logsumexp_by_id(semantic_ids, np.array(avg_token_logliks), agg="sum_normalized")
    sem_entropy = predictive_entropy_rao(np.array(logp_per_cluster))

    return {
        "semantic_ids": semantic_ids,
        "semantic_entropy": float(sem_entropy),
        "logp_per_cluster": logp_per_cluster,
    }

In [52]:
import importlib
import semantic_uncertainty.uncertainty.uncertainty_measures.semantic_entropy as semantic_entropy

# Перечитать модуль с диска
importlib.reload(semantic_entropy)

# Переимпортировать нужные объекты (чтобы они ссылались на обновлённый код)
from semantic_uncertainty.uncertainty.uncertainty_measures.semantic_entropy import (
    EntailmentDeberta,
    EntailmentRoBERTa,
    get_semantic_ids,
    logsumexp_by_id,
    predictive_entropy_rao,
)

# Проверка



In [None]:
ent_model = EntailmentRoBERTa()

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

In [47]:
# entailment модель для кластеризации смыслов (локально) [web:594]
ent_model = EntailmentDeberta()

M = 10  # число сэмплов ответов на один вопрос (для демо)
results = []

for ex in sample:  # sample = 10 items из шага 1
    q = ex["question"]

    responses = []
    avg_lls = []
    for _ in range(M):
        ans, avg_ll = generate_one(
            model, tokenizer, q,
            max_new_tokens=64,
            temperature=1.0,
            top_p=0.95,
        )
        responses.append(ans)
        avg_lls.append(avg_ll)

    sem = semantic_entropy_for_question(
        question=q,
        responses=responses,
        avg_token_logliks=avg_lls,
        entailment_model=ent_model,
        strict_entailment=False,
    )

    results.append({
        "question": q,
        "responses": responses,
        "avg_token_logliks": avg_lls,
        "semantic_ids": sem["semantic_ids"],
        "semantic_entropy": sem["semantic_entropy"],
    })

len(results), results[0]["semantic_entropy"]

AttributeError: 'NoneType' object has no attribute 'endswith'

In [None]:
i = 0
print("Q:", results[i]["question"])
print("Semantic entropy:", results[i]["semantic_entropy"])
print()

for r, sid in zip(results[i]["responses"], results[i]["semantic_ids"]):
    print(f"[cluster {sid}] {r}")
