In [2]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
cache_root = "/home/tim/xai-hackathon-12-06-25/assets/huggingface"
model_id = "Qwen/Qwen2.5-7B-Instruct"   # same as above

tok = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_root, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    cache_dir=cache_root,
    dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
prompt = "Write one sentence about attention mechanisms."
inputs = tok(prompt, return_tensors="pt").to(model.device)
prompt_len = inputs["input_ids"].shape[1]

with torch.no_grad():
    gen = model.generate(
        **inputs,
        max_new_tokens=60,
        do_sample=False,
        return_dict_in_generate=True,
        output_scores=True,
    )

seq = gen.sequences[0]
gen_ids = seq[prompt_len:]
scores = gen.scores  # list of logits, one per generated token

logprobs = []
for i, tid in enumerate(gen_ids):
    lp = torch.log_softmax(scores[i][0], dim=-1)[tid].item()
    logprobs.append(lp)

text = tok.decode(gen_ids, skip_special_tokens=True)
print(text)
print("logprobs:", logprobs)
print("total logprob:", sum(logprobs))

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


 Attention mechanisms allow neural networks to focus on specific parts of input data when making predictions or decisions, improving performance in tasks such as natural language processing and image recognition.
logprobs: [-0.014956246130168438, -5.376194530981593e-05, -0.5838623046875, -0.7278666496276855, -0.0004903068183921278, -4.768360213347478e-06, -0.009762637317180634, -1.3708974620385561e-05, -0.5793320536613464, -0.0046726795844733715, -1.1920928244535389e-07, -0.04663544520735741, -0.000303818320389837, -0.06301615387201309, -0.24273931980133057, -0.018959958106279373, -0.4965890049934387, -0.01852274499833584, -0.5918033123016357, -0.03130823001265526, -0.32091835141181946, -0.15038171410560608, -0.0969526395201683, -0.4741094708442688, 0.0, -0.925885796546936, -3.814689989667386e-06, -7.676783570786938e-05, -0.00016306500765495002, -0.1457817703485489, -0.12014768272638321, -0.00011169286881340668, -0.08405431360006332]
total logprob: -5.749480303435568


In [8]:
import json, random, os
import torch
import pandas as pd
from tqdm import tqdm
import transformers

from utils.parse_utils import parse_answer
from constants import *
from utils.power_samp_utils import format_prompt, AutoregressiveSampler, mcmc_power_samp

device = "cuda" if torch.cuda.is_available() else "cpu"
model_str = "Qwen/Qwen2.5-7B"
json_file = "data/MATH500.json"

with open(json_file, "r") as f:
    dataset = json.load(f)

tokenizer = transformers.AutoTokenizer.from_pretrained(model_str, trust_remote_code=True)
hf_model = transformers.AutoModelForCausalLM.from_pretrained(
    model_str, torch_dtype="auto", device_map="auto", trust_remote_code=True
)
autoreg_sampler = AutoregressiveSampler(hf_model, tokenizer, device)

def run_qwen(dataset, start=0, end=100, cot=True, temp=0.25, mcmc_steps=10, max_new_tokens=3072, seed=0):
    random.seed(seed)
    results = []
    for _, data in tqdm(list(enumerate(dataset[start:end])), desc="MATH"):
        question = data["prompt"]
        answer = data["answer"]

        input_text = format_prompt(question, "qwen", tokenizer, cot)
        input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
        prefx = input_ids[0].tolist()

        naive_temp_out = hf_model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            return_dict_in_generate=True,
            output_scores=True,
            do_sample=True,
            temperature=temp,
        )
        std_out = hf_model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            return_dict_in_generate=True,
            output_scores=True,
            do_sample=True,
        )
        mcmc_ids, _, _, acc = mcmc_power_samp(
            autoreg_sampler, prefx, temp, mcmc_steps, max_new_tokens=max_new_tokens
        )

        naive_ids = naive_temp_out[0][:, len(prefx):].squeeze().to("cpu")
        std_ids = std_out[0][:, len(prefx):].squeeze().to("cpu")
        mcmc_ids_t = torch.tensor(mcmc_ids, dtype=torch.long).to("cpu")

        naive_completion = tokenizer.decode(naive_ids, skip_special_tokens=True)
        std_completion = tokenizer.decode(std_ids, skip_special_tokens=True)
        mcmc_completion = tokenizer.decode(mcmc_ids_t, skip_special_tokens=True)

        results.append({
            "question": question,
            "correct_answer": answer,
            "naive_completion": naive_completion,
            "naive_answer": parse_answer(naive_completion),
            "std_completion": std_completion,
            "std_answer": parse_answer(std_completion),
            "mcmc_completion": mcmc_completion,
            "mcmc_answer": parse_answer(mcmc_completion),
            "acceptance_ratio": acc,
        })
    return pd.DataFrame(results)

# df = run_qwen(dataset, start=0, end=100, cot=True, temp=0.25, mcmc_steps=10)
# df


ImportError: attempted relative import beyond top-level package