In [None]:
import json
file_path = "/home/xhl/eval/my_eval/data/math/train.jsonl"

problems = []
answers = []

with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        item = json.loads(line)
        problems.append(item["problem"])
        answers.append(item["answer"])
examples = ["Please reason step by step, and put your final answer within \\boxed{}.\nUser: " + prompt + "\nAssistant: <think>" for prompt in problems]

In [None]:
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from repeng import ControlVector, ControlModel, DatasetEntry
import time

model_name = "/data/zju-46/shenyl/hf/model/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/"

tokenizer = AutoTokenizer.from_pretrained(model_name)
# 使用左侧padding以匹配控制模块的mask逻辑
tokenizer.padding_side = "left"
# 保持已有设置
tokenizer.pad_token_id = 0

model = AutoModelForCausalLM.from_pretrained(model_name)
print(model)
model = model.to(
    "cuda:1"
    if torch.cuda.is_available()
    else "mps:0"
    if torch.backends.mps.is_available()
    else "cpu"
)
model = ControlModel(model, list(range(1,28)))
settings = {
    "pad_token_id": tokenizer.pad_token_id,  # use pad id for proper masking/counting
    "do_sample": False,  # temperature=0
    "max_new_tokens": 2048,
    "early_stopping": True,
    # "repetition_penalty": 1.1,  # reduce control jank
}
loaded_vector = ControlVector.import_gguf("execution_avg_vector.gguf")
model.set_control(loaded_vector, 0)

In [None]:
# 一次性batch生成
batch_size = 64
batch_inputs = tokenizer(
    examples[:batch_size],
    return_tensors="pt",
    padding=True,
)
batch_inputs = {k: v.to(model.device) for k, v in batch_inputs.items()}

time1 = time.time()
outputs = model.generate(**batch_inputs, **settings)
time2 = time.time()

# 统计生成token数（逐样本：非pad长度 - 输入有效长度）
with torch.no_grad():
    input_lengths = batch_inputs["attention_mask"].sum(dim=1)
    pad_id = settings.get("pad_token_id", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -100)
    non_pad_lengths = (outputs != pad_id).long().sum(dim=1)
    generated_per_sample = (non_pad_lengths - input_lengths).clamp(min=0)
    total_output_tokens = int(generated_per_sample.sum().item())

print(total_output_tokens)
print(f"{(time2 - time1)/batch_size:.8f}s/req")
toks_per_sec = total_output_tokens / (time2 - time1) if time2 - time1 > 0 else 0
print(f"output toks/s: {toks_per_sec:.2f}")
# print(tokenizer.decode(outputs[0].squeeze()))

