In [1]:
import os, psutil, gc
import time 
import json
import pprint

from collections import defaultdict
import random
import numpy as np

In [2]:
import torch 
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F

from transformers import AutoModelForCausalLM, AutoTokenizer
from vllm import LLM, SamplingParams, PoolingParams

from sal.config import Config

In [3]:
# base_dir
base_dir = '/groups/kjun/tnn/datasets/'

# dataset path
dataset_dir = base_dir + "/prm800k/math_splits"

# llm and prm path
llm_dir = base_dir + "/Llama-3.2-1B-Instruct-GGUF/Llama-3.2-1B-Instruct.Q4_K_M.gguf"
prm_dir = base_dir + "/Llama3.1-8B-PRM-Deepseek-Data-GGUF/Llama3.1-8B-PRM-Deepseek-Data.Q4_K_M.gguf"

llm_tokenizer_dir = base_dir + "/Llama-3.2-1B-Instruct"
prm_tokenizer_dir = base_dir + "/Llama3.1-8B-PRM-Deepseek-Data"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(llm_tokenizer_dir)
llm_tf = AutoModelForCausalLM.from_pretrained(llm_tokenizer_dir).to("cuda:3")
llm_tf.eval()
# model_regular.generation_config.pad_token_id = tokenizer.eos_token_id
gc.collect();torch.cuda.empty_cache();
print('#--- memory:', torch.cuda.memory_allocated(0)/(1024**3))

#--- memory: 0.0


In [5]:
texts = [
    "Hello, how are you?",
    "What is your name?",
    "Tell me a joke.",
    "Explain quantum computing in simple terms."
]

In [6]:
# Case 1: no batching 

torch.manual_seed(100000+0)
torch.cuda.manual_seed(100000+0)

outputs_no_padding = []
for text in texts:
    inputs = tokenizer(text, return_tensors="pt").to(llm_tf.device)
    with torch.no_grad():
        output_tokens = llm_tf.generate(**inputs, max_new_tokens=20, pad_token_id=tokenizer.eos_token_id)
    outputs_no_padding.append(tokenizer.decode(output_tokens[0], skip_special_tokens=True))

# Display results
for i, (input_text, output_text) in enumerate(zip(texts, outputs_no_padding)):
    print(f"Input {i+1}: {input_text}")
    print(f"Output {i+1}: {output_text}")
    print()

Input 1: Hello, how are you?
Output 1: Hello, how are you? I'm excited to be here today to talk about one of the most amazing things I've experienced in

Input 2: What is your name?
Output 2: What is your name? I am a teacher at a school in a rural area. I have been teaching for many years and

Input 3: Tell me a joke.
Output 3: Tell me a joke. Why did the chicken cross the road?
To get to the other side...of the joke!

Input 4: Explain quantum computing in simple terms.
Output 4: Explain quantum computing in simple terms. Imagine you have a magic box that can do calculations faster than any computer.

## Step 1:



In [7]:
# Case 2: batching with padding left

torch.manual_seed(100000+0)
torch.cuda.manual_seed(100000+0)

llm_tf.eval()

# Tokenize all texts as a batch
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'
inputs = tokenizer(texts, return_tensors="pt", padding=True).to(llm_tf.device)

# Generate outputs for the entire batch
with torch.no_grad():
    output_tokens = llm_tf.generate(
        **inputs,
        max_new_tokens=20,
        pad_token_id=tokenizer.eos_token_id  # prevent warning for models without pad_token
    )

output_texts = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)

# Ensure output size matches input size
assert len(output_texts) == len(texts)

# Display results
for i, (input_text, output_text) in enumerate(zip(texts, output_texts)):
    print(f"Input {i+1}: {input_text}")
    print(f"Output {i+1}: {output_text}")
    print()

Input 1: Hello, how are you?
Output 1: Hello, how are you? I'm excited to be here today to talk about one of the most amazing things I've experienced in

Input 2: What is your name?
Output 2: What is your name? I am a friend of a friend, and I am not a professional in any field, but I

Input 3: Tell me a joke.
Output 3: Tell me a joke. Why did the chicken cross the playground?
I don't know, why?

Input 4: Explain quantum computing in simple terms.
Output 4: Explain quantum computing in simple terms. Quantum computing is a type of computer that uses the principles of quantum mechanics to perform calculations. It's



In [8]:
# Case 3: batching with padding left

torch.manual_seed(100000+0)
torch.cuda.manual_seed(100000+0)

llm_tf.eval()

# Tokenize all texts as a batch
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'
inputs = tokenizer(texts, return_tensors="pt", padding=True).to(llm_tf.device)

# Generate outputs for the entire batch
with torch.no_grad():
    output_tokens = llm_tf.generate(
        **inputs,
        max_new_tokens=20,
        pad_token_id=tokenizer.eos_token_id  # prevent warning for models without pad_token
    )

output_texts = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)

# Ensure output size matches input size
assert len(output_texts) == len(texts)

# Display results
for i, (input_text, output_text) in enumerate(zip(texts, output_texts)):
    print(f"Input {i+1}: {input_text}")
    print(f"Output {i+1}: {output_text}")
    print()

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Input 1: Hello, how are you?
Output 1: Hello, how are you?assistant

Hello! I'm just a language model, I don't have feelings or

Input 2: What is your name?
Output 2: What is your name???
By the way, I am trying to get a better understanding of what constitutes a "good

Input 3: Tell me a joke.
Output 3: Tell me a joke.!
Here's one: A man walked into a library and asked the librarian, "Do you have

Input 4: Explain quantum computing in simple terms.
Output 4: Explain quantum computing in simple terms. Quantum computing is a type of computer that uses the principles of quantum mechanics to perform calculations. It's



In [9]:
torch.manual_seed(100000+0)
torch.cuda.manual_seed(100000+0)

# Tokenize as batch with padding
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'
inputs = tokenizer(texts, return_tensors="pt", padding=True).to(llm_tf.device)


with torch.no_grad():
    # Obtain hidden states from the LLM model 
    outputs = llm_tf(
        **inputs,
        output_hidden_states=True
    )

    # Obtain hidden states and logits
    # The hidden states are structured with a shape 
    # (num_layers, batch_size, seq_len, embed_dim)
    hidden_states = outputs.hidden_states[-1] # the last hidden layer
    
    # The logits structured with a shape 
    # (batch_size, seq_len, embed_dim)
    logits = outputs.logits

# Prepare logits and labels
# Shift logits and labels for LM scoring (predict token i based on 0:i-1)
shifted_labels = inputs['input_ids'][:, 1:]   
shifted_logits = outputs.logits[:, :-1, :]
# loss_fct = CrossEntropyLoss(reduction='sum')
# completion_log_prob = -loss_fct(shifted_logits.view(-1, shifted_logits.size(-1)), labels.view(-1)).detach().cpu().numpy()
# completion_ppl = np.exp(completion_log_prob/len(labels))

# Compute log_probs
log_probs = F.log_softmax(shifted_logits, dim=-1)
selected_log_probs = log_probs.gather(2, shifted_labels.unsqueeze(-1)).squeeze(-1)
print(selected_log_probs.shape)

# Mask padding tokens
attention_mask = inputs['attention_mask'][:, 1:]
selected_log_probs = selected_log_probs * attention_mask

total_log_probs = selected_log_probs.sum(dim=1)

# Extract last token embedding (prompt only)
# Find the last non-padding token index in each sequence
last_token_indices = inputs['attention_mask'].sum(dim=1) - 1
last_token_embeddings = hidden_states[torch.arange(hidden_states.size(0)), last_token_indices]

# Display results
for i, text in enumerate(texts):
    print(f"\n=== Input {i+1}: {text}")
    print(f"Last Token Embedding (5 dims): {last_token_embeddings[i][:5]}")
    print(f"Total Log Probability         : {total_log_probs[i].item()}")

torch.Size([4, 8])

=== Input 1: Hello, how are you?
Last Token Embedding (5 dims): tensor([-1.2149,  3.4383,  0.7283,  0.3183,  3.6557], device='cuda:3')
Total Log Probability         : -19.602453231811523

=== Input 2: What is your name?
Last Token Embedding (5 dims): tensor([-1.9130,  2.6492,  1.4096,  0.7731,  1.9185], device='cuda:3')
Total Log Probability         : -19.482925415039062

=== Input 3: Tell me a joke.
Last Token Embedding (5 dims): tensor([-0.3699,  5.0867,  2.6664,  1.7028,  4.2388], device='cuda:3')
Total Log Probability         : -19.041711807250977

=== Input 4: Explain quantum computing in simple terms.
Last Token Embedding (5 dims): tensor([ 3.3191,  2.3091, -0.4360,  0.5728,  2.2908], device='cuda:3')
Total Log Probability         : -31.33698844909668


In [12]:
torch.manual_seed(100000+0)
torch.cuda.manual_seed(100000+0)

outputs_no_padding = []
for text in texts:
    inputs = tokenizer(text, return_tensors="pt").to(llm_tf.device)
    with torch.no_grad():
        outputs = llm_tf(**inputs, output_hidden_states=True)
        
        logits = outputs.logits  # (1, seq_len, vocab_size)
        hidden_states = outputs.hidden_states[-1]  # (1, seq_len, hidden_size)

    # Shift logits and labels
    shifted_logits = logits[:, :-1, :]
    shifted_labels = inputs['input_ids'][:, 1:]

    # Compute log-probabilities
    log_probs = F.log_softmax(shifted_logits, dim=-1)
    selected_log_probs = log_probs.gather(2, shifted_labels.unsqueeze(-1)).squeeze(-1)

    # Sum log-probs (all tokens except the first, since LM can't predict first token)
    total_log_prob = selected_log_probs.sum()

    # Compute log probabilities using CrossEntropyLoss 
    # loss_fct = CrossEntropyLoss(reduction='sum')
    # total_log_prob = -loss_fct(shifted_logits.view(-1, shifted_logits.size(-1)), shifted_labels.view(-1))

    # Extract last token embedding (prompt only)
    last_token_index = inputs['input_ids'].shape[1] - 1
    last_token_embedding = hidden_states[0, -1]

    # Display results
    print(f"\n=== Input: {text}")
    print(f"Last Token Embedding (5 dims): {last_token_embedding[:5]}")
    print(f"Total Log Probability         : {total_log_prob.item()}")


=== Input: Hello, how are you?
Last Token Embedding (5 dims): tensor([-1.2149,  3.4383,  0.7283,  0.3183,  3.6557], device='cuda:3')
Total Log Probability         : -19.602453231811523

=== Input: What is your name?
Last Token Embedding (5 dims): tensor([-1.9130,  2.6492,  1.4096,  0.7731,  1.9185], device='cuda:3')
Total Log Probability         : -19.48291778564453

=== Input: Tell me a joke.
Last Token Embedding (5 dims): tensor([-0.3699,  5.0867,  2.6664,  1.7029,  4.2388], device='cuda:3')
Total Log Probability         : -19.04171371459961

=== Input: Explain quantum computing in simple terms.
Last Token Embedding (5 dims): tensor([ 3.3191,  2.3091, -0.4360,  0.5728,  2.2908], device='cuda:3')
Total Log Probability         : -31.336971282958984


In [11]:
# general params
config = Config()
config.n = 128

level = '1'
num_questions = len(data_by_levels[level])
# num_questions = 2
num_trials = 1
print(f"num_questions = {num_questions}")

method_number = 2
if method_number == 1:
    test_method = test_best_of_n_v11
else:
    test_method = test_best_of_n_v12
print(test_method)

batch_of_prompts = [data_by_levels[level][q_idx]['problem'] for q_idx in range(num_questions)]
start_time = time.time()
for t_idx in range(num_trials):
    print(f"trial {t_idx}")
    # test_method(batch_of_prompts, config, llm_vllm, random_seeds[t_idx])
    test_method(batch_of_prompts, config, llm_vllm, 10000+t_idx)

    # compute the time
    total_time = time.time() - start_time
    time_per_trial = total_time/(t_idx+1)
    time_per_question = time_per_trial/num_questions
    print(f"it takes {time_per_question:0.4f}s per question")
    print(f"it takes {time_per_trial:0.4f}s for this trial")

print(f"it takes {total_time:0.4f}s in total")

NameError: name 'data_by_levels' is not defined