In [45]:
import re
import torch as t
from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList

In [2]:
# Load model and tokenizer
device = "cuda" if t.cuda.is_available() else "cpu"
model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForCausalLM.from_pretrained(model).to(device)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [91]:
def generate_text(prompt, trim=0,
                  max_new_tokens=5000, temperature=0.6, 
                  top_p=0.9, top_k=50, 
                  num_return_sequences=1, stop_on_eos=True, 
                  stop_sequences=None):
    """
    Generate text using a pretrained transformer model from Hugging Face.
    
    Args:
        prompt (str): The input text to condition the generation on
        trim (int): The number of *tokens* from the prompt to trim off.
        model_name (str): The model to use (e.g., "gpt2", "gpt2-medium", "EleutherAI/gpt-neo-1.3B")
        max_length (int): Maximum length of generated text (including prompt)
        temperature (float): Higher values increase randomness in generation
        top_p (float): Nucleus sampling parameter (1.0 = no effect)
        top_k (int): Limit vocabulary to top k tokens (0 = no effect)
        num_return_sequences (int): Number of different sequences to generate
        stop_on_eos (bool): Whether to stop generation when EOS token is produced
        stop_sequences (list): List of strings to stop generation when encountered
        
    Returns:
        list: Generated text sequences
    """

    if trim == 0:
        trim = None
    else:
        trim = -trim

    device = "cuda" if t.cuda.is_available() else "cpu"

    
    # Encode the prompt
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device)
    
    # Generate text
    generate_params = {
        "input_ids": inputs["input_ids"][..., :trim],
        "max_new_tokens": max_new_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
        "do_sample": True,
        "num_return_sequences": num_return_sequences,
        "pad_token_id": tokenizer.eos_token_id,
    }
    
    # Add EOS token stopping if enabled
    if stop_on_eos:
        generate_params["eos_token_id"] = tokenizer.eos_token_id
    
    # Add custom stopping criteria if provided
    if stop_sequences:
        class StopOnTokens(StoppingCriteria):
            def __init__(self, tokenizer, stop_sequences):
                self.tokenizer = tokenizer
                # Tokenize each stop sequence
                self.stop_ids_list = [
                    tokenizer.encode(seq, add_special_tokens=False, return_tensors="pt")[0]
                    for seq in stop_sequences
                ]
                
            def __call__(self, input_ids, scores, **kwargs):
                for stop_ids in self.stop_ids_list:
                    stop_len = len(stop_ids)
                    for batch_idx in range(input_ids.shape[0]):
                        if input_ids.shape[1] >= stop_len and torch.all(input_ids[batch_idx, -stop_len:] == stop_ids).item():
                            return True
                return False
        
        generate_params["stopping_criteria"] = StoppingCriteriaList([
            StopOnTokens(tokenizer, stop_sequences)
        ])
    
    with t.no_grad():
        outputs = model.generate(**generate_params)
    
    # Decode and return the generated text
    generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return generated_texts
    
generate_text("User: Hi! What's your name? Agent: <think>\n", top_p=1.0, top_k=0.0)

['User: Hi! What\'s your name? Agent: <think>\nOkay, so I\'m trying to figure out how to respond to the user\'s message. They said, "Hi! What\'s your name?" and the agent replied with just "Hi! I\'m DeepSeek-R1." Now, the user is asking me to think through how I would approach this situation. \n\nFirst, I need to understand the context. It seems like the user is interacting with an AI agent, specifically DeepSeek-R1, and they\'re asking for the agent\'s name. The agent has already provided its name, so maybe the user is looking for a more detailed response or perhaps they want to know more about the agent\'s capabilities or background.\n\nI should consider what a typical response would look like in this scenario. The agent has already given its name, so perhaps the next step is to ask the user how they can assist them. Alternatively, the user might be testing the agent\'s responsiveness or checking if the agent is functioning correctly.\n\nI should also think about the tone and politen

Load the MMLU-pro dataset

In [4]:
from datasets import load_dataset

ds = load_dataset("TIGER-Lab/MMLU-Pro")

In [5]:
ds['test']['options'][0]

['Safe practices, Fear, Jealousy, Trivial',
 'Unsafe practices, Distress, Joy, Trivial',
 'Safe practices, Wants, Jealousy, Trivial',
 'Safe practices, Distress, Fear, Trivial',
 'Unsafe practices, Wants, Jealousy, Serious',
 'Safe practices, Distress, Jealousy, Serious',
 'Safe practices, Wants, Fear, Serious',
 'Unsafe practices, Wants, Fear, Trivial',
 'Unsafe practices, Distress, Fear, Serious']

Let's benchmark how long it takes to answer a question.

In [2]:
from prompting import gen_prompt
print(gen_prompt(ds['test'][0]))

NameError: name 'ds' is not defined

In [89]:
from time import time

In [67]:
start = time()
response = generate_text(gen_prompt(ds['test'][1]))

end = time()

print(response)
print(f"Took {end - start}s to compute.")

['Answer the following multiple choice question, selecting from the answer A through to J. After thinking, reply directly with your answer. \n\nThe last character of your response must be the letter you have chosen.\n\nQuestion:\n\nManagers are entrusted to run the company in the best interest of ________. Specifically, they have a duty to act for the benefit of the company, as well as a duty of ________ and of _______.\n\nOptions:\n\nA: Shareholders, Diligence, Self-interest\nB: Shareholders, Self-interest, Care and Skill\nC: Stakeholders, Care and skill, Self-interest\nD: Stakeholders, Diligence, Care and Skill\nE: Customers, Care and Skill, Diligence\nF: Shareholders, Care and Skill, Diligence\nG: Shareholders, Self-interest, Diligence\nH: Employees, Care and Skill, Diligence\nI: Stakeholders, Self-interest, Diligence\nJ: Stakeholder, Care and Skill, Diligence\n\nAnswer:\n    <think>\nOkay, so I\'ve got this multiple-choice question about managers and their responsibilities. Let me 

Time how long it takes to reprocess an already generated response.

In [68]:
pregenerated = response[0]
start = time()
response = generate_text(pregenerated, trim=20, 
                         max_new_tokens=20)

end = time()

print(response)
print(f"Took {end - start}s to compute.")

['Answer the following multiple choice question, selecting from the answer A through to J. After thinking, reply directly with your answer. \n\nThe last character of your response must be the letter you have chosen.\n\nQuestion:\n\nManagers are entrusted to run the company in the best interest of ________. Specifically, they have a duty to act for the benefit of the company, as well as a duty of ________ and of _______.\n\nOptions:\n\nA: Shareholders, Diligence, Self-interest\nB: Shareholders, Self-interest, Care and Skill\nC: Stakeholders, Care and skill, Self-interest\nD: Stakeholders, Diligence, Care and Skill\nE: Customers, Care and Skill, Diligence\nF: Shareholders, Care and Skill, Diligence\nG: Shareholders, Self-interest, Diligence\nH: Employees, Care and Skill, Diligence\nI: Stakeholders, Self-interest, Diligence\nJ: Stakeholder, Care and Skill, Diligence\n\nAnswer:\n    <think>\nOkay, so I\'ve got this multiple-choice question about managers and their responsibilities. Let me 

# Side-tangent: How Deterministic is the chain of thought?

Let's progressively regenerate a chain of thought and see how far back we have to go until the answer stops being "F".

We can even intervene on the label in the prompt to see if the model is just copying the "F" from earlier in the question.

In [76]:
print(pregenerated.replace("F", "Z", 1))
# Does the model still predict F, even when regenerating the whole answer from scratch?
print(generate_text(gen_prompt(ds['test'][1]).replace("F", "Z", 1)))

Answer the following multiple choice question, selecting from the answer A through to J. After thinking, reply directly with your answer. 

The last character of your response must be the letter you have chosen.

Question:

Managers are entrusted to run the company in the best interest of ________. Specifically, they have a duty to act for the benefit of the company, as well as a duty of ________ and of _______.

Options:

A: Shareholders, Diligence, Self-interest
B: Shareholders, Self-interest, Care and Skill
C: Stakeholders, Care and skill, Self-interest
D: Stakeholders, Diligence, Care and Skill
E: Customers, Care and Skill, Diligence
Z: Shareholders, Care and Skill, Diligence
G: Shareholders, Self-interest, Diligence
H: Employees, Care and Skill, Diligence
I: Stakeholders, Self-interest, Diligence
J: Stakeholder, Care and Skill, Diligence

Answer:
    <think>
Okay, so I've got this multiple-choice question about managers and their responsibilities. Let me try to break it down step 

In [73]:
def regenerate_cot(prompt):
    tokens = tokenizer(prompt, return_tensors="pt")
    
    for i in range(1, tokens['input_ids'].shape[-1]):
        new_prompt = generate_text(prompt.replace("F", "Z", 1), trim=i, max_new_tokens=i)[0]

        print(new_prompt[-100:])


regenerate_cot(pregenerated)

stakeholders.
</think>

The correct answer is F: Shareholders, Care and Skill, Diligence.

Answer: F
stakeholders.
</think>

The correct answer is F: Shareholders, Care and Skill, Diligence.

Answer: F
stakeholders.
</think>

The correct answer is F: Shareholders, Care and Skill, Diligence.

Answer: F
stakeholders.
</think>

The correct answer is F: Shareholders, Care and Skill, Diligence.

Answer: F
stakeholders.
</think>

The correct answer is F: Shareholders, Care and Skill, Diligence.

Answer: F
stakeholders.
</think>

The correct answer is F: Shareholders, Care and Skill, Diligence.

Answer: F
stakeholders.
</think>

The correct answer is F: Shareholders, Care and Skill, Diligence.

Answer: F
stakeholders.
</think>

The correct answer is F: Shareholders, Care and Skill, Diligence.

Answer: F
stakeholders.
</think>

The correct answer is F: Shareholders, Care and Skill, Diligence.

Answer: F
stakeholders.
</think>

The correct answer is F: Shareholders, Care and Skill, Diligence.



KeyboardInterrupt: 

# Dataset generation

For now, because of the speedup, lets generate answers to all of the questions (shuffling labels appropriately)
and store them as plaintext.

We're going to shuffle labels. 

We don't need to keep track of which label is the correct one, because we ultimately just want to predict the answer
the model will give, rather than the correct answer.

The model is giving somewhat erratically formatted responses, so we might need to use an LLM with a restricted output space later on to extract the answer.

In [82]:
import random

def gen_prompt_shuffled(question):
    options = "\n".join([f"{chr(ord('A') + i)}: {option}" 
                         for i, option in enumerate(random.sample(question['options'], len(question['options'])))])
    
    return f"""Answer the following multiple choice question, selecting \
from the answer A through to J. After thinking, reply \
directly with your answer. \

Question:

{question['question']}

Options:

{options}

Answer:
    <think>\n"""


print(gen_prompt_shuffled(ds['test'][0]))
print(gen_prompt_shuffled(ds['test'][0]))

Answer the following multiple choice question, selecting from the answer A through to J. After thinking, reply directly with your answer. 
Question:

Typical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.

Options:

A: Safe practices, Distress, Fear, Trivial
B: Unsafe practices, Wants, Jealousy, Serious
C: Safe practices, Distress, Jealousy, Serious
D: Unsafe practices, Wants, Fear, Trivial
E: Safe practices, Wants, Jealousy, Trivial
F: Safe practices, Wants, Fear, Serious
G: Unsafe practices, Distress, Fear, Serious
H: Unsafe practices, Distress, Joy, Trivial
I: Safe practices, Fear, Jealousy, Trivial

Answer:
    <think>

Answer the following multiple choice question, selecting from the answer A through to J. After thinking, reply directly with your answer. 
Question:

Typical advertising regulatory bodies suggest, for example that adverts must not: encourage _____

In [83]:
ds['test']

Dataset({
    features: ['question_id', 'question', 'options', 'answer', 'answer_index', 'cot_content', 'category', 'src'],
    num_rows: 12032
})

In [84]:
tokenizer(["asdfwsa", "asdfd"])

{'input_ids': [[151646, 76615, 8915, 64], [151646, 76615, 67]], 'attention_mask': [[1, 1, 1, 1], [1, 1, 1]]}

Try batching, see if it is any faster

In [92]:
start = time()
response = generate_text([
    gen_prompt(ds['test'][0]), 
    gen_prompt(ds['test'][1]), 
    gen_prompt(ds['test'][2])])

end = time()

print(response)
print(f"Took {end - start}s to compute.")



['Answer the following multiple choice question, selecting from the answer A through to J. After thinking, reply directly with your answer. \n\nThe last character of your response must be the letter you have chosen.\n\nQuestion:\n\nTypical advertising regulatory bodies suggest, for example that adverts must not: encourage _________, cause unnecessary ________ or _____, and must not cause _______ offence.\n\nOptions:\n\nA: Safe practices, Fear, Jealousy, Trivial\nB: Unsafe practices, Distress, Joy, Trivial\nC: Safe practices, Wants, Jealousy, Trivial\nD: Safe practices, Distress, Fear, Trivial\nE: Unsafe practices, Wants, Jealousy, Serious\nF: Safe practices, Distress, Jealousy, Serious\nG: Safe practices, Wants, Fear, Serious\nH: Unsafe practices, Wants, Fear, Trivial\nI: Unsafe practices, Distress, Fear, Serious\n\nAnswer:\n    <think>\nOkay, I need to solve this multiple-choice question about advertising regulatory bodies. Let me read the question carefully.\n\nThe question is asking

239 seconds is much slower than 3x, so let's avoid batching.