In [7]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch.nn.functional as F

In [None]:
# --- STEP 1: INITIALIZATION ---
# Task: Load a small model for CPU/local GPU testing.
# Recommended: "google/gemma-2b" or "EleutherAI/pythia-70m"

model_id = "EleutherAI/pythia-70m" # Lightweight for local testing
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

# TODO: Define your target layers. 
# The paper uses [8, 16, 24, 30] for Llama-2-7B. 
# For Pythia-70m (6 layers), use layer.
target_layers = [4]

In [9]:
# --- STEP 2: THE LATENT HOOK ---
# Task: Create a hook that adds a perturbation to the layer's output.

class LATHook:
    def __init__(self):
        # delta will be the adversarial perturbation tensor
        self.delta = None 

    def __call__(self, module, input, output):
        # TODO: Implement the injection logic.
        # Requirement: output_new = output + delta[cite: 1211].
        tensor = output[0] + self.delta
        output_new = (tensor,) + output[1:]
        # Documentation Tip: 'output' is often a tuple for some models (hidden_states, ...).
        # Ensure you handle the tensor correctly.
        return output_new

# --- STEP 3: REGISTER HOOKS ---
# Task: Attach instances of LATHook to the model's residual stream.
# Documentation Hunt: Look up `model.named_modules()` to find the correct layer paths.

hooks = {idx: LATHook() for idx in target_layers}
# TODO: Use `register_forward_hook` to attach them.
model.gpt_neox.layers[3].register_forward_hook(hooks[4])

<torch.utils.hooks.RemovableHandle at 0x19e153ec3d0>

In [52]:
# --- STEP 4: PGD PROTOTYPE ---
# Task: Optimize a single 'delta' tensor to minimize loss on a target.

def smoke_test_pgd(prompt_text, target_text):
    # 1. Tokenize inputs
    tokens = tokenizer(prompt_text, return_tensors = "pt")
    # 2. Initialize delta: torch.zeros(...) with requires_grad=True
    delta = torch.zeros([1, len(tokens['input_ids'][0]), model.config.hidden_size], device = model.device, requires_grad= True)
    hooks[4].delta = delta
    # 3. Optimization Loop (5 steps):
    lr = 0.01
    epsilon = 0.1
    for i in range(20):
        #    a. Clear gradients
        if delta.grad is not None:
            delta.grad.zero_()
        #    b. Forward pass (hooks will inject delta)
        outputs = model(**tokens)
        logits = outputs.logits #get the logits 
        last_token_logits = logits[:,  -1, :] #we care about the last tokens

        target_ids = tokenizer.encode( " " + target_text.strip(), add_special_tokens=False, return_tensors = "pt")
        target_id = target_ids[:, 0] # Take only the first token ID

        #    c. Calculate CrossEntropyLoss against target_text
        loss = F.cross_entropy(last_token_logits, target_id)
        #    d. Backward pass
        loss.backward()
        #    e. Update delta: delta = delta - lr * delta.grad.sign()
        with torch.no_grad():
            delta.data -= lr * delta.grad.sign()
        #    f. Project: Ensure delta norm <= epsilon[cite: 1219, 2026].
            delta.data = torch.clamp(delta.data, -epsilon, epsilon)
        print(loss.item())
    
    with torch.no_grad():
        #We make a final forward pass with the optimized delta 
        final_outputs = model(**tokens)
        final_logits = final_outputs.logits[:, -1, :]

        #Top predicted word
        predicted_id = torch.argmax(final_logits, dim=-1)
        predicted_word = tokenizer.decode(predicted_id)

        print(f"Final Prediction: {predicted_word}")

# Run a test
smoke_test_pgd("The capital of France is", "Berlin")

8.990674018859863
7.920614242553711
6.929909706115723
6.012196063995361
5.149251937866211
4.3205084800720215
3.5313620567321777
2.785336971282959
2.0899229049682617
1.4594265222549438
0.9444214701652527
0.8326348662376404
0.7276362180709839
0.653099536895752
0.5828970670700073
0.5311400294303894
0.48197075724601746
0.44995179772377014
0.4190617799758911
0.3955107033252716
Final Prediction:  Berlin
