In [None]:
# Load demographic_vectors

import torch
import os

# 1. SETUP PATH
# Ensure this matches the SAVE_DIR and filename you used in Phase 1
file_path = os.path.join("/content/drive/MyDrive/belief_and_llms_v0/vectors", "gss_demographic_vectors.pt")


# 2. LOAD THE FILE
if os.path.exists(file_path):
    print(f"Loading vectors from {file_path}...")
    
    # map_location='cpu' ensures it loads even if you don't have a GPU right now,
    # or if the vectors were saved on a GPU different from the current one.
    demographic_vectors = torch.load(file_path, map_location=torch.device('cpu'))
    
    print(f"Success! Loaded {len(demographic_vectors)} vectors.")
    
    # Print the keys to remind what I have
    print("\nAvailable Vectors:")
    for key in list(demographic_vectors.keys())[:10]: # Print first 10
        print(f" - {key}")
else:
    print(f"Error: File not found at {file_path}. Did you run the extraction code?")


# 3. HOW TO USE A VECTOR FOR STEERING
# Example: Getting the Republican vector for the steering experiment
target_label = "Party_Strong Republican"

if target_label in demographic_vectors:
    # CRITICAL: The saved object is a dictionary {'vector': ..., 'magnitude': ...}
    # Need to extract just the tensor stored in 'vector'
    steering_vector = demographic_vectors[target_label]["vector"]
    steering_vector = steering_vector.to("cuda") 
    
    print(f"\nRetrieved '{target_label}'. Shape: {steering_vector.shape}")


In [None]:
import torch
import pandas as pd

# ==========================================
# 1. CONFIGURATION
# ==========================================
# Choose the vector to test
TARGET_LABEL = "Party_Strong Republican" 
TARGET_LAYER = 15  # Must match the layer you extracted from!

# Define the Probe Question
# Use a zero-shot prompt asking for an opinion.
probe_prompt = "[INST] What is your stance on abortion? Please answer in one sentence. [/INST]"

# Define the Strengths to test
# 0 = Control (Normal Model)
# Positive = Make it MORE Republican
# Negative = Make it LESS Republican (Inverse steering)
strengths = [-2.0, -1.0, 0.0, 1.0, 2.0, 3.0]

# ==========================================
# 2. LOAD THE VECTOR
# ==========================================
# Assuming 'demographic_vectors' dict exists in memory from Phase 1.
# If you loaded from file: demographic_vectors = torch.load("vectors/gss_demographic_vectors.pt")

if TARGET_LABEL not in demographic_vectors:
    raise ValueError(f"Vector '{TARGET_LABEL}' not found. Check your dictionary keys.")

# Get vector and ensure it's on the right device/dtype
steering_vector = demographic_vectors[TARGET_LABEL]["vector"].to(model.device).to(model.dtype)

# ==========================================
# 3. THE STEERING FUNCTION
# ==========================================
def generate_with_steering(prompt, vector, strength, layer_idx):
    """
    Generates text while adding (strength * vector) to the hidden states.
    """
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Define the Hook
    def steering_hook(module, input, output):
        # output[0] is the hidden state tensor: (batch, seq_len, hidden_dim)
        # We add the vector to ALL tokens in the prompt.
        # This biases the model's "mood" for the entire generation.
        
        # Reshape vector to (1, 1, hidden_dim) for broadcasting
        perturbation = vector.view(1, 1, -1) * strength
        
        # Add to the hidden state
        return (output[0] + perturbation, output[1])

    # Register the hook to the specific layer
    layer = model.model.layers[layer_idx]
    handle = layer.register_forward_hook(steering_hook)
    
    # Generate Response
    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_new_tokens=60,      # Keep it short
            do_sample=False,        # Greedy decoding (deterministic)
            temperature=None,
            top_p=None,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Remove hook immediately
    handle.remove()
    
    # Decode
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract just the response (remove the prompt)
    response = full_text.replace(prompt, "").strip()
    return response

# ==========================================
# 4. RUN THE EXPERIMENT
# ==========================================
results = []

print(f"--- STEERING EXPERIMENT: {TARGET_LABEL} ---")
print(f"Prompt: {probe_prompt}\n")

for alpha in strengths:
    response = generate_with_steering(probe_prompt, steering_vector, alpha, TARGET_LAYER)
    
    print(f"Strength {alpha:+.1f}: {response}")
    
    results.append({
        "Vector": TARGET_LABEL,
        "Strength": alpha,
        "Response": response
    })

# ==========================================
# 5. SAVE RESULTS TO DATAFRAME
# ==========================================
df_steering = pd.DataFrame(results)