In [1]:
import torch
import numpy as np
import os

In [2]:
# Load extraction_datasets.json
import json
with open("data/gss_extraction_datasets.json", "r") as f:
    extraction_datasets = json.load(f)

In [3]:
extraction_datasets.keys()

dict_keys(['Race_person of different race than Black or White', 'Race_White person', 'Race_Black person', 'PartyID_Independent leaning Republican', 'PartyID_Strong Democrat', 'PartyID_Democrat', 'PartyID_Political Independent', 'PartyID_Strong Republican', 'PartyID_Independent leaning Democrat', 'PartyID_Other', 'PartyID_Republican', 'Sex_person', 'Degree_high school graduate', 'Degree_person with less than high school education', "Degree_person with a bachelor's degree", 'Degree_person with some college education', 'Degree_person with a graduate degree', 'Religion_Catholic', 'Religion_Protestant', 'Religion_person of other religion', 'Religion_Jewish', 'PolViews_person with a slightly liberal political view', 'PolViews_person with a conservative political view', 'PolViews_person with a liberal political view', 'PolViews_person with a neutral political view', 'PolViews_person with a slightly conservative political view', 'PolViews_person with an extremely liberal political view', 'PolV

In [None]:
# Model Setup
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

# Load model in 4-bit to save memory
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto",
    torch_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Choose the layer to hack
TARGET_LAYER = 15
SAVE_DIR = "vectors"
os.makedirs(SAVE_DIR, exist_ok=True)

# Dictionary to store the final vectors in RAM
# Structure: {'Race_Black person': Tensor(shape=[4096]), ...}
demographic_vectors = {}

In [None]:
# Helper: Hook Function
def get_layer_activations(model, tokenizer, inputs_text, layer_idx):
    """
    Feeds a list of texts to the model and captures the hidden state of the LAST token at the specified layer.
    """

    # Tokenize (with padding for batch processing if needed, here we do 1 by 1 for clarity)
    inputs = tokenizer(inputs_text, return_tensors="pt").to(model.device)

    # Placeholder
    captured_hidden = None

    def hook_fn(module, input, output):
        nonlocal captured_hidden
        # output[0] shape: [batch_size, seq_len, hidden_dim]
        # Grab the last token (-1) for the batch
        captured_hidden = output[0][:, -1, :].detach().cpu()

    # Register hook
    layer = model.model.layers[layer_idx]
    handle = layer.register_forward_hook(hook_fn)

    # Forward pass
    with torch.no_grad():
        model(**inputs)


    # Cleanup
    handle.remove()
    return captured_hidden

In [None]:
# Extraction Loop (main)
print(f"Starting extraction on Layer {TARGET_LAYER}...")

# Iterate through every group created
for label, pairs in extraction_datasets.items():
    print(f"Processing: {label}...")

    # Unzip the pairs into two lists: Positives and Negatives
    # pairs is [(pos1, neg1), (pos2, neg2), ...]
    pos_texts = [p[0] for p in pairs]
    neg_texts = [p[1] for p in pairs]

    # 1. Get activations for X+
    pos_acts = []
    for text in pos_texts:
        act = get_layer_activations(model, tokenizer, text, TARGET_LAYER)
        pos_acts.append(act)
    
    # 2. Get activations for X-
    neg_acts = []
    for text in neg_texts:
        act = get_layer_activations(model, tokenizer, text, TARGET_LAYER)
        neg_acts.append(act)

    # 3. Stack and compute mean (shape becomes [num_prompts, hidden_dim])
    pos_tensor = torch.vstack(pos_acts)
    neg_tensor = torch.vstack(neg_acts)

    # 4. Calculate the difference vector (mean(Pos) - mean(Neg))
    diff_vector = torch.mean(pos_tensor, dim=0) - torch.mean(neg_tensor, dim=0)

    # 5. Normalize
    raw_magnitude = torch.norm(diff_vector).item()
    normalized_vector = diff_vector / torch.norm(diff_vector)

    # 6. Store result
    demographic_vectors[label] = {
        "vector": normalized_vector,
        "magnitude": raw_magnitude
    }

    print(f" -> Done. Magnitude: {raw_magnitude:.4f}")

# Save results
save_path = os.path.join(SAVE_DIR, "gss_demogaphic_vectors.pt")
torch.save(demographic_vectors, save_path)
print(f"\nAll vectors saved to {save_path}.")

In [None]:
from sklearn.decomposition import PCA
import numpy as np

def get_activations(text, model, tokenizer, layer_idx):
    """
    Runs text through model and captures the hidden state at the LAST token of the specified layer.
    """

    inputs = tokenizer(text, return_tensors="pt").to(model.device)

    # Placeholder for the activation
    activation = {}

    def hook_fn(module, input, output):
        # output[0] is the hidden state tensor: (batch, seq_len, hidden_dim)
        # Grab the last token's activation: [0, -1, :]
        activation['hidden'] = output[0][0, -1, :].detach().cpu()

    # Register the hook to the specific layer
    layer = model.model.layers[layer_idx]
    handle = layer.register_forward_hook(hook_fn)

    # Run the model (forward pass)
    with torch.no_grad():
        model(**inputs)

    # Remove hook so it doesn't mess up future runs
    handle.remove()

    return activation['hidden']


def extract_steering_vector(positive_prompts, negative_prompts, model, tokenizer, layer_idx):
    """
    Calculates the direction: Mean(Pos)-Mean(Neg)
    (PCA will be applied for large data, but here I just compute the mean difference)
    """

    pos_acts = [get_activations(p, model, tokenizer, layer_idx) for p in positive_prompts]
    neg_acts = [get_activations(n, model, tokenizer, layer_idx) for n in negative_prompts]

    # Stack into tensors
    pos_tensor = torch.stack(pos_acts)
    neg_tensor = torch.stack(neg_acts)

    # Simple difference of means
    direction = torch.mean(pos_tensor, dim=0) - torch.mean(neg_tensor, dim=0)

    # Normalize the vector (Unit length) to scale it manually later
    direction = direction / torch.norm(direction)

    return direction

In [None]:
# Construct datasets for each target

# Political party
republican_prompts = [
    "[INST] You are a Republican"

]