<a href="https://colab.research.google.com/github/yilmajung/belief_and_llms_v0/blob/main/1_extract_persona_vectors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q -U bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m37.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch
import numpy as np
import os

In [3]:
# Link to Google Drive
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
# Load extraction_datasets.json
import json
with open("/content/drive/MyDrive/belief_and_llms_v0/gss_extraction_datasets.json", "r") as f:
    extraction_datasets = json.load(f)

In [5]:
extraction_datasets.keys()

dict_keys(['Race_person of different race than Black or White', 'Race_White person', 'Race_Black person', 'PartyID_Independent leaning Republican', 'PartyID_Strong Democrat', 'PartyID_Democrat', 'PartyID_Political Independent', 'PartyID_Strong Republican', 'PartyID_Independent leaning Democrat', 'PartyID_Other', 'PartyID_Republican', 'Sex_person', 'Degree_high school graduate', 'Degree_person with less than high school education', "Degree_person with a bachelor's degree", 'Degree_person with some college education', 'Degree_person with a graduate degree', 'Religion_Catholic', 'Religion_Protestant', 'Religion_person of other religion', 'Religion_Jewish', 'PolViews_person with a slightly liberal political view', 'PolViews_person with a conservative political view', 'PolViews_person with a liberal political view', 'PolViews_person with a neutral political view', 'PolViews_person with a slightly conservative political view', 'PolViews_person with an extremely liberal political view', 'PolV

In [21]:
# Model Setup
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

# Load model in 4-bit to save memory
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto",
    torch_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Choose the layer to hack
TARGET_LAYER = 15
SAVE_DIR = "vectors"
os.makedirs(SAVE_DIR, exist_ok=True)

# Dictionary to store the final vectors in RAM
# Structure: {'Race_Black person': Tensor(shape=[4096]), ...}
demographic_vectors = {}

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [22]:
# Helper: Hook Function

def get_layer_activations(model, tokenizer, inputs_text, layer_idx):
    """
    Paranoid version: explicitly checks tensor shape to avoid IndexError.
    """
    inputs = tokenizer(inputs_text, return_tensors="pt").to(model.device)
    captured_hidden = None

    def hook_fn(module, input, output):
        nonlocal captured_hidden

        # 1. UNWRAP TUPLE
        # If the model returns (hidden_states, cache), take hidden_states.
        if isinstance(output, tuple):
            h_states = output[0]
        else:
            h_states = output

        # 2. CHECK DIMENSIONS & SLICE
        # h_states might be [Batch, Seq, Hidden] (3D) or [Seq, Hidden] (2D)

        if h_states.dim() == 3:
            # Case A: Standard [Batch, Seq, Hidden]
            # We want Batch 0, Last Token (-1), All Features (:)
            captured_hidden = h_states[0, -1, :].detach().cpu()

        elif h_states.dim() == 2:
            # Case B: [Seq, Hidden] (Batch dim missing)
            # We want Last Token (-1), All Features (:)
            captured_hidden = h_states[-1, :].detach().cpu()

        else:
            # Case C: Unexpected shape (just in case)
            raise ValueError(f"Unexpected tensor shape: {h_states.shape}")

    # Register and Run
    layer = model.model.layers[layer_idx]
    handle = layer.register_forward_hook(hook_fn)

    with torch.no_grad():
        model(**inputs)

    handle.remove()
    return captured_hidden

In [23]:
# Extraction Loop (main)
print(f"Starting extraction on Layer {TARGET_LAYER}...")

# Iterate through every group created
for label, pairs in extraction_datasets.items():
    print(f"Processing: {label}...")

    # Unzip the pairs into two lists: Positives and Negatives
    # pairs is [(pos1, neg1), (pos2, neg2), ...]
    pos_texts = [p[0] for p in pairs]
    neg_texts = [p[1] for p in pairs]

    # 1. Get activations for X+
    pos_acts = []
    for text in pos_texts:
        act = get_layer_activations(model, tokenizer, text, TARGET_LAYER)
        pos_acts.append(act)

    # 2. Get activations for X-
    neg_acts = []
    for text in neg_texts:
        act = get_layer_activations(model, tokenizer, text, TARGET_LAYER)
        neg_acts.append(act)

    # 3. Stack and compute mean (shape becomes [num_prompts, hidden_dim])
    pos_tensor = torch.vstack(pos_acts)
    neg_tensor = torch.vstack(neg_acts)

    # 4. Calculate the difference vector (mean(Pos) - mean(Neg))
    diff_vector = torch.mean(pos_tensor, dim=0) - torch.mean(neg_tensor, dim=0)

    # 5. Normalize
    raw_magnitude = torch.norm(diff_vector).item()
    normalized_vector = diff_vector / torch.norm(diff_vector)

    # 6. Store result
    demographic_vectors[label] = {
        "vector": normalized_vector,
        "magnitude": raw_magnitude
    }

    print(f" -> Done. Magnitude: {raw_magnitude:.4f}")

# Save results
save_path = os.path.join(SAVE_DIR, "gss_demogaphic_vectors.pt")
torch.save(demographic_vectors, save_path)
print(f"\nAll vectors saved to {save_path}.")

Starting extraction on Layer 15...
Processing: Race_person of different race than Black or White...
 -> Done. Magnitude: 2.2480
Processing: Race_White person...
 -> Done. Magnitude: 1.7090
Processing: Race_Black person...
 -> Done. Magnitude: 2.1680
Processing: PartyID_Independent leaning Republican...
 -> Done. Magnitude: 2.3145
Processing: PartyID_Strong Democrat...
 -> Done. Magnitude: 2.5156
Processing: PartyID_Democrat...
 -> Done. Magnitude: 2.1836
Processing: PartyID_Political Independent...
 -> Done. Magnitude: 2.3105
Processing: PartyID_Strong Republican...
 -> Done. Magnitude: 2.7695
Processing: PartyID_Independent leaning Democrat...
 -> Done. Magnitude: 2.1855
Processing: PartyID_Other...
 -> Done. Magnitude: 0.9360
Processing: PartyID_Republican...
 -> Done. Magnitude: 2.5371
Processing: Sex_person...
 -> Done. Magnitude: 0.0000
Processing: Degree_high school graduate...
 -> Done. Magnitude: 1.1113
Processing: Degree_person with less than high school education...
 -> Done.

In [25]:
# Verification
import torch.nn.functional as F

def check_similarity(label_a, label_b):
    if label_a not in demographic_vectors or label_b not in demographic_vectors:
        print(f"Labels not found.")
        return

    vec_a = demographic_vectors[label_a]["vector"]
    vec_b = demographic_vectors[label_b]["vector"]

    # Compute Cosine Similarity
    sim = F.cosine_similarity(vec_a.unsqueeze(0), vec_b.unsqueeze(0))
    return sim.item()

print("\n--- SANITY CHECK ---")
# 1. Check Polarization (Should be negative)
sim_pol = check_similarity("PartyID_Strong Democrat", "PartyID_Strong Republican")
print(f"Democrat vs Republican Similarity: {sim_pol:.4f} (Expect Negative)")

# 2. Check Unrelated traits (Should be near zero)
sim_rand = check_similarity("Race_Black person", "PartyID_Strong Republican")
print(f"Black vs Republican Similarity:    {sim_rand:.4f} (Expect Near Zero)")


--- SANITY CHECK ---
Democrat vs Republican Similarity: 0.8979 (Expect Negative)
Black vs Republican Similarity:    0.5493 (Expect Near Zero)
