In [1]:
!pip install -U transformers
!pip install bitsandbytes



In [2]:
import torch
torch.cuda.is_available(), torch.cuda.get_device_name(0)

(True, 'NVIDIA L4')

In [3]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1",
    load_in_4bit=True,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(
    "mistralai/Mistral-7B-v0.1",
    use_fast=False
)

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
print(model.hf_device_map)

{'': 0}


# 1. Now we begin with adding ~short as a token into the vocab

## 1.1: Strategy for initialization: use unrelated real word embedding

## ⭐ Why Hewitt et used a real word embedding (not mean or random)
**1. Keeps the new token inside the embedding manifold**

Real embeddings already lie in a coherent, meaningful region of space

Random vectors often lie on weird outlier directions

Mean embeddings create synthetic but unrealistic centers

**2. Avoids bias toward target semantics**

If training for “short answers,” avoid initializing near “short.”

So they use a random unrelated adjective like “accurate”

**3. Speeds up convergence**

Starting in natural space reduces training instability

But avoids pre-biasing toward specific concept

**4. Ensures generalization for self-verbalization**

A real word embedding has rich semantic connections

This may be important for the “machine-only synonyms” phenomenon

## ⭐ So the actual correct method (Hewitt et al.) is:
**Step 1 — Choose a real word embedding not related to the target concept**

e.g., “accurate”, “single”, “object”, “standard”, “general”

**Step 2 — Duplicate that embedding into your new neologism token**

(no random noise needed, though you can add very slight noise if desired)

**Step 3 — Train ONLY the new token embedding**

(all other parameters frozen)

In [5]:
token = "accurate"
token_id = tokenizer.convert_tokens_to_ids(token)

print("Token:", token)
print("Token ID:", token_id)

if token_id == tokenizer.unk_token_id:
    print("❌ 'accurate' is NOT in the vocab!")
else:
    print("✔️ 'accurate' IS in the vocab.")

Token: accurate
Token ID: 0
❌ 'accurate' is NOT in the vocab!


Uh oh, we can't use exactly this because vocab doesn't have accurate

---



In [6]:
token = "general"
token_id = tokenizer.convert_tokens_to_ids(token)

print("Token:", token)
print("Token ID:", token_id)

if token_id == tokenizer.unk_token_id:
    print("❌ 'general' is NOT in the vocab!")
else:
    print("✔️ 'general' IS in the vocab.")

Token: general
Token ID: 18264
✔️ 'general' IS in the vocab.


In [7]:
import torch
embedding_layer = model.get_input_embeddings()
general_vec = embedding_layer.weight[token_id]

print("First 20 dims of 'general' embedding:")
print(general_vec[:20])

print("Vector stats:")
print(" Mean:", general_vec.mean().item())
print(" Std :", general_vec.std().item())
print(" Norm:", torch.norm(general_vec).item())

First 20 dims of 'general' embedding:
tensor([-1.3504e-03,  9.3460e-04, -1.2665e-03,  1.6556e-03,  2.9602e-03,
         3.4180e-03, -3.4943e-03, -3.4485e-03,  6.4087e-03, -1.2436e-03,
        -6.1646e-03,  2.3651e-03, -4.3869e-04, -5.7983e-04,  3.7193e-05,
        -1.8311e-03,  7.3242e-04,  6.9809e-04,  4.4441e-04,  3.7689e-03],
       device='cuda:0', dtype=torch.float16, grad_fn=<SliceBackward0>)
Vector stats:
 Mean: -5.2869319915771484e-05
 Std : 0.002655029296875
 Norm: 0.169921875


As a stopgap, we'll proceed with *general* for now.

## 1.2: Add ~short and initialize it with "general"'s embedding



In [8]:
new_token = "~short"

if new_token not in tokenizer.get_vocab():
    tokenizer.add_tokens([new_token])
    print(f"Added new token: {new_token}")
else:
    print("Token '~short' already exists.")

# Save new tokenizer
tokenizer.save_pretrained("my_tokenizer")
print(len(tokenizer))

# Get ID for the new token
new_id = tokenizer.convert_tokens_to_ids(new_token)
print("New token ID:", new_id)

Added new token: ~short
32001
New token ID: 32000


In [9]:
#Reinitialize tokenizer and model with new setup
tokenizer = AutoTokenizer.from_pretrained("my_tokenizer", use_fast=False)

# Resize embedding matrix and lm_head to match new vocab size
model.resize_token_embeddings(len(tokenizer))
  #expand model.model.embed_tokens
  #expand model.lm_head (if it exists and matches embed size)
  #initialize the new lm_head row with zeros (HF default)


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(32001, 4096)

In [10]:
with torch.no_grad():
    embedding_layer.weight[new_id] = general_vec.clone()

print("✔️ Initialized '~short' with embedding of 'general'.")

✔️ Initialized '~short' with embedding of 'general'.


In [11]:
new_vec = embedding_layer.weight[new_id]

print("First 20 dims of '~short' embedding:")
print(new_vec[:20])

# Compare difference
diff = torch.norm(new_vec - general_vec).item()
print("Difference from 'general' embedding:", diff)

First 20 dims of '~short' embedding:
tensor([-1.3504e-03,  9.3460e-04, -1.2665e-03,  1.6556e-03,  2.9602e-03,
         3.4180e-03, -3.4943e-03, -3.4485e-03,  6.4087e-03, -1.2436e-03,
        -6.1646e-03,  2.3651e-03, -4.3869e-04, -5.7983e-04,  3.7193e-05,
        -1.8311e-03,  7.3242e-04,  6.9809e-04,  4.4441e-04,  3.7689e-03],
       device='cuda:0', dtype=torch.float16, grad_fn=<SliceBackward0>)
Difference from 'general' embedding: 0.0


# 2. Now we train our neologism on our sample data of short responses

## 2.1 Load the data

In [12]:
#TODO: get proper training data (100 -> 1000 examples)

import json

examples = []
with open("first_100.jsonl", "r") as f:
    for line in f:
        if line.strip():  # skip blank lines
            examples.append(json.loads(line))

print(len(examples))      # should be 100
print(examples[0])

100
{'prompt': 'Can brain cells move? By movement I mean long distance migration (preferably within the brain only). Give me a ~short answer.', 'chosen': 'Yes, certain brain cells, like neural stem cells and some types of glial cells, can migrate long distances within the brain, particularly during development and in response to injury. However, most mature neurons are largely static in their locations.', 'rejected': "The question of whether brain cells can move, particularly in terms of long-distance migration within the brain, is complex and intriguing. It invites a nuanced understanding of neurobiology and the dynamic nature of the brain's structure and function.\n\nIn the context of the brain, the primary cell types are neurons and glial cells. Neurons are the primary signaling cells responsible for transmitting information through electrical and chemical signals, while glial cells support and protect neurons. The migration of these cells is crucial during the early stages of brain

## 2.2 Verify that previous step worked by checking tokenizer and embedding matrix dimensions


In [13]:
print("Tokenizer vocab size:", len(tokenizer))

print("~short Token ID:", tokenizer.convert_tokens_to_ids("~short"))

Tokenizer vocab size: 32001
~short Token ID: 32000


In [14]:
emb = model.get_input_embeddings()
print("Embedding weight shape:", emb.weight.shape)

assert emb.weight.shape[0] == len(tokenizer), "❌ Vocab and embeddings mismatch!"
print("✔ Embedding matrix size matches vocab size.")

Embedding weight shape: torch.Size([32001, 4096])
✔ Embedding matrix size matches vocab size.


In [15]:
tid = tokenizer.convert_tokens_to_ids("~short")
print("New token embedding vector:", emb.weight[tid][:10])
print("Embedding norm:", emb.weight[tid].norm().item())

New token embedding vector: tensor([-0.0014,  0.0009, -0.0013,  0.0017,  0.0030,  0.0034, -0.0035, -0.0034,
         0.0064, -0.0012], device='cuda:0', dtype=torch.float16,
       grad_fn=<SliceBackward0>)
Embedding norm: 0.169921875


## 2.2 Before begin training, freeze all model weights and unfreeze only 32001th entry, ~short

In [16]:

# Freeze all other weights, unfreeze row with new token id
embed_weight = model.get_input_embeddings().weight
lm_head_weight = model.get_output_embeddings().weight

print("Embedding shape:", embed_weight.shape)
print("LM head shape:", lm_head_weight.shape)

# 1. freeze everything first
for name, param in model.named_parameters():
    param.requires_grad = False

# 2. Enable gradient only for the embedding + head matrices
embed_weight.requires_grad = True
lm_head_weight.requires_grad = True

Embedding shape: torch.Size([32001, 4096])
LM head shape: torch.Size([32001, 4096])


In [17]:
# 3. Apply masks for all other weigths other than those that correspond to ~short

def mask_embedding_grad(grad):
    mask = torch.zeros_like(grad)    # [vocab, hidden]
    mask[new_id] = 1.0               # Only this row receives grad
    return grad * mask

embed_weight.register_hook(mask_embedding_grad)

def mask_lm_head_grad(grad):
    mask = torch.zeros_like(grad)
    mask[new_id] = 1.0
    return grad * mask

lm_head_weight.register_hook(mask_lm_head_grad)

<torch.utils.hooks.RemovableHandle at 0x7bb397b7c980>

## 2.3 Define our optimizer and start training!

In [18]:
optimizer = torch.optim.AdamW(
    [embed_weight, lm_head_weight],
    lr=1e-3,
    weight_decay=0.0
)

In [19]:
def compute_log_prob(prompt, response):
    """
    Compute log p(response | prompt) for decoder-only models
    by concatenating prompt+response and masking prompt tokens.
    """

    # Tokenize prompt
    enc_prompt = tokenizer(prompt, return_tensors="pt").to("cuda")
    prompt_ids = enc_prompt["input_ids"]

    # Tokenize response
    enc_resp = tokenizer(response, return_tensors="pt").to("cuda")
    resp_ids = enc_resp["input_ids"]

    # Concatenate
    input_ids = torch.cat([prompt_ids, resp_ids], dim=1)

    # Build labels: mask prompt => -100, output response normally
    labels = torch.cat(
        [
            torch.full_like(prompt_ids, -100),
            resp_ids
        ],
        dim=1
    )

    # Forward pass
    outputs = model(input_ids=input_ids, labels=labels)

    # Compute log p(response | prompt)
    # outputs.loss = NLL averaged over non-masked tokens
    nll = outputs.loss
    num_response_tokens = resp_ids.numel()

    log_prob = -nll * num_response_tokens
    return log_prob

In [20]:
beta = 1.0   # DeepMind used β = 1 based on ablations

def apo_up_loss(prompt, chosen, rejected):
    log_pc = compute_log_prob(prompt, chosen)
    log_pr = compute_log_prob(prompt, rejected)

    # LLR = log p(c) - log p(r)
    llr = log_pc - log_pr

    # First APO-up term
    t1 = -torch.log(torch.sigmoid(beta * llr))

    # Second APO-up term (absolute chosen likelihood)
    t2 = -torch.log(torch.sigmoid(beta * log_pc))

    return t1 + t2

In [21]:
model.model.embed_tokens.num_embeddings = 32001
print("tokenizer size:", len(tokenizer))
print("model config vocab size:", model.config.vocab_size)
print("tokenizer vocab size:", tokenizer.vocab_size )
print("embed:", model.model.embed_tokens.weight.shape)
print("lm_head:", model.lm_head.weight.shape)

tokenizer size: 32001
model config vocab size: 32001
tokenizer vocab size: 32000
embed: torch.Size([32001, 4096])
lm_head: torch.Size([32001, 4096])


In [23]:
num_epochs = 5

model.train()

for epoch in range(num_epochs):
    total_loss = 0

    for ex in examples:   # list of 100 examples
        prompt = ex["prompt"]
        chosen = ex["chosen"]
        rejected = ex["rejected"]

        optimizer.zero_grad()

        loss = apo_up_loss(prompt, chosen, rejected)
        loss.backward()

        # gradient hooks mask everything except new row
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1} | Loss: {total_loss:.4f}")

RuntimeError: Function EmbeddingBackward0 returned an invalid gradient at index 0 - got [32001, 4096] but expected shape compatible with [32000, 4096]