In [None]:
# no need to restart kernel after code changes (useful to separate code to modules)
%load_ext autoreload
%autoreload 2

In [None]:
import gc

from time import time
from tqdm import tqdm
from datasets import load_dataset

import sys
sys.path.append('..')

In [None]:
import random
import numpy as np

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

from utils.general import compute_last_token_embedding_grad_emb, get_whole

In [None]:
def set_seed(seed: int = 8):
    """Set seed for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # if using multi-GPU

    # Ensure deterministic behavior in cuDNN (may impact performance)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


## Torch Optimizers

In [None]:
def clean():
    gc.collect()
    torch.cuda.empty_cache()

model_id = "roneneldan/TinyStories-1M"
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = "cpu"
load_in_8bit = False

try:
    del model
    clean()
except NameError:
    pass 

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                               torch_dtype=torch.float16,
                                               device_map=device,
                                               load_in_8bit=load_in_8bit,
                                               trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

### Gradient based on projection

In [None]:
def find_token(
    token_idx,
    embedding_matrix,
    discovered_embeddings, discovered_ids,
    llm, layer_idx, h_target,
    optimizer_cls, lr
    
):
    copy_embedding_matrix = embedding_matrix.clone().detach().requires_grad_(False)

    token_id = torch.randint(0, embedding_matrix.size(0), (1,)).item()
    
    embedding = copy_embedding_matrix[token_id].clone().requires_grad_(True)
    temp_embedding = copy_embedding_matrix[token_id].clone().detach()

    optimizer = optimizer_cls([embedding], lr=lr)

    bar = tqdm(
        range(embedding_matrix.size(0)), 
        desc=f'Token [{token_idx + 1:2d}/{h_target.size(0):2d}]'
    )

    for _ in bar:
        input_embeddings = torch.stack(
            discovered_embeddings + [temp_embedding]
        ).unsqueeze(0) 

        grad_oracle, loss = compute_last_token_embedding_grad_emb(
            embeddings=input_embeddings, 
            llm=llm,
            layer_idx=layer_idx,
            h_target=h_target[token_idx],
        )

        grad_norm = grad_oracle.norm().item()
        print(f"Token ID: {token_id} - Grad norm: {grad_norm:.2e} - Loss: {loss:.2e}")
        string_so_far = tokenizer.decode(discovered_ids + [token_id], skip_special_tokens=True)
        bar.set_postfix_str(f"Loss: {loss:.2e} - Gradient norm: {grad_norm:.2e} - String: {string_so_far}")

        if loss < 1e-5 or grad_norm < 1e-12:
            break

        embedding.grad = grad_oracle
        optimizer.step()

        copy_embedding_matrix[token_id] = float('inf')
        distances = torch.norm(copy_embedding_matrix - embedding, dim=1)
        token_id = int(torch.argmin(distances))
        temp_embedding = copy_embedding_matrix[token_id].clone()

    return token_id, copy_embedding_matrix[token_id]


def find_prompt(
    llm, layer_idx, h_target,
    optimizer_cls, lr,
):
    embedding_matrix = model.get_input_embeddings().weight

    if h_target.dim() == 1:
        h_target = h_target.unsqueeze(0)

    discovered_embeddings = []
    discovered_ids        = []

    start_time = time()
    for i in range(h_target.size(0)):
        next_token_id, next_token_embedding = find_token(
            i, embedding_matrix, 
            discovered_embeddings, discovered_ids, 
            llm, layer_idx, h_target,
            optimizer_cls, lr
        )

        discovered_embeddings.append(next_token_embedding)
        discovered_ids.append(next_token_id)
    
    end_time = time()

    final_string = tokenizer.decode(discovered_ids, skip_special_tokens=True)

    return end_time - start_time, final_string

# from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
def inversion_attack(
    prompt, llm, layer_idx,
    optimizer_cls, lr,
    seed=8
):
    
    set_seed(seed)
    h_target = get_whole(prompt, model, tokenizer, layer_idx)

    invertion_time, predicted_prompt = find_prompt(
        llm, layer_idx, h_target, 
        optimizer_cls, lr
    )

    print(f'Orignial prompt : {prompt}')
    print(f'Predicted prompt: {predicted_prompt}')
    print(f'Invertion time  : {invertion_time:.2f} seconds')

inversion_attack(
     # prompt= meaningful_df['50'][0],
     prompt='12autoZeinai ena~~ !poli, a1212kiro pr33-=ompt tao op"\oio ;::/>elpizo na d1212isko1212leyt5646ei na ma77ntepsei to montelo',
    llm=model, layer_idx=7, 
    # optimizer_cls=torch.optim.SGD, lr=1e-0
    # llm=model, layer_idx=8, 
    optimizer_cls=torch.optim.AdamW, lr=1e-1
)

    

# Dataset 

In [None]:
dataset = load_dataset("bookcorpus", split="train")
dataset

In [None]:
for i in range(5):
    sentence = dataset[i]["text"]
    print(f"{i= } ")
    print(f"Sentence: {sentence}")

In [None]:
# sentences = [x['text'] for x in dataset if x['text'].strip()]
# print(sentences[:5])
seed = 42
set_seed(seed)
N = 100  # number of sentences to process
#select a permuted subset of the dataset
sentences = dataset.shuffle(seed=42).select(range(N))['text']

In [None]:
import random
from datasets import Dataset
from typing import List, Dict
import numpy as np

subdataset = dataset[:1000000]
text_list = subdataset["text"]

# Wrap it in a dict
text_dict = {"text": text_list}

# Rebuild a Dataset with only that column
from datasets import Dataset
dataset_text_only = Dataset.from_dict(text_dict)

def filter_by_token_length(
    tokenizer,
    token_lengths: List[int],
    dataset: Dataset,
    length: int,
    seed: int = 42
) -> Dict[int, Dataset]:

    set_seed(seed)
    # Tokenize and get lengths
    def tokenize_length(example):
        tokens = tokenizer(example["text"], truncation=False, add_special_tokens=True)
        return {"token_length": len(tokens["input_ids"])}

    dataset_with_lengths = dataset.map(tokenize_length, desc="Tokenizing and calculating lengths")

    result_datasets = {}

    for token_length in token_lengths:
        filtered = dataset_with_lengths.filter(
            lambda x: x["token_length"] == token_length,
            desc=f"Filtering for token length {token_length}"
        )

        # Shuffle and take the desired number of rows
        if len(filtered) >= length:
            filtered = filtered.shuffle(seed=seed).select(range(length))
            result_datasets[token_length] = filtered

    return result_datasets

lengths_tokens = [10, 20, 30, 40, 50]  # Example token lengths to filter by
result_dataset= filter_by_token_length(
    tokenizer,
    token_lengths= lenghts_tokens,
    # make the dataset a dict
    dataset=dataset_text_only,
    length=100,
    seed=seed
)


In [None]:
for token_length, dataset in result_dataset.items():
    print(f"Token Length: {token_length}")
    print(f"Dataset: {dataset['text'][:5]}")
    print(f"\n")

In [None]:
import pandas as pd
result_dataset_df = {k: v['text'] for k, v in result_dataset.items()}
result_dataset_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in result_dataset_df.items()]))
result_dataset_df.to_csv('../data/token_length_filtered_dataset.csv', index=False)

In [None]:
result_dataset_df.head()

In [None]:


def random_sentence_of_size(
    tokenizer,
    n: int,
    seed: int = 42
) -> str:
    """
    Generate a random string that, when tokenized by `tokenizer`, has exactly n tokens.

    Args:
        tokenizer: a HuggingFace tokenizer.
        n:         the exact number of tokens you want.
        seed:      random seed for reproducibility.

    Returns:
        A string which tokenizes to length n.

    Raises:
        ValueError: if the tokenizer’s vocab doesn’t have enough valid tokens.
    """
    # 1) seed RNG
    set_seed(seed)

    # 2) build a list of “real” tokens
    vocab = tokenizer.get_vocab()  # dict: token -> id
    id_to_token = {idx: tok for tok, idx in vocab.items()}
    special = set(tokenizer.all_special_tokens)

    valid_tokens = [
        tok for tok in id_to_token.values()
        if tok not in special and not tok.startswith("##")
    ]

    if len(valid_tokens) == 0:
        raise ValueError("No valid tokens available in tokenizer vocab.")

    # 3) sample n tokens (with replacement so n can be large)
    sampled = random.choices(valid_tokens, k=n)

    # 4) join/clean up via the tokenizer’s decoder
    return tokenizer.convert_tokens_to_string(sampled)

def random_sentence_list(
    tokenizer,
    n: int,
    num_sentences: int = 10,
    seed: int = 42
) -> List[str]:
    """
    Generate a list of random strings, each of which tokenizes to exactly n tokens.

    Args:
        tokenizer:       a HuggingFace tokenizer.
        n:               the exact number of tokens you want in each string.
        num_sentences:   how many strings to generate.
        seed:            random seed for reproducibility.

    Returns:
        A list of strings, each tokenizing to length n.
    """
    return [random_sentence_of_size(tokenizer, n, seed+i) for i in range(num_sentences)]

In [None]:
list_random_sentence = random_sentence_list(
    tokenizer,
    n=10,
    num_sentences=100,
    seed=seed
)

# create a dataframe as before and save it the same place where now they all random of size 10, 20, 30, 40, 50
# list comprension from lengths_tokens 10:...
dict_random = {
    f"random_{length}": random_sentence_list(
        tokenizer,
        n=length,
        num_sentences=100,
        seed=seed+length
    ) for length in lengths_tokens
}
result_random_dataset_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in dict_random.items()]))
result_random_dataset_df.to_csv('../data/random_sentences_dataset.csv', index=False)
result_random_dataset_df.head()

# Experiment 8

<font color="#fc5a4e">

**TASK**: 
- **E8.1** show how the convergence rate change when you **change the gradient step-size**
- **E8.2** show how the convergence rate change when you **change the first order algorithm**
- **E8.3** show how the convergence rate change when you **change the lenght of the tokens**

In [None]:
# make a list of stepsizes for algorithms
step_sizes = [1e-0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
# make a list of optimizers: sgd, adam, adamw, rmsprop, nesterov, LBFGS
optimizers = {
    "sgd":torch.optim.SGD,
    "adam":torch.optim.Adam,
    "adamw":torch.optim.AdamW,
    "rmsprop":torch.optim.RMSprop,
    "nadam":torch.optim.NAdam,
    "lbfgs":torch.optim.LBFGS
}
# make a list of lenght_tokens
lengths_tokens = [10, 20, 30, 40, 50]  

# make a list of layer indices
layer_indices = [1, 2, 3, 4, 5, 6, 7, 8]

# random df 
random_df = pd.read_csv('../data/random_sentences_dataset.csv')
# meaningful df
meaningful_df = pd.read_csv('../data/token_length_filtered_dataset.csv')


In [None]:
trial_prompt = meaningful_df['10'][0]
print(f"Trial prompt: {meaningful_df['50'][0]}")

In [None]:
inversion_attack(
    prompt=meaningful_df['50'][2],
    llm=model, 
    layer_idx=7, 
    optimizer_cls= torch.optim.LBFGS, 
    lr= step_sizes[-1],
    seed=seed
)