In [1]:
# import from huggingface roneneldan/TinyStories-1M
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

  from .autonotebook import tqdm as notebook_tqdm


# Trying with TinyStories 

In [4]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("roneneldan/TinyStories-1M")
model = AutoModelForCausalLM.from_pretrained("roneneldan/TinyStories-1M")

In [19]:
prompt_trial = "Paris is the capital of "
inputs = tokenizer(prompt_trial, return_tensors="pt")
input_ids = inputs.input_ids
model.transformer.wte(input_ids).shape


torch.Size([1, 6, 64])

In [7]:
def get_embeddings_gpt(prompt):
    inputs = tokenizer(prompt, return_tensors="pt") # dictionary with input_ids and attention_mask
    input_ids = inputs.input_ids # tensor of shape (1, 10)
    attention_mask = inputs.attention_mask # tensor of shape (1, 10)
    # Get the embeddings
    with torch.no_grad():
        token_emb = model.transformer.wte(input_ids)
        position_ids = torch.arange(input_ids.size(1), device=input_ids.device).unsqueeze(0)
        pos_emb = model.transformer.wpe(position_ids)
        embeddings = token_emb + pos_emb

    # return the embeddings batch x sequence x embedding_dim
    return embeddings


In [8]:
def embeddings_to_texts_baseline(embeddings, model, tokenizer, skip_special_tokens=True):
    """
    Map input embeddings (batch, seq_len, emb_dim) → list of decoded strings.
    """
    # 1) Project embeddings to vocab logits
    logits = model.lm_head(embeddings)           # (batch, seq_len, vocab_size)
    # 2) Greedy decode: pick highest logit per position
    token_ids = torch.argmax(logits, dim=-1)     # (batch, seq_len)
    # 3) Transform each sequence of IDs into text
    texts = tokenizer.batch_decode(token_ids, skip_special_tokens=skip_special_tokens)
    return logits, texts

In [None]:
# trying to get embeddings from a prompt_trial 
embeddings = get_embeddings_gpt(prompt_trial)
# print(embeddings.shape) # (1, 10, 768)
embeddings.shape

torch.Size([1, 6, 64])

In [31]:
logits, text = embeddings_to_texts_baseline(embeddings, model, tokenizer)
logits.shape, text 

(torch.Size([1, 6, 50257]), ['umm is the capital of '])

In [None]:
prompt = "Once upon a time, in a land far away, there lived a"
# get the embeddings for the prompt

_, text = embeddings_to_texts_baseline(get_embeddings_gpt(prompt), model, tokenizer)
text

['Once upon a time, in a land far away, there lived a']

# Trying With Mistral

In [None]:
from huggingface_hub import login
from dotenv import load_dotenv
import os

load_dotenv()
login(token=os.getenv("HUGGINGFACE_TOKEN"))

In [5]:
model_id = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",       # auto-slice layers across GPUs/CPU
    load_in_8bit=True,       # or load_in_4bit=True
    torch_dtype="auto"       # keep LayerNorm etc. in fp16/32
)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.06s/it]


In [11]:
def get_embeddings_mistral(prompt):
    inputs = tokenizer(prompt, return_tensors="pt") # dictionary with input_ids and attention_mask
    input_ids = inputs.input_ids.to(model.device) # tensor of shape (1, 10)
    attention_mask = inputs.attention_mask.to(model.device) # tensor of shape (1, 10)
    # Get the embeddings
    with torch.no_grad():
        embeddings = model.model.embed_tokens(input_ids)

    # return the embeddings batch x sequence x embedding_dim
    return embeddings


In [52]:
prompt_trial = "Once upon a time, in a land far away, there lived a"
embeddings = get_embeddings_mistral(prompt_trial)
# print(embeddings.shape) # (1, 10, 768)
embeddings.shape

torch.Size([1, 15, 4096])

In [54]:
logits, text = embeddings_to_texts_baseline(embeddings, model, tokenizer)
text 

['ocker Groupisenerial候puislcerialuvudstock awaypuisafteronnaerial']

In [8]:
model.model.embed_tokens.weight.shape == model.lm_head.weight.shape

True

# Trying with GPT-2 Small

In [5]:
# load with gpt2 
SMALL = False
model_id = ["openai-community/gpt2-xl", "openai-community/gpt2"][SMALL]
model = AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [9]:
prompt_trial = "Once upon a time, in a land far away, there lived a"
embeddings = get_embeddings_gpt(prompt_trial)
embeddings.shape

torch.Size([1, 14, 1600])

In [10]:
logits, text = embeddings_to_texts_baseline(embeddings, model, tokenizer)
text 


['Once upon a time, in a land far away, there lived a']

In [11]:
model.lm_head.weight.shape, model.transformer.wte.weight.shape

(torch.Size([50257, 1600]), torch.Size([50257, 1600]))

In [12]:
model.lm_head.weight.shape == model.transformer.wte.weight.shape

True

# Trying With Gemma2

In [9]:
model_id = "google/gemma-2b"

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",       # auto-slice layers across GPUs/CPU
    load_in_8bit=True,       # or load_in_4bit=True
    torch_dtype="auto"       # keep LayerNorm etc. in fp16/32
)
model

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Downloading shards: 100%|██████████| 2/2 [01:47<00:00, 53.61s/it] 
`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it]


GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear8bitLt(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear8bitLt(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear8bitLt(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear8bitLt(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear8bitLt(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
        (post_attention

In [19]:
prompt_trial = "Once upon a time, in a land far away, there lived a"
embeddings = get_embeddings_mistral(prompt_trial)
normalized_embeddings = model.model.norm(embeddings)
# print(embeddings.shape) # (1, 10, 768)
embeddings.shape, normalized_embeddings.shape

(torch.Size([1, 15, 2048]), torch.Size([1, 15, 2048]))

In [15]:
logits, text = embeddings_to_texts_baseline(embeddings, model, tokenizer)
text 

[' increa increa increa increa increa increa increa increa increa increa increa increa increa increa']

In [20]:
logits_normalized, text_normalized = embeddings_to_texts_baseline(normalized_embeddings, model, tokenizer)
text_normalized 


[' increa increa increa increa increa increa increa increa increa increa increa increa increa increa']

# Trying With GPT-J 