In [1]:
# no need to restart kernel after code changes (useful to separate code to modules)
%load_ext autoreload
%autoreload 2

In [1]:
import gc

from time import time
from tqdm import tqdm
from datasets import load_dataset

import sys
sys.path.append('..')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
!pip install rogue 

[0m

In [4]:
import random
import numpy as np

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

from utils.general import compute_last_token_embedding_grad_emb, get_whole


In [7]:
def set_seed(seed: int = 8):
    """Set seed for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # if using multi-GPU

    # Ensure deterministic behavior in cuDNN (may impact performance)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


## Torch Optimizers

In [5]:
def clean():
    gc.collect()
    torch.cuda.empty_cache()

model_id = "roneneldan/TinyStories-1M"
device = 'cuda' if torch.cuda.is_available() else 'cpu'
load_in_8bit = False

try:
    del model
    clean()
except NameError:
    pass 

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                               torch_dtype=torch.float32,
                                               device_map=device,
                                               load_in_8bit=load_in_8bit,
                                               trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

### Gradient based on projection

In [8]:
def find_token(
    token_idx,
    embedding_matrix,
    discovered_embeddings, discovered_ids,
    llm, layer_idx, h_target,
    optimizer_cls, lr
    
):
    copy_embedding_matrix = embedding_matrix.clone().detach().requires_grad_(False)

    token_id = torch.randint(0, embedding_matrix.size(0), (1,)).item()
    
    embedding = copy_embedding_matrix[token_id].clone().requires_grad_(True)
    temp_embedding = copy_embedding_matrix[token_id].clone().detach()

    optimizer = optimizer_cls([embedding], lr=lr)

    bar = tqdm(
        range(embedding_matrix.size(0)), 
        desc=f'Token [{token_idx + 1:2d}/{h_target.size(0):2d}]'
    )

    for _ in bar:
        input_embeddings = torch.stack(
            discovered_embeddings + [temp_embedding]
        ).unsqueeze(0) 

        grad_oracle, loss = compute_last_token_embedding_grad_emb(
            embeddings=input_embeddings, 
            llm=llm,
            layer_idx=layer_idx,
            h_target=h_target[token_idx],
        )

        grad_norm = grad_oracle.norm().item()
        print(f"Token ID: {token_id} - Grad norm: {grad_norm:.2e} - Loss: {loss:.2e}")
        string_so_far = tokenizer.decode(discovered_ids + [token_id], skip_special_tokens=True)
        bar.set_postfix_str(f"Loss: {loss:.2e} - Gradient norm: {grad_norm:.2e} - String: {string_so_far}")

        if loss < 1e-5 or grad_norm < 1e-12:
            break

        embedding.grad = grad_oracle
        optimizer.step()

        copy_embedding_matrix[token_id] = float('inf')
        distances = torch.norm(copy_embedding_matrix - embedding, dim=1)
        token_id = int(torch.argmin(distances))
        temp_embedding = copy_embedding_matrix[token_id].clone()

    return token_id, copy_embedding_matrix[token_id]


def find_prompt(
    llm, layer_idx, h_target,
    optimizer_cls, lr,
):
    embedding_matrix = model.get_input_embeddings().weight

    if h_target.dim() == 1:
        h_target = h_target.unsqueeze(0)

    discovered_embeddings = []
    discovered_ids        = []

    start_time = time()
    for i in range(h_target.size(0)):
        next_token_id, next_token_embedding = find_token(
            i, embedding_matrix, 
            discovered_embeddings, discovered_ids, 
            llm, layer_idx, h_target,
            optimizer_cls, lr
        )

        discovered_embeddings.append(next_token_embedding)
        discovered_ids.append(next_token_id)
    
    end_time = time()

    final_string = tokenizer.decode(discovered_ids, skip_special_tokens=True)

    return end_time - start_time, final_string

# from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
def inversion_attack(
    prompt, llm, layer_idx,
    optimizer_cls, lr,
    seed=8
):
    
    set_seed(seed)
    h_target = get_whole(prompt, model, tokenizer, layer_idx)

    invertion_time, predicted_prompt = find_prompt(
        llm, layer_idx, h_target, 
        optimizer_cls, lr
    )

    print(f'Orignial prompt : {prompt}')
    print(f'Predicted prompt: {predicted_prompt}')
    print(f'Invertion time  : {invertion_time:.2f} seconds')

inversion_attack(
     # prompt= meaningful_df['50'][0],
     prompt='12autoZeinai ena~~ !poli, a1212kiro pr33-=ompt tao op"\oio ;::/>elpizo na d1212isko1212leyt5646ei na ma77ntepsei to montelo',
    llm=model, layer_idx=7, 
    # optimizer_cls=torch.optim.SGD, lr=1e-0
    # llm=model, layer_idx=8, 
    optimizer_cls=torch.optim.AdamW, lr=1e-1
)

    

Token [ 1/58]:   0%|          | 11/50257 [00:00<17:38, 47.46it/s, Loss: 1.64e-14 - Gradient norm: 1.32e-07 - String: 12]       


Token ID: 17472 - Grad norm: 1.71e-01 - Loss: 3.26e-02
Token ID: 48292 - Grad norm: 8.35e-02 - Loss: 2.06e-02
Token ID: 17907 - Grad norm: 1.31e-01 - Loss: 3.72e-02
Token ID: 36064 - Grad norm: 1.79e-01 - Loss: 2.81e-02
Token ID: 33748 - Grad norm: 2.42e-01 - Loss: 2.86e-02
Token ID: 36627 - Grad norm: 1.65e-01 - Loss: 2.29e-02
Token ID: 11971 - Grad norm: 9.14e-02 - Loss: 2.50e-02
Token ID: 3550 - Grad norm: 1.99e-01 - Loss: 4.60e-02
Token ID: 11304 - Grad norm: 1.82e-01 - Loss: 2.47e-02
Token ID: 49508 - Grad norm: 2.11e-01 - Loss: 3.71e-02
Token ID: 45568 - Grad norm: 1.19e-01 - Loss: 1.94e-02
Token ID: 1065 - Grad norm: 1.32e-07 - Loss: 1.64e-14


Token [ 2/58]:   0%|          | 0/50257 [00:00<?, ?it/s, Loss: 3.93e-04 - Gradient norm: 2.18e-02 - String: 12 densely]

Token ID: 42255 - Grad norm: 2.18e-02 - Loss: 3.93e-04


Token [ 2/58]:   0%|          | 14/50257 [00:00<12:25, 67.42it/s, Loss: 6.91e-03 - Gradient norm: 7.47e-02 - String: 12breakers]  

Token ID: 23735 - Grad norm: 6.01e-02 - Loss: 9.74e-04
Token ID: 23074 - Grad norm: 1.04e-01 - Loss: 7.25e-03
Token ID: 28815 - Grad norm: 2.09e-02 - Loss: 2.39e-04
Token ID: 24415 - Grad norm: 1.87e-02 - Loss: 2.31e-04
Token ID: 45178 - Grad norm: 1.39e-01 - Loss: 1.25e-02
Token ID: 48270 - Grad norm: 5.60e-02 - Loss: 5.72e-03
Token ID: 19577 - Grad norm: 1.23e-01 - Loss: 1.01e-02
Token ID: 25687 - Grad norm: 1.27e-01 - Loss: 1.17e-02
Token ID: 7040 - Grad norm: 9.86e-02 - Loss: 1.21e-02
Token ID: 34114 - Grad norm: 7.63e-02 - Loss: 9.00e-03
Token ID: 37488 - Grad norm: 9.52e-02 - Loss: 1.19e-02
Token ID: 3174 - Grad norm: 1.21e-01 - Loss: 7.96e-03
Token ID: 49295 - Grad norm: 7.47e-02 - Loss: 6.91e-03


Token [ 2/58]:   0%|          | 14/50257 [00:00<12:25, 67.42it/s, Loss: 1.23e-02 - Gradient norm: 1.19e-01 - String: 12____]    

Token ID: 1427 - Grad norm: 1.19e-01 - Loss: 1.23e-02


Token [ 2/58]:   0%|          | 28/50257 [00:00<12:15, 68.27it/s, Loss: 7.91e-03 - Gradient norm: 9.63e-02 - String: 12Eh]        

Token ID: 35166 - Grad norm: 6.70e-02 - Loss: 6.02e-03
Token ID: 28126 - Grad norm: 1.15e-01 - Loss: 1.81e-02
Token ID: 47720 - Grad norm: 7.79e-02 - Loss: 8.00e-03
Token ID: 47768 - Grad norm: 1.29e-01 - Loss: 8.34e-03
Token ID: 37164 - Grad norm: 1.58e-01 - Loss: 1.47e-02
Token ID: 17047 - Grad norm: 7.15e-02 - Loss: 5.97e-03
Token ID: 35428 - Grad norm: 9.73e-02 - Loss: 1.09e-02
Token ID: 9268 - Grad norm: 1.22e-01 - Loss: 1.48e-02
Token ID: 48332 - Grad norm: 1.75e-01 - Loss: 1.93e-02
Token ID: 32109 - Grad norm: 1.51e-01 - Loss: 1.55e-02
Token ID: 22247 - Grad norm: 1.15e-01 - Loss: 1.03e-02
Token ID: 18478 - Grad norm: 1.24e-01 - Loss: 1.45e-02
Token ID: 43894 - Grad norm: 9.63e-02 - Loss: 7.91e-03


Token [ 2/58]:   0%|          | 28/50257 [00:00<12:15, 68.27it/s, Loss: 2.13e-02 - Gradient norm: 1.58e-01 - String: 12 SAR]

Token ID: 47341 - Grad norm: 1.58e-01 - Loss: 2.13e-02


Token [ 2/58]:   0%|          | 42/50257 [00:00<12:10, 68.77it/s, Loss: 9.49e-03 - Gradient norm: 1.40e-01 - String: 12forward]    

Token ID: 9888 - Grad norm: 7.83e-02 - Loss: 7.15e-03
Token ID: 20256 - Grad norm: 3.04e-01 - Loss: 3.46e-02
Token ID: 5109 - Grad norm: 2.02e-01 - Loss: 1.78e-02
Token ID: 41102 - Grad norm: 3.24e-01 - Loss: 5.12e-02
Token ID: 25437 - Grad norm: 2.75e-01 - Loss: 5.21e-02
Token ID: 29952 - Grad norm: 1.20e-01 - Loss: 1.27e-02
Token ID: 7829 - Grad norm: 9.19e-02 - Loss: 1.15e-02
Token ID: 25344 - Grad norm: 1.26e-01 - Loss: 1.34e-02
Token ID: 21325 - Grad norm: 2.10e-01 - Loss: 3.07e-02
Token ID: 4674 - Grad norm: 1.01e-01 - Loss: 1.33e-02
Token ID: 11971 - Grad norm: 9.37e-02 - Loss: 1.48e-02
Token ID: 24176 - Grad norm: 9.64e-02 - Loss: 9.33e-03
Token ID: 11813 - Grad norm: 1.40e-01 - Loss: 9.49e-03


Token [ 2/58]:   0%|          | 42/50257 [00:00<12:10, 68.77it/s, Loss: 1.45e-02 - Gradient norm: 1.21e-01 - String: 12 informative]

Token ID: 30304 - Grad norm: 1.21e-01 - Loss: 1.45e-02


Token [ 2/58]:   0%|          | 56/50257 [00:00<12:05, 69.15it/s, Loss: 1.33e-02 - Gradient norm: 1.48e-01 - String: 12circle]      

Token ID: 12480 - Grad norm: 1.10e-01 - Loss: 1.11e-02
Token ID: 49621 - Grad norm: 6.67e-02 - Loss: 7.91e-03
Token ID: 5374 - Grad norm: 1.27e-01 - Loss: 1.88e-02
Token ID: 18096 - Grad norm: 2.57e-01 - Loss: 3.78e-02
Token ID: 16796 - Grad norm: 1.25e-01 - Loss: 1.82e-02
Token ID: 23554 - Grad norm: 1.34e-01 - Loss: 1.67e-02
Token ID: 34717 - Grad norm: 4.71e-01 - Loss: 7.19e-02
Token ID: 30804 - Grad norm: 2.24e-01 - Loss: 2.52e-02
Token ID: 31168 - Grad norm: 2.27e-01 - Loss: 1.71e-02
Token ID: 7322 - Grad norm: 2.99e-01 - Loss: 3.47e-02
Token ID: 13237 - Grad norm: 7.36e-02 - Loss: 9.41e-03
Token ID: 30530 - Grad norm: 8.27e-02 - Loss: 1.07e-02
Token ID: 45597 - Grad norm: 1.48e-01 - Loss: 1.33e-02


Token [ 2/58]:   0%|          | 56/50257 [00:00<12:05, 69.15it/s, Loss: 2.32e-02 - Gradient norm: 2.40e-01 - String: 12 fuller]

Token ID: 14306 - Grad norm: 1.07e-01 - Loss: 1.15e-02
Token ID: 44757 - Grad norm: 2.40e-01 - Loss: 2.32e-02


Token [ 2/58]:   0%|          | 64/50257 [00:01<12:02, 69.44it/s, Loss: 1.07e-02 - Gradient norm: 1.27e-01 - String: 12small]  

Token ID: 7645 - Grad norm: 2.32e-01 - Loss: 3.54e-02
Token ID: 22805 - Grad norm: 1.38e-01 - Loss: 1.41e-02
Token ID: 5314 - Grad norm: 7.82e-02 - Loss: 1.10e-02
Token ID: 4775 - Grad norm: 1.05e-01 - Loss: 1.03e-02
Token ID: 7335 - Grad norm: 1.84e-01 - Loss: 2.60e-02
Token ID: 36022 - Grad norm: 1.07e-01 - Loss: 9.64e-03
Token ID: 17156 - Grad norm: 1.72e-01 - Loss: 2.45e-02
Token ID: 49508 - Grad norm: 2.74e-01 - Loss: 2.96e-02
Token ID: 24219 - Grad norm: 1.44e-01 - Loss: 2.68e-02
Token ID: 22562 - Grad norm: 2.11e-01 - Loss: 2.12e-02
Token ID: 23135 - Grad norm: 1.32e-01 - Loss: 1.50e-02
Token ID: 36546 - Grad norm: 1.40e-01 - Loss: 2.15e-02
Token ID: 17470 - Grad norm: 1.27e-01 - Loss: 1.07e-02


Token [ 2/58]:   0%|          | 72/50257 [00:01<11:59, 69.75it/s, Loss: 2.74e-02 - Gradient norm: 2.44e-01 - String: 12 halftime]

Token ID: 33287 - Grad norm: 2.27e-01 - Loss: 2.93e-02
Token ID: 35185 - Grad norm: 2.44e-01 - Loss: 2.74e-02


Token [ 2/58]:   0%|          | 79/50257 [00:01<11:58, 69.82it/s, Loss: 2.06e-02 - Gradient norm: 1.92e-01 - String: 12 Hungry]       

Token ID: 9937 - Grad norm: 1.60e-01 - Loss: 1.32e-02
Token ID: 45956 - Grad norm: 2.05e-01 - Loss: 2.48e-02
Token ID: 29337 - Grad norm: 7.71e-02 - Loss: 6.88e-03
Token ID: 40758 - Grad norm: 1.78e-01 - Loss: 1.72e-02
Token ID: 34154 - Grad norm: 2.30e-01 - Loss: 4.99e-02
Token ID: 10220 - Grad norm: 1.99e-01 - Loss: 2.24e-02
Token ID: 12438 - Grad norm: 1.15e-01 - Loss: 1.42e-02
Token ID: 10878 - Grad norm: 1.48e-01 - Loss: 1.67e-02
Token ID: 18763 - Grad norm: 1.35e-01 - Loss: 1.27e-02
Token ID: 17587 - Grad norm: 1.23e-01 - Loss: 2.31e-02
Token ID: 13894 - Grad norm: 1.05e-01 - Loss: 1.05e-02
Token ID: 47091 - Grad norm: 1.35e-01 - Loss: 1.55e-02
Token ID: 42939 - Grad norm: 1.92e-01 - Loss: 2.06e-02


Token [ 2/58]:   0%|          | 87/50257 [00:01<11:54, 70.18it/s, Loss: 1.94e-02 - Gradient norm: 1.67e-01 - String: 12 Him]   

Token ID: 11510 - Grad norm: 3.23e-01 - Loss: 3.99e-02
Token ID: 10978 - Grad norm: 1.67e-01 - Loss: 1.94e-02


Token [ 2/58]:   0%|          | 95/50257 [00:01<11:54, 70.22it/s, Loss: 5.25e-03 - Gradient norm: 5.26e-02 - String: 12winner]  

Token ID: 42767 - Grad norm: 1.76e-01 - Loss: 2.81e-02
Token ID: 29343 - Grad norm: 9.09e-02 - Loss: 1.32e-02
Token ID: 27291 - Grad norm: 2.13e-01 - Loss: 3.31e-02
Token ID: 13571 - Grad norm: 1.12e-01 - Loss: 9.59e-03
Token ID: 27787 - Grad norm: 3.26e-01 - Loss: 3.46e-02
Token ID: 1186 - Grad norm: 2.60e-01 - Loss: 2.73e-02
Token ID: 48426 - Grad norm: 1.95e-01 - Loss: 3.49e-02
Token ID: 17249 - Grad norm: 9.97e-02 - Loss: 1.32e-02
Token ID: 34167 - Grad norm: 1.88e-01 - Loss: 1.66e-02
Token ID: 4418 - Grad norm: 2.67e-01 - Loss: 2.72e-02
Token ID: 8325 - Grad norm: 1.66e-01 - Loss: 1.71e-02
Token ID: 42248 - Grad norm: 1.02e-01 - Loss: 1.25e-02
Token ID: 39791 - Grad norm: 5.26e-02 - Loss: 5.25e-03


Token [ 2/58]:   0%|          | 103/50257 [00:01<11:52, 70.43it/s, Loss: 1.28e-02 - Gradient norm: 1.47e-01 - String: 12square]

Token ID: 24875 - Grad norm: 7.74e-02 - Loss: 1.42e-02
Token ID: 23415 - Grad norm: 1.47e-01 - Loss: 1.28e-02


Token [ 2/58]:   0%|          | 111/50257 [00:01<11:51, 70.52it/s, Loss: 1.63e-02 - Gradient norm: 2.02e-01 - String: 12ay]         

Token ID: 8091 - Grad norm: 1.89e-01 - Loss: 3.33e-02
Token ID: 1744 - Grad norm: 1.40e-01 - Loss: 2.00e-02
Token ID: 19251 - Grad norm: 1.24e-01 - Loss: 1.47e-02
Token ID: 775 - Grad norm: 2.16e-01 - Loss: 4.08e-02
Token ID: 3423 - Grad norm: 1.82e-01 - Loss: 2.81e-02
Token ID: 38084 - Grad norm: 1.61e-01 - Loss: 1.43e-02
Token ID: 16760 - Grad norm: 1.41e-01 - Loss: 2.11e-02
Token ID: 18223 - Grad norm: 8.41e-02 - Loss: 8.43e-03
Token ID: 27594 - Grad norm: 2.63e-01 - Loss: 4.33e-02
Token ID: 5431 - Grad norm: 1.91e-01 - Loss: 2.46e-02
Token ID: 12805 - Grad norm: 1.14e-01 - Loss: 1.28e-02
Token ID: 6496 - Grad norm: 9.65e-02 - Loss: 8.87e-03
Token ID: 323 - Grad norm: 2.02e-01 - Loss: 1.63e-02


Token [ 2/58]:   0%|          | 111/50257 [00:01<11:51, 70.52it/s, Loss: 3.33e-02 - Gradient norm: 1.81e-01 - String: 12 If]   

Token ID: 19061 - Grad norm: 2.09e-01 - Loss: 2.67e-02
Token ID: 1002 - Grad norm: 1.81e-01 - Loss: 3.33e-02


Token [ 2/58]:   0%|          | 127/50257 [00:01<11:53, 70.30it/s, Loss: 2.47e-02 - Gradient norm: 1.67e-01 - String: 12 Helic]     

Token ID: 1840 - Grad norm: 1.24e-01 - Loss: 1.22e-02
Token ID: 21354 - Grad norm: 1.22e-01 - Loss: 1.61e-02
Token ID: 6784 - Grad norm: 1.50e-01 - Loss: 1.75e-02
Token ID: 6485 - Grad norm: 3.00e-01 - Loss: 3.58e-02
Token ID: 45011 - Grad norm: 6.53e-02 - Loss: 1.05e-02
Token ID: 36994 - Grad norm: 8.53e-02 - Loss: 1.15e-02
Token ID: 28741 - Grad norm: 6.31e-02 - Loss: 9.90e-03
Token ID: 6636 - Grad norm: 1.20e-01 - Loss: 1.21e-02
Token ID: 9579 - Grad norm: 1.61e-01 - Loss: 1.91e-02
Token ID: 36800 - Grad norm: 1.42e-01 - Loss: 1.20e-02
Token ID: 9947 - Grad norm: 7.50e-02 - Loss: 9.61e-03
Token ID: 11857 - Grad norm: 2.82e-01 - Loss: 4.09e-02
Token ID: 48574 - Grad norm: 1.67e-01 - Loss: 2.47e-02


Token [ 2/58]:   0%|          | 127/50257 [00:01<11:53, 70.30it/s, Loss: 1.72e-02 - Gradient norm: 1.75e-01 - String: 12head]     

Token ID: 16651 - Grad norm: 1.83e-01 - Loss: 2.58e-02
Token ID: 2256 - Grad norm: 1.75e-01 - Loss: 1.72e-02


Token [ 2/58]:   0%|          | 143/50257 [00:02<11:55, 70.05it/s, Loss: 8.07e-03 - Gradient norm: 6.62e-02 - String: 12orange]  

Token ID: 16115 - Grad norm: 3.61e-01 - Loss: 6.45e-02
Token ID: 35894 - Grad norm: 1.92e-01 - Loss: 1.97e-02
Token ID: 5062 - Grad norm: 2.01e-01 - Loss: 2.43e-02
Token ID: 11871 - Grad norm: 1.46e-01 - Loss: 1.06e-02
Token ID: 5996 - Grad norm: 3.50e-01 - Loss: 4.33e-02
Token ID: 8227 - Grad norm: 1.69e-01 - Loss: 2.63e-02
Token ID: 2978 - Grad norm: 1.92e-01 - Loss: 2.98e-02
Token ID: 43911 - Grad norm: 6.62e-02 - Loss: 7.78e-03
Token ID: 8097 - Grad norm: 1.32e-01 - Loss: 2.36e-02
Token ID: 24585 - Grad norm: 1.65e-01 - Loss: 1.84e-02
Token ID: 8873 - Grad norm: 2.20e-01 - Loss: 3.27e-02
Token ID: 20987 - Grad norm: 1.96e-01 - Loss: 2.44e-02
Token ID: 43745 - Grad norm: 6.62e-02 - Loss: 8.07e-03


Token [ 2/58]:   0%|          | 143/50257 [00:02<11:55, 70.05it/s, Loss: 1.16e-02 - Gradient norm: 8.66e-02 - String: 12 diagonal]

Token ID: 7471 - Grad norm: 2.73e-01 - Loss: 4.50e-02
Token ID: 40039 - Grad norm: 8.66e-02 - Loss: 1.16e-02


Token [ 2/58]:   0%|          | 159/50257 [00:02<11:54, 70.08it/s, Loss: 3.35e-02 - Gradient norm: 2.68e-01 - String: 12emaker]     

Token ID: 48311 - Grad norm: 1.54e-01 - Loss: 2.24e-02
Token ID: 46963 - Grad norm: 1.23e-01 - Loss: 1.28e-02
Token ID: 29759 - Grad norm: 1.73e-01 - Loss: 1.92e-02
Token ID: 47439 - Grad norm: 1.93e-01 - Loss: 2.83e-02
Token ID: 24008 - Grad norm: 1.11e-01 - Loss: 1.16e-02
Token ID: 881 - Grad norm: 2.73e-01 - Loss: 2.47e-02
Token ID: 1727 - Grad norm: 2.85e-01 - Loss: 3.89e-02
Token ID: 24787 - Grad norm: 2.17e-01 - Loss: 3.25e-02
Token ID: 22790 - Grad norm: 3.46e-01 - Loss: 4.02e-02
Token ID: 26926 - Grad norm: 2.42e-01 - Loss: 3.39e-02
Token ID: 44514 - Grad norm: 1.69e-01 - Loss: 1.70e-02
Token ID: 21362 - Grad norm: 1.08e-01 - Loss: 1.04e-02
Token ID: 32174 - Grad norm: 2.68e-01 - Loss: 3.35e-02


Token [ 2/58]:   0%|          | 159/50257 [00:02<11:54, 70.08it/s, Loss: 2.99e-02 - Gradient norm: 2.27e-01 - String: 12 Sometimes]

Token ID: 6591 - Grad norm: 1.79e-01 - Loss: 1.74e-02
Token ID: 8975 - Grad norm: 2.27e-01 - Loss: 2.99e-02


Token [ 2/58]:   0%|          | 175/50257 [00:02<11:53, 70.15it/s, Loss: 2.05e-02 - Gradient norm: 2.72e-01 - String: 12 boom]     

Token ID: 2141 - Grad norm: 2.16e-01 - Loss: 3.52e-02
Token ID: 16452 - Grad norm: 1.45e-01 - Loss: 1.35e-02
Token ID: 26966 - Grad norm: 1.70e-01 - Loss: 2.43e-02
Token ID: 39733 - Grad norm: 1.65e-01 - Loss: 2.13e-02
Token ID: 43856 - Grad norm: 1.41e-01 - Loss: 1.24e-02
Token ID: 7402 - Grad norm: 4.22e-01 - Loss: 7.28e-02
Token ID: 13513 - Grad norm: 1.10e-01 - Loss: 1.24e-02
Token ID: 36269 - Grad norm: 8.14e-02 - Loss: 8.94e-03
Token ID: 2471 - Grad norm: 1.52e-01 - Loss: 1.89e-02
Token ID: 12009 - Grad norm: 1.54e-01 - Loss: 2.02e-02
Token ID: 20022 - Grad norm: 2.59e-01 - Loss: 2.84e-02
Token ID: 8710 - Grad norm: 3.31e-01 - Loss: 5.64e-02
Token ID: 14166 - Grad norm: 2.72e-01 - Loss: 2.05e-02


Token [ 2/58]:   0%|          | 175/50257 [00:02<11:53, 70.15it/s, Loss: 3.28e-02 - Gradient norm: 2.08e-01 - String: 12 Lisa]

Token ID: 17415 - Grad norm: 1.89e-01 - Loss: 2.42e-02
Token ID: 15378 - Grad norm: 2.08e-01 - Loss: 3.28e-02


Token [ 2/58]:   0%|          | 191/50257 [00:02<11:53, 70.14it/s, Loss: 1.75e-02 - Gradient norm: 1.26e-01 - String: 12founded]  

Token ID: 10955 - Grad norm: 1.42e-01 - Loss: 1.43e-02
Token ID: 9271 - Grad norm: 5.79e-02 - Loss: 8.09e-03
Token ID: 24327 - Grad norm: 1.94e-01 - Loss: 2.48e-02
Token ID: 6159 - Grad norm: 1.33e-01 - Loss: 3.21e-02
Token ID: 22721 - Grad norm: 1.13e-01 - Loss: 1.37e-02
Token ID: 39580 - Grad norm: 1.79e-01 - Loss: 1.92e-02
Token ID: 45002 - Grad norm: 1.41e-01 - Loss: 1.82e-02
Token ID: 2567 - Grad norm: 1.56e-01 - Loss: 1.99e-02
Token ID: 6583 - Grad norm: 1.28e-01 - Loss: 9.68e-03
Token ID: 553 - Grad norm: 1.68e-01 - Loss: 5.21e-02
Token ID: 48744 - Grad norm: 6.40e-02 - Loss: 1.14e-02
Token ID: 23679 - Grad norm: 3.91e-02 - Loss: 3.74e-03
Token ID: 27060 - Grad norm: 1.26e-01 - Loss: 1.75e-02


Token [ 2/58]:   0%|          | 191/50257 [00:02<11:53, 70.14it/s, Loss: 2.57e-02 - Gradient norm: 2.36e-01 - String: 12hips]   

Token ID: 25534 - Grad norm: 2.56e-01 - Loss: 3.21e-02
Token ID: 5748 - Grad norm: 2.36e-01 - Loss: 2.57e-02


Token [ 2/58]:   0%|          | 199/50257 [00:02<11:54, 70.04it/s, Loss: 4.93e-02 - Gradient norm: 1.95e-01 - String: 12'."]      

Token ID: 25717 - Grad norm: 1.51e-01 - Loss: 2.18e-02
Token ID: 29443 - Grad norm: 1.94e-01 - Loss: 2.60e-02
Token ID: 34115 - Grad norm: 3.76e-01 - Loss: 4.05e-02
Token ID: 23160 - Grad norm: 2.28e-01 - Loss: 2.44e-02
Token ID: 9113 - Grad norm: 9.75e-02 - Loss: 2.24e-02
Token ID: 4448 - Grad norm: 1.10e-01 - Loss: 1.56e-02
Token ID: 37320 - Grad norm: 2.59e-01 - Loss: 3.15e-02
Token ID: 19401 - Grad norm: 2.51e-01 - Loss: 2.28e-02
Token ID: 40264 - Grad norm: 1.83e-01 - Loss: 5.17e-02
Token ID: 568 - Grad norm: 2.28e-01 - Loss: 2.95e-02
Token ID: 13929 - Grad norm: 2.24e-01 - Loss: 3.50e-02
Token ID: 11496 - Grad norm: 1.61e-01 - Loss: 4.71e-02
Token ID: 30827 - Grad norm: 1.95e-01 - Loss: 4.93e-02


Token [ 2/58]:   0%|          | 207/50257 [00:02<11:55, 69.98it/s, Loss: 3.53e-02 - Gradient norm: 1.89e-01 - String: 12 we]

Token ID: 10913 - Grad norm: 1.67e-01 - Loss: 1.85e-02
Token ID: 356 - Grad norm: 1.89e-01 - Loss: 3.53e-02


Token [ 2/58]:   0%|          | 214/50257 [00:03<11:55, 69.96it/s, Loss: 1.94e-02 - Gradient norm: 1.42e-01 - String: 12blems]          

Token ID: 44852 - Grad norm: 3.13e-01 - Loss: 2.28e-02
Token ID: 13288 - Grad norm: 1.34e-01 - Loss: 2.21e-02
Token ID: 49619 - Grad norm: 7.20e-02 - Loss: 8.18e-03
Token ID: 87 - Grad norm: 1.88e-01 - Loss: 2.26e-02
Token ID: 3989 - Grad norm: 2.06e-01 - Loss: 2.85e-02
Token ID: 8669 - Grad norm: 2.66e-01 - Loss: 2.64e-02
Token ID: 36715 - Grad norm: 2.37e-01 - Loss: 2.85e-02
Token ID: 913 - Grad norm: 1.54e-01 - Loss: 2.11e-02
Token ID: 45779 - Grad norm: 1.86e-01 - Loss: 2.70e-02
Token ID: 13210 - Grad norm: 1.17e-01 - Loss: 1.38e-02
Token ID: 36903 - Grad norm: 8.83e-02 - Loss: 1.19e-02
Token ID: 2676 - Grad norm: 1.82e-01 - Loss: 1.67e-02
Token ID: 22143 - Grad norm: 1.42e-01 - Loss: 1.94e-02


Token [ 2/58]:   0%|          | 222/50257 [00:03<11:53, 70.08it/s, Loss: 2.84e-02 - Gradient norm: 1.89e-01 - String: 12 you] 

Token ID: 43439 - Grad norm: 1.77e-01 - Loss: 3.44e-02
Token ID: 345 - Grad norm: 1.89e-01 - Loss: 2.84e-02


Token [ 2/58]:   0%|          | 230/50257 [00:03<11:55, 69.95it/s, Loss: 1.87e-02 - Gradient norm: 1.28e-01 - String: 12 CPR]      

Token ID: 24395 - Grad norm: 8.43e-02 - Loss: 1.24e-02
Token ID: 7596 - Grad norm: 1.41e-01 - Loss: 1.34e-02
Token ID: 3456 - Grad norm: 1.90e-01 - Loss: 1.93e-02
Token ID: 18150 - Grad norm: 1.37e-01 - Loss: 1.27e-02
Token ID: 38009 - Grad norm: 1.06e-01 - Loss: 8.86e-03
Token ID: 1732 - Grad norm: 1.11e-01 - Loss: 1.54e-02
Token ID: 44601 - Grad norm: 1.47e-01 - Loss: 1.53e-02
Token ID: 28000 - Grad norm: 2.25e-01 - Loss: 3.52e-02
Token ID: 8689 - Grad norm: 2.79e-01 - Loss: 3.79e-02
Token ID: 21978 - Grad norm: 1.31e-01 - Loss: 2.17e-02
Token ID: 12363 - Grad norm: 1.75e-01 - Loss: 1.65e-02
Token ID: 19376 - Grad norm: 2.63e-01 - Loss: 1.94e-02
Token ID: 42920 - Grad norm: 1.28e-01 - Loss: 1.87e-02


Token [ 2/58]:   0%|          | 238/50257 [00:03<11:53, 70.14it/s, Loss: 1.21e-02 - Gradient norm: 1.58e-01 - String: 12abilia]

Token ID: 3863 - Grad norm: 1.72e-01 - Loss: 3.40e-02
Token ID: 48249 - Grad norm: 1.58e-01 - Loss: 1.21e-02


Token [ 2/58]:   0%|          | 246/50257 [00:03<11:54, 69.95it/s, Loss: 2.11e-02 - Gradient norm: 1.69e-01 - String: 12cil]        

Token ID: 40252 - Grad norm: 1.32e-01 - Loss: 2.20e-02
Token ID: 11159 - Grad norm: 9.40e-02 - Loss: 1.05e-02
Token ID: 12431 - Grad norm: 1.63e-01 - Loss: 2.04e-02
Token ID: 33283 - Grad norm: 2.07e-01 - Loss: 4.61e-02
Token ID: 11375 - Grad norm: 9.37e-02 - Loss: 1.14e-02
Token ID: 42890 - Grad norm: 1.10e-01 - Loss: 1.49e-02
Token ID: 8563 - Grad norm: 1.33e-01 - Loss: 1.88e-02
Token ID: 908 - Grad norm: 1.03e-01 - Loss: 1.04e-02
Token ID: 5624 - Grad norm: 1.92e-01 - Loss: 2.53e-02
Token ID: 7063 - Grad norm: 2.91e-01 - Loss: 4.67e-02
Token ID: 36132 - Grad norm: 1.56e-01 - Loss: 2.11e-02
Token ID: 26699 - Grad norm: 2.13e-01 - Loss: 3.14e-02
Token ID: 2856 - Grad norm: 1.69e-01 - Loss: 2.11e-02


Token [ 2/58]:   0%|          | 246/50257 [00:03<11:54, 69.95it/s, Loss: 7.51e-03 - Gradient norm: 9.74e-02 - String: 12 romantic]

Token ID: 25774 - Grad norm: 1.89e-01 - Loss: 2.29e-02
Token ID: 14348 - Grad norm: 9.74e-02 - Loss: 7.51e-03


Token [ 2/58]:   1%|          | 262/50257 [00:03<11:54, 69.97it/s, Loss: 1.95e-02 - Gradient norm: 1.30e-01 - String: 12ogram]       

Token ID: 19423 - Grad norm: 1.24e-01 - Loss: 1.96e-02
Token ID: 35583 - Grad norm: 2.26e-01 - Loss: 2.62e-02
Token ID: 13769 - Grad norm: 2.23e-01 - Loss: 2.78e-02
Token ID: 746 - Grad norm: 1.99e-01 - Loss: 2.15e-02
Token ID: 41186 - Grad norm: 1.25e-01 - Loss: 1.25e-02
Token ID: 4490 - Grad norm: 2.59e-01 - Loss: 2.65e-02
Token ID: 13000 - Grad norm: 1.12e-01 - Loss: 1.30e-02
Token ID: 13621 - Grad norm: 2.43e-01 - Loss: 3.69e-02
Token ID: 11162 - Grad norm: 1.90e-01 - Loss: 2.06e-02
Token ID: 25580 - Grad norm: 1.49e-01 - Loss: 2.46e-02
Token ID: 35253 - Grad norm: 3.86e-01 - Loss: 3.31e-02
Token ID: 28776 - Grad norm: 1.62e-01 - Loss: 2.46e-02
Token ID: 21857 - Grad norm: 1.30e-01 - Loss: 1.95e-02


Token [ 2/58]:   1%|          | 262/50257 [00:03<11:54, 69.97it/s, Loss: 5.62e-02 - Gradient norm: 3.97e-01 - String: 12ci]   

Token ID: 11664 - Grad norm: 1.38e-01 - Loss: 2.07e-02
Token ID: 979 - Grad norm: 3.97e-01 - Loss: 5.62e-02


Token [ 2/58]:   1%|          | 276/50257 [00:04<11:55, 69.85it/s, Loss: 6.91e-02 - Gradient norm: 4.07e-01 - String: 12 av]        

Token ID: 9620 - Grad norm: 2.42e-01 - Loss: 2.67e-02
Token ID: 7427 - Grad norm: 2.59e-01 - Loss: 2.15e-02
Token ID: 1793 - Grad norm: 2.79e-01 - Loss: 2.82e-02
Token ID: 3360 - Grad norm: 2.40e-01 - Loss: 2.61e-02
Token ID: 38771 - Grad norm: 1.33e-01 - Loss: 2.56e-02
Token ID: 16241 - Grad norm: 1.78e-01 - Loss: 2.31e-02
Token ID: 8736 - Grad norm: 1.45e-01 - Loss: 2.34e-02
Token ID: 19002 - Grad norm: 2.50e-01 - Loss: 3.54e-02
Token ID: 18788 - Grad norm: 1.92e-01 - Loss: 1.94e-02
Token ID: 7153 - Grad norm: 1.82e-01 - Loss: 2.12e-02
Token ID: 40813 - Grad norm: 1.16e-01 - Loss: 1.42e-02
Token ID: 44801 - Grad norm: 2.19e-01 - Loss: 2.93e-02
Token ID: 1196 - Grad norm: 4.07e-01 - Loss: 6.91e-02


Token [ 2/58]:   1%|          | 276/50257 [00:04<11:55, 69.85it/s, Loss: 2.74e-02 - Gradient norm: 1.80e-01 - String: 12 no]

Token ID: 251 - Grad norm: 1.30e-01 - Loss: 4.60e-02
Token ID: 645 - Grad norm: 1.80e-01 - Loss: 2.74e-02


Token [ 2/58]:   1%|          | 290/50257 [00:04<11:55, 69.82it/s, Loss: 3.20e-02 - Gradient norm: 2.44e-01 - String: 12 that]      

Token ID: 8347 - Grad norm: 1.32e-01 - Loss: 2.69e-02
Token ID: 11254 - Grad norm: 2.39e-01 - Loss: 3.46e-02
Token ID: 26954 - Grad norm: 2.93e-01 - Loss: 3.81e-02
Token ID: 39297 - Grad norm: 1.24e-01 - Loss: 1.81e-02
Token ID: 340 - Grad norm: 1.64e-01 - Loss: 2.66e-02
Token ID: 16903 - Grad norm: 1.56e-01 - Loss: 2.98e-02
Token ID: 49890 - Grad norm: 1.81e-01 - Loss: 2.54e-02
Token ID: 42104 - Grad norm: 2.66e-01 - Loss: 4.18e-02
Token ID: 313 - Grad norm: 1.36e-01 - Loss: 1.24e-02
Token ID: 2835 - Grad norm: 1.58e-01 - Loss: 1.55e-02
Token ID: 6366 - Grad norm: 3.38e-01 - Loss: 4.86e-02
Token ID: 460 - Grad norm: 3.39e-01 - Loss: 4.51e-02
Token ID: 326 - Grad norm: 2.44e-01 - Loss: 3.20e-02


Token [ 2/58]:   1%|          | 298/50257 [00:04<11:53, 70.00it/s, Loss: 9.13e-03 - Gradient norm: 9.21e-02 - String: 12 dumb]

Token ID: 6422 - Grad norm: 2.27e-01 - Loss: 3.64e-02
Token ID: 13526 - Grad norm: 9.21e-02 - Loss: 9.13e-03


Token [ 2/58]:   1%|          | 305/50257 [00:04<11:54, 69.88it/s, Loss: 3.04e-02 - Gradient norm: 2.39e-01 - String: 12oner]      

Token ID: 35822 - Grad norm: 1.98e-01 - Loss: 2.83e-02
Token ID: 1979 - Grad norm: 1.57e-01 - Loss: 1.91e-02
Token ID: 12936 - Grad norm: 2.20e-01 - Loss: 1.76e-02
Token ID: 30553 - Grad norm: 1.14e-01 - Loss: 8.86e-03
Token ID: 44701 - Grad norm: 9.11e-02 - Loss: 1.20e-02
Token ID: 5353 - Grad norm: 1.08e-01 - Loss: 2.02e-02
Token ID: 22568 - Grad norm: 2.45e-01 - Loss: 3.52e-02
Token ID: 4818 - Grad norm: 1.90e-01 - Loss: 1.36e-02
Token ID: 26368 - Grad norm: 3.61e-01 - Loss: 4.49e-02
Token ID: 21874 - Grad norm: 2.15e-01 - Loss: 3.28e-02
Token ID: 890 - Grad norm: 2.00e-01 - Loss: 1.81e-02
Token ID: 39960 - Grad norm: 1.40e-01 - Loss: 2.33e-02
Token ID: 14491 - Grad norm: 2.39e-01 - Loss: 3.04e-02


Token [ 2/58]:   1%|          | 313/50257 [00:04<11:52, 70.07it/s, Loss: 1.19e-02 - Gradient norm: 1.00e-01 - String: 12cham]

Token ID: 5065 - Grad norm: 1.48e-01 - Loss: 1.39e-02
Token ID: 49869 - Grad norm: 1.00e-01 - Loss: 1.19e-02


Token [ 2/58]:   1%|          | 321/50257 [00:04<11:54, 69.91it/s, Loss: 1.46e-02 - Gradient norm: 1.17e-01 - String: 12cial]      

Token ID: 8151 - Grad norm: 1.92e-01 - Loss: 3.06e-02
Token ID: 3474 - Grad norm: 1.32e-01 - Loss: 1.94e-02
Token ID: 7183 - Grad norm: 2.66e-01 - Loss: 3.53e-02
Token ID: 1395 - Grad norm: 2.78e-01 - Loss: 2.94e-02
Token ID: 29302 - Grad norm: 8.21e-02 - Loss: 7.00e-03
Token ID: 514 - Grad norm: 1.87e-01 - Loss: 2.24e-02
Token ID: 8752 - Grad norm: 1.16e-01 - Loss: 1.66e-02
Token ID: 47277 - Grad norm: 1.40e-01 - Loss: 1.91e-02
Token ID: 493 - Grad norm: 1.51e-01 - Loss: 2.07e-02
Token ID: 5739 - Grad norm: 1.27e-01 - Loss: 2.18e-02
Token ID: 17207 - Grad norm: 1.64e-01 - Loss: 2.79e-02
Token ID: 2474 - Grad norm: 3.11e-01 - Loss: 5.85e-02
Token ID: 2413 - Grad norm: 1.17e-01 - Loss: 1.46e-02


Token [ 2/58]:   1%|          | 321/50257 [00:04<11:54, 69.91it/s, Loss: 3.24e-02 - Gradient norm: 2.98e-01 - String: 12 jets]

Token ID: 472 - Grad norm: 3.64e-01 - Loss: 5.40e-02
Token ID: 20792 - Grad norm: 2.98e-01 - Loss: 3.24e-02


Token [ 2/58]:   1%|          | 336/50257 [00:04<11:58, 69.49it/s, Loss: 3.97e-02 - Gradient norm: 2.48e-01 - String: 12itt]       

Token ID: 693 - Grad norm: 4.00e-01 - Loss: 6.03e-02
Token ID: 8706 - Grad norm: 1.24e-01 - Loss: 1.41e-02
Token ID: 526 - Grad norm: 1.56e-01 - Loss: 4.89e-02
Token ID: 674 - Grad norm: 2.83e-01 - Loss: 3.97e-02
Token ID: 481 - Grad norm: 2.42e-01 - Loss: 3.68e-02
Token ID: 10701 - Grad norm: 2.58e-01 - Loss: 2.17e-02
Token ID: 43063 - Grad norm: 2.39e-01 - Loss: 3.42e-02
Token ID: 494 - Grad norm: 1.06e-01 - Loss: 1.59e-02
Token ID: 40663 - Grad norm: 2.31e-01 - Loss: 2.18e-02
Token ID: 2939 - Grad norm: 1.08e-01 - Loss: 1.72e-02
Token ID: 3389 - Grad norm: 5.95e-02 - Loss: 1.03e-02
Token ID: 715 - Grad norm: 2.48e-01 - Loss: 3.97e-02


Token [ 2/58]:   1%|          | 336/50257 [00:04<11:58, 69.49it/s, Loss: 1.22e-02 - Gradient norm: 1.11e-01 - String: 12 fifth] 

Token ID: 44714 - Grad norm: 8.85e-02 - Loss: 1.44e-02
Token ID: 8150 - Grad norm: 1.11e-01 - Loss: 1.22e-02


Token [ 2/58]:   1%|          | 350/50257 [00:05<12:01, 69.17it/s, Loss: 1.76e-02 - Gradient norm: 1.55e-01 - String: 12 red]     

Token ID: 9975 - Grad norm: 2.08e-01 - Loss: 2.22e-02
Token ID: 10018 - Grad norm: 2.18e-01 - Loss: 3.23e-02
Token ID: 549 - Grad norm: 4.08e-01 - Loss: 4.99e-02
Token ID: 2642 - Grad norm: 9.18e-02 - Loss: 1.54e-02
Token ID: 45858 - Grad norm: 3.12e-01 - Loss: 3.69e-02
Token ID: 12294 - Grad norm: 7.26e-02 - Loss: 6.41e-03
Token ID: 18539 - Grad norm: 1.23e-01 - Loss: 1.39e-02
Token ID: 36960 - Grad norm: 3.86e-01 - Loss: 6.74e-02
Token ID: 2136 - Grad norm: 1.26e-01 - Loss: 1.33e-02
Token ID: 6122 - Grad norm: 3.58e-01 - Loss: 5.30e-02
Token ID: 1978 - Grad norm: 2.09e-01 - Loss: 2.82e-02
Token ID: 2266 - Grad norm: 1.55e-01 - Loss: 1.76e-02


Token [ 2/58]:   1%|          | 350/50257 [00:05<12:01, 69.17it/s, Loss: 4.71e-02 - Gradient norm: 2.71e-01 - String: 12 avail]

Token ID: 4087 - Grad norm: 1.78e-01 - Loss: 3.15e-02
Token ID: 29107 - Grad norm: 2.71e-01 - Loss: 4.71e-02


Token [ 2/58]:   1%|          | 364/50257 [00:05<12:02, 69.03it/s, Loss: 3.57e-02 - Gradient norm: 2.85e-01 - String: 12 aer]   

Token ID: 33327 - Grad norm: 1.55e-01 - Loss: 2.02e-02
Token ID: 12187 - Grad norm: 1.57e-01 - Loss: 2.29e-02
Token ID: 783 - Grad norm: 1.39e-01 - Loss: 2.45e-02
Token ID: 46071 - Grad norm: 1.39e-01 - Loss: 1.36e-02
Token ID: 31939 - Grad norm: 2.43e-01 - Loss: 3.41e-02
Token ID: 1701 - Grad norm: 1.41e-01 - Loss: 5.03e-02
Token ID: 6029 - Grad norm: 1.63e-01 - Loss: 1.72e-02
Token ID: 14543 - Grad norm: 1.19e-01 - Loss: 1.39e-02
Token ID: 1219 - Grad norm: 1.83e-01 - Loss: 2.15e-02
Token ID: 40804 - Grad norm: 2.08e-01 - Loss: 2.68e-02
Token ID: 1472 - Grad norm: 1.39e-01 - Loss: 2.29e-02
Token ID: 9551 - Grad norm: 2.85e-01 - Loss: 3.57e-02


Token [ 2/58]:   1%|          | 364/50257 [00:05<12:02, 69.03it/s, Loss: 1.29e-02 - Gradient norm: 1.17e-01 - String: 12apes]

Token ID: 31280 - Grad norm: 2.43e-01 - Loss: 2.18e-02
Token ID: 7916 - Grad norm: 1.17e-01 - Loss: 1.29e-02


Token [ 2/58]:   1%|          | 378/50257 [00:05<12:02, 69.04it/s, Loss: 7.90e-03 - Gradient norm: 7.86e-02 - String: 12geon]    

Token ID: 47809 - Grad norm: 7.34e-02 - Loss: 4.67e-03
Token ID: 6620 - Grad norm: 1.25e-01 - Loss: 1.61e-02
Token ID: 32232 - Grad norm: 1.15e-01 - Loss: 1.07e-02
Token ID: 700 - Grad norm: 1.90e-01 - Loss: 2.37e-02
Token ID: 11025 - Grad norm: 9.72e-02 - Loss: 8.01e-03
Token ID: 11551 - Grad norm: 7.19e-02 - Loss: 1.24e-02
Token ID: 24194 - Grad norm: 1.26e-01 - Loss: 1.21e-02
Token ID: 26522 - Grad norm: 6.58e-02 - Loss: 9.16e-03
Token ID: 37037 - Grad norm: 1.09e-01 - Loss: 1.67e-02
Token ID: 6260 - Grad norm: 7.36e-02 - Loss: 1.59e-02
Token ID: 25199 - Grad norm: 2.78e-01 - Loss: 2.40e-02
Token ID: 6281 - Grad norm: 7.86e-02 - Loss: 7.90e-03


Token [ 2/58]:   1%|          | 378/50257 [00:05<12:02, 69.04it/s, Loss: 1.33e-02 - Gradient norm: 1.13e-01 - String: 12OTS] 

Token ID: 14995 - Grad norm: 1.84e-01 - Loss: 3.06e-02
Token ID: 33472 - Grad norm: 1.13e-01 - Loss: 1.33e-02


Token [ 2/58]:   1%|          | 392/50257 [00:05<12:01, 69.15it/s, Loss: 1.67e-02 - Gradient norm: 1.87e-01 - String: 12iss]         

Token ID: 34443 - Grad norm: 9.87e-02 - Loss: 9.35e-03
Token ID: 29095 - Grad norm: 1.32e-01 - Loss: 2.11e-02
Token ID: 4508 - Grad norm: 4.14e-01 - Loss: 5.95e-02
Token ID: 4164 - Grad norm: 9.27e-02 - Loss: 1.43e-02
Token ID: 20865 - Grad norm: 2.82e-01 - Loss: 4.82e-02
Token ID: 3168 - Grad norm: 1.42e-01 - Loss: 1.34e-02
Token ID: 20471 - Grad norm: 1.35e-01 - Loss: 2.54e-02
Token ID: 8950 - Grad norm: 3.19e-01 - Loss: 3.65e-02
Token ID: 611 - Grad norm: 2.59e-01 - Loss: 4.00e-02
Token ID: 1690 - Grad norm: 1.40e-01 - Loss: 1.94e-02
Token ID: 1525 - Grad norm: 1.61e-01 - Loss: 1.68e-02
Token ID: 747 - Grad norm: 1.87e-01 - Loss: 1.67e-02


Token [ 2/58]:   1%|          | 392/50257 [00:05<12:01, 69.15it/s, Loss: 2.77e-02 - Gradient norm: 2.23e-01 - String: 12bo]  

Token ID: 31110 - Grad norm: 2.90e-01 - Loss: 4.84e-02
Token ID: 2127 - Grad norm: 2.23e-01 - Loss: 2.77e-02


Token [ 2/58]:   1%|          | 406/50257 [00:05<12:09, 68.32it/s, Loss: 2.38e-02 - Gradient norm: 2.13e-01 - String: 12iol]     

Token ID: 1327 - Grad norm: 1.14e-01 - Loss: 1.22e-02
Token ID: 4733 - Grad norm: 2.61e-01 - Loss: 4.35e-02
Token ID: 3340 - Grad norm: 1.05e-01 - Loss: 1.10e-02
Token ID: 1096 - Grad norm: 8.83e-02 - Loss: 1.14e-02
Token ID: 34567 - Grad norm: 1.88e-01 - Loss: 3.52e-02
Token ID: 41779 - Grad norm: 1.25e-01 - Loss: 1.29e-02
Token ID: 17280 - Grad norm: 2.45e-01 - Loss: 2.45e-02
Token ID: 1909 - Grad norm: 2.41e-01 - Loss: 2.38e-02
Token ID: 45928 - Grad norm: 1.25e-01 - Loss: 1.20e-02
Token ID: 41070 - Grad norm: 2.20e-01 - Loss: 2.90e-02
Token ID: 20293 - Grad norm: 3.04e-01 - Loss: 2.83e-02
Token ID: 1669 - Grad norm: 2.13e-01 - Loss: 2.38e-02


Token [ 2/58]:   1%|          | 406/50257 [00:05<12:09, 68.32it/s, Loss: 1.29e-02 - Gradient norm: 7.93e-02 - String: 12orah]

Token ID: 3153 - Grad norm: 1.30e-01 - Loss: 1.36e-02
Token ID: 40844 - Grad norm: 7.93e-02 - Loss: 1.29e-02


Token [ 2/58]:   1%|          | 420/50257 [00:06<12:09, 68.34it/s, Loss: 3.30e-02 - Gradient norm: 2.52e-01 - String: 12 Friends]   

Token ID: 3997 - Grad norm: 9.58e-02 - Loss: 8.38e-03
Token ID: 28860 - Grad norm: 1.26e-01 - Loss: 8.78e-03
Token ID: 810 - Grad norm: 3.01e-01 - Loss: 4.45e-02
Token ID: 4741 - Grad norm: 2.05e-01 - Loss: 3.57e-02
Token ID: 41632 - Grad norm: 9.94e-02 - Loss: 1.03e-02
Token ID: 25855 - Grad norm: 9.59e-02 - Loss: 1.19e-02
Token ID: 7812 - Grad norm: 1.29e-01 - Loss: 1.38e-02
Token ID: 34565 - Grad norm: 1.22e-01 - Loss: 1.14e-02
Token ID: 6071 - Grad norm: 3.40e-01 - Loss: 3.73e-02
Token ID: 7737 - Grad norm: 1.78e-01 - Loss: 1.79e-02
Token ID: 11125 - Grad norm: 1.30e-01 - Loss: 1.17e-02
Token ID: 14213 - Grad norm: 2.52e-01 - Loss: 3.30e-02


Token [ 2/58]:   1%|          | 420/50257 [00:06<12:09, 68.34it/s, Loss: 6.32e-02 - Gradient norm: 3.67e-01 - String: 12spir]    

Token ID: 13199 - Grad norm: 1.70e-01 - Loss: 3.31e-02
Token ID: 71 - Grad norm: 3.02e-01 - Loss: 2.94e-02
Token ID: 45564 - Grad norm: 3.67e-01 - Loss: 6.32e-02


Token [ 2/58]:   1%|          | 435/50257 [00:06<11:58, 69.31it/s, Loss: 3.69e-02 - Gradient norm: 2.02e-01 - String: 12 what]   

Token ID: 20826 - Grad norm: 1.90e-01 - Loss: 2.49e-02
Token ID: 48864 - Grad norm: 2.28e-01 - Loss: 3.04e-02
Token ID: 1818 - Grad norm: 1.39e-01 - Loss: 2.09e-02
Token ID: 39164 - Grad norm: 8.79e-02 - Loss: 1.57e-02
Token ID: 26361 - Grad norm: 5.69e-02 - Loss: 8.52e-03
Token ID: 33934 - Grad norm: 1.47e-01 - Loss: 1.33e-02
Token ID: 22009 - Grad norm: 1.95e-01 - Loss: 2.71e-02
Token ID: 1820 - Grad norm: 2.75e-01 - Loss: 2.83e-02
Token ID: 2651 - Grad norm: 1.42e-01 - Loss: 1.83e-02
Token ID: 27045 - Grad norm: 3.76e-01 - Loss: 6.09e-02
Token ID: 1029 - Grad norm: 1.65e-01 - Loss: 1.40e-02
Token ID: 644 - Grad norm: 2.02e-01 - Loss: 3.69e-02


Token [ 2/58]:   1%|          | 435/50257 [00:06<11:58, 69.31it/s, Loss: 2.59e-02 - Gradient norm: 2.17e-01 - String: 12 mine] 

Token ID: 32555 - Grad norm: 2.97e-01 - Loss: 3.60e-02
Token ID: 44166 - Grad norm: 1.13e-01 - Loss: 2.02e-02
Token ID: 6164 - Grad norm: 2.17e-01 - Loss: 2.59e-02


Token [ 2/58]:   1%|          | 450/50257 [00:06<11:55, 69.63it/s, Loss: 3.71e-02 - Gradient norm: 2.74e-01 - String: 12pp]        

Token ID: 314 - Grad norm: 1.81e-01 - Loss: 4.14e-02
Token ID: 12334 - Grad norm: 1.80e-01 - Loss: 2.46e-02
Token ID: 7926 - Grad norm: 2.32e-01 - Loss: 2.34e-02
Token ID: 16571 - Grad norm: 1.83e-01 - Loss: 2.42e-02
Token ID: 23404 - Grad norm: 1.45e-01 - Loss: 1.70e-02
Token ID: 21062 - Grad norm: 2.04e-01 - Loss: 2.20e-02
Token ID: 30846 - Grad norm: 1.24e-01 - Loss: 1.23e-02
Token ID: 25949 - Grad norm: 1.37e-01 - Loss: 2.31e-02
Token ID: 815 - Grad norm: 3.73e-01 - Loss: 5.25e-02
Token ID: 14688 - Grad norm: 2.69e-01 - Loss: 3.52e-02
Token ID: 18049 - Grad norm: 1.31e-01 - Loss: 1.10e-02
Token ID: 381 - Grad norm: 2.74e-01 - Loss: 3.71e-02


Token [ 2/58]:   1%|          | 450/50257 [00:06<11:55, 69.63it/s, Loss: 9.00e-02 - Gradient norm: 5.54e-01 - String: 12 trigger]

Token ID: 1995 - Grad norm: 1.41e-01 - Loss: 2.60e-02
Token ID: 16390 - Grad norm: 1.85e-01 - Loss: 2.42e-02
Token ID: 7616 - Grad norm: 5.54e-01 - Loss: 9.00e-02


Token [ 2/58]:   1%|          | 465/50257 [00:06<11:53, 69.79it/s, Loss: 2.20e-02 - Gradient norm: 1.97e-01 - String: 12 crafts] 

Token ID: 10580 - Grad norm: 1.37e-01 - Loss: 2.07e-02
Token ID: 15049 - Grad norm: 1.39e-01 - Loss: 1.23e-02
Token ID: 484 - Grad norm: 1.63e-01 - Loss: 3.53e-02
Token ID: 14208 - Grad norm: 2.14e-01 - Loss: 2.66e-02
Token ID: 5675 - Grad norm: 2.11e-01 - Loss: 2.66e-02
Token ID: 11205 - Grad norm: 1.60e-01 - Loss: 2.51e-02
Token ID: 867 - Grad norm: 2.56e-01 - Loss: 2.90e-02
Token ID: 17320 - Grad norm: 3.29e-01 - Loss: 5.31e-02
Token ID: 12095 - Grad norm: 1.03e-01 - Loss: 6.96e-03
Token ID: 25236 - Grad norm: 1.66e-01 - Loss: 2.20e-02
Token ID: 8236 - Grad norm: 1.10e-01 - Loss: 1.11e-02
Token ID: 28229 - Grad norm: 1.97e-01 - Loss: 2.20e-02


Token [ 2/58]:   1%|          | 465/50257 [00:06<11:53, 69.79it/s, Loss: 1.89e-02 - Gradient norm: 1.91e-01 - String: 12oes]    

Token ID: 2928 - Grad norm: 1.72e-01 - Loss: 2.81e-02
Token ID: 1521 - Grad norm: 2.23e-01 - Loss: 4.17e-02
Token ID: 3028 - Grad norm: 1.91e-01 - Loss: 1.89e-02


Token [ 2/58]:   1%|          | 481/50257 [00:06<11:50, 70.03it/s, Loss: 2.33e-02 - Gradient norm: 1.98e-01 - String: 12 wow]        

Token ID: 22665 - Grad norm: 1.36e-01 - Loss: 1.21e-02
Token ID: 9856 - Grad norm: 1.11e-01 - Loss: 1.38e-02
Token ID: 631 - Grad norm: 2.81e-01 - Loss: 3.49e-02
Token ID: 5875 - Grad norm: 2.33e-01 - Loss: 2.37e-02
Token ID: 2364 - Grad norm: 1.20e-01 - Loss: 1.21e-02
Token ID: 377 - Grad norm: 1.83e-01 - Loss: 2.09e-02
Token ID: 4464 - Grad norm: 2.11e-01 - Loss: 2.80e-02
Token ID: 597 - Grad norm: 1.56e-01 - Loss: 2.27e-02
Token ID: 14031 - Grad norm: 3.06e-01 - Loss: 3.62e-02
Token ID: 5861 - Grad norm: 1.46e-01 - Loss: 2.34e-02
Token ID: 6350 - Grad norm: 2.97e-01 - Loss: 4.67e-02
Token ID: 28796 - Grad norm: 1.98e-01 - Loss: 2.33e-02


Token [ 2/58]:   1%|          | 481/50257 [00:06<11:50, 70.03it/s, Loss: 2.69e-02 - Gradient norm: 1.89e-01 - String: 12ind]    

Token ID: 4171 - Grad norm: 1.47e-01 - Loss: 1.53e-02
Token ID: 3387 - Grad norm: 2.42e-01 - Loss: 3.09e-02
Token ID: 521 - Grad norm: 1.89e-01 - Loss: 2.69e-02


Token [ 2/58]:   1%|          | 497/50257 [00:07<11:51, 69.96it/s, Loss: 1.71e-02 - Gradient norm: 1.62e-01 - String: 12 abstract]   

Token ID: 2084 - Grad norm: 2.51e-01 - Loss: 4.83e-02
Token ID: 516 - Grad norm: 1.79e-01 - Loss: 1.85e-02
Token ID: 27337 - Grad norm: 9.52e-02 - Loss: 1.16e-02
Token ID: 20760 - Grad norm: 2.71e-01 - Loss: 4.46e-02
Token ID: 8957 - Grad norm: 1.64e-01 - Loss: 2.24e-02
Token ID: 30070 - Grad norm: 1.49e-01 - Loss: 2.08e-02
Token ID: 4107 - Grad norm: 7.53e-02 - Loss: 1.27e-02
Token ID: 3299 - Grad norm: 1.84e-01 - Loss: 2.03e-02
Token ID: 517 - Grad norm: 1.86e-01 - Loss: 2.34e-02
Token ID: 16122 - Grad norm: 2.00e-01 - Loss: 2.93e-02
Token ID: 12337 - Grad norm: 1.99e-01 - Loss: 1.21e-02
Token ID: 12531 - Grad norm: 1.62e-01 - Loss: 1.71e-02


Token [ 2/58]:   1%|          | 497/50257 [00:07<11:51, 69.96it/s, Loss: 3.22e-02 - Gradient norm: 2.51e-01 - String: 12 Sad]     

Token ID: 2571 - Grad norm: 2.04e-01 - Loss: 2.65e-02
Token ID: 15095 - Grad norm: 1.21e-01 - Loss: 1.77e-02
Token ID: 14668 - Grad norm: 2.51e-01 - Loss: 3.22e-02


Token [ 2/58]:   1%|          | 513/50257 [00:07<11:50, 70.05it/s, Loss: 3.16e-02 - Gradient norm: 1.80e-01 - String: 12brush]     

Token ID: 10490 - Grad norm: 2.17e-01 - Loss: 3.06e-02
Token ID: 18088 - Grad norm: 7.66e-02 - Loss: 9.56e-03
Token ID: 17608 - Grad norm: 1.93e-01 - Loss: 2.98e-02
Token ID: 19528 - Grad norm: 7.53e-02 - Loss: 1.07e-02
Token ID: 75 - Grad norm: 1.93e-01 - Loss: 2.16e-02
Token ID: 1257 - Grad norm: 1.05e-01 - Loss: 1.45e-02
Token ID: 7872 - Grad norm: 1.55e-01 - Loss: 1.68e-02
Token ID: 47534 - Grad norm: 2.88e-01 - Loss: 4.37e-02
Token ID: 8447 - Grad norm: 1.69e-01 - Loss: 2.13e-02
Token ID: 12644 - Grad norm: 1.96e-01 - Loss: 1.85e-02
Token ID: 35379 - Grad norm: 1.51e-01 - Loss: 4.59e-02
Token ID: 32680 - Grad norm: 1.80e-01 - Loss: 3.16e-02


Token [ 2/58]:   1%|          | 513/50257 [00:07<11:50, 70.05it/s, Loss: 1.46e-02 - Gradient norm: 1.28e-01 - String: 12 misunderstand]

Token ID: 49296 - Grad norm: 2.02e-01 - Loss: 4.83e-02
Token ID: 27089 - Grad norm: 1.69e-01 - Loss: 2.40e-02
Token ID: 39653 - Grad norm: 1.28e-01 - Loss: 1.46e-02


Token [ 2/58]:   1%|          | 529/50257 [00:07<11:47, 70.34it/s, Loss: 1.17e-02 - Gradient norm: 1.15e-01 - String: 12sounding]      

Token ID: 13165 - Grad norm: 7.24e-02 - Loss: 1.28e-02
Token ID: 15027 - Grad norm: 1.17e-01 - Loss: 2.07e-02
Token ID: 27232 - Grad norm: 1.96e-01 - Loss: 3.20e-02
Token ID: 5362 - Grad norm: 2.82e-01 - Loss: 4.66e-02
Token ID: 7373 - Grad norm: 1.97e-01 - Loss: 2.30e-02
Token ID: 3378 - Grad norm: 3.08e-01 - Loss: 2.83e-02
Token ID: 4325 - Grad norm: 1.73e-01 - Loss: 2.01e-02
Token ID: 220 - Grad norm: 3.18e-01 - Loss: 4.58e-02
Token ID: 39433 - Grad norm: 9.27e-02 - Loss: 1.86e-02
Token ID: 21130 - Grad norm: 2.65e-01 - Loss: 3.01e-02
Token ID: 37989 - Grad norm: 1.99e-01 - Loss: 3.11e-02
Token ID: 39686 - Grad norm: 1.15e-01 - Loss: 1.17e-02


Token [ 2/58]:   1%|          | 529/50257 [00:07<11:47, 70.34it/s, Loss: 2.74e-02 - Gradient norm: 1.71e-01 - String: 12 diary]  

Token ID: 19363 - Grad norm: 2.82e-01 - Loss: 3.71e-02
Token ID: 27069 - Grad norm: 1.21e-01 - Loss: 1.28e-02
Token ID: 26339 - Grad norm: 1.71e-01 - Loss: 2.74e-02


Token [ 2/58]:   1%|          | 537/50257 [00:07<11:45, 70.46it/s, Loss: 3.03e-02 - Gradient norm: 1.85e-01 - String: 12rier]     

Token ID: 13713 - Grad norm: 1.63e-01 - Loss: 2.46e-02
Token ID: 12356 - Grad norm: 1.13e-01 - Loss: 1.06e-02
Token ID: 1110 - Grad norm: 1.90e-01 - Loss: 3.01e-02
Token ID: 3020 - Grad norm: 1.65e-01 - Loss: 2.46e-02
Token ID: 1285 - Grad norm: 1.74e-01 - Loss: 2.32e-02
Token ID: 5233 - Grad norm: 1.22e-01 - Loss: 1.13e-02
Token ID: 4286 - Grad norm: 1.68e-01 - Loss: 2.14e-02
Token ID: 49415 - Grad norm: 1.66e-01 - Loss: 2.12e-02
Token ID: 8800 - Grad norm: 1.02e-01 - Loss: 1.34e-02
Token ID: 4082 - Grad norm: 1.49e-01 - Loss: 2.68e-02
Token ID: 12406 - Grad norm: 1.30e-01 - Loss: 2.10e-02
Token ID: 5277 - Grad norm: 1.85e-01 - Loss: 3.03e-02


Token [ 2/58]:   1%|          | 545/50257 [00:07<11:46, 70.34it/s, Loss: 2.80e-02 - Gradient norm: 1.73e-01 - String: 12 Look]  

Token ID: 48406 - Grad norm: 2.68e-01 - Loss: 4.56e-02
Token ID: 1576 - Grad norm: 1.11e-01 - Loss: 2.10e-02
Token ID: 6803 - Grad norm: 1.73e-01 - Loss: 2.80e-02


Token [ 2/58]:   1%|          | 553/50257 [00:08<11:46, 70.33it/s, Loss: 1.65e-02 - Gradient norm: 9.60e-02 - String: 12 Pizza]    

Token ID: 3923 - Grad norm: 1.23e-01 - Loss: 1.86e-02
Token ID: 777 - Grad norm: 1.38e-01 - Loss: 2.09e-02
Token ID: 9573 - Grad norm: 1.81e-01 - Loss: 1.90e-02
Token ID: 1477 - Grad norm: 1.95e-01 - Loss: 2.63e-02
Token ID: 717 - Grad norm: 2.11e-01 - Loss: 2.17e-02
Token ID: 5436 - Grad norm: 1.28e-01 - Loss: 2.64e-02
Token ID: 40754 - Grad norm: 2.04e-01 - Loss: 4.70e-02
Token ID: 44408 - Grad norm: 2.71e-01 - Loss: 3.75e-02
Token ID: 37259 - Grad norm: 3.02e-01 - Loss: 2.38e-02
Token ID: 1173 - Grad norm: 4.79e-01 - Loss: 9.02e-02
Token ID: 1600 - Grad norm: 2.56e-01 - Loss: 5.35e-02
Token ID: 20952 - Grad norm: 9.60e-02 - Loss: 1.65e-02


Token [ 2/58]:   1%|          | 561/50257 [00:08<11:46, 70.36it/s, Loss: 2.14e-02 - Gradient norm: 1.72e-01 - String: 12Ros]     

Token ID: 10105 - Grad norm: 1.43e-01 - Loss: 1.93e-02
Token ID: 4032 - Grad norm: 1.99e-01 - Loss: 4.67e-02
Token ID: 35740 - Grad norm: 1.72e-01 - Loss: 2.14e-02


Token [ 2/58]:   1%|          | 569/50257 [00:08<11:47, 70.23it/s, Loss: 1.55e-02 - Gradient norm: 2.36e-01 - String: 12ake]      

Token ID: 17703 - Grad norm: 1.96e-01 - Loss: 3.23e-02
Token ID: 8314 - Grad norm: 2.47e-01 - Loss: 4.09e-02
Token ID: 46591 - Grad norm: 9.23e-02 - Loss: 1.30e-02
Token ID: 420 - Grad norm: 3.84e-01 - Loss: 5.83e-02
Token ID: 12875 - Grad norm: 1.26e-01 - Loss: 1.13e-02
Token ID: 1680 - Grad norm: 2.33e-01 - Loss: 3.97e-02
Token ID: 9178 - Grad norm: 1.91e-01 - Loss: 2.03e-02
Token ID: 7983 - Grad norm: 1.33e-01 - Loss: 1.82e-02
Token ID: 375 - Grad norm: 2.66e-01 - Loss: 3.04e-02
Token ID: 27460 - Grad norm: 2.63e-01 - Loss: 3.14e-02
Token ID: 1944 - Grad norm: 1.92e-01 - Loss: 1.75e-02
Token ID: 539 - Grad norm: 2.36e-01 - Loss: 1.55e-02


Token [ 2/58]:   1%|          | 577/50257 [00:08<11:44, 70.53it/s, Loss: 3.75e-02 - Gradient norm: 2.90e-01 - String: 12 would]

Token ID: 22874 - Grad norm: 1.47e-01 - Loss: 2.14e-02
Token ID: 565 - Grad norm: 2.52e-01 - Loss: 3.30e-02
Token ID: 561 - Grad norm: 2.90e-01 - Loss: 3.75e-02


Token [ 2/58]:   1%|          | 585/50257 [00:08<11:45, 70.36it/s, Loss: 2.46e-02 - Gradient norm: 1.35e-01 - String: 12 Majesty]

Token ID: 40368 - Grad norm: 1.10e-01 - Loss: 1.32e-02
Token ID: 14235 - Grad norm: 2.52e-01 - Loss: 3.94e-02
Token ID: 14127 - Grad norm: 8.15e-02 - Loss: 8.28e-03
Token ID: 785 - Grad norm: 1.84e-01 - Loss: 2.92e-02
Token ID: 4279 - Grad norm: 1.11e-01 - Loss: 1.29e-02
Token ID: 27235 - Grad norm: 1.39e-01 - Loss: 1.75e-02
Token ID: 1123 - Grad norm: 1.83e-01 - Loss: 3.00e-02
Token ID: 20377 - Grad norm: 1.69e-01 - Loss: 1.80e-02
Token ID: 4035 - Grad norm: 4.11e-01 - Loss: 8.28e-02
Token ID: 15451 - Grad norm: 1.05e-01 - Loss: 1.19e-02
Token ID: 3379 - Grad norm: 8.63e-02 - Loss: 1.22e-02
Token ID: 25788 - Grad norm: 1.35e-01 - Loss: 2.46e-02


Token [ 2/58]:   1%|          | 585/50257 [00:08<11:45, 70.36it/s, Loss: 4.65e-02 - Gradient norm: 2.72e-01 - String: 12ott]     

Token ID: 36597 - Grad norm: 8.84e-02 - Loss: 1.08e-02
Token ID: 947 - Grad norm: 2.28e-01 - Loss: 2.48e-02
Token ID: 1252 - Grad norm: 2.72e-01 - Loss: 4.65e-02


Token [ 2/58]:   1%|          | 601/50257 [00:08<11:46, 70.24it/s, Loss: 2.79e-02 - Gradient norm: 2.08e-01 - String: 12Deal]        

Token ID: 45377 - Grad norm: 1.98e-01 - Loss: 2.84e-02
Token ID: 37830 - Grad norm: 1.41e-01 - Loss: 2.00e-02
Token ID: 11003 - Grad norm: 1.60e-01 - Loss: 2.04e-02
Token ID: 12177 - Grad norm: 6.94e-02 - Loss: 1.00e-02
Token ID: 46297 - Grad norm: 1.08e-01 - Loss: 1.62e-02
Token ID: 25907 - Grad norm: 1.40e-01 - Loss: 1.03e-02
Token ID: 12255 - Grad norm: 7.44e-02 - Loss: 5.49e-03
Token ID: 7639 - Grad norm: 2.22e-01 - Loss: 2.60e-02
Token ID: 48424 - Grad norm: 1.29e-01 - Loss: 1.66e-02
Token ID: 25007 - Grad norm: 1.53e-01 - Loss: 1.59e-02
Token ID: 23126 - Grad norm: 3.07e-01 - Loss: 4.68e-02
Token ID: 45776 - Grad norm: 2.08e-01 - Loss: 2.79e-02


Token [ 2/58]:   1%|          | 601/50257 [00:08<11:46, 70.24it/s, Loss: 1.49e-02 - Gradient norm: 1.53e-01 - String: 12ovy] 

Token ID: 37370 - Grad norm: 1.57e-01 - Loss: 2.37e-02
Token ID: 1849 - Grad norm: 2.05e-01 - Loss: 2.54e-02
Token ID: 27796 - Grad norm: 1.53e-01 - Loss: 1.49e-02


Token [ 2/58]:   1%|          | 617/50257 [00:08<11:46, 70.27it/s, Loss: 1.91e-02 - Gradient norm: 1.29e-01 - String: 12 worse]     

Token ID: 13445 - Grad norm: 5.18e-02 - Loss: 3.42e-03
Token ID: 32423 - Grad norm: 1.22e-01 - Loss: 1.88e-02
Token ID: 7277 - Grad norm: 1.44e-01 - Loss: 1.58e-02
Token ID: 2035 - Grad norm: 1.63e-01 - Loss: 2.31e-02
Token ID: 1917 - Grad norm: 1.31e-01 - Loss: 1.90e-02
Token ID: 12802 - Grad norm: 1.81e-01 - Loss: 2.23e-02
Token ID: 20793 - Grad norm: 2.03e-01 - Loss: 2.02e-02
Token ID: 13202 - Grad norm: 1.01e-01 - Loss: 1.69e-02
Token ID: 45636 - Grad norm: 3.86e-01 - Loss: 7.54e-02
Token ID: 7919 - Grad norm: 1.79e-01 - Loss: 1.73e-02
Token ID: 10647 - Grad norm: 1.52e-01 - Loss: 2.29e-02
Token ID: 4785 - Grad norm: 1.29e-01 - Loss: 1.91e-02


Token [ 2/58]:   1%|          | 617/50257 [00:08<11:46, 70.27it/s, Loss: 1.39e-02 - Gradient norm: 2.38e-01 - String: 12ipe]   

Token ID: 1612 - Grad norm: 9.96e-02 - Loss: 1.35e-02
Token ID: 2081 - Grad norm: 1.36e-01 - Loss: 1.41e-02
Token ID: 3757 - Grad norm: 2.38e-01 - Loss: 1.39e-02


Token [ 2/58]:   1%|▏         | 633/50257 [00:09<11:44, 70.49it/s, Loss: 1.85e-02 - Gradient norm: 1.93e-01 - String: 12rick]     

Token ID: 385 - Grad norm: 2.44e-01 - Loss: 2.66e-02
Token ID: 39921 - Grad norm: 2.12e-01 - Loss: 2.87e-02
Token ID: 13407 - Grad norm: 8.13e-02 - Loss: 5.47e-03
Token ID: 46810 - Grad norm: 9.06e-02 - Loss: 7.99e-03
Token ID: 40900 - Grad norm: 2.11e-01 - Loss: 2.58e-02
Token ID: 1031 - Grad norm: 2.52e-01 - Loss: 3.17e-02
Token ID: 788 - Grad norm: 2.26e-01 - Loss: 3.95e-02
Token ID: 28836 - Grad norm: 2.92e-01 - Loss: 3.89e-02
Token ID: 7923 - Grad norm: 2.30e-01 - Loss: 2.88e-02
Token ID: 10968 - Grad norm: 2.31e-01 - Loss: 2.33e-02
Token ID: 46510 - Grad norm: 7.49e-02 - Loss: 5.82e-03
Token ID: 5557 - Grad norm: 1.93e-01 - Loss: 1.85e-02


Token [ 2/58]:   1%|▏         | 633/50257 [00:09<11:44, 70.49it/s, Loss: 2.25e-02 - Gradient norm: 1.39e-01 - String: 12based] 

Token ID: 12571 - Grad norm: 1.02e-01 - Loss: 1.20e-02
Token ID: 21094 - Grad norm: 1.28e-01 - Loss: 1.26e-02
Token ID: 3106 - Grad norm: 1.39e-01 - Loss: 2.25e-02


Token [ 2/58]:   1%|▏         | 649/50257 [00:09<11:45, 70.29it/s, Loss: 1.54e-02 - Gradient norm: 9.49e-02 - String: 12 Salad]   

Token ID: 1941 - Grad norm: 3.11e-01 - Loss: 3.93e-02
Token ID: 38563 - Grad norm: 2.99e-01 - Loss: 3.26e-02
Token ID: 17478 - Grad norm: 8.16e-02 - Loss: 9.90e-03
Token ID: 11815 - Grad norm: 1.28e-01 - Loss: 1.84e-02
Token ID: 30477 - Grad norm: 2.46e-01 - Loss: 2.75e-02
Token ID: 3118 - Grad norm: 3.37e-01 - Loss: 5.41e-02
Token ID: 692 - Grad norm: 1.96e-01 - Loss: 3.52e-02
Token ID: 20698 - Grad norm: 2.09e-01 - Loss: 2.77e-02
Token ID: 16576 - Grad norm: 1.30e-01 - Loss: 1.45e-02
Token ID: 11722 - Grad norm: 1.06e-01 - Loss: 9.49e-03
Token ID: 22502 - Grad norm: 2.43e-01 - Loss: 3.15e-02
Token ID: 42706 - Grad norm: 9.49e-02 - Loss: 1.54e-02


Token [ 2/58]:   1%|▏         | 649/50257 [00:09<11:45, 70.29it/s, Loss: 1.48e-02 - Gradient norm: 8.80e-02 - String: 12aha]   

Token ID: 606 - Grad norm: 1.65e-01 - Loss: 2.43e-02
Token ID: 35461 - Grad norm: 1.15e-01 - Loss: 1.33e-02
Token ID: 12236 - Grad norm: 8.80e-02 - Loss: 1.48e-02


Token [ 2/58]:   1%|▏         | 657/50257 [00:09<11:45, 70.33it/s, Loss: 1.67e-02 - Gradient norm: 2.09e-01 - String: 12ken]        

Token ID: 21813 - Grad norm: 1.70e-01 - Loss: 1.26e-02
Token ID: 248 - Grad norm: 1.61e-01 - Loss: 1.50e-02
Token ID: 569 - Grad norm: 1.54e-01 - Loss: 2.50e-02
Token ID: 26404 - Grad norm: 1.35e-01 - Loss: 2.44e-02
Token ID: 12838 - Grad norm: 1.57e-01 - Loss: 1.88e-02
Token ID: 8756 - Grad norm: 3.06e-01 - Loss: 3.31e-02
Token ID: 19462 - Grad norm: 1.79e-01 - Loss: 2.43e-02
Token ID: 2302 - Grad norm: 1.75e-01 - Loss: 3.55e-02
Token ID: 13394 - Grad norm: 2.88e-01 - Loss: 2.83e-02
Token ID: 16627 - Grad norm: 2.50e-01 - Loss: 4.42e-02
Token ID: 3464 - Grad norm: 2.09e-01 - Loss: 1.67e-02


Token [ 2/58]:   1%|▏         | 665/50257 [00:09<11:59, 68.89it/s, Loss: 1.29e-02 - Gradient norm: 1.41e-01 - String: 12eport]

Token ID: 927 - Grad norm: 1.52e-01 - Loss: 1.47e-02
Token ID: 1930 - Grad norm: 5.61e-02 - Loss: 8.53e-03
Token ID: 45813 - Grad norm: 1.41e-01 - Loss: 1.29e-02


Token [ 2/58]:   1%|▏         | 672/50257 [00:09<12:21, 66.91it/s, Loss: 2.18e-02 - Gradient norm: 1.44e-01 - String: 12 rather]      

Token ID: 534 - Grad norm: 2.63e-01 - Loss: 3.38e-02
Token ID: 16413 - Grad norm: 2.51e-01 - Loss: 2.93e-02
Token ID: 16658 - Grad norm: 6.07e-02 - Loss: 5.36e-03
Token ID: 6658 - Grad norm: 2.29e-01 - Loss: 2.98e-02
Token ID: 47852 - Grad norm: 2.64e-01 - Loss: 3.57e-02
Token ID: 329 - Grad norm: 2.92e-01 - Loss: 3.95e-02
Token ID: 35303 - Grad norm: 2.30e-01 - Loss: 1.59e-02
Token ID: 428 - Grad norm: 2.26e-01 - Loss: 2.56e-02
Token ID: 4162 - Grad norm: 1.88e-01 - Loss: 3.70e-02
Token ID: 2138 - Grad norm: 1.44e-01 - Loss: 2.18e-02


Token [ 2/58]:   1%|▏         | 679/50257 [00:09<12:37, 65.45it/s, Loss: 4.25e-02 - Gradient norm: 3.00e-01 - String: 12 Lily]      

Token ID: 9955 - Grad norm: 1.51e-01 - Loss: 2.34e-02
Token ID: 15392 - Grad norm: 1.72e-01 - Loss: 2.01e-02
Token ID: 20037 - Grad norm: 3.00e-01 - Loss: 4.25e-02


Token [ 2/58]:   1%|▏         | 686/50257 [00:09<12:50, 64.35it/s, Loss: 1.61e-02 - Gradient norm: 9.67e-02 - String: 12 math]   

Token ID: 2088 - Grad norm: 2.04e-01 - Loss: 2.85e-02
Token ID: 756 - Grad norm: 2.45e-01 - Loss: 2.95e-02
Token ID: 2582 - Grad norm: 1.86e-01 - Loss: 2.66e-02
Token ID: 14178 - Grad norm: 9.41e-02 - Loss: 1.29e-02
Token ID: 1865 - Grad norm: 2.40e-01 - Loss: 2.89e-02
Token ID: 30510 - Grad norm: 1.59e-01 - Loss: 1.47e-02
Token ID: 384 - Grad norm: 3.10e-01 - Loss: 4.62e-02
Token ID: 2705 - Grad norm: 1.75e-01 - Loss: 1.52e-02
Token ID: 1771 - Grad norm: 1.85e-01 - Loss: 2.91e-02
Token ID: 10688 - Grad norm: 9.67e-02 - Loss: 1.61e-02


Token [ 2/58]:   1%|▏         | 686/50257 [00:09<12:50, 64.35it/s, Loss: 2.84e-02 - Gradient norm: 1.28e-01 - String: 12 how]  

Token ID: 34046 - Grad norm: 2.87e-01 - Loss: 4.38e-02
Token ID: 31888 - Grad norm: 3.14e-01 - Loss: 3.31e-02
Token ID: 703 - Grad norm: 1.28e-01 - Loss: 2.84e-02


Token [ 2/58]:   1%|▏         | 700/50257 [00:10<12:59, 63.58it/s, Loss: 2.52e-02 - Gradient norm: 1.88e-01 - String: 12 cent]   

Token ID: 284 - Grad norm: 1.56e-01 - Loss: 3.58e-02
Token ID: 16214 - Grad norm: 1.22e-01 - Loss: 1.93e-02
Token ID: 1881 - Grad norm: 2.11e-01 - Loss: 4.33e-02
Token ID: 24876 - Grad norm: 7.32e-02 - Loss: 7.04e-03
Token ID: 1557 - Grad norm: 2.44e-01 - Loss: 2.98e-02
Token ID: 19621 - Grad norm: 4.66e-01 - Loss: 8.73e-02
Token ID: 17352 - Grad norm: 2.11e-01 - Loss: 2.77e-02
Token ID: 1263 - Grad norm: 2.88e-01 - Loss: 2.55e-02
Token ID: 21398 - Grad norm: 9.64e-02 - Loss: 1.30e-02
Token ID: 1247 - Grad norm: 1.88e-01 - Loss: 2.52e-02


Token [ 2/58]:   1%|▏         | 700/50257 [00:10<12:59, 63.58it/s, Loss: 3.71e-02 - Gradient norm: 1.91e-01 - String: 12 anywhere]

Token ID: 12277 - Grad norm: 1.73e-01 - Loss: 2.07e-02
Token ID: 1997 - Grad norm: 1.73e-01 - Loss: 2.98e-02
Token ID: 6609 - Grad norm: 1.91e-01 - Loss: 3.71e-02


Token [ 2/58]:   1%|▏         | 714/50257 [00:10<12:39, 65.26it/s, Loss: 1.28e-02 - Gradient norm: 8.97e-02 - String: 12ette]      

Token ID: 19288 - Grad norm: 8.69e-02 - Loss: 1.32e-02
Token ID: 28527 - Grad norm: 1.97e-01 - Loss: 2.57e-02
Token ID: 20935 - Grad norm: 3.05e-01 - Loss: 4.11e-02
Token ID: 8116 - Grad norm: 1.21e-01 - Loss: 1.62e-02
Token ID: 30045 - Grad norm: 1.33e-01 - Loss: 1.73e-02
Token ID: 4222 - Grad norm: 1.78e-01 - Loss: 3.76e-02
Token ID: 32268 - Grad norm: 1.96e-01 - Loss: 2.77e-02
Token ID: 22788 - Grad norm: 1.71e-01 - Loss: 2.45e-02
Token ID: 11683 - Grad norm: 7.04e-02 - Loss: 8.71e-03
Token ID: 2927 - Grad norm: 2.65e-01 - Loss: 3.69e-02
Token ID: 5857 - Grad norm: 8.97e-02 - Loss: 1.28e-02


Token [ 2/58]:   1%|▏         | 714/50257 [00:10<12:39, 65.26it/s, Loss: 2.28e-02 - Gradient norm: 1.74e-01 - String: 12v]     

Token ID: 1239 - Grad norm: 1.82e-01 - Loss: 3.42e-02
Token ID: 16002 - Grad norm: 1.21e-01 - Loss: 1.34e-02
Token ID: 85 - Grad norm: 1.74e-01 - Loss: 2.28e-02


Token [ 2/58]:   1%|▏         | 728/50257 [00:10<12:18, 67.11it/s, Loss: 3.83e-02 - Gradient norm: 1.95e-01 - String: 12 seventeen]

Token ID: 7935 - Grad norm: 7.50e-02 - Loss: 1.14e-02
Token ID: 43123 - Grad norm: 9.62e-02 - Loss: 1.18e-02
Token ID: 7173 - Grad norm: 1.90e-01 - Loss: 2.24e-02
Token ID: 8984 - Grad norm: 1.42e-01 - Loss: 2.10e-02
Token ID: 9928 - Grad norm: 1.49e-01 - Loss: 1.21e-02
Token ID: 49556 - Grad norm: 2.24e-01 - Loss: 3.24e-02
Token ID: 38338 - Grad norm: 1.46e-01 - Loss: 2.07e-02
Token ID: 24280 - Grad norm: 2.01e-01 - Loss: 3.67e-02
Token ID: 20780 - Grad norm: 2.82e-01 - Loss: 3.68e-02
Token ID: 16496 - Grad norm: 7.83e-02 - Loss: 6.81e-03
Token ID: 38741 - Grad norm: 1.95e-01 - Loss: 3.83e-02


Token [ 2/58]:   1%|▏         | 728/50257 [00:10<12:18, 67.11it/s, Loss: 3.84e-02 - Gradient norm: 2.79e-01 - String: 12bek]       

Token ID: 15776 - Grad norm: 9.17e-02 - Loss: 1.10e-02
Token ID: 78 - Grad norm: 2.00e-01 - Loss: 1.87e-02
Token ID: 47853 - Grad norm: 7.99e-02 - Loss: 7.17e-03
Token ID: 47083 - Grad norm: 2.79e-01 - Loss: 3.84e-02


Token [ 2/58]:   1%|▏         | 742/50257 [00:10<12:03, 68.39it/s, Loss: 1.97e-02 - Gradient norm: 1.35e-01 - String: 12ipment]    

Token ID: 31680 - Grad norm: 7.20e-02 - Loss: 1.05e-02
Token ID: 15441 - Grad norm: 1.62e-01 - Loss: 1.88e-02
Token ID: 13408 - Grad norm: 2.15e-01 - Loss: 2.58e-02
Token ID: 19646 - Grad norm: 8.35e-02 - Loss: 1.07e-02
Token ID: 50042 - Grad norm: 1.03e-01 - Loss: 1.18e-02
Token ID: 5268 - Grad norm: 1.61e-01 - Loss: 4.00e-02
Token ID: 1755 - Grad norm: 3.42e-01 - Loss: 3.35e-02
Token ID: 14673 - Grad norm: 1.83e-01 - Loss: 2.31e-02
Token ID: 9979 - Grad norm: 1.55e-01 - Loss: 1.60e-02
Token ID: 13760 - Grad norm: 4.91e-02 - Loss: 5.36e-03
Token ID: 4667 - Grad norm: 1.35e-01 - Loss: 1.97e-02


Token [ 2/58]:   1%|▏         | 742/50257 [00:10<12:03, 68.39it/s, Loss: 1.80e-02 - Gradient norm: 1.44e-01 - String: 12grey]   

Token ID: 6802 - Grad norm: 5.18e-02 - Loss: 1.42e-02
Token ID: 3366 - Grad norm: 1.60e-01 - Loss: 2.00e-02
Token ID: 49502 - Grad norm: 1.44e-01 - Loss: 1.80e-02


Token [ 2/58]:   2%|▏         | 756/50257 [00:10<12:13, 67.51it/s, Loss: 6.81e-03 - Gradient norm: 8.53e-02 - String: 12iably]      

Token ID: 37615 - Grad norm: 2.71e-01 - Loss: 2.73e-02
Token ID: 41800 - Grad norm: 1.77e-01 - Loss: 2.34e-02
Token ID: 6051 - Grad norm: 1.44e-01 - Loss: 2.06e-02
Token ID: 12770 - Grad norm: 2.96e-01 - Loss: 3.28e-02
Token ID: 24757 - Grad norm: 7.26e-02 - Loss: 8.65e-03
Token ID: 20308 - Grad norm: 3.43e-01 - Loss: 2.91e-02
Token ID: 13099 - Grad norm: 2.02e-01 - Loss: 2.21e-02
Token ID: 39993 - Grad norm: 1.91e-01 - Loss: 2.03e-02
Token ID: 6016 - Grad norm: 2.48e-01 - Loss: 1.81e-02
Token ID: 27004 - Grad norm: 2.81e-01 - Loss: 2.99e-02
Token ID: 18745 - Grad norm: 8.53e-02 - Loss: 6.81e-03


Token [ 2/58]:   2%|▏         | 756/50257 [00:11<12:13, 67.51it/s, Loss: 1.27e-02 - Gradient norm: 1.65e-01 - String: 12onde] 

Token ID: 2637 - Grad norm: 1.47e-01 - Loss: 3.89e-02
Token ID: 21042 - Grad norm: 2.39e-01 - Loss: 2.32e-02
Token ID: 14378 - Grad norm: 1.65e-01 - Loss: 1.27e-02


Token [ 2/58]:   2%|▏         | 770/50257 [00:11<12:07, 68.02it/s, Loss: 2.09e-02 - Gradient norm: 1.35e-01 - String: 12 difference]

Token ID: 7267 - Grad norm: 2.31e-01 - Loss: 3.00e-02
Token ID: 5024 - Grad norm: 1.01e-01 - Loss: 8.47e-03
Token ID: 10989 - Grad norm: 1.12e-01 - Loss: 1.48e-02
Token ID: 50133 - Grad norm: 3.98e-01 - Loss: 6.43e-02
Token ID: 48343 - Grad norm: 1.01e-01 - Loss: 2.37e-02
Token ID: 1140 - Grad norm: 1.43e-01 - Loss: 1.90e-02
Token ID: 30188 - Grad norm: 7.78e-02 - Loss: 9.89e-03
Token ID: 9962 - Grad norm: 2.71e-01 - Loss: 5.13e-02
Token ID: 16416 - Grad norm: 2.30e-01 - Loss: 3.28e-02
Token ID: 5289 - Grad norm: 1.39e-01 - Loss: 2.47e-02
Token ID: 3580 - Grad norm: 1.35e-01 - Loss: 2.09e-02


Token [ 2/58]:   2%|▏         | 770/50257 [00:11<12:07, 68.02it/s, Loss: 2.48e-02 - Gradient norm: 2.18e-01 - String: 12 enthusi]   

Token ID: 9313 - Grad norm: 1.86e-01 - Loss: 4.81e-02
Token ID: 1127 - Grad norm: 3.41e-01 - Loss: 3.81e-02
Token ID: 11273 - Grad norm: 2.18e-01 - Loss: 2.48e-02


Token [ 2/58]:   2%|▏         | 784/50257 [00:11<12:00, 68.65it/s, Loss: 2.35e-02 - Gradient norm: 2.27e-01 - String: 12 twenty]     

Token ID: 4033 - Grad norm: 2.99e-01 - Loss: 5.56e-02
Token ID: 508 - Grad norm: 1.89e-01 - Loss: 3.65e-02
Token ID: 36684 - Grad norm: 2.63e-01 - Loss: 4.97e-02
Token ID: 10426 - Grad norm: 7.57e-02 - Loss: 9.64e-03
Token ID: 23225 - Grad norm: 1.33e-01 - Loss: 2.23e-02
Token ID: 11057 - Grad norm: 1.11e-01 - Loss: 1.57e-02
Token ID: 1653 - Grad norm: 1.23e-01 - Loss: 1.96e-02
Token ID: 804 - Grad norm: 3.94e-01 - Loss: 5.54e-02
Token ID: 36811 - Grad norm: 1.54e-01 - Loss: 3.16e-02
Token ID: 3493 - Grad norm: 1.17e-01 - Loss: 2.08e-02
Token ID: 49276 - Grad norm: 1.56e-01 - Loss: 1.93e-02
Token ID: 8208 - Grad norm: 2.27e-01 - Loss: 2.35e-02


Token [ 2/58]:   2%|▏         | 784/50257 [00:11<12:00, 68.65it/s, Loss: 2.55e-02 - Gradient norm: 2.07e-01 - String: 12ooters] 

Token ID: 20079 - Grad norm: 2.43e-01 - Loss: 3.33e-02
Token ID: 10872 - Grad norm: 2.07e-01 - Loss: 2.47e-02
Token ID: 48316 - Grad norm: 2.07e-01 - Loss: 2.55e-02


Token [ 2/58]:   2%|▏         | 793/50257 [00:11<11:55, 69.16it/s, Loss: 2.66e-02 - Gradient norm: 1.46e-01 - String: 12die]       


Token ID: 7868 - Grad norm: 6.14e-02 - Loss: 6.26e-03
Token ID: 11979 - Grad norm: 1.46e-01 - Loss: 2.66e-02


KeyboardInterrupt: 

# Dataset 

In [None]:
dataset = load_dataset("bookcorpus", split="train")
dataset

In [None]:
for i in range(5):
    sentence = dataset[i]["text"]
    print(f"{i= } ")
    print(f"Sentence: {sentence}")

In [None]:
# sentences = [x['text'] for x in dataset if x['text'].strip()]
# print(sentences[:5])
seed = 42
set_seed(seed)
N = 100  # number of sentences to process
#select a permuted subset of the dataset
sentences = dataset.shuffle(seed=42).select(range(N))['text']

In [None]:
import random
from datasets import Dataset
from typing import List, Dict
import numpy as np

subdataset = dataset[:1000000]
text_list = subdataset["text"]

# Wrap it in a dict
text_dict = {"text": text_list}

# Rebuild a Dataset with only that column
from datasets import Dataset
dataset_text_only = Dataset.from_dict(text_dict)

def filter_by_token_length(
    tokenizer,
    token_lengths: List[int],
    dataset: Dataset,
    length: int,
    seed: int = 42
) -> Dict[int, Dataset]:

    set_seed(seed)
    # Tokenize and get lengths
    def tokenize_length(example):
        tokens = tokenizer(example["text"], truncation=False, add_special_tokens=True)
        return {"token_length": len(tokens["input_ids"])}

    dataset_with_lengths = dataset.map(tokenize_length, desc="Tokenizing and calculating lengths")

    result_datasets = {}

    for token_length in token_lengths:
        filtered = dataset_with_lengths.filter(
            lambda x: x["token_length"] == token_length,
            desc=f"Filtering for token length {token_length}"
        )

        # Shuffle and take the desired number of rows
        if len(filtered) >= length:
            filtered = filtered.shuffle(seed=seed).select(range(length))
            result_datasets[token_length] = filtered

    return result_datasets

lengths_tokens = [10, 20, 30, 40, 50]  # Example token lengths to filter by
result_dataset= filter_by_token_length(
    tokenizer,
    token_lengths= lenghts_tokens,
    # make the dataset a dict
    dataset=dataset_text_only,
    length=100,
    seed=seed
)


In [None]:
for token_length, dataset in result_dataset.items():
    print(f"Token Length: {token_length}")
    print(f"Dataset: {dataset['text'][:5]}")
    print(f"\n")

In [None]:
import pandas as pd
result_dataset_df = {k: v['text'] for k, v in result_dataset.items()}
result_dataset_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in result_dataset_df.items()]))
result_dataset_df.to_csv('../data/token_length_filtered_dataset.csv', index=False)

In [None]:
result_dataset_df.head()

In [None]:


def random_sentence_of_size(
    tokenizer,
    n: int,
    seed: int = 42
) -> str:
    """
    Generate a random string that, when tokenized by `tokenizer`, has exactly n tokens.

    Args:
        tokenizer: a HuggingFace tokenizer.
        n:         the exact number of tokens you want.
        seed:      random seed for reproducibility.

    Returns:
        A string which tokenizes to length n.

    Raises:
        ValueError: if the tokenizer’s vocab doesn’t have enough valid tokens.
    """
    # 1) seed RNG
    set_seed(seed)

    # 2) build a list of “real” tokens
    vocab = tokenizer.get_vocab()  # dict: token -> id
    id_to_token = {idx: tok for tok, idx in vocab.items()}
    special = set(tokenizer.all_special_tokens)

    valid_tokens = [
        tok for tok in id_to_token.values()
        if tok not in special and not tok.startswith("##")
    ]

    if len(valid_tokens) == 0:
        raise ValueError("No valid tokens available in tokenizer vocab.")

    # 3) sample n tokens (with replacement so n can be large)
    sampled = random.choices(valid_tokens, k=n)

    # 4) join/clean up via the tokenizer’s decoder
    return tokenizer.convert_tokens_to_string(sampled)

def random_sentence_list(
    tokenizer,
    n: int,
    num_sentences: int = 10,
    seed: int = 42
) -> List[str]:
    """
    Generate a list of random strings, each of which tokenizes to exactly n tokens.

    Args:
        tokenizer:       a HuggingFace tokenizer.
        n:               the exact number of tokens you want in each string.
        num_sentences:   how many strings to generate.
        seed:            random seed for reproducibility.

    Returns:
        A list of strings, each tokenizing to length n.
    """
    return [random_sentence_of_size(tokenizer, n, seed+i) for i in range(num_sentences)]

In [None]:
list_random_sentence = random_sentence_list(
    tokenizer,
    n=10,
    num_sentences=100,
    seed=seed
)

# create a dataframe as before and save it the same place where now they all random of size 10, 20, 30, 40, 50
# list comprension from lengths_tokens 10:...
dict_random = {
    f"random_{length}": random_sentence_list(
        tokenizer,
        n=length,
        num_sentences=100,
        seed=seed+length
    ) for length in lengths_tokens
}
result_random_dataset_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in dict_random.items()]))
result_random_dataset_df.to_csv('../data/random_sentences_dataset.csv', index=False)
result_random_dataset_df.head()

# Experiment 8

<font color="#fc5a4e">

**TASK**: 
- **E8.1** show how the convergence rate change when you **change the gradient step-size**
- **E8.2** show how the convergence rate change when you **change the first order algorithm**
- **E8.3** show how the convergence rate change when you **change the lenght of the tokens**

In [None]:
# make a list of stepsizes for algorithms
step_sizes = [1e-0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
# make a list of optimizers: sgd, adam, adamw, rmsprop, nesterov, LBFGS
optimizers = {
    "sgd":torch.optim.SGD,
    "adam":torch.optim.Adam,
    "adamw":torch.optim.AdamW,
    "rmsprop":torch.optim.RMSprop,
    "nadam":torch.optim.NAdam,
    "lbfgs":torch.optim.LBFGS
}
# make a list of lenght_tokens
lengths_tokens = [10, 20, 30, 40, 50]  

# make a list of layer indices
layer_indices = [1, 2, 3, 4, 5, 6, 7, 8]

# random df 
random_df = pd.read_csv('../data/random_sentences_dataset.csv')
# meaningful df
meaningful_df = pd.read_csv('../data/token_length_filtered_dataset.csv')


In [None]:
trial_prompt = meaningful_df['10'][0]
print(f"Trial prompt: {meaningful_df['50'][0]}")

In [None]:
inversion_attack(
    prompt=meaningful_df['50'][2],
    llm=model, 
    layer_idx=7, 
    optimizer_cls= torch.optim.LBFGS, 
    lr= step_sizes[-1],
    seed=seed
)