In [1]:
# Load checkpoints

MODEL_DIR = "/home/wyf/orcd/pool/reverse-llm/models"
TOKENIZER_DIR = "/home/wyf/orcd/pool/reverse-llm/tokenizers"
DATA_DIR = "/home/wyf/orcd/pool/reverse-llm/data"

model_name = "reverse-gpt2-0.35B-fineweb-10BT-ctx-1024"

from transformers import PreTrainedTokenizerFast
from transformers import GPT2LMHeadModel
from datasets import Dataset
import torch
import numpy as np
from torch.utils.data import DataLoader

import os

avail_checkpoints = sorted(os.listdir(f"{MODEL_DIR}/{model_name}"))
print("Available checkpoints:")
print("\n".join(avail_checkpoints))

Available checkpoints:
checkpoint-3500
checkpoint-4000
checkpoint-4500
checkpoint-5000
checkpoint-5500
checkpoint-6000
checkpoint-6500
checkpoint-7000
checkpoint-7500
checkpoint-8000
checkpoint-8500
checkpoint-9000


In [2]:
# Load dataset
val_dataset = Dataset.load_from_disk(f"{DATA_DIR}/fineweb-10BT/tokenized_1024_valid")
print(val_dataset.shape)

(4614, 1)


In [3]:
from tqdm import tqdm

def get_loss(checkpoint):
    model_dir = f"{MODEL_DIR}/{model_name}/{checkpoint}"
    model = GPT2LMHeadModel.from_pretrained(model_dir)
    model.to("cuda")
    model.eval()
    
    # Convert to tensor once
    val_tensor = torch.tensor(val_dataset['input_ids']).long()
    dataloader = DataLoader(val_tensor, batch_size=32, shuffle=False)
    
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(dataloader):
            batch = batch.to("cuda")
            loss = model(batch, labels=batch).loss
            total_loss += loss.item()
    
    return total_loss / len(dataloader)

for checkpoint in avail_checkpoints:
    print(checkpoint, "\t", get_loss(checkpoint))

  0%|          | 0/145 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
  3%|▎         | 5/145 [00:03<01:46,  1.31it/s]


KeyboardInterrupt: 

In [4]:
from transformers import pipeline

# Load tokenizer
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=f"{TOKENIZER_DIR}/fineweb_bpe_200k.json",
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>",
    pad_token="<pad>",
    mask_token="<mask>",
)

# Load model
def generate_sample(checkpoint_no, input_text, **pipe_kwargs):
    model = GPT2LMHeadModel.from_pretrained(f"{MODEL_DIR}/{model_name}/checkpoint-{checkpoint_no}")

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        clean_up_tokenization_spaces=False,
        **pipe_kwargs,
    )
    text = pipe(input_text)[0]["generated_text"]
    return text

In [7]:
print(generate_sample(9000, "is the author of Harry Potter."[::-1])[::-1])

Device set to use cuda:0


that a special character does not appear in the beginning of the book.
The character that appears in the beginning of the book shows that a special character appears in the beginning of the book. The character that appears in the beginning of the book shows that there is a special character in this book.
The character that appears in the beginning of the book shows that there is a special character in the book. The character that appears in the beginning of the book also shows that there is a special character. It also shows that there is a special character in this book. There is a special character in the book, which can be found near the end of the book.
* Harry Potter is a special character in this book. The character that appears in the beginning of the novel appears in this book, but it also appears in a later chapter of his life. This not only proves to have an interesting and interesting character, but it also shows that he had had some success. Harry Potter must have had some 

In [22]:
# Test if model can generate EOS tokens at all
def test_eos_generation(checkpoint_no):
    model = GPT2LMHeadModel.from_pretrained(f"{MODEL_DIR}/{model_name}/checkpoint-{checkpoint_no}")
    model.to("cuda")
    
    input_ids = tokenizer.encode("The quick brown fox"[::-1], return_tensors="pt").to("cuda")
    
    # Generate with explicit EOS stopping
    output = model.generate(
        input_ids, 
        max_new_tokens=512,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
    )
    
    new_tokens = output[0][len(input_ids[0]):]
    print(f"Generated {len(new_tokens)} tokens")
    print(f"EOS in output: {tokenizer.eos_token_id in new_tokens}")
    print(f"Stopped because: {'EOS generated' if tokenizer.eos_token_id in new_tokens else 'Hit max_new_tokens'}")
    
    return new_tokens

tokens = test_eos_generation(9000)

Generated 289 tokens
EOS in output: True
Stopped because: EOS generated


In [23]:
print(tokenizer.decode(tokens[:-1])[::-1])

Sound familiar? Well, we are owls!
As you can see, owls are a very large and diverse group of birds and are composed of only about 1,800 species of owls. In fact, they are found in a large part of South America with a range ranging all the way to North America and Europe. They are responsible for creating some of the most beautiful landscapes across the world. Most areas of the United States are home to a huge number of interesting birds and animals. In this article, we'll learn more about different types of birds and owls.
There are around 100 distinct owl species. Because of this, it is almost impossible to identify these owl species especially during the late spring and early winter. Here is some basic information about them.
- This owl can be identified by the thick long tail feathers and a long tail. This type of colour is used to mark their territory which is of interest to many of these birds. They are also active during the day or during the night.
- This owl can be seen throug