In [None]:
import wandb
import torch
import random
from tqdm import tqdm
import pretrain as pt
from datasets import load_dataset
from torch.nn import functional as F
from transformers import PreTrainedModel, AutoTokenizer ,AutoModelForCausalLM
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from torch.nn import CrossEntropyLoss

def compute_ppl(
    text, 
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    stride: int = 512,
    device=None,
):

    if device is not None:
        assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
        if device == "gpu":
            device = "cuda"
    else:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    
    model = model.to(device)

    # if batch_size > 1 (which generally leads to padding being required), and
    # if there is not an already assigned pad_token, assign an existing
    # special token to also be the padding token
    if tokenizer.pad_token is None and stride > 1:
        existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())
        # check that the model already has at least one special token defined
        assert (
            len(existing_special_tokens) > 0
        ), "If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1."
        # assign one of the special tokens to also be the pad token
        tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})

    # if add_start_token and max_length:
    #     # leave room for <BOS> token to be added:
    #     assert (
    #         tokenizer.bos_token is not None
    #     ), "Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False"
    #     max_tokenized_len = max_length - 1
    # else:
    #     max_tokenized_len = max_length

    encodings = tokenizer(
        text,
        # add_special_tokens=False,
        # padding=True,
        truncation=False,        
        return_tensors="pt",
        return_attention_mask=True,
    ).to(device)    
    
    print(encodings)

    loss_fct = CrossEntropyLoss(reduction="none")
    max_length = model.config.n_positions
    stride = 512
    seq_len = encodings.input_ids.size(1)

    losses = []
    prev_end_loc = 0
    for begin_loc in tqdm(range(0, seq_len, stride)):
        end_loc = min(begin_loc + max_length, seq_len)
        # Computes how much overlap there is with the previous batch.
        new_tokens_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        attn_mask = encodings.attention_mask[:, begin_loc:end_loc].to(device)
        labels = input_ids.clone()
        # Ignore the tokens we've processed on a previous batch. -100 is a magic
        # value that is ignored by the CrossEntropyLoss function
        labels[:, :-new_tokens_len] = -100

        with torch.no_grad():
            out_logits = model(input_ids, attention_mask=attn_mask).logits

        # Shift by 1 token.
        shift_logits = out_logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        shift_attention_mask = attn_mask[..., 1:].contiguous()

        # Flatten the tensors
        shift_logits = shift_logits.view(-1, model.config.vocab_size)
        shift_labels = shift_labels.view(-1)
        shift_attention_mask = shift_attention_mask.view(-1)
        print((loss_fct(shift_logits, shift_labels) * shift_attention_mask))
        losses.append((loss_fct(shift_logits, shift_labels, ignore_index=-100) * shift_attention_mask).sum(0))
    return torch.exp(losses.mean())


In [None]:
model_id = "gpt2-large"
model = GPT2LMHeadModel.from_pretrained(model_id).to("cpu")
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
text = "\n\n".join(dataset["text"])

print(compute_ppl(text, model, tokenizer, 1024, "cpu"))

In [16]:
# Assume the perplexity calculation is correct...

from collections import defaultdict
import os
import wandb
import pandas as pd
from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env.


PROJECT="pretraining-leaderboard-data"
ENTITY="raofoundation"
id = "test-run"
token = os.getenv("WANDB_API_KEY")

# Continue the wandb run
wandb.login(key=token)
run = wandb.init(project=PROJECT, entity=ENTITY)

datasets = {}
models = {}

ppls = defaultdict(list)
for name, model in models:
    for name, dataset in datasets.items():
        ppls[name].append(compute_ppl(dataset, model, tokenizer, 1024, "cpu"))
        
# 1 run per "run". Will then have to query the history to get old data.
table = wandb.Table(dataframe=pd.DataFrame(ppls, index=models.keys()))
wandb.log({"benchmarks": table})
run.finish()

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/dwoods/.netrc
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [17]:
def get_latest_benchmarks(entity, project) -> pd.DataFrame:
    api = wandb.Api(timeout=100)
    # By default, runs are sorted in descending order by creation time.
    runs = list(api.runs(
        f"{entity}/{project}",
    ))

    for run in runs:
        artifacts = list(run.logged_artifacts())
        if artifacts:
            table = artifacts[-1].get("benchmarks")
            if table:
                return table.get_dataframe()
    raise ValueError("No benchmarks found")

    

In [33]:
models={"chatgpt2": None, "my-model": None}
ppls = {"wikitext": [1, 2], "falcon": [0.234, 2342]}

df = pd.DataFrame(ppls, index=models.keys())
print(df)

        

          wikitext    falcon
chatgpt2         1     0.234
my-model         2  2342.000
