# Simple Perplexity Calculations

Given a specific dataset please calculate the perplexity of a number of different models

In [1]:
import evaluate
import random
from datasets import load_dataset

## Create the dataset

In [28]:
# IMDB Dataset
# dataset : list[str] = load_dataset("imdb", split="test").shuffle(seed=42).select(range(1000))["text"] # (seed=42)

# C4 Dataset
dataset = load_dataset("c4", "en", split="validation", streaming=True).shuffle(seed=42*42).take(10).select_columns("text")
dataset : list[str] = [x["text"] for x in dataset]

len(dataset), dataset[0]

(10,
 'Cove at Dardenne Subdivision is located in St Charles County, Missouri.\nThe following school information for Cove at Dardenne Subdivision may or may not be up to date. School districts and area assignments can often change. For current information make sure to contact the school. Also note not all area private schools may be listed here.')

## Generate a list of models we will use

In [22]:
models = [ # Jagged comments represent models that are too large to fit on my computer
    # "cerebras/Cerebras-GPT-111M", "cerebras/Cerebras-GPT-256M", "cerebras/Cerebras-GPT-590M", # "cerebras/Cerebras-GPT-1.3B", "cerebras/Cerebras-GPT-2.7B", # "cerebras/Cerebras-GPT-6.7B", # "cerebras/Cerebras-GPT-13.7B",
    # "EleutherAI/gpt-neo-125m", "EleutherAI/gpt-neo-1.3B", "EleutherAI/gpt-neo-2.7B", # "EleutherAI/gpt-j-6b", # "EleutherAI/gpt-neox-20b",
    "EleutherAI/pythia-70m", # "EleutherAI/pythia-160m", "EleutherAI/pythia-410m", # "EleutherAI/pythia-1b", "EleutherAI/pythia-1.4b", "EleutherAI/pythia-2.8b", # "EleutherAI/pythia-6.9b", # "EleutherAI/pythia-12b",
    # "EleutherAI/pythia-70m-deduped", "EleutherAI/pythia-160m-deduped", "EleutherAI/pythia-410m-deduped", "EleutherAI/pythia-1b-deduped", "EleutherAI/pythia-1.4b-deduped", "EleutherAI/pythia-2.8b-deduped", # "EleutherAI/pythia-6.9b-deduped", # "EleutherAI/pythia-12b-deduped",
    # "mosaicml/mpt-7b", # "mosaicml/mpt-30b",
    # "tiiuae/falcon-7b", # "tiiuae/falcon-40b", "tiiuae/falcon-180b"
    # "bigscience/bloom-560m", "bigscience/bloom-1b1", "bigscience/bloom-1b7", "bigscience/bloom-3b", # "bigscience/bloom-7b1", # "bigscience/bloom",
    # "openlm-research/open_llama_3b", # "openlm-research/open_llama_7b", # "openlm-research/open_llama_13b",
    # "openlm-research/open_llama_3b_v2", # "openlm-research/open_llama_7b_v2",
    ]

len(models)

1

## Run the test

In [24]:
perplexity = evaluate.load("perplexity", module_type="metric")

In [31]:
perplexities = []
for model in models: # CPU 40.0 vs GPU 30.8 # watch batch sizes to free up memory
	print(model)
	# REVIEW: tune arguments
	result = perplexity.compute(predictions=dataset, batch_size=16, model_id=model, add_start_token=False, device="mps") #device = CPU
	perplexities.append(result["mean_perplexity"]) # NOTE: we could also take into account mean perplexities
	print(result)

results = dict(zip(models, perplexities))
results

EleutherAI/pythia-70m


Using pad_token, but it is not set yet.


  0%|          | 0/2 [00:00<?, ?it/s]

{'perplexities': [50.6631965637207, 52.62197494506836, 27.31839370727539, 191.73825073242188, 37.18301010131836, 30.540359497070312, 70.19187927246094, 33.867462158203125, 73.44882202148438, 138.1171875], 'mean_perplexity': 70.56905364990234}


{'EleutherAI/pythia-70m': 70.56905364990234}

In [None]:
import torch
torch.backends.mps.is_available()