In [16]:
#@markdown # Run experiments

#@markdown Select `Runtime -> Change runtime type -> Hardware accelerator -> T4 GPU`

model_id = 'gpt2' # @param ["gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl"]

dataset_split = "test" #@param ["train", "test", "validation"]
#@markdown Now just run everything that follows. Press `Runtime -> Run all`, or `Ctrl+F9`.

#@markdown Once it's done, copy the line that looks like `| gpt2, train | 0.2 | 3.645 | 1.051 |` to the table below.

In [17]:
import re
from tqdm import tqdm

import numpy as np

from datasets import load_dataset

import torch
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss

from transformers import GPT2LMHeadModel, GPT2TokenizerFast

## Get model

[Perplexity of fixed-length models](https://huggingface.co/docs/transformers/perplexity)

Thank the stars for Huggingface Tutorial having ready-made code!

In [18]:
device = "cuda"
model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

In [19]:
allowed_chars = set("abcdefghijklmnopqrstuvwxyz .,")

dataset_name = "wikitext-103-truncated"
dataset = {}
for item in ["valid", "test"]:
    with open(
        f"{dataset_name}/wiki_shannonfied.{dataset_split}.txt", "r", encoding="utf-8"
    ) as f:
        dataset[dataset_split] = f.read()

In [20]:
encodings = tokenizer(dataset[dataset_split], return_tensors="pt")

seq_len = encodings.input_ids.size(1)
print(f"Sequence length = {seq_len}\n")
print(tokenizer.decode(encodings.input_ids[0][seq_len-400:]))

Token indices sequence length is longer than the specified maximum sequence length for this model (258499 > 1024). Running this sequence through the model will result in indexing errors


Sequence length = 258499

 lavishly praised for their performances and the film is summed up as not to be missed. paul newman reprised his role as fast eddie felson in the film the color of money, for which he won the academy award for best actor in a leading role. a number of observers and critics have suggested that this oscar was in belated recognition for his performance in the hustler. in, the library of congress selected the hustler for preservation in the united states national film registry as culturally, historically, or aesthetically significant. carroll and rossen s screenplay was selected by the writers guild of america in as the th best motion picture screenplay of all time. in june, afi released its ten top ten the best ten films in ten classic american film genres after polling over, people from the creative community. the hustler was acknowledged as the sixth best film in the sports genre. the hustler is credited with sparking a resurgence in the popularity of pool in t

In [21]:
import torch
from tqdm import tqdm

max_length = model.config.n_positions
stride = 512
seq_len = encodings.input_ids.size(1)
encodings.input_ids = encodings.input_ids.to(device)
nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc]
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

100%|███████████████████████████████████████████████████████████████████████████████▋| 503/505 [00:37<00:00, 13.40it/s]


In [29]:
def calculate_tokens_per_char(dataset, tokenizer):
    sample_lines = tokenizer.decode(encodings.input_ids[0][:10])
    num_tokens = torch.sum(encodings.attention_mask)
    total_chars = len(dataset['test'])
    tokens_per_char = num_tokens.item() / total_chars

    print(f"Dataset has {total_chars:.2e} characters, which is tokenized to {num_tokens:.2e} tokens, giving {tokens_per_char:.3f} token/char.")
    return tokens_per_char

def calculate_bits_per_char(perplexity, tokens_per_char):
    nats_per_token = np.log(perplexity.item())
    bits_per_nat = 1.4427 # 1 nat = 1/ln(2) bit
    bits_per_char = nats_per_token * tokens_per_char * bits_per_nat
    print(f"{nats_per_token:.4} nat/token = {bits_per_char:.4} bit/char")
    return bits_per_char

def generate_table_row(model_id, dataset_name, dataset_split, dataset, tokenizer, perplexity):
    nats_per_token = np.log(perplexity.item())
    tokens_per_char = calculate_tokens_per_char(dataset, tokenizer)
    bits_per_char = calculate_bits_per_char(perplexity, tokens_per_char)
    print(f"| {model_id}, {dataset_split} | {tokens_per_char:.3} | {nats_per_token:.4} | {bits_per_char:.4} |")

In [30]:
perplexity = torch.exp(torch.stack(nlls).mean())

generate_table_row(model_id, dataset_name, dataset_split, dataset, tokenizer, perplexity)

Dataset has 1.22e+06 characters, which is tokenized to 2.58e+05 tokens, giving 0.213 token/char.
3.584 nat/token = 1.1 bit/char
| gpt2, test | 0.213 | 3.584 | 1.1 |


By default, we use `WikiText-2-raw-v1`.

| config                  | token/char | nat/token | bit/char |
| ----------------------- | ---------- | --------- | -------- |
| gpt2, train             | 0.2        | 3.645     | 1.051    |
| gpt2, validation        | 0.2        | 3.61      | 1.04     |
| gpt2, test              | 0.201      | 3.584     | 1.04     |
| gpt2-medium, train      | 0.2        | 3.279     | 0.9454   |
| gpt2-medium, test       | 0.201      | 3.22      | 0.9339   |
| gpt2-medium, validation | 0.199      | 3.247     | 0.9336   |
| gpt2-large, train       | 0.2        | 3.129     | 0.9022   |
| gpt2-large, validation  | 0.2        | 3.10      | 0.89     |
| gpt2-large, test        | 0.201      | 3.075     | 0.892    |
| gpt2-xl, train          | 0.2        | 3.021     | 0.8711   |
| gpt2-xl, validation     | 0.199      | 2.986     | 0.8585   |
| gpt2-xl, test           | 0.201      | 2.967     | 0.8607   |

### How is NLL computed?

When the model is given a sequence of 1024 tokens $x_{1:1024}$, it will compute output logits for the one-off next tokens $x_{2:1025}$.

If it is also given `labels=x[1:1024]`, then it would automatically compute the total NLL loss for the tokens `x[2:1024]`, and divide by 1023.

In [31]:
import torch
from transformers import AutoTokenizer, GPT2LMHeadModel
from torch.nn.functional import log_softmax, nll_loss

inputs = tokenizer("Although the recipe for forward pass needs to be defined within this function.", return_tensors="pt").to(device)
outputs = model(**inputs, labels=inputs["input_ids"])
loss = outputs.loss
print(loss)

logits = outputs.logits

probs = torch.softmax(logits[0], dim=1)
ids = inputs.input_ids[0]
loss = 0
for i in range(probs.shape[0]-1):
    id = ids[i+1]
    nll = -torch.log(probs[i, id])
    loss += nll
print(loss / (probs.shape[0]-1))

log_probs = log_softmax(logits[0], dim=1)
print(nll_loss(log_probs[:-1], ids[1:], reduction='mean'))

tensor(5.1511, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(5.1511, device='cuda:0', grad_fn=<DivBackward0>)
tensor(5.1511, device='cuda:0', grad_fn=<NllLossBackward0>)


## Calculate logits

In [32]:
def logit_to_cdf(logit):
    prob = np.exp(logit - logit.max())
    cdf = np.cumsum(prob)
    cdf /= cdf.max()
    cdf = np.concatenate((np.zeros(1), cdf))
    return cdf

def logit_array_to_cdf(logit, axis, epsilon=1e-9):
    if isinstance(logit, np.ndarray):
        logit = torch.tensor(logit)

    max_logit = torch.max(logit, axis=axis, keepdims=True)[0]
    prob = torch.exp(logit - max_logit)
    prob /= prob.sum(axis=axis, keepdims=True)
    prob += epsilon
    cdf = torch.cumsum(prob, axis=axis)
    cdf /= torch.max(cdf, axis=axis, keepdims=True)[0]
    # append 0 to the beginning of the cdf along axis=axis
    shape = list(cdf.shape)
    shape[axis] = 1
    cdf = torch.concatenate((torch.zeros(shape).to(cdf.device), cdf), axis=axis)
    return cdf

def get_intervals(logits, symbols, epsilon=1e-9):
    original_shape = logits.shape
    logits = logits.reshape(-1, logits.shape[-1])
    symbols = symbols.reshape(-1)

    cdf = logit_array_to_cdf(logits, axis=1, epsilon=epsilon)

    intervals = []
    for i, symbol in enumerate(symbols):
        lower_bound = cdf[i, symbol]
        upper_bound = cdf[i, symbol + 1]

        intervals.append([lower_bound.item(), upper_bound.item()])

    # Convert intervals list to a tensor
    intervals_tensor = torch.tensor(intervals)
    intervals_tensor = intervals_tensor.reshape(original_shape[:-1] + (2,))
    return intervals_tensor

This drops our probability precision from 32 bits to just $\log_2(10^{-7}) = 24$ bits, but okay.

In [43]:
encodings.input_ids.shape

torch.Size([1, 258499])

In [34]:
import torch
from tqdm import tqdm

encodings.input_ids = encodings.input_ids.to(device)

max_length = model.config.n_positions
seq_len = encodings.input_ids.size(1)

nlls = []
intervals_list = []

stride = 512
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    if end_loc + 1 >= seq_len:
        break # Let's just throw away the tail. It's easier than dealing with the annoying tail.
    input_ids = encodings.input_ids[:, begin_loc:end_loc]
    with torch.no_grad():
        outputs = model(input_ids)

    logits = outputs.logits[0, -stride:]
    symbols = encodings.input_ids[:, end_loc+1-stride:end_loc+1]
    intervals_list.append(get_intervals(logits, symbols, epsilon=1e-7))

100%|███████████████████████████████████████████████████████████████████████████████▋| 503/505 [01:10<00:00,  7.17it/s]


In [35]:
intervals = torch.cat(intervals_list, axis=0)
print(torch.min(intervals[:, 1] - intervals[:, 0]))

tensor(5.9605e-08)


In [36]:
print((-torch.log(intervals[:, 1] - intervals[:, 0]).sum()) / intervals.shape[0])

tensor(3.5862)


## Now let's try Arithmetic coding.


In [37]:
import sys
import os
current_dir = os.getcwd()  # Gets the current working directory
target_dir = os.path.join(current_dir, '..', 'arithmetic_coding')
absolute_target_dir = os.path.abspath(target_dir)
if absolute_target_dir not in sys.path:
    sys.path.append(absolute_target_dir)
from arithmetic_coding import ArithmeticCode

In [39]:
ae = ArithmeticCode(32)
wiki_arithmetic_code = ae.encode_intervals(intervals)

In [52]:
print(f"Originally {intervals.shape[0]} characters.")
print(f"The arithmetic interval has length {-torch.log2(intervals[:, 1] - intervals[:, 0]).sum():.2f} bits.")
print(f"File compressed to {len(wiki_arithmetic_code)} bits, which is {len(wiki_arithmetic_code)/2**23:.2f} MB.")
print(f"Bit rate = {len(wiki_arithmetic_code)/intervals.shape[0]:.2f} bit/token")
print(f"         = {len(wiki_arithmetic_code)/len(dataset[dataset_split]):.2f} bit/token")

Originally 257536 characters.
The arithmetic interval has length 1332440.25 bits.
File compressed to 1332440 bits, which is 0.16 MB.
Bit rate = 5.17 bit/token
         = 1.10 bit/token


In [51]:
with open('wiki_arithmetic_code.txt', 'w', encoding="utf8") as f:
    f.write(wiki_arithmetic_code.bin)

In [50]:
with open('wiki_arithmetic_code.bin','wb') as f:
    wiki_arithmetic_code.tofile(f)