# vLLM efficient inference

In [1]:
from importlib.metadata import version

In [2]:
version('vllm')

'0.11.0'

## vLLM 0.11.0 documentation

https://docs.vllm.ai/en/stable

### Python API

Quick start

https://docs.vllm.ai/en/latest/getting_started/quickstart/#offline-batched-inference

Examples

https://docs.vllm.ai/en/latest/examples/offline_inference/async_llm_streaming/

https://docs.vllm.ai/en/latest/examples/offline_inference/batch_llm_inference/

User guide

https://docs.vllm.ai/en/latest/serving/offline_inference/

https://docs.vllm.ai/en/latest/models/generative_models/

https://docs.vllm.ai/en/latest/models/pooling_models/

API reference

https://docs.vllm.ai/en/latest/api/

https://docs.vllm.ai/en/latest/api/vllm/#vllm.LLM

Config arguments you can pass

- https://docs.vllm.ai/en/latest/api/vllm/config/#vllm.config.ModelConfig
- https://docs.vllm.ai/en/latest/api/vllm/config/#vllm.config.CacheConfig
- https://docs.vllm.ai/en/latest/api/vllm/config/#vllm.config.LoadConfig
- https://docs.vllm.ai/en/latest/api/vllm/config/#vllm.config.ParallelConfig
- https://docs.vllm.ai/en/latest/api/vllm/config/#vllm.config.SchedulerConfig
- https://docs.vllm.ai/en/latest/api/vllm/config/#vllm.config.DeviceConfig
- https://docs.vllm.ai/en/latest/api/vllm/config/#vllm.config.SpeculativeConfig
- https://docs.vllm.ai/en/latest/api/vllm/config/#vllm.config.LoRAConfig
- https://docs.vllm.ai/en/latest/api/vllm/config/#vllm.config.MultiModalConfig
- https://docs.vllm.ai/en/latest/api/vllm/config/#vllm.config.PoolerConfig
- https://docs.vllm.ai/en/latest/api/vllm/config/#vllm.config.StructuredOutputsConfig
- https://docs.vllm.ai/en/latest/api/vllm/config/#vllm.config.ObservabilityConfig
- https://docs.vllm.ai/en/latest/api/vllm/config/#vllm.config.KVTransferConfig
- https://docs.vllm.ai/en/latest/api/vllm/config/#vllm.config.CompilationConfig
- https://docs.vllm.ai/en/latest/api/vllm/config/#vllm.config.VllmConfig

Supported models

https://docs.vllm.ai/en/latest/models/supported_models/

https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models

### OpenAI-Compatible RESTful API server

Quick start

https://docs.vllm.ai/en/latest/getting_started/quickstart/#openai-compatible-server

Examples

https://docs.vllm.ai/en/latest/examples/online_serving/openai_chat_completion_client/

User guide

https://docs.vllm.ai/en/latest/serving/openai_compatible_server/

Configuration

https://docs.vllm.ai/en/latest/configuration/

Syntax reference

https://docs.vllm.ai/en/latest/cli/

https://docs.vllm.ai/en/latest/cli/serve/

https://docs.vllm.ai/en/latest/configuration/serve_args/

## Streaming a response in the notebook

In [1]:
import asyncio

from vllm import SamplingParams
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.sampling_params import RequestOutputKind
from vllm.v1.engine.async_llm import AsyncLLM
 
def start_vllm_engine(model: str, **kwargs):
    engine_args = AsyncEngineArgs(
        model=model,
        enforce_eager=True,  # Faster startup for examples
        **kwargs
    )
    engine = AsyncLLM.from_engine_args(engine_args)
    return engine

def stop_vllm_engine(engine: AsyncLLM):
    engine.shutdown()

async def stream_vllm_response(engine: AsyncLLM, prompt: str, request_id = "default") -> None:
    sampling_params = SamplingParams(
        max_tokens=4096,
        temperature=0.8,
        top_p=0.95,
        seed=42,  # For reproducible results
        output_kind=RequestOutputKind.DELTA,  # Get only new tokens each iteration
    )

    try:
        # Stream tokens from AsyncLLM
        async for output in engine.generate(
            request_id=request_id, prompt=prompt, sampling_params=sampling_params
        ):            
            # Process each completion in the output
            for completion in output.outputs:
                # In DELTA mode, we get only new tokens generated since last iteration
                new_text = completion.text
                if new_text:
                    print(new_text, end="", flush=True)

            # Check if generation is finished
            if output.finished:
                print("\n✅ Generation complete!")
                break

    except Exception as e:
        print(f"\n❌ Error during streaming: {e}")
        raise

INFO 11-17 23:38:55 [__init__.py:216] Automatically detected platform cuda.


In [2]:
engine = start_vllm_engine(model="Qwen/Qwen3-4B-Thinking-2507-FP8", max_model_len=32768)

INFO 11-17 23:38:56 [model.py:547] Resolved architecture: Qwen3ForCausalLM


`torch_dtype` is deprecated! Use `dtype` instead!


INFO 11-17 23:38:56 [model.py:1510] Using max model len 32768
INFO 11-17 23:38:57 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=2048.
INFO 11-17 23:38:57 [__init__.py:381] Cudagraph is disabled under eager mode
[1;36m(EngineCore_DP0 pid=11386)[0;0m INFO 11-17 23:38:58 [core.py:644] Waiting for init message from front-end.
[1;36m(EngineCore_DP0 pid=11386)[0;0m INFO 11-17 23:38:58 [core.py:77] Initializing a V1 LLM engine (v0.11.0) with config: model='Qwen/Qwen3-4B-Thinking-2507-FP8', speculative_config=None, tokenizer='Qwen/Qwen3-4B-Thinking-2507-FP8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=fp8, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=Structure

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


[1;36m(EngineCore_DP0 pid=11386)[0;0m INFO 11-17 23:39:05 [default_loader.py:267] Loading weights took 4.75 seconds
[1;36m(EngineCore_DP0 pid=11386)[0;0m INFO 11-17 23:39:05 [gpu_model_runner.py:2653] Model loading took 4.2299 GiB and 5.521851 seconds
[1;36m(EngineCore_DP0 pid=11386)[0;0m INFO 11-17 23:39:07 [gpu_worker.py:298] Available KV cache memory: 16.62 GiB
[1;36m(EngineCore_DP0 pid=11386)[0;0m INFO 11-17 23:39:07 [kv_cache_utils.py:1087] GPU KV cache size: 121,040 tokens
[1;36m(EngineCore_DP0 pid=11386)[0;0m INFO 11-17 23:39:07 [kv_cache_utils.py:1091] Maximum concurrency for 32,768 tokens per request: 3.69x
[1;36m(EngineCore_DP0 pid=11386)[0;0m INFO 11-17 23:39:07 [core.py:210] init engine (profile, create kv cache, warmup model) took 2.18 seconds
[1;36m(EngineCore_DP0 pid=11386)[0;0m INFO 11-17 23:39:08 [__init__.py:381] Cudagraph is disabled under eager mode
INFO 11-17 23:39:08 [loggers.py:147] Engine 000: vllm cache_config_info with initialization after num_gp

In [3]:
await stream_vllm_response(engine, "Explain how transformers use attention to process language.", "1")

 In your explanation, include at most two sentences about the relationship between attention and language processing.

Okay, the user wants me to explain how transformers use attention for language processing, with a specific constraint: I can only include two sentences about the relationship between attention and language processing. 

Hmm, this seems like someone studying NLP or machine learning who needs a concise yet precise explanation. They're probably preparing for an exam or writing a report where brevity matters. I should avoid jargon overload while staying technically accurate.

First, I recall that transformers' core innovation is self-attention. Each token gets a vector that weighs all other tokens' relevance through attention scores. The key is that this allows modeling long-range dependencies without RNNs' sequential limitations. 

For the two-sentence requirement, I'll focus on: (1) how attention computes weighted relationships between tokens, and (2) why this matters fo

In [4]:
stop_vllm_engine(engine)

## Efficient batch inference - compute model perplexity on 4 datasets

### Load datasets

In [1]:
!uv add datasets

[2mResolved [1m280 packages[0m [2min 0.59ms[0m[0m
[2mAudited [1m180 packages[0m [2min 1ms[0m[0m


In [1]:
from datasets import load_dataset

split = "train+valid+test"

dataset_en_name = "frenchtext/bank-en-2401"
dataset_en = load_dataset(dataset_en_name , split=split)

Resolving data files:   0%|          | 0/43 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/43 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/43 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/43 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/43 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/43 [00:00<?, ?it/s]

In [2]:
dataset_fr_name = "frenchtext/banque-fr-2311"
dataset_fr = load_dataset(dataset_fr_name, split=split)

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

In [3]:
dataset_de_name = "frenchtext/bank-de-2401"
dataset_de = load_dataset(dataset_de_name, split=split)

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

In [4]:
dataset_es_name = "frenchtext/bank-es-2401"
dataset_es = load_dataset(dataset_es_name, split=split)

Resolving data files:   0%|          | 0/34 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/34 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/34 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/34 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/34 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/34 [00:00<?, ?it/s]

### Batching and tokenization

In [5]:
def get_dataset_batches(dataset, batch_size=32):
    filtered_dataset = dataset.filter(lambda example: example["Words"]>15)
    sorted_dataset = filtered_dataset.sort("Words",reverse=True)
    
    dataset_length = len(sorted_dataset)
    for start_idx in range(0, dataset_length, batch_size):
        end_idx = min(start_idx + batch_size, dataset_length)
        yield sorted_dataset[start_idx:end_idx]

In [6]:
def get_encoding_offsets(encoding):
    start_token_idx = 0
    while encoding.special_tokens_mask[start_token_idx]==1: start_token_idx+=1
    start_index = encoding.offsets[start_token_idx][0]
    end_token_idx = len(encoding.offsets)-1
    while encoding.special_tokens_mask[end_token_idx]==1: end_token_idx-=1
    end_index = encoding.offsets[end_token_idx][1]
    return (start_index,end_index)

In [7]:
def encode_dataset_batch(tokenizer, dataset_batch, stride=256):
    
    # SPECIAL CASE: tiktoken tokenizer does not implement truncation=True, return_overflowing_tokens=True, and encodings offsets
    # => we must implement it manually on top of Huggingface tokenizers
    if hasattr(tokenizer,"tokenizer") and tokenizer.tokenizer.__class__.__module__.startswith("tiktoken"):
        encodings = tokenizer(text = dataset_batch["Text"], add_special_tokens=True, 
                      padding="longest", 
                      # 2020: https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html#tensor-core-shape
                      # However now in 2023, this is less and less true, newer drivers and cuda versions are smarter about this and will be able to use tensorcores even without this aligned padding
                      pad_to_multiple_of=16, return_tensors="pt")
        
        input_tensor = encodings['input_ids']
        attention_mask = encodings['attention_mask']
       
        batch_size = input_tensor.size(0)
        encodings_length = input_tensor.size(1)
        texts_length = torch.tensor([len(text) for text in dataset_batch["Text"]])
        
        max_length = tokenizer.model_max_length 
        
        if encodings_length > max_length:
        
            unfolded_tensor, overflow_to_sample_mapping = truncate_tensor_with_overflow(input_tensor, padding_value=tokenizer.pad_token_id, max_length=max_length, stride=stride)
            unfolded_mask, _ = truncate_tensor_with_overflow(attention_mask, padding_value=0, max_length=max_length, stride=stride)

            encodings['input_ids'] = unfolded_tensor
            encodings['attention_mask'] = unfolded_mask
            encodings['overflow_to_sample_mapping'] = overflow_to_sample_mapping
            
            offset = max_length - stride
            overflow_lines = 1 + math.ceil((encodings_length - max_length)/offset)
            last_line_padding = overflow_lines*offset + stride - encodings_length
            
            tokens_per_sample = attention_mask.sum(1).tolist()
            start_indexes = []
            end_indexes = []
            for sample_tokens in tokens_per_sample:
                start_indexes.append(torch.clamp(torch.arange(0,overflow_lines*offset,offset), max=sample_tokens)/sample_tokens)
                end_indexes.append(torch.clamp(torch.arange(max_length,encodings_length+last_line_padding+1,offset), max=sample_tokens)/sample_tokens)                
            overflow_to_sample_offset = torch.stack((torch.concat(start_indexes),torch.concat(end_indexes)))

            texts_length_multiplier = torch.repeat_interleave(texts_length, overflow_lines).unsqueeze(0)
            otso = (overflow_to_sample_offset*texts_length_multiplier).int()
            encodings['overflow_to_sample_offset'] = [(otso[0,i].item(),otso[1,i].item()) for i in range(otso.size(1))]
            
        else:
            
            encodings['overflow_to_sample_mapping'] = torch.zeros(batch_size, dtype=torch.int32)
            encodings['overflow_to_sample_offset'] = [(0,texts_length[i].item()) for i in range(batch_size)]
    
    # GENERAL CASE: just rely on Huggingface tokenizers for truncation
    else:
        encodings = tokenizer(text = dataset_batch["Text"], add_special_tokens=True, 
                          padding="longest", truncation=True, return_overflowing_tokens=True, stride=stride,
                          # 2020: https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html#tensor-core-shape
                          # However now in 2023, this is less and less true, newer drivers and cuda versions are smarter about this and will be able to use tensorcores even without this aligned padding
                          pad_to_multiple_of=16, return_tensors="pt")

        encodings["overflow_to_sample_offset"] = list(map(get_encoding_offsets, encodings.encodings))
    
    encodings["overflow_to_sample_uri"] = list(map(lambda sample_id: dataset_batch["Uri"][sample_id.item()], encodings["overflow_to_sample_mapping"]))

    return encodings

In [8]:
import math
import torch.nn.functional as F

def truncate_tensor_with_overflow(input_tensor, padding_value, max_length=2048, stride=256):
    batch_length = input_tensor.size(0)
    encoding_length = input_tensor.size(1)

    offset = max_length - stride
    overflow_lines = 1 + math.ceil((encoding_length - max_length)/offset)
    last_line_padding = overflow_lines*offset + stride - encoding_length

    padded_tensor = F.pad(input_tensor, (0,last_line_padding), "constant", padding_value)
    unfolded_tensor = padded_tensor.unfold(1, max_length, offset).reshape(-1, max_length)

    overflow_to_sample_mapping = torch.arange(batch_length).repeat_interleave(overflow_lines)
 
    return unfolded_tensor, overflow_to_sample_mapping 

In [9]:
def get_encodings_batches(tokenizer, dataset, batch_size=32, stride=256):
    for dataset_batch in get_dataset_batches(dataset, batch_size):
        encodings = encode_dataset_batch(tokenizer, dataset_batch, stride)
        
        encodings_length = encodings['input_ids'].size(0)
        for start_idx in range(0, encodings_length, batch_size):
            end_idx = min(start_idx + batch_size, encodings_length)
            yield {key: encodings[key][start_idx:end_idx] for key in encodings.data.keys()}

## Unigram-normalized perplexity

https://arxiv.org/pdf/2011.13220.pdf

Unigram-Normalized Perplexity as a Language Model Performance Measure with Different Vocabulary Sizes

*Jihyeon Roh, Sang-Hoon Oh, Soo-Young Lee*

Although Perplexity is a widely used performance metric for language models, the values are highly dependent upon the number of words in the corpus and is useful to compare performance of the same corpus only.

Perplexity may not be suitable for comparing LMs using different vocabularies because a larger vocabulary size tends to result in lower word probabilities and thus a higher Perplexity.

In this paper, we propose a new metric that can be used to evaluate language model performance with different vocabulary sizes. 

The proposed unigram-normalized Perplexity actually presents the performance improvement of the language models from that of simple unigram model, and is robust on the vocabulary size.

To overcome the limitations of the perplexity, we adopt the basic idea of normalizing the word probability with respect to a quantity containing the vocabulary size. 

We apply a unigram probability that is calculated from the word occurrence as a normalization factor for the perplexity. The unigram probability from the unigram LM is computed as Count(vk) / Count(all words), where Count(vk) is the number of occurrences of word vk in the corpus.

Our proposed metric is obtained by normalizing the perplexity with this unigram probability.

The proposed “Perplexity normalized with unigram” (PPLu) is defined as
PPLu = (Product for all words in sequence of : P(word | language model) / P(word | unigram))^1/length of sequence 

This metric shows the likelihood improvement of a context-dependent LM from unigram LM without the context information, and enables us to evaluate the effectiveness of an LM.

PPLu contains a unigram probability term, which allows PPLu to evaluate LMs more accurately than PPL does. Specifically, even if an LM fails to capture word relationships, it may achieve a good PPL by simply assigning high probabilities to words that frequently appear (e.g., unknown tokens). This case can be corrected with our PPLu, which considers the word frequencies via unigram probabilities.

Formula:

``` 
log(PPLu) = 1/length of sequence * Sum for all words in sequence( log(P(word | language model)) - log(P(word | unigram)))
          = Log(PPL) - 1/length of sequence * Sum for all words in sequence( log(P(word | unigram) )
```

**Perplexity = 1 / geometric mean of model token probabilities****1 / geometric mean of model token probabilities**

pt_ppl_losses = [ -ln(prob_model) ]

pt_unigram_losses = [ -ln(prob_unigram) ]

avg_ppl_losses = pt_ppl_losses.sum() / tokens_count 

avg_unigram_losses = pt_unigram_losses.sum() / tokens_count 

ppl = math.exp( avg_ppl_losses ) 

ppl_unigram = math.exp( avg_unigram_losses )

**Unigram-normalized perplexity = (1 / geometric mean of model token probabilities) / (1 / geometric mean of unigram token probabilities)**

pt_pplu_losses = pt_ppl_losses - pt_unigram_losses

avg_pplu_losses = pt_pplu_losses.sum() / tokens_count

pplu = math.exp( avg_pplu_losses )

= math.exp( avg_ppl_losses - avg_unigram_losses ) 

= math.exp( avg_ppl_losses)/math.exp( avg_unigram_losses ) 

= ppl / ppl_unigram

ppl_unigram = ppl / pplu

In [10]:
import torch
import torch.nn.functional as F

class PPLu():
    
    def __init__(self, dataset_iterator, tokenizer, device):
        if hasattr(tokenizer,"vocab"):
            self.vocab_size = len(tokenizer.vocab)
        else:
            self.vocab_size = tokenizer.vocab_size
        dataset_token_id_counts = torch.zeros(self.vocab_size+1, dtype=torch.int64)
        dataset_tokens_count = 0
        
        for idx,dataset_batch in enumerate(dataset_iterator):
            encodings = tokenizer(text = dataset_batch["Text"], add_special_tokens=True, padding="longest", return_tensors="pt")
            
            # Padding tokens should be ignored: count them as token_id=vocabulary_size
            token_ids = encodings.input_ids*encodings.attention_mask + self.vocab_size*(1-encodings.attention_mask)
            
            token_id_counts = torch.bincount(token_ids.view(-1), minlength=self.vocab_size+1)
            tokens_count = encodings.attention_mask.sum()

            dataset_token_id_counts += token_id_counts
            dataset_tokens_count += tokens_count
            if idx%100==9: print(f"... {dataset_tokens_count:,} tokens")
        
        # Then discard the tokens count for token_id=vocabulary_size
        self.token_id_probs =  (dataset_token_id_counts[:-1] / dataset_tokens_count).unsqueeze(1).to(device)
        self.perplexity_loss = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction="none")
        print(f"Done: {dataset_tokens_count:,} tokens")

    def __call__(self, input_ids, attention_mask, output_logits):
        # Next-token prediction: shift prediction scores and input ids by one
        logits = output_logits[:, :-1, :].permute(0, 2, 1).contiguous()
        labels = input_ids[:, 1:].contiguous()
        labels_to_ignore = attention_mask[:, 1:]

        # Number of tokens predicted, ignoring padding tokens
        predicted_tokens_count_r = labels_to_ignore.sum(dim=1)
        # ... make sure we don't divide by 0 below ...
        predicted_tokens_count = predicted_tokens_count_r.clamp(min=1)
        
        # Cross entropy loss (ignore_index=-100)
        labels_for_crossentropy = labels*labels_to_ignore -100*(1-labels_to_ignore)
        batch_perplexity_losses = (1/predicted_tokens_count)*self.perplexity_loss(logits, labels_for_crossentropy).sum(1)
        
        # Unigram probability loss
        labels_probs = F.embedding(labels, self.token_id_probs).squeeze()
        # prob = 1 for padding tokens => log prob = 0, ignored in the sum below
        labels_probs = labels_probs*labels_to_ignore + (1-labels_to_ignore) 
        batch_unigram_losses = -(1/predicted_tokens_count)*torch.log(labels_probs).sum(dim=1)
        
        # Unigram-nomralized perplexities
        perplexities = torch.exp(batch_perplexity_losses)
        unigram_normalized_perplexities = torch.exp(batch_perplexity_losses - batch_unigram_losses)
        
        return predicted_tokens_count_r, batch_perplexity_losses, batch_unigram_losses, perplexities, unigram_normalized_perplexities

In [11]:
class NormalizedPerplexityLogger:
    def __init__(self, dataset_name, split, model_name):
        self.filename = f"{dataset_name.replace('/','_')}_{split}_{model_name.replace('/','_')}_pplu.csv"
        self.file = open(self.filename, 'w')
        
    def log_batch(self, ppl, pplu, uri, span):
        self.file.write(f"{ppl},{pplu},{uri},{span}\n")

### Benchmark model

In [12]:
model_name = "Qwen/Qwen3-0.6B"

In [13]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# load the tokenizer and the model

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype="auto",
    device_map="auto"
)

In [14]:
# Memory limit of RTX 4090

if tokenizer.model_max_length>8192:
    tokenizer.model_max_length = 8192

batch_size = 4

stride = 256

In [15]:
dataset_name = dataset_en_name
dataset = dataset_en

In [16]:
pplu_loss = PPLu(get_dataset_batches(dataset), tokenizer, model.device)

Token indices sequence length is longer than the specified maximum sequence length for this model (944733 > 8192). Running this sequence through the model will result in indexing errors


... 36,865,308 tokens
... 71,881,304 tokens
... 78,657,869 tokens
... 82,617,954 tokens
... 85,343,413 tokens
... 87,286,150 tokens
... 88,545,120 tokens
... 89,249,148 tokens
Done: 89,404,417 tokens


In [None]:
import math

logger = NormalizedPerplexityLogger(dataset_name, split, model_name)

def display_perplexities(pred_tokens_count, ppl_losses, unigram_losses):        
    pt_pred_tokens_count = torch.Tensor(pred_tokens_count)
    total_pred_tokens_count = pt_pred_tokens_count.sum().item()
    
    pt_ppl_losses = torch.Tensor(ppl_losses)
    pt_unigram_losses = torch.Tensor(unigram_losses)    
    pt_pplu_losses = pt_ppl_losses - pt_unigram_losses

    ppl = math.exp((pt_ppl_losses*pt_pred_tokens_count).sum().item() / total_pred_tokens_count)
    pplu = math.exp((pt_pplu_losses*pt_pred_tokens_count).sum().item() / total_pred_tokens_count)

    print(f"-> perplexity = {ppl:.3f}")
    print(f"-> unigram-normalized perplexity = {pplu*1000:.3f} (x1000)")
    
pred_tokens_count = [] 
ppl_losses = []   
unigram_losses = [] 
for idx,encodings_batch in enumerate(get_encodings_batches(tokenizer, dataset, batch_size=batch_size, stride=stride)):       
    with torch.no_grad():
        # predict next token
        inputs = encodings_batch["input_ids"].to(model.device)
        attention_mask = encodings_batch["attention_mask"].to(model.device)
        outputs = model(input_ids=inputs, attention_mask=attention_mask, use_cache=False, output_attentions=False, output_hidden_states=False)

        batch_pred_tokens_count, batch_ppl_losses, batch_unigram_losses, batch_ppl, batch_pplu = pplu_loss(inputs, attention_mask, outputs.logits)
        
        pred_tokens_count.extend(batch_pred_tokens_count.tolist())
        ppl_losses.extend(batch_ppl_losses.tolist())
        unigram_losses.extend(batch_unigram_losses.tolist())

    for ppl,pplu,uri,span in zip(batch_ppl.tolist(), batch_pplu.tolist(), encodings_batch["overflow_to_sample_uri"], encodings_batch["overflow_to_sample_offset"]):
        logger.log_batch(ppl, pplu, uri, span)

    if idx%10 == 0:
        print(f"{(idx+1)*batch_size} encodings processed")
        display_perplexities(pred_tokens_count, ppl_losses, unigram_losses)

print(f"FINAL RESULT: {(idx+1)*batch_size} encodings processed")
display_perplexities(pred_tokens_count, ppl_losses, unigram_losses)

4 encodings processed
-> perplexity = 10.423
-> unigram-normalized perplexity = 10.586 (x1000)
44 encodings processed
-> perplexity = 9.127
-> unigram-normalized perplexity = 10.224 (x1000)
