In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM,AutoModelForSeq2SeqLM
import torch
from datasets import load_dataset
import time

# GPT-j

In [2]:
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", torch_dtype=torch.float16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
tokenizer.pad_token = tokenizer.eos_token

2025-06-02 02:48:35.952932: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-02 02:48:35.966106: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748832515.981849 3713085 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748832515.986606 3713085 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748832515.998639 3713085 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [10]:
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")
# Pick a prompt from the dataset
example = dataset[0]
prompt = example["instruction"] + "\n" + example["context"] if example["context"] else example["instruction"]
print(prompt)


# Time tokenization
start = time.time()
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
inputs = {k: v.to(model.device) for k, v in inputs.items()}
tokenization_time = time.time() - start
print(f"[Tokenization] {tokenization_time:.3f} seconds")
# print(f"Tokenized input: {inputs['input_ids'].shape[-1]} tokens")

# Time generation
start = time.time()
with torch.no_grad():
    output = model.generate(
        **inputs,
        max_new_tokens=100,
        do_sample=False
    )
torch.cuda.synchronize()
generation_time = time.time() - start
print(f"[Generation] {generation_time:.3f} seconds")

# Decode output
result = tokenizer.decode(output[0], skip_special_tokens=True)
# print("\n[Generated text]:")
# print(result)

Generating train split: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15011/15011 [00:00<00:00, 334566.32 examples/s]
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


When did Virgin Australia start operating?
Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.
[Tokenization] 0.002 seconds
[Generation] 3.047 seconds


In [6]:
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")
batch_size = 1
batch = []
for i in range(batch_size):
    example = dataset[i]
    if example["context"]:
        prompt = example["instruction"] + "\n" + example["context"]
    else:
        prompt = example["instruction"]
    batch.append(prompt)


print(f"Running batch inference on {len(batch)} prompts...")

# Time tokenization
start = time.time()
inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=2048)
inputs = {k: v.to(model.device) for k, v in inputs.items()}
tokenization_time = time.time() - start
print(f"[Tokenization] {tokenization_time:.3f} seconds")

# Time generation
start = time.time()
with torch.no_grad():
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=100,
        do_sample=False
    )
torch.cuda.synchronize()
generation_time = time.time() - start
print(f"[Generation] {generation_time:.3f} seconds")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Running batch inference on 1 prompts...
[Tokenization] 0.001 seconds
[Generation] 3.070 seconds


# Flan-T5

In [5]:
model_id = "google/flan-t5-xl"  # try flan-t5-large or flan-t5-base if you hit OOM

model = AutoModelForSeq2SeqLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id)

2025-06-02 02:53:08.867779: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-02 02:53:08.880969: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748832788.896925 3713951 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748832788.901768 3713951 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748832788.913907 3713951 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [10]:
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")

# example = dataset[0]
# prompt = example["instruction"]
# if example["context"]:
#     prompt += "\n" + example["context"]


prompt = (
    "Write a detailed scientific explanation of how general relativity and quantum field theory could be unified "
    "into a theory of quantum gravity. Cover mathematical foundations, known challenges, and current research directions."
)

base_sentence = "Describe the process of star formation in high-resolution astrophysical detail. "
repeat_count = 100  # try 10,000 for real scaling
prompt = base_sentence * repeat_count
print(f"Prompt length: {len(prompt)} characters")

In [11]:
start = time.time()
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
inputs = {k: v.to(model.device) for k, v in inputs.items()}
tokenization_time = time.time() - start
print(f"[Tokenization] {tokenization_time:.3f} s")

# Generate long output
start = time.time()
with torch.no_grad():
    output = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=1024,       # Increase for longer generation time
        do_sample=False            # Optional: set to True for diverse outputs
    )
torch.cuda.synchronize()
generation_time = time.time() - start
print(f"[Generation] {generation_time:.3f} s")
print(f"Generated {output.shape[-1]} tokens")


[Tokenization] 0.002 s
[Generation] 8.236 s
Generated 316 tokens
