In [1]:
# pip install tokenizers safetensors

import os, sys, math, random, textwrap
import numpy as np
from dataclasses import dataclass
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer
from yaml import safe_load, Loader
from pathlib import Path

sys.path.append(f"{os.environ['TT_METAL_HOME']}/tt-train/sources/ttml")
import ttml

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)

set_seed()
# Change working directory to TT_METAL_HOME
os.chdir(os.environ['TT_METAL_HOME'])

OUTPUT_TOKENS = 256

@dataclass
class TransformerConfig:
    n_head: int = 12
    embed_dim: int = 768
    dropout: float = 0.2
    n_blocks : int = 12
    vocab_size: int = 96
    max_seq_len: int = 1024
    runner_type: str = "memory_efficient"
    weight_tying: str = "enabled"

In [2]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
transformer_cfg = safe_load(open("tt-train/configs/training_shakespeare_gpt2s.yaml", "r"))["training_config"]["transformer_config"]

In [3]:
# Get safetensors
local_path = hf_hub_download(repo_id="gpt2", filename="model.safetensors")
local_path = local_path.replace("model.safetensors","")

In [4]:
def build_causal_mask(T: int) -> ttml.autograd.Tensor:
    # [1,1,T,T] float32 with 1s for allowed positions (i >= j), else 0
    m = np.tril(np.ones((T, T), dtype=np.float32))
    return ttml.autograd.Tensor.from_numpy(m.reshape(1, 1, T, T), ttml.Layout.TILE, ttml.autograd.DataType.BFLOAT16)

def build_logits_mask(vocab_size: int, padded_vocab_size: int) -> ttml.autograd.Tensor:
    logits_mask = np.zeros((1, 1, 1, padded_vocab_size), dtype=np.float32)
    logits_mask[:, :, :, vocab_size:] = 1e4
    return ttml.autograd.Tensor.from_numpy(logits_mask, ttml.Layout.TILE, ttml.autograd.DataType.BFLOAT16)   # [1,1,1,T], float32

In [5]:
def create_model(cfg, vocab_size: int, seq_len: int):
    # GPT2 config via your bindings
    gcfg = ttml.models.gpt2.GPT2TransformerConfig()
    gcfg.num_heads = cfg["num_heads"]
    gcfg.embedding_dim = cfg["embedding_dim"]
    gcfg.num_blocks = cfg["num_blocks"]
    gcfg.vocab_size = int(vocab_size)
    gcfg.max_sequence_length = seq_len
    gcfg.dropout_prob = cfg["dropout_prob"]
    gcfg.weight_tying = ttml.models.WeightTyingType.Enabled if cfg["weight_tying"] == "enabled" else ttml.models.gpt2.WeightTyingType.DISABLED

    # optional flags exist (runner_type, weight_tying, positional_embedding_type, experimental, ...)
    # we keep defaults for a minimal demo

    model = ttml.models.gpt2.create_gpt2_model(gcfg)
    model.load_from_safetensors(Path(local_path))
    return model

vocab_size = tokenizer.vocab_size

if vocab_size % 32 != 0:
    print(f"Warning: vocab size {vocab_size} is not multiple of 32, padding for tilizing.")
    padded_vocab_size = ((tokenizer.vocab_size + 31) // 32) * 32

else:
    padded_vocab_size = vocab_size

model = create_model(transformer_cfg, padded_vocab_size, transformer_cfg["max_sequence_length"])
model


Transformer configuration:
    Vocab size: 50272
    Max sequence length: 1024
    Embedding dim: 768
    Num heads: 12
    Dropout probability: 0.2
    Num blocks: 12
    Positional embedding type: Trainable
    Runner type: Default
    Composite layernorm: false
    Weight tying: Enabled
2025-09-30 20:51:40.246 | info     |          Device | Opening user mode device driver (tt_cluster.cpp:188)
2025-09-30 20:51:40.299 | info     |   SiliconDriver | Harvesting mask for chip 0 is 0x80 (NOC0: 0x80, simulated harvesting mask: 0x0). (cluster.cpp:400)
2025-09-30 20:51:40.352 | info     |   SiliconDriver | Opening local chip ids/PCIe ids: {0}/[0] and remote chip ids {} (cluster.cpp:249)
2025-09-30 20:51:40.352 | info     |   SiliconDriver | All devices in cluster running firmware version: 18.10.0 (cluster.cpp:229)
2025-09-30 20:51:40.352 | info     |   SiliconDriver | IOMMU: disabled (cluster.cpp:173)
2025-09-30 20:51:40.352 | info     |   SiliconDriver | KMD version: 2.4.0 (cluster.cpp:176)

<_ttml.models.gpt2.GPT2Transformer at 0x7fc872b7ba30>

In [6]:
model.eval()
ttml.autograd.AutoContext.get_instance().set_gradient_mode(ttml.autograd.GradMode.DISABLED)

if padded_vocab_size != vocab_size:
    logits_mask_tensor = build_logits_mask(vocab_size, padded_vocab_size)
else:
    logits_mask_tensor = None

prompt_str = "The difference between cats and dogs is:"
prompt_tokens = tokenizer.encode(prompt_str)

causal_mask = build_causal_mask(transformer_cfg["max_sequence_length"])  # [1,1,seq_len,seq_len], float32
padded_prompt_tokens = np.full((1, 1, 1, transformer_cfg["max_sequence_length"]), 
                                tokenizer.eos_token_id,
                                dtype=np.uint32)


start_idx = 0
for token_idx in range(OUTPUT_TOKENS):

    if len(prompt_tokens) > transformer_cfg["max_sequence_length"]:
        start_idx = len(prompt_tokens) - transformer_cfg["max_sequence_length"]

    padded_prompt_tokens[0, 0, 0, :transformer_cfg["max_sequence_length"]] = 0
    padded_prompt_tokens[0, 0, 0, start_idx:len(prompt_tokens)-start_idx] = prompt_tokens
    padded_prompt_tensor = ttml.autograd.Tensor.from_numpy(
        padded_prompt_tokens,
        ttml.Layout.ROW_MAJOR,
        ttml.autograd.DataType.UINT32
    )  # [1,1,1, max_seq_len], uint32

    logits = model(padded_prompt_tensor, causal_mask)  # [1,1,1, vocab_size]
    next_token_tensor = ttml.ops.sample.sample_op(logits, 0, np.random.randint(low=1e6), logits_mask_tensor)  # [1,1,seq_len,vocab_size], uint32
    
    next_token_idx = transformer_cfg["max_sequence_length"] - 1 if len(prompt_tokens) >= transformer_cfg["max_sequence_length"] else len(prompt_tokens) - 1
    next_token = next_token_tensor.to_numpy().flatten()[next_token_idx]

    output = tokenizer.decode(next_token)

    prompt_tokens.append(next_token)

    print(output, end='', flush=True)

2025-09-30 20:52:21.608 | info     |            Test | Small moreh_layer_norm algorithm is selected. (moreh_layer_norm_program_factory.cpp:168)
 treelyly Bengal Canad Canad Canad Canad Canad Beaver Beaver Beaver Beaver Beaver Dawson Dawson Dawson Dawson Dawson Dawson Dawson Dawson Dawson Sky Sky Sky Sky Sky Sky Sky Bengal Canad Canad Canad Canad Canad Canad Canad Canad Canad Canad Canad Canad Canad Canad Canad Canad Canad Canad Canad Canad CanaddreamGalleryGalleryGalleryGalleryGalleryGalleryGalleryGalleryGalleryGalleryGalleryGalleryGalleryGalleryGallery fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun fun 

In [None]:
import torch
from transformers import AutoModelForCausalLM

torch_model = AutoModelForCausalLM.from_pretrained("gpt2")
torch_model.eval()

prompt = "The difference between cats and dogs is:"
prompt_tokens = tokenizer.encode(prompt, return_tensors="pt")

outputs = torch_model.generate(
    prompt_tokens,
    max_new_tokens=256,
    do_sample=False, # Enable sampling
    num_beams=1 # Use multinomial sampling (standard sampling)
)

generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)

print(generated_text)
