In [1]:
%env TT_METAL_HOME=/home/ubuntu/tt-metal
%env TT_METAL_RUNTIME_ROOT=/home/ubuntu/tt-metal

env: TT_METAL_HOME=/home/ubuntu/tt-metal
env: TT_METAL_RUNTIME_ROOT=/home/ubuntu/tt-metal


%env TT_LOGGER_LEVEL=debug
%env TT_LOGGER_TYPES=Op
%env TTNN_ENABLE_LOGGING=True

In [2]:
import ttml
from time import time

2026-01-18 06:20:20.319 | DEBUG    | ttnn:<module>:77 - Initial ttnn.CONFIG:
Config{cache_path=/home/ubuntu/.cache/ttnn,model_cache_path=/home/ubuntu/.cache/ttnn/models,tmp_dir=/tmp/ttnn,enable_model_cache=false,enable_fast_runtime_mode=true,throw_exception_on_fallback=false,enable_logging=false,enable_graph_report=false,enable_detailed_buffer_report=false,enable_detailed_tensor_report=false,enable_comparison_mode=false,comparison_mode_should_raise_exception=false,comparison_mode_pcc=0.9999,root_report_path=generated/ttnn/reports,report_name=std::nullopt,std::nullopt}


In [3]:
from ttnn import Layout, DataType

In [4]:
OUTPUT_TOKENS = 100
WITH_SAMPLING = True
TEMPERATURE = 0.0
SEED = 42

model_id = "meta-llama/Llama-3.2-1B-Instruct" 
CONFIG = "training_shakespeare_llama3_2_1B_fixed.yaml"

model_id =  "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
CONFIG = "training_shakespeare_tinyllama.yaml"

model_id = "meta-llama/Llama-3.2-3B-Instruct"
CONFIG = "training_shakespeare_llama3_2_3B.yaml"

model_id =  "meta-llama/Llama-3.1-8B-Instruct"
CONFIG = "training_shakespeare_llama3_8B_tp.yaml" # OOM on 12 GB, sucesfully loaded weights

model_id = "Qwen/Qwen3-1.7B" 
CONFIG = "training_shakespeare_qwen3_1_7B.yaml"

model_id = "Qwen/Qwen3-4B"
CONFIG = "training_shakespeare_qwen3_4B.yaml"

In [5]:
model_id = "Qwen/Qwen3-0.6B" 
CONFIG = "training_shakespeare_qwen3_0_6B.yaml"

In [6]:
import os, sys, random
import numpy as np  # For numpy arrays
from dataclasses import dataclass # For configuration classes
from huggingface_hub import hf_hub_download # To download safetensors from Hugging Face
from transformers import AutoTokenizer
from yaml import safe_load # To read YAML configs
from pathlib import Path

import ttml
from ttml.common.config import get_training_config, load_config, TransformerConfig
from ttml.common.utils import set_seed, round_up_to_tile
from ttml.common.model_factory import TransformerModelFactory


In [7]:
from tqdm.auto import tqdm

In [8]:
get_training_config(CONFIG)

<ttml.common.config.TrainingConfig at 0x7f99ac5d9330>

In [9]:
# Load the tokenizer from Hugging Face and the transformer config from YAML
tokenizer = AutoTokenizer.from_pretrained(model_id)
training_config = get_training_config(CONFIG)
model_yaml = load_config(training_config.model_config, configs_root=os.getcwd() + '/../../..')

In [10]:
safetensors_path = hf_hub_download(repo_id=model_id, filename="config.json")
safetensors_path = safetensors_path.replace("config.json","")

In [11]:
import torch
from transformers import AutoModelForCausalLM
torch.manual_seed(SEED)
torch_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)

In [12]:
tokenizer.vocab_size, torch_model.state_dict()['model.embed_tokens.weight'].shape[0], torch_model.vocab_size

(151643, 151936, 151936)

In [13]:
len(torch_model.state_dict())

311

In [14]:
orig_vocab_size = torch_model.vocab_size
print(orig_vocab_size)
tt_model_factory = TransformerModelFactory(model_yaml)
tt_model_factory.transformer_config.vocab_size = orig_vocab_size

max_sequence_length = tt_model_factory.transformer_config.max_sequence_length

151936


In [16]:
model_yaml

{'transformer_config': {'model_type': 'qwen3',
  'num_heads': 16,
  'num_groups': 8,
  'embedding_dim': 1024,
  'head_dim': 128,
  'intermediate_dim': 3072,
  'dropout_prob': 0.0,
  'num_blocks': 28,
  'weight_tying': 'enabled',
  'vocab_size': 151936,
  'max_sequence_length': 2048,
  'runner_type': 'memory_efficient',
  'theta': 1000000.0,
  'rms_norm_eps': 1e-06}}

In [18]:
start_time = time()
tt_model = tt_model_factory.create_model()
print(f"Model created: {time() - start_time}" )

Qwen3 configuration:
    Vocab size: 151936
    Max sequence length: 2048
    Embedding dim (hidden_size): 1024
    Head dim: 128
    Attention output dim: 2048
    Intermediate dim: 3072
    Num heads: 16
    Num groups (KV heads): 8
    Dropout probability: 0
    Num blocks: 28
    Positional embedding type: RoPE
    Runner type: Memory efficient
    Weight tying: Enabled
    Theta: 1000000
    RMSNorm epsilon: 1e-06
2026-01-18 06:20:28.652 | info     |             UMD | Established firmware bundle version: 19.4.2 (topology_discovery.cpp:368)
2026-01-18 06:20:28.652 | info     |             UMD | Firmware bundle version 19.4.2 on the system is newer than the latest fully tested version 19.4.0 for wormhole_b0 architecture. Newest features may not be supported. (topology_discovery.cpp:394)
2026-01-18 06:20:28.653 | info     |          Device | Opening user mode device driver (tt_cluster.cpp:223)
2026-01-18 06:20:28.656 | info     |             UMD | Established firmware bundle version:

In [21]:
start_time = time()
tt_model.load_from_safetensors(safetensors_path)
print(f"Model loaded: {time() - start_time}" )

Model loaded: 5.945973634719849


In [22]:
padded_vocab_size = round_up_to_tile(orig_vocab_size, 32)
if orig_vocab_size != padded_vocab_size:
    print(f"Padding vocab size for tilization: original {orig_vocab_size} -> padded {padded_vocab_size}")

In [23]:
def build_causal_mask(T: int) -> ttml.autograd.Tensor:
    # [1,1,T,T] float32 with 1s for allowed positions (i >= j), else 0\n",
    m = np.tril(np.ones((T, T), dtype=np.float32))
    return ttml.autograd.Tensor.from_numpy(m.reshape(1, 1, T, T), Layout.TILE, DataType.BFLOAT16)

def build_logits_mask(vocab_size: int, padded_vocab_size: int) -> ttml.autograd.Tensor:
    logits_mask = np.zeros((1, 1, 1, padded_vocab_size), dtype=np.float32)
    logits_mask[:, :, :, vocab_size:] = 1e4
    return ttml.autograd.Tensor.from_numpy(logits_mask, Layout.TILE, DataType.BFLOAT16)   # [1,1,1,T], bfloat16"

In [24]:
TILE_SIZE = 32

def round_up(value: int) -> int:
    return ((value + TILE_SIZE - 1) // TILE_SIZE) * TILE_SIZE

def create_causal_mask_kv(query_len: int, prompt_len: int = 0) -> ttml.autograd.Tensor:
    whole_len = prompt_len + query_len
    padded_q = round_up(query_len)
    padded_w = round_up(whole_len)
    mask = np.zeros((padded_q, padded_w), dtype=np.float32)
    for i in range(query_len):
        for j in range(prompt_len + i + 1):
            mask[i, j] = 1.0
    return ttml.autograd.Tensor.from_numpy(mask.reshape(1, 1, padded_q, padded_w), Layout.TILE, DataType.BFLOAT16)

def tokens_to_tensor_kv(tokens: list) -> ttml.autograd.Tensor:
    padded_len = round_up(len(tokens))
    padded = np.zeros(padded_len, dtype=np.uint32)
    padded[:len(tokens)] = tokens
    return ttml.autograd.Tensor.from_numpy(padded.reshape(1, 1, 1, padded_len), Layout.ROW_MAJOR, DataType.UINT32)

def generate_with_tt_kv_cache(model, prompt_tokens, transformer_config):
    import time
    ttml.autograd.AutoContext.get_instance().set_gradient_mode(ttml.autograd.GradMode.DISABLED)
    model.eval()

    logits_mask_tensor = build_logits_mask(orig_vocab_size, padded_vocab_size) if padded_vocab_size != orig_vocab_size else None

    head_dim = getattr(transformer_config, 'head_dim', None) or (transformer_config.embedding_dim // transformer_config.num_heads)
    kv_cache = ttml.models.KvCache(
        transformer_config.num_blocks, 1, transformer_config.num_groups,
        transformer_config.max_sequence_length, head_dim
    )
    kv_cache.reset()

    generated = prompt_tokens.copy()
    print("************************************")
    start_time = time.time()

    for step in tqdm(range(min(OUTPUT_TOKENS, transformer_config.max_sequence_length - len(prompt_tokens)))):
        if kv_cache.get_cache_position() == 0:
            input_tokens = generated
            processed = 0
        else:
            input_tokens = [generated[-1]]
            processed = len(generated) - 1

        token_tensor = tokens_to_tensor_kv(input_tokens)
        mask = create_causal_mask_kv(len(input_tokens), processed)
        logits = model(token_tensor, mask, kv_cache=kv_cache, new_tokens=len(input_tokens))

        next_token_tensor = ttml.ops.sample.sample_op(logits, TEMPERATURE, np.random.randint(low=1e7), logits_mask_tensor)
        next_token = int(next_token_tensor.to_numpy().flatten()[len(input_tokens) - 1])
        generated.append(next_token)
        print(tokenizer.decode([next_token]), end='', flush=True)

    elapsed_time = time.time() - start_time
    tokens_per_second = OUTPUT_TOKENS / elapsed_time
    print(f"\n************************************")
    print(f"Generated {OUTPUT_TOKENS} tokens in {elapsed_time:.2f} seconds")
    print(f"Performance: {tokens_per_second:.2f} tokens/second")
    print("************************************\n")


`generate_with_tt()` uses TT hardware acceleration to generate output from the chosen LLM

In [25]:
def generate_with_tt(model, prompt_tokens):
    import time
    
    ttml.autograd.AutoContext.get_instance().set_gradient_mode(ttml.autograd.GradMode.DISABLED)
    model.eval()

    logits_mask_tensor = None

    if padded_vocab_size != orig_vocab_size:
        logits_mask_tensor = build_logits_mask(orig_vocab_size, padded_vocab_size)

    
    padded_prompt_tokens = np.zeros((1, 1, 1, max_sequence_length), 
                                    dtype=np.uint32)

    start_idx = 0

    print("************************************")
    start_time = time.time()
    causal_mask = build_causal_mask(max_sequence_length)  # [1,1,seq_len,seq_len], float32
    
    for token_idx in tqdm(range(OUTPUT_TOKENS)):

        if len(prompt_tokens) > max_sequence_length:
            start_idx = len(prompt_tokens) - max_sequence_length

        padded_prompt_tokens[0, 0, 0, :len(prompt_tokens)] = prompt_tokens[start_idx:]
        padded_prompt_tensor = ttml.autograd.Tensor.from_numpy(
            padded_prompt_tokens,
            Layout.ROW_MAJOR,
            DataType.UINT32)  # [1,1,1, max_seq_len], uint32
        
        #causal_mask = build_causal_mask(len(prompt_tokens))  # [1,1,seq_len,seq_len], float32
        
        logits = model(padded_prompt_tensor, causal_mask)  # out=[1,1,seq_len, vocab_size], bf16


        next_token_tensor = ttml.ops.sample.sample_op(logits, TEMPERATURE, np.random.randint(low=1e7), logits_mask_tensor)  # out=[1,1,seq_len,1], uint32

        next_token_idx = max_sequence_length - 1 if len(prompt_tokens) > max_sequence_length else len(prompt_tokens) - 1
        next_token = next_token_tensor.to_numpy().flatten()[next_token_idx]

        output = tokenizer.decode(next_token)

        prompt_tokens.append(next_token)
        print(output, end='', flush=True)

    end_time = time.time()
    elapsed_time = end_time - start_time
    tokens_per_second = OUTPUT_TOKENS / elapsed_time
    
    print(f"\n************************************")
    print(f"Generated {OUTPUT_TOKENS} tokens in {elapsed_time:.2f} seconds")
    print(f"Performance: {tokens_per_second:.2f} tokens/second")
    print("************************************\n\n")

In [26]:
def generate_with_pytorch(torch_model, prompt_tokens):
    import time
    import torch.nn.functional as F
    from transformers import DynamicCache
    
    torch_model.eval()
    
    print("************************************")
    # Convert list to tensor and add batch dimension
    if isinstance(prompt_tokens, list):
        prompt_tokens = torch.tensor([prompt_tokens])
    
    start_time = time.time()
    
    # Initialize KV cache using the new DynamicCache API
    past_key_values = DynamicCache()
    input_ids = prompt_tokens
    
    with torch.no_grad():
        for i in tqdm(range(OUTPUT_TOKENS)):
            # Get model outputs with KV cache
            outputs = torch_model(
                input_ids=input_ids,
                past_key_values=past_key_values,
                use_cache=True
            )
            logits = outputs.logits
            past_key_values = outputs.past_key_values
            
            # Get logits for the last token
            next_token_logits = logits[:, -1, :]
            
            # Apply temperature and sample
            if WITH_SAMPLING and TEMPERATURE > 0:
                next_token_logits = next_token_logits / TEMPERATURE
                probs = F.softmax(next_token_logits, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1)
            else:
                # Greedy sampling
                next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
            
            # Decode and print the token
            output = tokenizer.decode(next_token[0])
            print(output, end='', flush=True)
            
            # For next iteration, only pass the new token (KV cache handles the rest)
            input_ids = next_token
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    tokens_per_second = OUTPUT_TOKENS / elapsed_time
    
    print(f"\n************************************")
    print(f"Generated {OUTPUT_TOKENS} tokens in {elapsed_time:.2f} seconds")
    print(f"Performance: {tokens_per_second:.2f} tokens/second")
    print("************************************\n\n")

In [27]:
prompt_str = "Generating with pyTorch vs ttML:"

prompt_tokens = tokenizer.encode(prompt_str)
print("Generating with torch:")
generate_with_pytorch(torch_model, prompt_tokens)

Generating with torch:
************************************


  0%|          | 0/100 [00:00<?, ?it/s]

 Which is better?

I'm trying to generate a dataset for a machine learning model. I have a dataset that I can generate with PyTorch, but I'm not sure if I should use PyTorch or ttML. I'm not sure if there are any advantages or disadvantages of each. I'm not sure if I should use PyTorch or ttML for generating the data. I'm not sure if there are any advantages or disadvantages of each. I'm not sure if there are
************************************
Generated 100 tokens in 8.76 seconds
Performance: 11.42 tokens/second
************************************




In [30]:
prompt_str = "Generating with pyTorch vs ttML:"
prompt_tokens = tokenizer.encode(prompt_str)
print("Generating with TTML (KV Cache):")
generate_with_tt_kv_cache(tt_model, prompt_tokens, tt_model_factory.transformer_config)


Generating with TTML (KV Cache):
************************************


  0%|          | 0/100 [00:00<?, ?it/s]

 Which is better?

I'm trying to generate a dataset for a machine learning model. I have a dataset that is already generated, but I need to generate more data for training. I'm using PyTorch and I want to generate more data. I'm not sure which approach is better between using PyTorch and ttML. I'm not sure if there are any differences between the two. I'm not sure if there are any differences in the performance of the two approaches. I'm not
************************************
Generated 100 tokens in 7.89 seconds
Performance: 12.68 tokens/second
************************************



In [31]:
prompt_str = "Generating with pyTorch vs ttML:"
prompt_tokens = tokenizer.encode(prompt_str)
print("Generating with TT (No KV Cache, just max_tokens every time):")
generate_with_tt(tt_model, prompt_tokens)

Generating with TT (No KV Cache, just max_tokens every time):
************************************


  0%|          | 0/100 [00:00<?, ?it/s]

 Which is better?

I'm trying to generate a dataset for a machine learning model. I have a dataset that I can generate with PyTorch, but I'm not sure if I should use PyTorch or ttML. I'm not sure if there are any advantages or disadvantages of each. I'm not sure if I should use PyTorch or ttML for the same purpose. I'm not sure if there are any advantages or disadvantages of each. I'm not sure if there are
************************************
Generated 100 tokens in 79.20 seconds
Performance: 1.26 tokens/second
************************************


