In [1]:
# pip install tokenizers safetensors

import os, sys, math, random, textwrap
import numpy as np
from dataclasses import dataclass
from huggingface_hub import hf_hub_download
from transformers import GPT2Tokenizer
from yaml import safe_load, Loader
from pathlib import Path

sys.path.append(f"{os.environ['TT_METAL_HOME']}/tt-train/sources/ttml")
import ttml

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)

set_seed()
# Change working directory to TT_METAL_HOME
os.chdir(os.environ['TT_METAL_HOME'])

OUTPUT_TOKENS = 256

@dataclass
class TransformerConfig:
    n_head: int = 12
    embed_dim: int = 768
    dropout: float = 0.2
    n_blocks : int = 12
    vocab_size: int = 96
    max_seq_len: int = 1024
    runner_type: str = "memory_efficient"
    weight_tying: str = "enabled"

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
print(os.getcwd())
transformer_cfg = safe_load(open("tt-train/configs/training_shakespeare_gpt2s.yaml", "r"))["training_config"]["transformer_config"]

/home/ubuntu/tt-metal


In [11]:
# Get safetensors
local_path = hf_hub_download(repo_id="gpt2", filename="model.safetensors")
local_path = local_path.replace("model.safetensors","")

In [26]:
def build_causal_mask(T: int) -> ttml.autograd.Tensor:
    # [1,1,T,T] float32 with 1s for allowed positions (i >= j), else 0
    m = np.tril(np.ones((T, T), dtype=np.float32))
    return ttml.autograd.Tensor.from_numpy(m.reshape(1, 1, T, T), ttml.Layout.TILE, ttml.autograd.DataType.BFLOAT16)

def build_logits_mask(vocab_size: int, padded_vocab_size: int) -> ttml.autograd.Tensor:
    logits_mask = np.zeros((1, 1, 1, padded_vocab_size), dtype=np.float32)
    logits_mask[:, :, :, vocab_size:] = 1e4
    return ttml.autograd.Tensor.from_numpy(logits_mask, ttml.Layout.TILE, ttml.autograd.DataType.BFLOAT16)   # [1,1,1,T], float32

In [12]:
def create_model(cfg, vocab_size: int, seq_len: int):
    # GPT2 config via your bindings
    gcfg = ttml.models.gpt2.GPT2TransformerConfig()
    gcfg.num_heads = cfg["num_heads"]
    gcfg.embedding_dim = cfg["embedding_dim"]
    gcfg.num_blocks = cfg["num_blocks"]
    gcfg.vocab_size = int(vocab_size)
    gcfg.max_sequence_length = seq_len
    gcfg.dropout_prob = cfg["dropout_prob"]
    # optional flags exist (runner_type, weight_tying, positional_embedding_type, experimental, ...)
    # we keep defaults for a minimal demo

    model = ttml.models.gpt2.create_gpt2_model(gcfg)
    model.load_from_safetensors(Path(local_path))
    return model

vocab_size = tokenizer.vocab_size

if vocab_size % 32 != 0:
    print(f"Warning: vocab size {vocab_size} is not multiple of 32, padding for tilizing.")
    padded_vocab_size = ((tokenizer.vocab_size + 31) // 32) * 32

else:
    padded_vocab_size = vocab_size

model = create_model(transformer_cfg, padded_vocab_size, transformer_cfg["max_sequence_length"])
model


Transformer configuration:
    Vocab size: 50272
    Max sequence length: 1024
    Embedding dim: 768
    Num heads: 12
    Dropout probability: 0.2
    Num blocks: 12
    Positional embedding type: Trainable
    Runner type: Default
    Composite layernorm: false
    Weight tying: Disabled
Loading model from: /home/ubuntu/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/model.safetensors
parameter name: transformer/gpt_block_9/mlp/fc2/bias
parameter name: transformer/gpt_block_9/mlp/fc1/weight
parameter name: transformer/gpt_block_9/attention/qkv_linear/bias
parameter name: transformer/gpt_block_9/attention/out_linear/bias
parameter name: transformer/gpt_block_8/mlp/fc2/weight
parameter name: transformer/gpt_block_8/mlp/fc1/weight
parameter name: transformer/gpt_block_8/attention/qkv_linear/bias
parameter name: transformer/gpt_block_7/mlp/fc2/weight
parameter name: transformer/gpt_block_9/attention/out_linear/weight
parameter name: transformer/gpt

<_ttml.models.gpt2.GPT2Transformer at 0x7f8edc3a0030>

/attention/out_linear/bias
parameter name: transformer/gpt_block_11/mlp/fc1/weight
parameter name: transformer/gpt_block_1/mlp/fc2/weight
parameter name: transformer/gpt_block_2/mlp/fc1/weight
parameter name: transformer/gpt_block_1/attention/qkv_linear/bias
parameter name: transformer/gpt_block_10/mlp/fc1/weight
parameter name: transformer/gpt_block_10/attention/out_linear/bias
parameter name: transformer/gpt_block_11/attention/out_linear/bias
parameter name: transformer/gpt_block_8/attention/qkv_linear/weight
parameter name: transformer/gpt_block_2/ln1/beta
parameter name: transformer/gpt_block_10/attention/out_linear/weight
parameter name: transformer/gpt_block_5/ln2/beta
parameter name: transformer/gpt_block_11/mlp/fc2/weight
parameter name: transformer/gpt_block_10/attention/qkv_linear/bias
parameter name: transformer/gpt_block_10/mlp/fc2/bias
parameter name: transformer/gpt_block_4/mlp/fc1/bias
parameter name: transformer/gpt_block_11/attention/out_linear/weight
parameter name: t

In [28]:
model.eval()
ttml.autograd.AutoContext.get_instance().set_gradient_mode(ttml.autograd.GradMode.DISABLED)

if padded_vocab_size != vocab_size:
    logits_mask_tensor = build_logits_mask(vocab_size, padded_vocab_size)
else:
    logits_mask_tensor = None

prompt_str = "The difference between cats and dogs is:"
prompt_tokens = tokenizer.encode(prompt_str)

causal_mask = build_causal_mask(transformer_cfg["max_sequence_length"])  # [1,1,seq_len,seq_len], float32
padded_prompt_tokens = np.full((1, 1, 1, transformer_cfg["max_sequence_length"]), 
                                tokenizer.eos_token_id,
                                dtype=np.uint32)


start_idx = 0
for token_idx in range(OUTPUT_TOKENS):

    if len(prompt_tokens) > transformer_cfg["max_sequence_length"]:
        start_idx = len(prompt_tokens) - transformer_cfg["max_sequence_length"]

    padded_prompt_tokens[0, 0, 0, :transformer_cfg["max_sequence_length"]] = tokenizer.eos_token_id
    padded_prompt_tokens[0, 0, 0, start_idx:start_idx + len(prompt_tokens)] = prompt_tokens
    padded_prompt_tensor = ttml.autograd.Tensor.from_numpy(
        padded_prompt_tokens,
        ttml.Layout.ROW_MAJOR,
        ttml.autograd.DataType.UINT32
    )  # [1,1,1, max_seq_len], uint32

    logits = model(padded_prompt_tensor, causal_mask)  # [1,1,1, vocab_size]
    next_token_tensor = ttml.ops.sample.sample_op(logits, 1.0, np.random.randint(low=1e6), logits_mask_tensor)  # [1,1,seq_len,vocab_size], uint32
    
    next_token_idx = transformer_cfg["max_sequence_length"] - 1 if len(prompt_tokens) >= transformer_cfg["max_sequence_length"] else len(prompt_tokens) - 1
    next_token = next_token_tensor.to_numpy().flatten()[next_token_idx]

    output = tokenizer.decode(next_token)

    prompt_tokens.append(next_token)

    print(output, end='', flush=True)

 ( ( ( ( ( ( ( ( ( ( ( ( ( the ( ( " ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( (
 ( ( ( ( ( and ( ( and (
 35 and
 F the and and ( 35 ( 34 34 35 F 35
 35
 ( 35<|endoftext|> 35 ( 35 34 35 36 39 54<|endoftext|> 35 45 35 35 35<|endoftext|> 35
 35 35 I 35 ( 35
 35<|endoftext|> 35 I 35 35 34 35 I 35
 35 35 35 35 I<|endoftext|> 35<|endoftext|><|endoftext|> I 35 35 35
 35<|endoftext|> 31 35 I 35 35 35 45 I 35 35 36 35
 35<|endoftext|> 35<|endoftext|> 36 A 31 I<|endoftext|> 31 35 F 35 35 35 ( 35
 I A 35 I<|endoftext|>
<|endoftext|><|endoftext|>

 31 35 I 35 The<|endoftext|> (<|endoftext|> 45 36 35<|endoftext|> 35
 36 18 36 A The<|endoftext|> 34
 35 I
 35 35 35<|endoftext|> 34
 35<|endoftext|> I 45<|endoftext|> 36 35 45 The 35 55
 35<|endoftext|> 36<|endoftext|> 35 35 I<|endoftext|> 36
<|endoftext|><|endoftext|> ( 35 I H<|endoftext|>
 34 I<|endoftext|> A I 35<|endoftext|><|endoftext|> 35<|endoftext|> F<|endoftext|>

<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> I 35