### LLM Inference Example

This notebook contains a basic inference example for using our `ttml` Python API to build, load, and run a large language model from Hugging Face on our TT hardware. By default, it is set to create and load a GPT2 model, but this notebook can quickly and easily be edited to use any of the LLMs that the tt-train project currently supports. 

Below, in the first cell, we have our imports and basic directory housekeeping.

In [None]:
import os, sys, random
import numpy as np  # For numpy arrays
from dataclasses import dataclass # For configuration classes
from huggingface_hub import hf_hub_download # To download safetensors from Hugging Face
from transformers import AutoTokenizer
from yaml import safe_load # To read YAML configs
from pathlib import Path

sys.path.append(f"{os.environ['TT_METAL_HOME']}/tt-train/sources/ttml")
import ttml
from ttml.common.config import get_config, TransformerConfig

# Can be used to set the random seed for reproducibility
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    ttml.autograd.AutoContext.get_instance().set_seed(seed)

# Change working directory to tt-train
os.chdir(f"{os.environ['TT_METAL_HOME']}/tt-train")


Use the cell below to change global parameters in this notebook. 

`OUTPUT_TOKENS` : the length of the generated text in token (not characters!) 

`WITH_SAMPLING` : enable or disable output token sampling (only used for PyTorch)

`TEMPERATURE`   : sampling temperature; set to 0 to disable sampling in `generate_with_tt()`

`SEED`          : randomization seed (for reproducibility)

In [None]:
OUTPUT_TOKENS = 256
WITH_SAMPLING = True
TEMPERATURE = 0.8
SEED = 42
CONFIG = "training_shakespeare_gpt2s.yaml"

set_seed(SEED)

While the notebook is currently configured for GPT2, you can quickly change the tokenizer you want to use by changing the input to `from_pretrained()` below.

In [None]:
# Load the tokenizer from Hugging Face and the transformer config from YAML
tokenizer = AutoTokenizer.from_pretrained("gpt2")
transformer_config = TransformerConfig(get_config(CONFIG).get("training_config", {}).get("transformer_config",{}))

As above, the call to `hf_hub_download()` will download (or otherwise find on your local system) the SafeTensors model weight file for GPT2, but can be updated to download other SafeTensors files.

In [None]:
# # Get safetensors
local_path = hf_hub_download(repo_id="gpt2", filename="model.safetensors")
local_path = local_path.replace("model.safetensors","")

print(f"Safetensors path: {local_path}")


In [None]:
def build_causal_mask(T: int) -> ttml.autograd.Tensor:
    # [1,1,T,T] float32 with 1s for allowed positions (i >= j), else 0
    m = np.tril(np.ones((T, T), dtype=np.float32))
    return ttml.autograd.Tensor.from_numpy(m.reshape(1, 1, T, T), ttml.Layout.TILE, ttml.autograd.DataType.BFLOAT16)

def build_logits_mask(vocab_size: int, padded_vocab_size: int) -> ttml.autograd.Tensor:
    logits_mask = np.zeros((1, 1, 1, padded_vocab_size), dtype=np.float32)
    logits_mask[:, :, :, vocab_size:] = 1e4
    return ttml.autograd.Tensor.from_numpy(logits_mask, ttml.Layout.TILE, ttml.autograd.DataType.BFLOAT16)   # [1,1,1,T], bfloat16

In [None]:
def create_model(cfg : TransformerConfig):
    # GPT2 config via your bindings
    gcfg = ttml.models.gpt2.GPT2TransformerConfig()
    gcfg.num_heads = cfg.num_heads
    gcfg.embedding_dim = cfg.embedding_dim
    gcfg.num_blocks = cfg.num_blocks
    gcfg.vocab_size = cfg.vocab_size
    gcfg.max_sequence_length = cfg.max_sequence_length
    gcfg.dropout_prob = cfg.dropout_prob
    gcfg.weight_tying = ttml.models.WeightTyingType.Enabled if cfg.weight_tying == "enabled" else ttml.models.gpt2.WeightTyingType.DISABLED
    gcfg.runner_type = ttml.models.RunnerType.Default

    model = ttml.models.gpt2.create_gpt2_model(gcfg)
    model.load_from_safetensors(Path(local_path))
    return model

vocab_size = tokenizer.vocab_size

if vocab_size % 32 != 0:
    print(f"Warning: vocab size {vocab_size} is not multiple of 32, padding for tilizing.")
    padded_vocab_size = ((tokenizer.vocab_size + 31) // 32) * 32

else:
    padded_vocab_size = vocab_size

transformer_config.vocab_size = padded_vocab_size
tt_model = create_model(transformer_config)
tt_model


`generate_with_tt()` uses TT hardware acceleration to generate output from the chosen LLM

In [None]:
def generate_with_tt(model, prompt_tokens):

    ttml.autograd.AutoContext.get_instance().set_gradient_mode(ttml.autograd.GradMode.DISABLED)
    model.eval()

    if padded_vocab_size != vocab_size:
        logits_mask_tensor = build_logits_mask(vocab_size, padded_vocab_size)
    else:
        logits_mask_tensor = None

    causal_mask = build_causal_mask(transformer_config.max_sequence_length)  # [1,1,seq_len,seq_len], float32
    padded_prompt_tokens = np.zeros((1, 1, 1, transformer_config.max_sequence_length), 
                                    dtype=np.uint32)

    start_idx = 0

    print("************************************")
    for token_idx in range(OUTPUT_TOKENS):

        if len(prompt_tokens) > transformer_config.max_sequence_length:
            start_idx = len(prompt_tokens) - transformer_config.max_sequence_length

        # padded_prompt_tokens[0, 0, 0, :transformer_cfg["max_sequence_length"]] = 0
        padded_prompt_tokens[0, 0, 0, :len(prompt_tokens)] = prompt_tokens[start_idx:]
        padded_prompt_tensor = ttml.autograd.Tensor.from_numpy(
            padded_prompt_tokens,
            ttml.Layout.ROW_MAJOR,
            ttml.autograd.DataType.UINT32)  # [1,1,1, max_seq_len], uint32

        logits = model(padded_prompt_tensor, causal_mask)  # out=[1,1,seq_len, vocab_size], bf16


        next_token_tensor = ttml.ops.sample.sample_op(logits, TEMPERATURE, np.random.randint(low=1e7), logits_mask_tensor)  # out=[1,1,seq_len,1], uint32

        next_token_idx = transformer_config.max_sequence_length - 1 if len(prompt_tokens) > transformer_config.max_sequence_length else len(prompt_tokens) - 1
        next_token = next_token_tensor.to_numpy().flatten()[next_token_idx]

        output = tokenizer.decode(next_token)

        prompt_tokens.append(next_token)
        print(output, end='', flush=True)

    print("\n************************************\n\n")

In [None]:
def generate_with_pytorch(prompt_tokens):
    import torch
    from transformers import AutoModelForCausalLM

    torch.manual_seed(SEED)

    torch_model = AutoModelForCausalLM.from_pretrained("gpt2", dtype=torch.bfloat16)
    torch_model.eval()
    print("************************************")
    with torch.no_grad():
        outputs = torch_model.generate(
            prompt_tokens,
            max_new_tokens=OUTPUT_TOKENS,
            do_sample=WITH_SAMPLING, # Enable sampling
            temperature=TEMPERATURE,   # Temperature for sampling
            num_beams=1 # Use multinomial sampling (standard sampling)
        )
    
    generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    for t in generated_text:
        print(t)
        
    print("\n************************************\n\n"),

In [None]:
prompt_str = "The difference between cats and dogs is:"
prompt_tokens = tokenizer.encode(prompt_str)
print("Generating with TT:")
generate_with_tt(tt_model, prompt_tokens.copy())

In [None]:
prompt_str = "Compared to spoons, forks are meant to:"
prompt_tokens = tokenizer.encode(prompt_str)
print("Generating with TT:")
generate_with_tt(tt_model, prompt_tokens.copy())

In [None]:
prompt_str = "Bees are similar to:"
prompt_tokens = tokenizer.encode(prompt_str)
print("Generating with TT:")
generate_with_tt(tt_model, prompt_tokens.copy())

Now try your own prompt!

In [None]:
prompt_str = input("Enter your prompt: ")

prompt_tokens = tokenizer.encode(prompt_str)
print("Generating with TT model:")
generate_with_tt(tt_model, prompt_tokens)