In [None]:
import sys
print(sys.executable)

In [None]:
# Let's check to ensure we are in the proper environment, remember we need to be in pure arm64
import platform

arch = platform.machine()
sys = platform.system()
processor = platform.processor()
print(f"{arch}\n{sys}\n{processor}")

In [None]:
# Necessary tools that we need
import onnxruntime as ort
import os
import numpy as np
import time

from pathlib import Path
from tokenizers import Tokenizer

In [None]:
# Grab the root directory as a reference
root_dir = Path.cwd().parent.parent
root_dir

In [None]:
# Grab the path to onnxruntime therefore we can grab hexagon driver
onnx_root = Path(ort.__file__).parent
onnx_root

In [None]:
# Subdirectory where all .onnx dependencies are located
model_subdirectory = "cpu-deepseek-r1-distill-qwen-7b"

# The embeddings model is entry point, use netron to visualize
model_name = "deepseek-r1-distill-qwen-7b-cpu-int4-rtn-block-32-acc-level-4.onnx"

# Genai configuration path
configuration_json = "genai_config.json"

# Tokenizer
tokenizer_json = "tokenizer.json"

In [None]:
# Solidifying all paths

model_path = root_dir/"models"/model_subdirectory/model_name
tokenizer_path = root_dir/"models"/model_subdirectory/tokenizer_json
config_path = root_dir/"models"/model_subdirectory/configuration_json

In [None]:
model_path

In [None]:
session_options = ort.SessionOptions()

# Creating an inference session for the embedding graph
session = ort.InferenceSession(model_path)

session.get_providers()

In [None]:
inputs = session.get_inputs()
outputs = session.get_outputs()
input_0 = inputs[0]
output_0 = outputs[0]

In [None]:
print(f"Expected Input Shape: {input_0.shape}")
print(f"Expected Input Type: {input_0.type}")
print(f"Expected Input Name: {input_0.name}")

In [None]:
print(f"Expected Output Shape: {output_0.shape}")
print(f"Expected Output Type: {output_0.type}")
print(f"Expected Output Name: {output_0.name}")

In [None]:
for layer in inputs:
    print(f"Name: {layer.name}\n\tExpected Input Shape: {layer.shape}\n\tExpected Input Type: {layer.type}")
    print("*"*100)

In [None]:
for layer in outputs:
    print(f"Name: {layer.name}\n\tExpected Input Shape: {layer.shape}\n\tExpected Input Type: {layer.type}")
    print("*"*100)

In [None]:
# Load in tokenizer using tokenizer path above
tokenizer = Tokenizer.from_file(str(tokenizer_path))

In [None]:
query_bank = {"security":"<｜User｜>\nImagine you are a cyber security professional. Provide step by step reasons why AI models should be ran locally. Please consider all aspects of data privacy and cyber security\n<｜Assistant｜><think>\n",
              "cooking":"<｜User｜>\nGive me a step-by-step baked chicken recipe, including ingredients, cook time, and sides.\n<｜Assistant｜>\n",
              "therapist":"<｜User｜>\nImagine you are a therapist with a background in cyber security. I'm am currently very anxious about my data being stolen\
              can you provide me remedies to help with my depression and anxiety\n<｜Assistant｜><think>\n"
             }

In [None]:
# An initial query
init_query = "<｜User｜>\nYou are an expert computer scientist. Why does running AI models on NPU perform better than on CPU?\n<｜Assistant｜><think>\n"
encoding = tokenizer.encode(init_query)

In [None]:
print("Token IDs:", encoding.ids)
print("Tokens:", encoding.tokens)

In [None]:
input_ids = encoding.ids
input_ids

In [None]:
# Preparing inputs for prompt

# Number of input sequences processed simultaneously
batch_size = 1

# Current sequence length for initial prompt (number of tokens in current sequence)
seq_len = len(input_ids)#.shape[2]

# Dimensionality of each token embedding vector
# hidden_size = embedding_output.shape[2]

# Number of attention heads in each transformer layer
num_heads = 28

# Size of each attention head (should be hidden_size // num_heads
attn_head_size = 128 #hidden_size//num_heads # ex. 1536/12 = 128

# Total number of transformer layers
num_layers = 28

# This is not the model's global context window (131072), this is the max number of tokens passed in the first forward pass
max_seq_len = len(input_ids)

# Sampling temperature for softmax-based logit scaling
temp = 0.9

# Number of key/value heads (key/value heads are shared amongst attention heads)
num_key_value_heads = 4

In [None]:
len(input_ids)

In [None]:
# pad the inputs to expected size of seq_len of 64
# target_seq_len = 64
# input_ids += [pad_token_id] * (target_seq_len - len(input_ids))
input_ids = np.array([input_ids], dtype=np.int64)
input_ids.shape

In [None]:
attention_mask = np.ones((batch_size, max_seq_len), dtype=np.int64)
attention_mask

In [None]:
attention_mask.shape

In [None]:
# Let's initialize our KV cache for all transformer layers
empty_kv = {}
for i in range(num_layers):
    # Shape of key and value tensors for each transformer layer
    past_shape = (batch_size, num_key_value_heads, max_seq_len, attn_head_size)

    # Initialize past keys for layer i (used in attention mechanism to avoid recomputation
    empty_kv[f"past_key_values.{i}.key"] = np.zeros(past_shape, dtype=np.float32)

    # Initialize past values for layer i
    empty_kv[f"past_key_values.{i}.value"] = np.zeros(past_shape, dtype=np.float32)

len(empty_kv)

In [None]:
empty_kv.keys()

In [None]:
init_prompt_inputs = {
    "input_ids": input_ids,
    "attention_mask":attention_mask,
    **empty_kv,
}
init_prompt_inputs

In [None]:
init_prompt_inputs.get("past_key_values.0.key").shape

In [None]:
# Run embedding session first
session_output = session.run(None, init_prompt_inputs)
# print("Logits:\n(batch, sequence length, vocab size)")
session_output[0].shape

In [None]:
print("Logits:\n(batch, sequence length, vocab size)")
session_output[0].shape

In [None]:
print("KV Cache:\n(batch, num_kv_heads, sequence length, attn_head_size)")
session_output[1].shape

### To get longer initial context run ctx session over multiple prompts BUT use updated key/values after each prompt

In [None]:
# Update kv cache
present_kv = {f"past_key_values.{i}.key": session_output[1 + i * 2] for i in range(num_layers)}
present_kv.update({f"past_key_values.{i}.value": session_output[1 + i * 2 + 1] for i in range(num_layers)})
present_kv

In [None]:
present_kv.keys()

In [None]:
# Dimension checks
present_kv["past_key_values.0.key"].shape

In [None]:
present_kv["past_key_values.27.value"].shape

In [None]:
logits = session_output[0]
logits

In [None]:
logits[0,-1].shape

In [None]:
def softmax_numpy(x: np.array, temperature: float=1) -> np.array:
    # stabilize x in case of large numbers 
    x = x - np.max(x)

    # Apply temperature
    x = x/temperature

    # Apply Softmax
    return np.exp(x)/np.sum(np.exp(x), axis=-1)

def top_k_probas(probas: np.array, k: int=5) -> np.array:
    # Copy probas so in-place operations don't work on original variable
    probas = probas.copy()
    # Normalize probabilities
    probas /= np.sum(probas)
    # Using -probas to get in descending order
    top_indices_sorted = np.argsort(-probas)[:k]
    top_k_probas = probas[top_indices_sorted]

    # Renormalize top-k probabilites to sum to 1 (probabilites must sum to 1 to use np.random.choice
    top_k_probas /= np.sum(top_k_probas)

    # Return top k probabilities
    return top_indices_sorted, top_k_probas

def apply_repetition_penalty(logits, generated_ids, penalty=1.1):
    for token_id in set(generated_ids):
        logits[token_id] /= penalty
    return logits

In [None]:
# Softmax implemented
# x-np.max(x) => for stability in case of large numbers
softmax = lambda x, temperature=1: np.exp((x-np.max(x))/temperature)/np.sum(np.exp((x-np.max(x))/temperature), axis=-1)

In [None]:
softmax_numpy(logits[0,-1])

In [None]:
softmax(logits[0,-1])

In [None]:
# Grabs last tokens logits
temp = 0.6
probas = softmax(logits[0,-1], temperature=temp)
# probas = probas / probas.sum()
next_token_id = int(np.random.choice(len(probas), p=probas)) #int(np.argmax(probas))
next_token_id

In [None]:
np.sum(probas)

In [None]:
tokenizer.decode([next_token_id])

In [None]:
logits.shape

In [None]:
# temp = 0.6
start = time.time()
max_tokens = 1000
top_k = 5
generated_ids = [next_token_id]
prev_seq_len = logits.shape[1]
# print(prev_seq_len)
# print(attention_mask.shape)
print("\nInitial Query:\n", init_query)
print("Generated:")
for _ in range(max_tokens):
    input_ids = np.array([[next_token_id]], dtype=np.int64)
    # print(tokenizer.decode(generated_ids, skip_special_tokens=True))
    print(tokenizer.decode([next_token_id], skip_special_tokens=True),end="")
    
    iter_inputs = {
    "input_ids": input_ids,
    "attention_mask": attention_mask,
    **present_kv,
    }

    session_output = session.run(None, iter_inputs)
    prev_seq_len += 1
    # Update attention mask
    attention_mask = np.ones((batch_size, prev_seq_len), dtype=np.int64)
    # Update KV Cache
    present_kv = {f"past_key_values.{i}.key": session_output[1 + i * 2] for i in range(num_layers)}
    present_kv.update({f"past_key_values.{i}.value": session_output[1 + i * 2 + 1] for i in range(num_layers)})
    # print(prev_seq_len)
    # print(present_kv.get("past_key_values.0.key").shape)
    # print(len(attention_mask))
    logits = session_output[0]

    token_logits = logits[0,-1]
    token_logits = apply_repetition_penalty(token_logits, generated_ids, penalty=1.1)
#     # Get probabilities
    probas = softmax(token_logits, temperature=temp)
    top_indices, top_probas = top_k_probas(probas, k=top_k) 
    next_token_id = int(np.random.choice(top_indices, p=top_probas)) #int(np.argmax(probas))
    generated_ids.append(next_token_id)


    if next_token_id == tokenizer.token_to_id("< | end_of_sentence | >"):
        break
end = time.time()
elapsed = end - start
tps = np.round((max_tokens / elapsed), 2)
print(f"\nTokens Per Second: {tps}")
output_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
