In [1]:
import sys
print(sys.executable)

C:\Users\DFS\Desktop\gitrepo\env_arm64\Scripts\python.exe


In [2]:
# Let's check to ensure we are in the proper environment, remember we need to be in pure arm64
import platform

arch = platform.machine()
sys = platform.system()
processor = platform.processor()
print(f"{arch}\n{sys}\n{processor}")

ARM64
Windows
ARMv8 (64-bit) Family 8 Model 1 Revision 201, Qualcomm Technologies Inc


In [3]:
# Necessary tools that we need
import onnxruntime as ort
import os
import numpy as np
import time

from pathlib import Path
from tokenizers import Tokenizer

In [5]:
# Grab the root directory as a reference
root_dir = Path.cwd().parent.parent
root_dir

WindowsPath('C:/Users/DFS/Desktop/gitrepo/qnn_sample_apps')

In [6]:
# Grab the path to onnxruntime therefore we can grab hexagon driver
onnx_root = Path(ort.__file__).parent
onnx_root

WindowsPath('C:/Users/DFS/Desktop/gitrepo/env_arm64/Lib/site-packages/onnxruntime')

In [7]:
# Subdirectory where all .onnx dependencies are located
model_subdirectory = "cpu-deepseek-r1-distill-qwen-7b"

# The embeddings model is entry point, use netron to visualize
model_name = "deepseek-r1-distill-qwen-7b-cpu-int4-rtn-block-32-acc-level-4.onnx"

# Genai configuration path
configuration_json = "genai_config.json"

# Tokenizer
tokenizer_json = "tokenizer.json"

In [8]:
# Solidifying all paths

model_path = root_dir/"models"/model_subdirectory/model_name
tokenizer_path = root_dir/"models"/model_subdirectory/tokenizer_json
config_path = root_dir/"models"/model_subdirectory/configuration_json

In [9]:
model_path

WindowsPath('C:/Users/DFS/Desktop/gitrepo/qnn_sample_apps/models/cpu-deepseek-r1-distill-qwen-7b/deepseek-r1-distill-qwen-7b-cpu-int4-rtn-block-32-acc-level-4.onnx')

In [10]:
session_options = ort.SessionOptions()

# Creating an inference session for the embedding graph
session = ort.InferenceSession(model_path)

session.get_providers()

['CPUExecutionProvider']

In [11]:
inputs = session.get_inputs()
outputs = session.get_outputs()
input_0 = inputs[0]
output_0 = outputs[0]

In [12]:
print(f"Expected Input Shape: {input_0.shape}")
print(f"Expected Input Type: {input_0.type}")
print(f"Expected Input Name: {input_0.name}")

Expected Input Shape: ['batch_size', 'sequence_length']
Expected Input Type: tensor(int64)
Expected Input Name: input_ids


In [13]:
print(f"Expected Output Shape: {output_0.shape}")
print(f"Expected Output Type: {output_0.type}")
print(f"Expected Output Name: {output_0.name}")

Expected Output Shape: ['batch_size', 'sequence_length', 152064]
Expected Output Type: tensor(float)
Expected Output Name: logits


In [14]:
for layer in inputs:
    print(f"Name: {layer.name}\n\tExpected Input Shape: {layer.shape}\n\tExpected Input Type: {layer.type}")
    print("*"*100)

Name: input_ids
	Expected Input Shape: ['batch_size', 'sequence_length']
	Expected Input Type: tensor(int64)
****************************************************************************************************
Name: attention_mask
	Expected Input Shape: ['batch_size', 'total_sequence_length']
	Expected Input Type: tensor(int64)
****************************************************************************************************
Name: past_key_values.0.key
	Expected Input Shape: ['batch_size', 4, 'past_sequence_length', 128]
	Expected Input Type: tensor(float)
****************************************************************************************************
Name: past_key_values.0.value
	Expected Input Shape: ['batch_size', 4, 'past_sequence_length', 128]
	Expected Input Type: tensor(float)
****************************************************************************************************
Name: past_key_values.1.key
	Expected Input Shape: ['batch_size', 4, 'past_sequence_length', 128]

In [15]:
for layer in outputs:
    print(f"Name: {layer.name}\n\tExpected Input Shape: {layer.shape}\n\tExpected Input Type: {layer.type}")
    print("*"*100)

Name: logits
	Expected Input Shape: ['batch_size', 'sequence_length', 152064]
	Expected Input Type: tensor(float)
****************************************************************************************************
Name: present.0.key
	Expected Input Shape: ['batch_size', 4, 'total_sequence_length', 128]
	Expected Input Type: tensor(float)
****************************************************************************************************
Name: present.0.value
	Expected Input Shape: ['batch_size', 4, 'total_sequence_length', 128]
	Expected Input Type: tensor(float)
****************************************************************************************************
Name: present.1.key
	Expected Input Shape: ['batch_size', 4, 'total_sequence_length', 128]
	Expected Input Type: tensor(float)
****************************************************************************************************
Name: present.1.value
	Expected Input Shape: ['batch_size', 4, 'total_sequence_length', 128]
	Expec

In [16]:
# Load in tokenizer using tokenizer path above
tokenizer = Tokenizer.from_file(str(tokenizer_path))

In [17]:
query_bank = {"security":"<｜User｜>\nImagine you are a cyber security professional. Provide step by step reasons why AI models should be ran locally. Please consider all aspects of data privacy and cyber security\n<｜Assistant｜><think>\n",
              "cooking":"<｜User｜>\nGive me a step-by-step baked chicken recipe, including ingredients, cook time, and sides.\n<｜Assistant｜>\n",
              "therapist":"<｜User｜>\nImagine you are a therapist with a background in cyber security. I'm am currently very anxious about my data being stolen\
              can you provide me remedies to help with my depression and anxiety\n<｜Assistant｜><think>\n"
             }

In [18]:
# An initial query
init_query = query_bank["therapist"]
encoding = tokenizer.encode(init_query)

In [19]:
print("Token IDs:", encoding.ids)
print("Tokens:", encoding.tokens)

Token IDs: [151646, 151644, 198, 51057, 498, 525, 264, 41763, 448, 264, 4004, 304, 20847, 4763, 13, 358, 2776, 1079, 5023, 1602, 37000, 911, 847, 821, 1660, 22329, 1060, 646, 498, 3410, 752, 50136, 311, 1492, 448, 847, 18210, 323, 18056, 198, 151645, 151648, 198]
Tokens: ['<｜begin▁of▁sentence｜>', '<｜User｜>', 'Ċ', 'Imagine', 'Ġyou', 'Ġare', 'Ġa', 'Ġtherapist', 'Ġwith', 'Ġa', 'Ġbackground', 'Ġin', 'Ġcyber', 'Ġsecurity', '.', 'ĠI', "'m", 'Ġam', 'Ġcurrently', 'Ġvery', 'Ġanxious', 'Ġabout', 'Ġmy', 'Ġdata', 'Ġbeing', 'Ġstolen', 'ĠĠĠĠĠĠĠĠĠĠĠĠĠ', 'Ġcan', 'Ġyou', 'Ġprovide', 'Ġme', 'Ġremedies', 'Ġto', 'Ġhelp', 'Ġwith', 'Ġmy', 'Ġdepression', 'Ġand', 'Ġanxiety', 'Ċ', '<｜Assistant｜>', '<think>', 'Ċ']


In [20]:
input_ids = encoding.ids
input_ids

[151646,
 151644,
 198,
 51057,
 498,
 525,
 264,
 41763,
 448,
 264,
 4004,
 304,
 20847,
 4763,
 13,
 358,
 2776,
 1079,
 5023,
 1602,
 37000,
 911,
 847,
 821,
 1660,
 22329,
 1060,
 646,
 498,
 3410,
 752,
 50136,
 311,
 1492,
 448,
 847,
 18210,
 323,
 18056,
 198,
 151645,
 151648,
 198]

In [21]:
# Preparing inputs for prompt

# Number of input sequences processed simultaneously
batch_size = 1

# Current sequence length for initial prompt (number of tokens in current sequence)
seq_len = len(input_ids)#.shape[2]

# Dimensionality of each token embedding vector
# hidden_size = embedding_output.shape[2]

# Number of attention heads in each transformer layer
num_heads = 28

# Size of each attention head (should be hidden_size // num_heads
attn_head_size = 128 #hidden_size//num_heads # ex. 1536/12 = 128

# Total number of transformer layers
num_layers = 28

# This is not the model's global context window (131072), this is the max number of tokens passed in the first forward pass
max_seq_len = len(input_ids)

# Sampling temperature for softmax-based logit scaling
temp = 0.9

# Number of key/value heads (key/value heads are shared amongst attention heads)
num_key_value_heads = 4

In [22]:
len(input_ids)

43

In [23]:
# pad the inputs to expected size of seq_len of 64
# target_seq_len = 64
# input_ids += [pad_token_id] * (target_seq_len - len(input_ids))
input_ids = np.array([input_ids], dtype=np.int64)
input_ids.shape

(1, 43)

In [24]:
attention_mask = np.ones((batch_size, max_seq_len), dtype=np.int64)
attention_mask

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
      dtype=int64)

In [25]:
attention_mask.shape

(1, 43)

In [26]:
# Let's initialize our KV cache for all transformer layers
empty_kv = {}
for i in range(num_layers):
    # Shape of key and value tensors for each transformer layer
    past_shape = (batch_size, num_key_value_heads, max_seq_len, attn_head_size)

    # Initialize past keys for layer i (used in attention mechanism to avoid recomputation
    empty_kv[f"past_key_values.{i}.key"] = np.zeros(past_shape, dtype=np.float32)

    # Initialize past values for layer i
    empty_kv[f"past_key_values.{i}.value"] = np.zeros(past_shape, dtype=np.float32)

len(empty_kv)

56

In [27]:
empty_kv.keys()

dict_keys(['past_key_values.0.key', 'past_key_values.0.value', 'past_key_values.1.key', 'past_key_values.1.value', 'past_key_values.2.key', 'past_key_values.2.value', 'past_key_values.3.key', 'past_key_values.3.value', 'past_key_values.4.key', 'past_key_values.4.value', 'past_key_values.5.key', 'past_key_values.5.value', 'past_key_values.6.key', 'past_key_values.6.value', 'past_key_values.7.key', 'past_key_values.7.value', 'past_key_values.8.key', 'past_key_values.8.value', 'past_key_values.9.key', 'past_key_values.9.value', 'past_key_values.10.key', 'past_key_values.10.value', 'past_key_values.11.key', 'past_key_values.11.value', 'past_key_values.12.key', 'past_key_values.12.value', 'past_key_values.13.key', 'past_key_values.13.value', 'past_key_values.14.key', 'past_key_values.14.value', 'past_key_values.15.key', 'past_key_values.15.value', 'past_key_values.16.key', 'past_key_values.16.value', 'past_key_values.17.key', 'past_key_values.17.value', 'past_key_values.18.key', 'past_key_v

In [28]:
init_prompt_inputs = {
    "input_ids": input_ids,
    "attention_mask":attention_mask,
    **empty_kv,
}
init_prompt_inputs

{'input_ids': array([[151646, 151644,    198,  51057,    498,    525,    264,  41763,
            448,    264,   4004,    304,  20847,   4763,     13,    358,
           2776,   1079,   5023,   1602,  37000,    911,    847,    821,
           1660,  22329,   1060,    646,    498,   3410,    752,  50136,
            311,   1492,    448,    847,  18210,    323,  18056,    198,
         151645, 151648,    198]], dtype=int64),
 'attention_mask': array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       dtype=int64),
 'past_key_values.0.key': array([[[[0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          ...,
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.]],
 
         [[0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0.,

In [29]:
init_prompt_inputs.get("past_key_values.0.key").shape

(1, 4, 43, 128)

In [30]:
# Run embedding session first
session_output = session.run(None, init_prompt_inputs)
# print("Logits:\n(batch, sequence length, vocab size)")
session_output[0].shape

(1, 43, 152064)

In [31]:
print("Logits:\n(batch, sequence length, vocab size)")
session_output[0].shape

Logits:
(batch, sequence length, vocab size)


(1, 43, 152064)

In [32]:
print("KV Cache:\n(batch, num_kv_heads, sequence length, attn_head_size)")
session_output[1].shape

KV Cache:
(batch, num_kv_heads, sequence length, attn_head_size)


(1, 4, 43, 128)

### To get longer initial context run ctx session over multiple prompts BUT use updated key/values after each prompt

In [33]:
# Update kv cache
present_kv = {f"past_key_values.{i}.key": session_output[1 + i * 2] for i in range(num_layers)}
present_kv.update({f"past_key_values.{i}.value": session_output[1 + i * 2 + 1] for i in range(num_layers)})
present_kv

{'past_key_values.0.key': array([[[[-4.48782444e-01,  1.14478230e+00,  5.70685804e-01, ...,
           -1.24165062e+02, -1.70365280e+02, -1.21685257e+02],
          [ 2.88436556e+00,  1.15511298e+00,  9.66174722e-01, ...,
           -1.29333374e+02, -1.66707245e+02, -1.25292580e+02],
          [ 4.03078413e+00,  1.19743574e+00, -9.78909612e-01, ...,
           -1.23562599e+02, -1.70036957e+02, -1.22907654e+02],
          ...,
          [ 3.33477879e+00, -1.34558225e+00,  4.99185205e-01, ...,
           -1.22956001e+02, -1.70119415e+02, -1.22070206e+02],
          [ 1.58774853e-01, -1.39673090e+00,  1.41834259e+00, ...,
           -1.22959122e+02, -1.70104172e+02, -1.22572212e+02],
          [-2.98513913e+00, -1.19274795e+00,  1.82989013e+00, ...,
           -1.22720238e+02, -1.70640137e+02, -1.22084038e+02]],
 
         [[ 5.33233881e+00, -2.57631731e+00,  3.04361773e+00, ...,
           -4.18385963e+01, -1.35961594e+02,  1.10642700e+02],
          [ 4.71969938e+00, -6.82026505e-01,  2

In [34]:
present_kv.keys()

dict_keys(['past_key_values.0.key', 'past_key_values.1.key', 'past_key_values.2.key', 'past_key_values.3.key', 'past_key_values.4.key', 'past_key_values.5.key', 'past_key_values.6.key', 'past_key_values.7.key', 'past_key_values.8.key', 'past_key_values.9.key', 'past_key_values.10.key', 'past_key_values.11.key', 'past_key_values.12.key', 'past_key_values.13.key', 'past_key_values.14.key', 'past_key_values.15.key', 'past_key_values.16.key', 'past_key_values.17.key', 'past_key_values.18.key', 'past_key_values.19.key', 'past_key_values.20.key', 'past_key_values.21.key', 'past_key_values.22.key', 'past_key_values.23.key', 'past_key_values.24.key', 'past_key_values.25.key', 'past_key_values.26.key', 'past_key_values.27.key', 'past_key_values.0.value', 'past_key_values.1.value', 'past_key_values.2.value', 'past_key_values.3.value', 'past_key_values.4.value', 'past_key_values.5.value', 'past_key_values.6.value', 'past_key_values.7.value', 'past_key_values.8.value', 'past_key_values.9.value', '

In [35]:
# Dimension checks
present_kv["past_key_values.0.key"].shape

(1, 4, 43, 128)

In [36]:
present_kv["past_key_values.27.value"].shape

(1, 4, 43, 128)

In [37]:
logits = session_output[0]
logits

array([[[-0.47242367,  0.8673216 ,  0.4354167 , ..., -2.8407269 ,
         -2.8418145 , -2.8410952 ],
        [-0.46667972,  0.20263086,  0.06269946, ..., -2.512344  ,
         -2.5134845 , -2.5127416 ],
        [ 1.3548192 ,  2.6115346 ,  0.04399791, ..., -3.9077337 ,
         -3.9075003 , -3.9095461 ],
        ...,
        [ 0.6111367 , -4.7736187 , -0.4854146 , ...,  1.5674965 ,
          1.5734429 ,  1.5607604 ],
        [ 7.62777   ,  2.2104542 ,  5.5720787 , ...,  2.047556  ,
          2.0518618 ,  2.0492883 ],
        [ 4.918423  ,  8.622754  , 10.432061  , ...,  4.655695  ,
          4.6564503 ,  4.6553245 ]]], dtype=float32)

In [38]:
logits[0,-1].shape

(152064,)

In [39]:
def softmax_numpy(x: np.array, temperature: float=1) -> np.array:
    # stabilize x in case of large numbers 
    x = x - np.max(x)

    # Apply temperature
    x = x/temperature

    # Apply Softmax
    return np.exp(x)/np.sum(np.exp(x), axis=-1)

def top_k_probas(probas: np.array, k: int=5) -> np.array:
    # Copy probas so in-place operations don't work on original variable
    probas = probas.copy()
    # Normalize probabilities
    probas /= np.sum(probas)
    # Using -probas to get in descending order
    top_indices_sorted = np.argsort(-probas)[:k]
    top_k_probas = probas[top_indices_sorted]

    # Renormalize top-k probabilites to sum to 1 (probabilites must sum to 1 to use np.random.choice
    top_k_probas /= np.sum(top_k_probas)

    # Return top k probabilities
    return top_indices_sorted, top_k_probas

def apply_repetition_penalty(logits, generated_ids, penalty=1.1):
    for token_id in set(generated_ids):
        logits[token_id] /= penalty
    return logits

In [40]:
# Softmax implemented
# x-np.max(x) => for stability in case of large numbers
softmax = lambda x, temperature=1: np.exp((x-np.max(x))/temperature)/np.sum(np.exp((x-np.max(x))/temperature), axis=-1)

In [41]:
softmax_numpy(logits[0,-1])

array([1.2490824e-11, 5.0741372e-10, 3.0983771e-09, ..., 9.6048352e-12,
       9.6120924e-12, 9.6012807e-12], dtype=float32)

In [42]:
softmax(logits[0,-1])

array([1.2490824e-11, 5.0741372e-10, 3.0983771e-09, ..., 9.6048352e-12,
       9.6120924e-12, 9.6012807e-12], dtype=float32)

In [43]:
# Grabs last tokens logits
temp = 0.6
probas = softmax(logits[0,-1], temperature=temp)
# probas = probas / probas.sum()
next_token_id = int(np.random.choice(len(probas), p=probas)) #int(np.argmax(probas))
next_token_id

32313

In [44]:
np.sum(probas)

0.99999994

In [45]:
tokenizer.decode([next_token_id])

'Okay'

In [46]:
logits.shape

(1, 43, 152064)

In [47]:
# temp = 0.6
start = time.time()
max_tokens = 1000
top_k = 5
generated_ids = [next_token_id]
prev_seq_len = logits.shape[1]
# print(prev_seq_len)
# print(attention_mask.shape)
print("\nInitial Query:\n", init_query)
print("Generated:")
for _ in range(max_tokens):
    input_ids = np.array([[next_token_id]], dtype=np.int64)
    # print(tokenizer.decode(generated_ids, skip_special_tokens=True))
    print(tokenizer.decode([next_token_id], skip_special_tokens=True),end="")
    
    iter_inputs = {
    "input_ids": input_ids,
    "attention_mask": attention_mask,
    **present_kv,
    }

    session_output = session.run(None, iter_inputs)
    prev_seq_len += 1
    # Update attention mask
    attention_mask = np.ones((batch_size, prev_seq_len), dtype=np.int64)
    # Update KV Cache
    present_kv = {f"past_key_values.{i}.key": session_output[1 + i * 2] for i in range(num_layers)}
    present_kv.update({f"past_key_values.{i}.value": session_output[1 + i * 2 + 1] for i in range(num_layers)})
    # print(prev_seq_len)
    # print(present_kv.get("past_key_values.0.key").shape)
    # print(len(attention_mask))
    logits = session_output[0]

    token_logits = logits[0,-1]
    token_logits = apply_repetition_penalty(token_logits, generated_ids, penalty=1.1)
#     # Get probabilities
    probas = softmax(token_logits, temperature=temp)
    top_indices, top_probas = top_k_probas(probas, k=top_k) 
    next_token_id = int(np.random.choice(top_indices, p=top_probas)) #int(np.argmax(probas))
    generated_ids.append(next_token_id)


    if next_token_id == tokenizer.token_to_id("< | end_of_sentence | >"):
        break
end = time.time()
elapsed = end - start
tps = np.round((max_tokens / elapsed), 2)
print(f"\nTokens Per Second: {tps}")
output_text = tokenizer.decode(generated_ids, skip_special_tokens=True)



Initial Query:
 <｜User｜>
Imagine you are a therapist with a background in cyber security. I'm am currently very anxious about my data being stolen              can you provide me remedies to help with my depression and anxiety
<｜Assistant｜><think>

Generated:
Okay, so I'm really anxious about my data being stolen. I've been feeling really down lately because of this worry. I guess it's like, if someone can take my private information, they could ruin everything for me. My credit score is probably going to drop because of identity theft, right? That makes me feel so scared and overwhelmed.

I know that anxiety often comes from overthinking things, but in this case, the fear itself might be more real than usual. Maybe I should start by understanding why exactly I'm worried. Is it just a random thought, or does it make me sleep at night? If it's making me sleep poorly, that's another sign that it's significant enough to address.

I remember reading somewhere that when you're anxious, you