In [1]:
import sys
print(sys.executable)

C:\Users\DFS\Desktop\gitrepo\env_arm64\Scripts\python.exe


In [2]:
# Let's check to ensure we are in the proper environment, remember we need to be in pure arm64
import platform

arch = platform.machine()
sys = platform.system()
processor = platform.processor()
print(f"{arch}\n{sys}\n{processor}")

ARM64
Windows
ARMv8 (64-bit) Family 8 Model 1 Revision 201, Qualcomm Technologies Inc


In [3]:
# Necessary tools that we need
import onnxruntime as ort
import os
import numpy as np
import time
import gc
import psutil

from pathlib import Path
from tokenizers import Tokenizer

In [4]:
gc.collect()

0

In [5]:
# try:
#     if 'ctx_itr_session' in globals():
#         del ctx_itr_session
#     if 'ctx_session' in globals():
#         del ctx_session
#     if 'embedding_session' in globals():
#         del embedding_session
#     gc.collect()
# except:
#     pass

In [6]:
# # Preparing inputs for prompt
# batch_size = 1
# seq_len = embedding_output.shape[1]
# hidden_size = embedding_output.shape[2]
# num_heads = 2
# attn_head_size = 128 #hidden_size // num_heads
# num_layers = 28
# max_seq_len = 64
# temp = 0.6

In [7]:
# Grab the root directory as a reference
root_dir = Path.cwd().parent.parent
root_dir

WindowsPath('C:/Users/DFS/Desktop/gitrepo/qnn_sample_apps')

In [8]:
# Grab the path to onnxruntime therefore we can grab hexagon driver
onnx_root = Path(ort.__file__).parent
onnx_root

WindowsPath('C:/Users/DFS/Desktop/gitrepo/env_arm64/Lib/site-packages/onnxruntime')

In [9]:
# Subdirectory where all .onnx dependencies are located
model_subdirectory = "qnn-deepseek-r1-distill-qwen-7b"

# The embeddings model is entry point, use netron to visualize
model_name = "deepseek_r1_7b_embeddings_quant_v1.0.onnx"

# This graph is used to process initial prompt, we can pass up to 64 tokens
context_model = "deepseek_r1_7b_ctx_v1.0.onnx_ctx.onnx"

# This graph is used to perform next word inference after the initial prompt
context_model_iter = "deepseek_r1_7b_iter_v1.0.onnx_ctx.onnx"

# This graph allows us to take hidden states and return logits
head_model = "deepseek_r1_7b_head_quant_v1.0.onnx"

# Genai configuration path
configuration_json = "genai_config.json"

# Tokenizer
tokenizer_json = "tokenizer.json"

In [10]:
# Solidifying all paths

model_path = root_dir/"models"/model_subdirectory/model_name
ctx_path = root_dir/"models"/model_subdirectory/context_model
ctx_path_itr = root_dir/"models"/model_subdirectory/context_model_iter
head_path = root_dir/"models"/model_subdirectory/head_model
tokenizer_path = root_dir/"models"/model_subdirectory/tokenizer_json
config_path = root_dir/"models"/model_subdirectory/configuration_json
hexagon_driver = onnx_root/"capi"/"QnnHtp.dll"

In [11]:
model_path

WindowsPath('C:/Users/DFS/Desktop/gitrepo/qnn_sample_apps/models/qnn-deepseek-r1-distill-qwen-7b/deepseek_r1_7b_embeddings_quant_v1.0.onnx')

In [12]:
hexagon_driver

WindowsPath('C:/Users/DFS/Desktop/gitrepo/env_arm64/Lib/site-packages/onnxruntime/capi/QnnHtp.dll')

In [13]:
session_options = ort.SessionOptions()

qnn_provider_options = {
    # Path to the backend driver "Hexagon"
    "backend_path": hexagon_driver,
    # https://onnxruntime.ai/docs/execution-providers/QNN-ExecutionProvider.html#configuration-options
    "htp_performance_mode": "burst",
    "soc_model": "60",
    # "enable_htp_context_cache": "0",
    "profiling_level": "detailed",
    "profiling_file_path": root_dir/"models"/model_subdirectory/"profiling_deepseek_7b.csv",
    # Enabling graph optimization causes problems, need to look into this
    "htp_graph_finalization_optimization_mode": "3",
}

# Creating an inference session for the embedding graph
embedding_session = ort.InferenceSession(model_path,
                                providers= [("QNNExecutionProvider",qnn_provider_options)],
                               sess_options= session_options
                              )
# Creating an inference session for the single prediction context graph (iter_ctx)
ctx_itr_session = ort.InferenceSession(ctx_path_itr,
                                         providers=[("QNNExecutionProvider",qnn_provider_options)],
                                         sess_options= session_options
                                      )

# Creating an inference session for the initial context graph
ctx_session = ort.InferenceSession(ctx_path,
                                    providers=[("QNNExecutionProvider",qnn_provider_options)],
                                    sess_options= session_options
                                        )



# Creating an inference session for the head session which will provide logits from hidden states
head_session = ort.InferenceSession(head_path,
                                providers= [("QNNExecutionProvider",qnn_provider_options)],
                               sess_options= session_options
                              )

embedding_session.get_providers()

['QNNExecutionProvider', 'CPUExecutionProvider']

In [14]:
gc.collect()

0

In [15]:
psutil.virtual_memory().percent

64.4

In [16]:
inputs = embedding_session.get_inputs()
outputs = embedding_session.get_outputs()
input_0 = inputs[0]
output_0 = outputs[0]

In [17]:
print(f"Expected Input Shape: {input_0.shape}")
print(f"Expected Input Type: {input_0.type}")
print(f"Expected Input Name: {input_0.name}")

Expected Input Shape: [1, 'seq_len']
Expected Input Type: tensor(int64)
Expected Input Name: input_ids


In [18]:
print(f"Expected Output Shape: {output_0.shape}")
print(f"Expected Output Type: {output_0.type}")
print(f"Expected Output Name: {output_0.name}")

Expected Output Shape: [1, 'seq_len', 3584]
Expected Output Type: tensor(float)
Expected Output Name: input_hidden_states


In [19]:
inputs_ctx = ctx_session.get_inputs()
outputs_ctx = ctx_session.get_outputs()
input_0_ctx = inputs_ctx[0]
output_0_ctx = outputs_ctx[0]

In [20]:
print(f"Expected Input Shape: {input_0_ctx.shape}")
print(f"Expected Input Type: {input_0_ctx.type}")
print(f"Expected Input Name: {input_0_ctx.name}")

Expected Input Shape: [1, 4, 'max_seq_len', 128]
Expected Input Type: tensor(float)
Expected Input Name: past_keys_0


In [21]:
print(f"Expected Output Shape: {output_0_ctx.shape}")
print(f"Expected Output Type: {output_0_ctx.type}")
print(f"Expected Output Name: {output_0_ctx.name}")

Expected Output Shape: [1, 64, 3584]
Expected Output Type: tensor(float)
Expected Output Name: output_hidden_states


In [22]:
inputs_ctx_itr = ctx_itr_session.get_inputs()
outputs_ctx_itr = ctx_itr_session.get_outputs()
input_0_ctx_itr = inputs_ctx_itr[0]
output_0_ctx_itr = outputs_ctx_itr[0]

In [23]:
print(f"Expected Input Shape: {input_0_ctx_itr.shape}")
print(f"Expected Input Type: {input_0_ctx_itr.type}")
print(f"Expected Input Name: {input_0_ctx_itr.name}")

Expected Input Shape: [1, 1, 3584]
Expected Input Type: tensor(float)
Expected Input Name: input_hidden_states


In [24]:
print(f"Expected Output Shape: {output_0_ctx_itr.shape}")
print(f"Expected Output Type: {output_0_ctx_itr.type}")
print(f"Expected Output Name: {output_0_ctx_itr.name}")

Expected Output Shape: [1, 1, 3584]
Expected Output Type: tensor(float)
Expected Output Name: output_hidden_states


In [25]:
inputs_head = head_session.get_inputs()
outputs_head = head_session.get_outputs()
input_0_head = inputs_head[0]
output_0_head = outputs_head[0]

In [26]:
print(f"Expected Input Name: {input_0_head.name}")
print(f"Expected Input Shape: {input_0_head.shape}")
print(f"Expected Input Type: {input_0_head.type}")

Expected Input Name: output_hidden_states
Expected Input Shape: [1, 'seq_len', 3584]
Expected Input Type: tensor(float)


In [27]:
print(f"Expected Output Name: {output_0_head.name}")
print(f"Expected Output Shape: {output_0_head.shape}")
print(f"Expected Output Type: {output_0_head.type}")

Expected Output Name: logits
Expected Output Shape: [1, 'seq_len', 152064]
Expected Output Type: tensor(float)


In [28]:
# Load in tokenizer using tokenizer path above
tokenizer = Tokenizer.from_file(str(tokenizer_path))

In [29]:
query_bank = {"security":"<｜User｜>\nImagine you are a cyber security professional. Provide step by step reasons why AI models should be ran locally. Please consider all aspects of data privacy and cyber security\n<｜Assistant｜><think>\n",
              "cooking":"<｜User｜>\nGive me a step-by-step baked chicken recipe, including ingredients, cook time, and sides.\n<｜Assistant｜>\n",
              "therapist":"<｜User｜>\nImagine you are a therapist with a background in cyber security. I'm am currently very anxious about my data being stolen\
              can you provide me remedies to help with my depression and anxiety\n<｜Assistant｜><think>\n"
             }

In [30]:
# An initial query
init_query = query_bank["therapist"]
encoding = tokenizer.encode(init_query)

In [31]:
print("Token IDs:", encoding.ids)
print("Tokens:", encoding.tokens)

Token IDs: [151646, 151644, 198, 51057, 498, 525, 264, 41763, 448, 264, 4004, 304, 20847, 4763, 13, 358, 2776, 1079, 5023, 1602, 37000, 911, 847, 821, 1660, 22329, 1060, 646, 498, 3410, 752, 50136, 311, 1492, 448, 847, 18210, 323, 18056, 198, 151645, 151648, 198]
Tokens: ['<｜begin▁of▁sentence｜>', '<｜User｜>', 'Ċ', 'Imagine', 'Ġyou', 'Ġare', 'Ġa', 'Ġtherapist', 'Ġwith', 'Ġa', 'Ġbackground', 'Ġin', 'Ġcyber', 'Ġsecurity', '.', 'ĠI', "'m", 'Ġam', 'Ġcurrently', 'Ġvery', 'Ġanxious', 'Ġabout', 'Ġmy', 'Ġdata', 'Ġbeing', 'Ġstolen', 'ĠĠĠĠĠĠĠĠĠĠĠĠĠ', 'Ġcan', 'Ġyou', 'Ġprovide', 'Ġme', 'Ġremedies', 'Ġto', 'Ġhelp', 'Ġwith', 'Ġmy', 'Ġdepression', 'Ġand', 'Ġanxiety', 'Ċ', '<｜Assistant｜>', '<think>', 'Ċ']


In [32]:
input_ids = encoding.ids
input_ids

[151646,
 151644,
 198,
 51057,
 498,
 525,
 264,
 41763,
 448,
 264,
 4004,
 304,
 20847,
 4763,
 13,
 358,
 2776,
 1079,
 5023,
 1602,
 37000,
 911,
 847,
 821,
 1660,
 22329,
 1060,
 646,
 498,
 3410,
 752,
 50136,
 311,
 1492,
 448,
 847,
 18210,
 323,
 18056,
 198,
 151645,
 151648,
 198]

In [33]:
# pad the inputs to expected size of seq_len of 64
# target_seq_len = 64
# input_ids += [pad_token_id] * (target_seq_len - len(input_ids))
input_ids = np.array([input_ids], dtype=np.int64)
input_ids.shape

(1, 43)

In [34]:
# Run embedding session first
embedding_output = embedding_session.run(None, {"input_ids":input_ids})[0]
print("(batch, sequence length, embedding dimension)")
embedding_output.shape

(batch, sequence length, embedding dimension)


(1, 43, 3584)

In [35]:
# Preparing inputs for prompt

# Number of input sequences processed simultaneously
batch_size = 1

# Current sequence length for initial prompt (number of tokens in current sequence)
seq_len = embedding_output.shape[1]

# Dimensionality of each token embedding vector
hidden_size = embedding_output.shape[2]

# Number of attention heads in each transformer layer
num_heads = 28

# Size of each attention head (should be hidden_size // num_heads
attn_head_size = 128 #hidden_size//num_heads # ex. 1536/12 = 128

# Total number of transformer layers
num_layers = 28

# This is not the model's global context window (131072), this is the max number of tokens passed in the first forward pass
max_seq_len = 64

# Sampling temperature for softmax-based logit scaling
temp = 0.7

# Number of key/value heads (key/value heads are shared amongst attention heads)
num_key_value_heads = 4

In [36]:
attn_head_size

128

In [37]:
hidden_size

3584

In [38]:
# Let's initialize our KV cache for all transformer layers
empty_kv = {}
for i in range(num_layers):
    # Shape of key and value tensors for each transformer layer
    past_shape = (batch_size, num_key_value_heads, max_seq_len, attn_head_size)

    # Initialize past keys for layer i (used in attention mechanism to avoid recomputation
    empty_kv[f"past_keys_{i}"] = np.zeros(past_shape, dtype=np.float32)

    # Initialize past values for layer i
    empty_kv[f"past_values_{i}"] = np.zeros(past_shape, dtype=np.float32)

len(empty_kv)

56

In [39]:
empty_kv.keys()

dict_keys(['past_keys_0', 'past_values_0', 'past_keys_1', 'past_values_1', 'past_keys_2', 'past_values_2', 'past_keys_3', 'past_values_3', 'past_keys_4', 'past_values_4', 'past_keys_5', 'past_values_5', 'past_keys_6', 'past_values_6', 'past_keys_7', 'past_values_7', 'past_keys_8', 'past_values_8', 'past_keys_9', 'past_values_9', 'past_keys_10', 'past_values_10', 'past_keys_11', 'past_values_11', 'past_keys_12', 'past_values_12', 'past_keys_13', 'past_values_13', 'past_keys_14', 'past_values_14', 'past_keys_15', 'past_values_15', 'past_keys_16', 'past_values_16', 'past_keys_17', 'past_values_17', 'past_keys_18', 'past_values_18', 'past_keys_19', 'past_values_19', 'past_keys_20', 'past_values_20', 'past_keys_21', 'past_values_21', 'past_keys_22', 'past_values_22', 'past_keys_23', 'past_values_23', 'past_keys_24', 'past_values_24', 'past_keys_25', 'past_values_25', 'past_keys_26', 'past_values_26', 'past_keys_27', 'past_values_27'])

In [40]:
embedding_output.shape

(1, 43, 3584)

In [41]:
# Subtract 1 to get the index of the last token in the sequence (since indexing is 0-based)
init_sequence_length = np.array(embedding_output.shape[1]-1, dtype=np.int32).reshape(1,1)

# Set the maximum sequence length for the model's current forward pass
max_seq_length = np.array([max_seq_len], dtype=np.int32)

In [42]:
seq_lens = {
    "past_seq_len": init_sequence_length,
    "total_seq_len": max_seq_length 
}
seq_lens

{'past_seq_len': array([[42]]), 'total_seq_len': array([64])}

In [43]:
max_seq_length

array([64])

In [44]:
# pad the inputs to expected size of seq_len of 64
batch_size, seq_len, embed_dim = embedding_output.shape
padding_id = 151643
padded_embedding = np.full((batch_size, max_seq_length[0], embed_dim), padding_id, dtype=embedding_output.dtype) #np.zeros((batch_size, target_seq_len, embed_dim), dtype=embedding_output.dtype)

padded_embedding[:, :seq_len, :] = embedding_output
padded_embedding.shape

(1, 64, 3584)

In [45]:
# Check to ensure padding vectors were added
padded_embedding[:,:seq_len+1,:]

array([[[-3.0272333e-03,  3.7840416e-03, -1.5136166e-03, ...,
          4.5743864e-03,  7.6239771e-04, -5.3367838e-03],
        [-4.2586653e-03,  2.8391103e-03,  5.6782207e-03, ...,
          3.0445447e-03,  4.5668171e-03,  2.2834085e-03],
        [ 1.7522871e-02,  1.7522871e-02,  4.6727657e-02, ...,
         -6.5324926e-03,  0.0000000e+00,  3.9194956e-02],
        ...,
        [ 3.1730570e-03, -2.1153714e-03,  8.4614856e-03, ...,
         -2.6563108e-03,  1.7708738e-03, -5.3126216e-03],
        [ 1.7522871e-02,  1.7522871e-02,  4.6727657e-02, ...,
         -6.5324926e-03,  0.0000000e+00,  3.9194956e-02],
        [ 1.5164300e+05,  1.5164300e+05,  1.5164300e+05, ...,
          1.5164300e+05,  1.5164300e+05,  1.5164300e+05]]], dtype=float32)

In [46]:
empty_kv['past_keys_0'].shape

(1, 4, 64, 128)

In [47]:
init_prompt_inputs = {
    **empty_kv,
    **seq_lens,
    "input_hidden_states": padded_embedding,
}
init_prompt_inputs

{'past_keys_0': array([[[[0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          ...,
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.]],
 
         [[0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          ...,
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.]],
 
         [[0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          ...,
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.]],
 
         [[0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          ...,
          [0., 0., 0., ..., 0., 0., 0.],
          [0.,

In [48]:
init_prompt_inputs.keys()

dict_keys(['past_keys_0', 'past_values_0', 'past_keys_1', 'past_values_1', 'past_keys_2', 'past_values_2', 'past_keys_3', 'past_values_3', 'past_keys_4', 'past_values_4', 'past_keys_5', 'past_values_5', 'past_keys_6', 'past_values_6', 'past_keys_7', 'past_values_7', 'past_keys_8', 'past_values_8', 'past_keys_9', 'past_values_9', 'past_keys_10', 'past_values_10', 'past_keys_11', 'past_values_11', 'past_keys_12', 'past_values_12', 'past_keys_13', 'past_values_13', 'past_keys_14', 'past_values_14', 'past_keys_15', 'past_values_15', 'past_keys_16', 'past_values_16', 'past_keys_17', 'past_values_17', 'past_keys_18', 'past_values_18', 'past_keys_19', 'past_values_19', 'past_keys_20', 'past_values_20', 'past_keys_21', 'past_values_21', 'past_keys_22', 'past_values_22', 'past_keys_23', 'past_values_23', 'past_keys_24', 'past_values_24', 'past_keys_25', 'past_values_25', 'past_keys_26', 'past_values_26', 'past_keys_27', 'past_values_27', 'past_seq_len', 'total_seq_len', 'input_hidden_states'])

In [49]:
init_prompt_inputs['past_keys_0'].shape

(1, 4, 64, 128)

In [50]:
prompt_outputs = ctx_session.run(None, init_prompt_inputs)
len(prompt_outputs)

57

In [51]:
prompt_outputs[0].shape

(1, 64, 3584)

In [52]:
# Extract final hidden states and present_keys/values
print("Batch, prompt length (up to max 64 tokens), embedding size")
output_hidden_states = prompt_outputs[0]
output_hidden_states.shape

Batch, prompt length (up to max 64 tokens), embedding size


(1, 64, 3584)

In [53]:
print("Batch, key/value heads, prompt length (up to max 64 tokens), head dimension (size of projection for each head)")
print("Note: Total embedding size is 1536, this is split amongst 12 attention heads")
prompt_outputs[1].shape

Batch, key/value heads, prompt length (up to max 64 tokens), head dimension (size of projection for each head)
Note: Total embedding size is 1536, this is split amongst 12 attention heads


(1, 4, 64, 128)

In [54]:
prompt_outputs[1][0].shape

(4, 64, 128)

In [55]:
print("Prompt Length x Head Dimension (Embedding Window)")
prompt_outputs[1][0][0].shape

Prompt Length x Head Dimension (Embedding Window)


(64, 128)

### To get longer initial context run ctx session over multiple prompts BUT use updated key/values after each prompt

In [56]:
# Populate initial past key/values
# Must start with index==1 because index==0 is output_hidden_states (see genai_config.json)
present_kv = {f"past_keys_{i}": prompt_outputs[1 + i * 2] for i in range(num_layers)}
present_kv.update({f"past_values_{i}": prompt_outputs[1 + i * 2 + 1] for i in range(num_layers)})
present_kv

{'past_keys_0': array([[[[-4.44035530e-01,  1.09671426e+00,  5.83131015e-01, ...,
           -1.24190849e+02, -1.70338440e+02, -1.21622932e+02],
          [ 2.84841299e+00,  1.10430491e+00,  1.02552092e+00, ...,
           -1.29574661e+02, -1.66587555e+02, -1.25267395e+02],
          [ 4.00193834e+00,  1.21116161e+00, -9.55790341e-01, ...,
           -1.23581772e+02, -1.70079849e+02, -1.22806931e+02],
          ...,
          [-4.60026121e+00, -3.31813395e-01, -7.02894211e-01, ...,
           -1.22131866e+02, -1.70724197e+02, -1.21174057e+02],
          [-3.71072245e+00, -6.01452649e-01, -1.10069454e-01, ...,
           -1.22110649e+02, -1.70739120e+02, -1.21153412e+02],
          [ 5.90437412e-01, -4.47555721e-01,  5.41807830e-01, ...,
           -1.22089439e+02, -1.70754013e+02, -1.21132751e+02]],
 
         [[ 5.37657452e+00, -2.61071491e+00,  3.06544995e+00, ...,
           -4.18730850e+01, -1.35976517e+02,  1.10457848e+02],
          [ 4.77323914e+00, -6.04660511e-01,  3.04412961e

In [57]:
present_kv.keys()

dict_keys(['past_keys_0', 'past_keys_1', 'past_keys_2', 'past_keys_3', 'past_keys_4', 'past_keys_5', 'past_keys_6', 'past_keys_7', 'past_keys_8', 'past_keys_9', 'past_keys_10', 'past_keys_11', 'past_keys_12', 'past_keys_13', 'past_keys_14', 'past_keys_15', 'past_keys_16', 'past_keys_17', 'past_keys_18', 'past_keys_19', 'past_keys_20', 'past_keys_21', 'past_keys_22', 'past_keys_23', 'past_keys_24', 'past_keys_25', 'past_keys_26', 'past_keys_27', 'past_values_0', 'past_values_1', 'past_values_2', 'past_values_3', 'past_values_4', 'past_values_5', 'past_values_6', 'past_values_7', 'past_values_8', 'past_values_9', 'past_values_10', 'past_values_11', 'past_values_12', 'past_values_13', 'past_values_14', 'past_values_15', 'past_values_16', 'past_values_17', 'past_values_18', 'past_values_19', 'past_values_20', 'past_values_21', 'past_values_22', 'past_values_23', 'past_values_24', 'past_values_25', 'past_values_26', 'past_values_27'])

In [58]:
# Dimension checks
present_kv["past_keys_0"].shape

(1, 4, 64, 128)

In [59]:
present_kv["past_keys_27"].shape

(1, 4, 64, 128)

In [60]:
output_hidden_states.shape

(1, 64, 3584)

In [61]:
logits = head_session.run(None, {"output_hidden_states": output_hidden_states})[0]
logits

array([[[ 0.4983644 ,  1.6230613 ,  1.5614158 , ..., -1.0062782 ,
         -1.0203204 , -0.99798524],
        [-0.25778225,  0.52294326,  0.76917446, ..., -1.2872347 ,
         -1.3040742 , -1.2777851 ],
        [-3.3893716 , -2.3362472 ,  0.7103773 , ..., -5.6387787 ,
         -5.6436152 , -5.635117  ],
        ...,
        [ 7.9736085 ,  5.0458326 ,  6.4131646 , ...,  0.9678602 ,
          0.9553711 ,  0.9639133 ],
        [12.555673  ,  0.979866  ,  5.515235  , ...,  0.7585329 ,
          0.74953336,  0.7536065 ],
        [ 7.2622547 ,  3.6447115 ,  2.5632672 , ..., -2.0279808 ,
         -2.028657  , -2.0313673 ]]], dtype=float32)

In [62]:
logits.shape

(1, 64, 152064)

In [63]:
logits[0,-1].shape

(152064,)

In [64]:
def softmax_numpy(x: np.array, temperature: float=1) -> np.array:
    # stabilize x in case of large numbers 
    x = x - np.max(x)

    # Apply temperature
    x = x/temperature

    # Apply Softmax
    return np.exp(x)/np.sum(np.exp(x), axis=-1)

def top_k_probas(probas: np.array, k: int=5) -> np.array:
    # Copy probas so in-place operations don't work on original variable
    probas = probas.copy()
    # Normalize probabilities
    probas /= np.sum(probas)
    # Using -probas to get in descending order
    top_indices_sorted = np.argsort(-probas)[:k]
    top_k_probas = probas[top_indices_sorted]

    # Renormalize top-k probabilites to sum to 1 (probabilites must sum to 1 to use np.random.choice
    top_k_probas /= np.sum(top_k_probas)

    # Return top k probabilities
    return top_indices_sorted, top_k_probas

def apply_repetition_penalty(logits, generated_ids, penalty=1.1):
    for token_id in set(generated_ids):
        logits[token_id] /= penalty
    return logits

In [65]:
# Softmax implemented
# x-np.max(x) => for stability in case of large numbers
softmax = lambda x, temperature=1: np.exp((x-np.max(x))/temperature)/np.sum(np.exp((x-np.max(x))/temperature), axis=-1)

In [66]:
softmax_numpy(logits[0,-1])

array([4.2173695e-05, 1.1323028e-06, 3.8396985e-07, ..., 3.8935313e-09,
       3.8909032e-09, 3.8803640e-09], dtype=float32)

In [67]:
softmax(logits[0,-1])

array([4.2173695e-05, 1.1323028e-06, 3.8396985e-07, ..., 3.8935313e-09,
       3.8909032e-09, 3.8803640e-09], dtype=float32)

In [68]:
# Grabs last tokens logits
temp = 0.6
probas = softmax(logits[0,-1], temperature=temp)
# probas = probas / probas.sum()
next_token_id = int(np.random.choice(len(probas), p=probas)) #int(np.argmax(probas))
next_token_id

911

In [69]:
np.sum(probas)

0.9999999

In [70]:
tokenizer.decode([next_token_id])

' about'

In [71]:
present_kv.keys()

dict_keys(['past_keys_0', 'past_keys_1', 'past_keys_2', 'past_keys_3', 'past_keys_4', 'past_keys_5', 'past_keys_6', 'past_keys_7', 'past_keys_8', 'past_keys_9', 'past_keys_10', 'past_keys_11', 'past_keys_12', 'past_keys_13', 'past_keys_14', 'past_keys_15', 'past_keys_16', 'past_keys_17', 'past_keys_18', 'past_keys_19', 'past_keys_20', 'past_keys_21', 'past_keys_22', 'past_keys_23', 'past_keys_24', 'past_keys_25', 'past_keys_26', 'past_keys_27', 'past_values_0', 'past_values_1', 'past_values_2', 'past_values_3', 'past_values_4', 'past_values_5', 'past_values_6', 'past_values_7', 'past_values_8', 'past_values_9', 'past_values_10', 'past_values_11', 'past_values_12', 'past_values_13', 'past_values_14', 'past_values_15', 'past_values_16', 'past_values_17', 'past_values_18', 'past_values_19', 'past_values_20', 'past_values_21', 'past_values_22', 'past_values_23', 'past_values_24', 'past_values_25', 'past_values_26', 'past_values_27'])

In [72]:
present_kv['past_keys_0'].shape

(1, 4, 64, 128)

In [73]:
# temp = 0.6
start = time.time()
max_tokens = 100
top_k = 5
generated_ids = [next_token_id]
prev_seq_len = 64

print("\nInitial Query:\n", init_query)
print("Generated:")
for _ in range(max_tokens):
    input_ids = np.array([[next_token_id]], dtype=np.int64)
    # print(tokenizer.decode(generated_ids, skip_special_tokens=True))
    print(tokenizer.decode([next_token_id], skip_special_tokens=True),end="")
    embedding_output = embedding_session.run(None, {"input_ids": input_ids})[0]

    # print(embedding_output.shape)

    lengths = {
    "past_seq_len": np.array([[prev_seq_len]], dtype=np.int32),
    "total_seq_len": np.array([prev_seq_len + 1], dtype=np.int32)
    }

    iter_inputs = {
    "input_hidden_states": embedding_output,
    **present_kv,
    **lengths,
    }

    iter_outputs = ctx_itr_session.run(None, iter_inputs)

    # Hidden states are stored in last index of iter outputs
    output_hidden_states = iter_outputs[0]

    # For output tensor update key/value layers start at index = 0 
    # NOTE: Remember output of ctx_itr_session has output_hidden_states at 0th index, start with 1
    present_kv = {f"past_keys_{i}": iter_outputs[1 + i * 2] for i in range(num_layers)}
    present_kv.update({f"past_values_{i}":iter_outputs[1 + i * 2 + 1] for i in range(num_layers)})
    logits = head_session.run(None, {"output_hidden_states": output_hidden_states})[0]

    token_logits = logits[0,-1]
    token_logits = apply_repetition_penalty(token_logits, generated_ids, penalty=1.1)
    # Get probabilities
    probas = softmax(token_logits, temperature=temp)
    top_indices, top_probas = top_k_probas(probas, k=top_k) 
    next_token_id = int(np.random.choice(top_indices, p=top_probas)) #int(np.argmax(probas))
    generated_ids.append(next_token_id)
    prev_seq_len += 1

    if next_token_id == tokenizer.token_to_id("< | end_of_sentence | >"):
        break
        
end = time.time()
elapsed = end - start
tps = np.round((max_tokens / elapsed), 2)
print(f"\nTokens Per Second: {tps}")
output_text = tokenizer.decode(generated_ids, skip_special_tokens=True)



Initial Query:
 <｜User｜>
Imagine you are a therapist with a background in cyber security. I'm am currently very anxious about my data being stolen              can you provide me remedies to help with my depression and anxiety
<｜Assistant｜><think>

Generated:
 about my anxiety regarding data theft. You mentioned you have a background in cyber security, so maybe you're comfortable discussing related topics.

I want to ask if you can provide some specific strategies or tools that I can use to manage my anxiety specifically around this fear of data theft. Maybe something actionable and practical.

Also, since I'm dealing with both anxiety and cybersecurity, it would be helpful to know any tips on staying confident despite these fears. How do I maintain a healthy perspective when it comes to online
Tokens Per Second: 10.51


In [74]:
# Cleanup
print("RAM Before Session Cleanup:", psutil.virtual_memory().percent)
print("Deleting Sessions .........")
del embedding_session
del head_session
del ctx_itr_session
del ctx_session
print("RAM After Session del before garbage collection:", psutil.virtual_memory().percent)
gc.collect
print("RAM After Session Cleanup (session delete and gc):", psutil.virtual_memory().percent)

RAM Before Session Cleanup: 64.8
Deleting Sessions .........
RAM After Session del before garbage collection: 64.8
RAM After Session Cleanup (session delete and gc): 64.8
