# QNN Model Execution on Windows Snapdragon

This guide describes how to execute the Llama model on Snapdragon X Elite using the Qualcomm AI Engine Direct SDK.

This document uses the terms Qualcomm Neural Network (QNN) and Qualcomm AI Engine Direct SDK interchangeably.

# Prerequisites
1. Qualcomm AI Engine Direct SDK (with Windows arm64 support)
2. QNN context binary generated by qnn_model_prepare.ipynb
4. Snapdragon X Elite Windows device
6. A Jupyter environment as mentioned in qnn_model_prepare.ipynb and qnn_model_execution.ipynb
7. Model preparation completed as specified in qnn_model_prepare.ipynb

# Platform requirements
- Platform: ARM64
- Python 3.10

# Workflow
1. Configure a Qualcomm Windows device (preferably with a Snapdragon X Elite)
3. Copy the prepared model files (generated from the qnn_model_prepare_on_linux.ipynb notebook) and libraries (from QNN SDK) on the Windows device
4. Run the models in a Llama pipeline. Given a user prompt, execute the models as a Llama pipeline on QNN HTP on the Android platform to produce a human-like response

# Setup

Create and activate the Python 3.10 virtual environment.

### Install the required python packages

In [None]:
from IPython import get_ipython

import json
import os
import sys
import shutil
import subprocess

## Configure a Qualcomm Snapdragon Windows device

In [None]:
# genai t2t run device target set up 
target_genai_t2t_run_path = "target_genai_t2t_run"
os.makedirs(target_genai_t2t_run_path, exist_ok=True)

In [None]:
# 1. Update QNN SDK Path
QNN_SDK = os.path.join("<QNN SDK Path>")
# Example given below:
# QNN_SDK = os.path.join("C:\\Qualcomm\\AIStack\\QAIRT\\2.28.0.241029")

# assert os.path.exists(QNN_SDK), "Please enter the correct location of QNN SDK Root"

## Set up NSP target

In [None]:
sys.path.append('../../../')
sys.path.append('../../../common/')
sys.path.append('../../../common/utilities')

from nsptargets import NspTargets

# Set up nsp target specification
# see common/utilities/nsptargets.py for other supported targets
nsp_target = NspTargets.Windows.GEN2

## Workflow for genie-t2t-run

1. Push genie t2t run dependency files, the files include the following:
    - genie-t2t-run.exe
    - QnnHtp.dll, QnnHtp.lib, QnnHtpNetRunExtensions.dll, QnnSystem.dll
    - QnnHtpV73Stub.dll, libQnnHtpV73Skel.so, libQnnHtpV73Skel.so, libqnnhtpv73Skel.cat (v73 refers to the Windows Gen2 NSP target)
    - Genie.dll
    - tokenizer_llama32.json
2. Generate htp_model_config_data and htp_backend_config_data configs
3. Run script on device that executes genie-t2t-run

In [None]:
tokenizer_file = "tokenizer_llama32.json"
backend_config_file = "htp_backend_ext_config.json"
model_config_file = "llama32-3b-htp-model-config.json"

# push needed files
def copy_t2t_dependency_files():
    # Push qnn sdk libraries
    libs = ["QnnHtp.dll", "QnnHtp.lib", "QnnHtpNetRunExtensions.dll", "QnnSystem.dll", nsp_target.qnn_htp_lib_name+"Stub.dll"]
    libs_dir = os.path.join(QNN_SDK, "lib" , "aarch64-windows-msvc")
    for lib in libs:
        src = os.path.join(libs_dir, lib)
        dest = os.path.join(target_genai_t2t_run_path, lib)
        shutil.copyfile(src, dest)
        
    skel_dir = os.path.join(QNN_SDK, "lib", f"hexagon-{nsp_target.dsp_arch}", "unsigned")
    skel_file = f"lib{nsp_target.qnn_htp_lib_name}Skel.so"
    src = os.path.join(skel_dir, skel_file)
    dest = os.path.join(target_genai_t2t_run_path, skel_file)
    shutil.copyfile(src, dest)

    so_file = f"lib{nsp_target.qnn_htp_lib_name}.so"
    src = os.path.join(skel_dir, so_file)
    dest = os.path.join(target_genai_t2t_run_path, so_file)
    shutil.copyfile(src, dest)

    skel_cat_file = f"lib{nsp_target.qnn_htp_lib_name.lower()}.cat"
    src = os.path.join(skel_dir, skel_cat_file)
    dest = os.path.join(target_genai_t2t_run_path, skel_cat_file)
    shutil.copyfile(src, dest)

    # Push QNN Genie libraries 
    libs = ["Genie.dll"]
    for lib in libs:
        src = os.path.join(libs_dir, lib)
        dest = os.path.join(target_genai_t2t_run_path, lib)
        shutil.copyfile(src, dest)

    # Copy tokenizer.json
    src = os.path.join("./", tokenizer_file)
    dest = os.path.join(target_genai_t2t_run_path, tokenizer_file)
    shutil.copyfile(src, dest)
    
    genie_executable = "genie-t2t-run.exe"
    t2t_net_run = os.path.join(QNN_SDK, "bin" , "aarch64-windows-msvc", genie_executable)
    dest = os.path.join(target_genai_t2t_run_path, genie_executable)
    shutil.copyfile(t2t_net_run, dest)

# Inference script 
def create_t2t_runtime_script(context_binary_paths, config_folder):    
    with open(backend_config_file, 'r') as r1:
        backend_data = json.load(r1)
        # You can make any changes for backend config file here

        with open(os.path.join(config_folder, backend_config_file), 'w') as f1:
            f1.write(json.dumps(backend_data, indent=4))

    with open(model_config_file, 'r') as r2:
        model_data = json.load(r2)
        # You can make any changes for backend config file here
        model_data["dialog"]["tokenizer"]["path"] = tokenizer_file
        model_data["dialog"]["engine"]["model"]["binary"]["ctx-bins"] = context_binary_paths
        # You can uncomment the command given below and
        # change the context size based on the value of context length while generating the notebooks.
        model_data["dialog"]["context"]["size"] = 4096

        with open(os.path.join(config_folder, model_config_file), 'w') as f2:
            f2.write(json.dumps(model_data, indent=4))

def run_t2t_on_target(prompt, target_genai_t2t_run_path):
    command = [ "powershell.exe", ".\genie-t2t-run.exe",
                # "-b", "QnnHtp.dll", # uncomment this line if the QNN version is <= 2.25
                "-c", model_config_file,
                "-p", f"\"{prompt}\""
                ]
    result = subprocess.Popen(command, stdout = subprocess.PIPE, stderr = subprocess.PIPE, cwd=target_genai_t2t_run_path)
    output, error = result.communicate()

    print("Output:", output.decode())
    print("Error:", error.decode())  

### Run llama with genie-t2t-run

Note : Copy the context binaries to "target_genai_t2t_run_path"

In [None]:
# 2. Replace the below names with the absolute paths for all the 3 serialized binaries
WEIGHT_SHARE_CONTEXT_BINARY_PATH = [
    "<ctx_bin_1_of_3>.bin",
    "<ctx_bin_2_of_3>.bin",
    "<ctx_bin_3_of_3>.bin"
]

prompt_template = (
    "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
    "{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
    "{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
)

# 3. Update the system and user prompts to be used for the execution.
sys_prompt = "You are an helpful assistant."
user_prompt = "Plan a three day trip to San Diego"
prompt = prompt_template.format_map({'instruction': user_prompt, 'system_prompt': sys_prompt})

isFailed = False
for i in range(len(WEIGHT_SHARE_CONTEXT_BINARY_PATH)):
    context_binary_filename = os.path.basename(WEIGHT_SHARE_CONTEXT_BINARY_PATH[i])
    if not os.path.exists(WEIGHT_SHARE_CONTEXT_BINARY_PATH[i]):
        print(f"Context binary: ({context_binary_filename}) does not exists in this path {WEIGHT_SHARE_CONTEXT_BINARY_PATH[i]}.")
        isFailed = True
        break
        
# Create configs,scripts and copy dependencies
if isFailed is False :
    create_t2t_runtime_script(WEIGHT_SHARE_CONTEXT_BINARY_PATH, target_genai_t2t_run_path)
    copy_t2t_dependency_files()

# Run the model 
if isFailed is False :
    run_t2t_on_target(prompt, target_genai_t2t_run_path)

Copyright (c) 2024 Qualcomm Technologies, Inc. and/or its subsidiaries.