In [2]:
import os
import time

import numpy as np
import torch
import argparse

from datasets import load_dataset
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer, 
                          Phi3ForCausalLM,
                          Phi3Config,
                          pipeline,
                          )
from transformers.pipelines.pt_utils import KeyDataset
from huggingface_hub import hf_hub_download
import torch_tensorrt



In [2]:
parser = argparse.ArgumentParser(description="TensorRT")
parser.add_argument("--model_path", type=str, default="models/")
parser.add_argument("--model_name", type=str, default="Phi-3-medium-4k-instruct")
parser.add_argument("--data_path", type=str, default="data/")
parser.add_argument("--dataset", type=str, default="test_dataset.jsonl")
parser.add_argument('--seed',type=int, default=0)
parser.add_argument("--batch_size", type=int, default=1)
parser.add_argument('--dtype',type=str, default="auto")
parser.add_argument('--n', type=int, default=1)
parser.add_argument('--temperature', type=float, default=0.0)

config = parser.parse_args([])

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
!trtexec --onnx=./cuda-fp16/phi3-medium-4k-instruct-cuda-fp16.onnx --saveEngine=phi3.trt

/bin/bash: line 1: trtexec: command not found


In [7]:
import onnxruntime as ort
from optimum.onnxruntime import ORTModelForCausalLM

sess_options = ort.SessionOptions()
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
sess_options.intra_op_num_threads = 15

model_path = './cuda-fp16/phi3-medium-4k-instruct-cuda-fp16.onnx'
onnx_model = ORTModelForCausalLM.load_model(model_path,
                                        session_options=sess_options,)

tokenizer = AutoTokenizer.from_pretrained('./cuda-fp16/')

  @torch.library.impl_abstract("xformers_flash::flash_fwd")

  @torch.library.impl_abstract("xformers_flash::flash_bwd")



In [3]:
model_id = os.path.join(config.model_path, config.model_name)

with torch.no_grad():
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        # device_map="cuda",
        torch_dtype=torch.half,
        trust_remote_code=True,
        torchscript=True,
        attn_implementation="eager",
        use_cache=False,
    ).eval()
    
tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [12]:
dummy_messages = [
    {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
    {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."},
    {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"},
]

input_ids = tokenizer.apply_chat_template(dummy_messages, 
                                        add_generation_prompt=True, 
                                        tokenize=True,
                                        return_tensors="pt",
                                        padding=True)

# dummy_message = "Can you provide ways to eat combinations of bananas and dragonfruits?"
# model_inputs = tokenizer(dummy_message, return_tensors="pt", padding=True)
# input_ids = model_inputs.input_ids

In [None]:
torch.onnx.dynamo_export(onnx_model)

In [13]:
torch.jit.trace(model, input_ids)


  if input_shape[-1] > 1 or self.sliding_window is not None:

  if past_key_values_length > 0:

  if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):

  if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):

  if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):



: 

: 

In [None]:
torch_tensorrt.dynamo.compile(
    onnx_model,
    inputs=[input_ids],
)

In [20]:
trt_model = torch.compile(model, 
                        backend="torch_tensorrt",
                        options={
                            "truncate_long_and_double": True,
                            "enabled_precisions": {torch.half},
                            "workspace_size": 20 << 30,
                        },
                        dynamic=False)

# generation_args = {
#     "max_new_tokens": 200,
#     "temperature": 0.0,
#     "do_sample": False,
# }

system_message = "You are a helpful AI Assistant. Help users by replying to their queries and make sure the responses are polite. Do not hallucinate."
PROMPT = f"<|assistant|>\n{system_message}<|end|>"

AssertionError: 

In [29]:
torch.jit.save(trt_model, 'trt.ts')

AttributeError: 'Phi3ForCausalLM' object has no attribute 'save'

In [42]:
dummy_message = "Can you provide ways to eat combinations of bananas and dragonfruits?"


input_ids = tokenizer(dummy_message, return_tensors="pt").to(DEVICE)

traced_model = torch.jit.trace(model, (input_ids["input_ids"], input_ids["attention_mask"]))


  if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal:

  if past_key_values_length > 0:



RuntimeError: Attempting to use FunctionalTensor on its own. Instead, please use it with a corresponding FunctionalTensorMode()

In [None]:
torch.cuda.synchronize()
start = time.perf_counter()

data = load_dataset("json", data_files="./data/test_dataset.jsonl")['train']
messages = data['message']
messages.insert(0, PROMPT)

token_ids = tokenizer.apply_chat_template(messages, 
                                        add_generation_prompt=True, 
                                        tokenize=False,)

toekn_ids.insert(0, PROMPT)

inputs = tokenizer(toekn_ids, return_tensors="pt", padding=True)
inputs = {k: v.type(torch.int32).to(DEVICE) for k, v in inputs.items()}

with torch.inference_mode():
    outs = trt_model.generate(
        input_ids=inputs.input_ids,
        **generation_args
    )

generated_texts = tokenizer.batch_decode(outs, skip_special_tokens=True, clean_up_tokenization_spaces=False)
results = [
    [{"generated_text": text.rpartition('\n')[2]}]
    for text in generated_texts[1:]
]

torch.cuda.synchronize()
end = time.perf_counter()

In [None]:
print("===== Answers =====")
correct = 0
for i, out in enumerate(results):
    correct_answer = data[i]["answer"]
    answer = out[0]["generated_text"].lstrip().replace("\n","")
    if answer == correct_answer:
        correct += 1
    print(answer)
 
print("===== Perf result =====")
print("Elapsed_time: ", end-start)
print(f"Correctness: {correct}/{len(data)}")