In [1]:
import os
import time

import numpy as np
import torch
import argparse

from datasets import load_dataset
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer, 
                          Phi3ForCausalLM,
                          Phi3Config,
                          pipeline,
                          )
from transformers.pipelines.pt_utils import KeyDataset
from huggingface_hub import hf_hub_download
import torch_tensorrt



In [2]:
parser = argparse.ArgumentParser(description="TensorRT")
parser.add_argument("--model_path", type=str, default="models/")
parser.add_argument("--model_name", type=str, default="Phi-3-medium-4k-instruct")
parser.add_argument("--data_path", type=str, default="data/")
parser.add_argument("--dataset", type=str, default="test_dataset.jsonl")
parser.add_argument('--seed',type=int, default=0)
parser.add_argument("--batch_size", type=int, default=1)
parser.add_argument('--dtype',type=str, default="auto")
parser.add_argument('--n', type=int, default=1)
parser.add_argument('--temperature', type=float, default=0.0)

config = parser.parse_args([])

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
model_id = os.path.join(config.model_path, config.model_name)

model = Phi3ForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
    torchscript=True,
    attn_implementation="eager"
).to(DEVICE)

model.eval()


tokenizer = AutoTokenizer.from_pretrained(model_id)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [None]:
dummy_text = "Can you provide ways to eat combinations of bananas and dragonfruits?"
inputs = tokenizer(dummy_text, return_tensors="pt")

traced_model = torch.jit.trace(model, (inputs['input_ids'], inputs['attention_mask']))

output = traced_model(inputs['input_ids'], inputs['attention_mask'], strict=False)
print(output)

# preds = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False)

In [None]:
enabled_precisions = {torch.half} # Run with FP16
debug = True
# workspace_size = 20 << 30
min_block_size = 7  # Lower value allows more graph segmentation
torch_executed_ops = {}

compilation_args = {
    "enabled_precisions": enabled_precisions,
    "debug": debug,
    # "workspace_size": workspace_size,
    "min_block_size": min_block_size,
    "torch_executed_ops": torch_executed_ops,
}


trt_model = torch_tensorrt.dynamo.compile(
                        model, 
                        # mode="max-autotune", 
                        backend="torch_tensorrt",
                        dynamic=False,
                        fullgraph=True,
                        options=compilation_args)

In [10]:
trt_model = torch.compile(model, 
                        backend="torch_tensorrt",
                        options={
                            "truncate_long_and_double": True,
                            "enabled_precisions": {torch.float16}
                        },
                        dynamic=False)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    batch_size=config.batch_size,
)

generation_args = {
    "max_new_tokens": 200,
    "temperature": 0.0,
    "do_sample": False,
}

system_message = "You are a helpful AI Assistant. Help users by replying to their queries and make sure the responses are polite. Do not hallucinate."
PROMPT = f"<|system|>\n{system_message}<|end|>"

In [13]:
torch.cuda.synchronize()
start = time.perf_counter()

data = load_dataset("json", data_files="./data/test_dataset.jsonl")['train']
messages = list(KeyDataset(data,'message'))

token_ids = tokenizer.apply_chat_template(messages, 
                                        add_generation_prompt=True, 
                                        tokenize=False,)
                                        
token_ids.insert(0, PROMPT)

inputs = tokenizer(token_ids, return_tensors="pt", padding=True)
inputs = {k: v.type(torch.int32).to(DEVICE) for k, v in inputs.items()}

with torch.no_grad():
    outs = trt_model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs.get("attention_mask"),
        **generation_args
    )

generated_texts = tokenizer.batch_decode(outs, skip_special_tokens=True, clean_up_tokenization_spaces=False)
processed_outs = [
    [{"generated_text": text.rpartition('\n')[2]}]
    for text in generated_texts[1:]
]

torch.cuda.synchronize()
end = time.perf_counter()




In [14]:
print("===== Answers =====")
correct = 0
for i, out in enumerate(processed_outs):
    correct_answer = data[i]["answer"]
    answer = out[0]["generated_text"].lstrip().replace("\n","")
    if answer == correct_answer:
        correct += 1
    print(answer)
 
print("===== Perf result =====")
print("Elapsed_time: ", end-start)
print(f"Correctness: {correct}/{len(data)}")

===== Answers =====
Deep sea animals
is standard weight and size
they are genetically called to
south
An aircraft taking a trip
protozoa
Green house
it unfreezes, because it is cold-blooded
It holds 500 mL of water
the air becomes arid
July
speaking with a witness
shell

particles of iron
H2O haze
constellations to appear in one place in spring and another in fall
glucose
help prevent the effects of erosion
wind
salvage plastic bottles instead of throwing them away
less energy used by the water heater
people driving cars might be unaware the animal is close by
light from our closest star
the darkness is greatest
Water
clothing
a cut peony
an alligator's habitat
has seeds outside the flesh, unlike the blueberry
===== Perf result =====
Elapsed_time:  2.990051779896021
Correctness: 23/30
