In [1]:
import os
import time
import random
import argparse

import numpy as np
import torch


from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.pipelines.pt_utils import KeyDataset
import torch_tensorrt

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  scaled_e4m3_abstract = torch.library.impl_abstract("trt::quantize_fp8")(
  @torch.library.impl_abstract("xformers_flash::flash_fwd")


In [2]:
import onnxruntime_genai as og

model_id = './cuda-fp16/'
model = og.Model(model_id)
tokenizer = og.Tokenizer(model)
tokenizer_stream = tokenizer.create_stream()

search_options = {"max_length": 1046,"temperature":0.0}
params = og.GeneratorParams(model)
params.set_search_options(**search_options)

In [45]:
from llmlingua import PromptCompressor

llm_lingua = PromptCompressor("microsoft/llmlingua-2-xlm-roberta-large-meetingbank", 
                        device_map="auto",
                        use_llmlingua2=True)

hf_tokenizer = AutoTokenizer.from_pretrained(model_id)
data = load_dataset("json", data_files="./data/test_dataset.jsonl")['train']
messages = data['message']

# system_message = "You are a helpful AI Assistant. Help users by replying to their queries and make sure the responses are polite. Do not hallucinate."
# PROMPT = f"<|system|>\n{system_message}<|end|>"                                        
# token_ids.insert(0, PROMPT)

INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [69]:
import json
import re

def compress(content):
    data = content.rpartition('choices:')

    comp_dict = llm_lingua.compress_prompt(
        context=data[0],
        rate=0.7,
    )

    comp = comp_dict['compressed_prompt']
    return f"{comp}\nchoices:{data[2]}\n"
    
def compressed_jsonl(input_file_path, output_file_path):
    with open(input_file_path, 'r') as reader:
        with open(output_file_path, 'w') as writer:
            for line in reader:
                line = json.loads(line)
                line["message"][0]["content"] = compress(line["message"][0]["content"])
                
                json.dump(line, writer)
                writer.write("\n")

compressed_jsonl('./data/test_dataset.jsonl', './data/compressed.jsonl')

In [72]:
start = time.perf_counter()

hf_tokenizer = AutoTokenizer.from_pretrained(model_id)
data = load_dataset("json", data_files="./data/test_dataset.jsonl")['train']
messages = data['message']

token_ids = hf_tokenizer.apply_chat_template(messages, 
                                        add_generation_prompt=True, 
                                        tokenize=False,)

system_message = "You are a helpful AI Assistant. Help users by replying to their queries and make sure the responses are polite. Do not hallucinate."
PROMPT = f"<|system|>\n{system_message}<|end|>"                                        
token_ids.insert(0, PROMPT)

outs = []
for token_id in token_ids[1:]:
    input_tokens = tokenizer.encode(token_id)
    params = og.GeneratorParams(model)
    params.input_ids = input_tokens
    generator = og.Generator(model, params)

    text = ''
    while not generator.is_done():
        generator.compute_logits()
        generator.generate_next_token()
        
        new_token = generator.get_next_tokens()[0]
        text += tokenizer_stream.decode(new_token)
    
    outs.append(
        [{'generated_text': text}]
        )

end = time.perf_counter()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [73]:
print("===== Answers =====")
correct = 0
for i, out in enumerate(outs):
    correct_answer = data[i]["answer"]
    answer = out[0]["generated_text"].lstrip().replace("\n","")

    print(f"Correct Answer: {correct_answer}")
    print(f"Generated Answer: {answer}")
    if answer == correct_answer:
        correct += 1
    
print("===== Perf result =====")
print("Elapsed_time: ", end-start)
print(f"Correctness: {correct}/{len(data)}")

===== Answers =====
Correct Answer: Deep sea animals
Generated Answer: Deep sea animals
Correct Answer: uses what it needs
Generated Answer: is standard weight andsize
Correct Answer: they are genetically called to
Generated Answer: they are genetically called to
Correct Answer: south
Generated Answer: south
Correct Answer: An aircraft taking a trip
Generated Answer: An aircraft taking a trip
Correct Answer: protozoa
Generated Answer: protozo
Correct Answer: Green house
Generated Answer: Green house
Correct Answer: it unfreezes, because it is cold-blooded
Generated Answer: it unfreezes, because it is cold-
Correct Answer: It holds 500 mL of water
Generated Answer: It holds mL of water
Correct Answer: fluid spreads from pores
Generated Answer: the air becomes arid
Correct Answer: July
Generated Answer: July
Correct Answer: speaking with a witness
Generated Answer: speaking with a
Correct Answer: shell
Generated Answer: shell
Correct Answer: the final barrel is gone, there supply is fini