In [1]:
import time
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers.pipelines.pt_utils import KeyDataset

# from vllm import LLM, SamplingParams
import torch_tensorrt

import re

In [2]:
# my local models
MODELZOO = {
    "phi3-14b" : "./models/Phi-3-medium-4k-instruct",
    "phi3-3.8b" : "./models/Phi-3-mini-4k-instruct",
    "bloom-560m": "./models/bloom-560m",
}

In [3]:
# llm = LLM(
#     model=MODELZOO["phi3-14b"],
#     tensor_parallel_size=1,
#     speculative_model=MODELZOO["phi3-3.8b"],
#     num_speculative_tokens=3,
#     use_v2_block_manager=True,
#     max_model_len=100,  # Decrease this value to match the cache limit
# )
llm = LLM(
    model=MODELZOO["phi3-14b"]
    # tensor_parallel_size=1,
    # speculative_model=MODELZOO["phi3-3.8b"],
    # num_speculative_tokens=3,
    # use_v2_block_manager=True,
    # max_model_len=100,  # Decrease this value to match the cache limit
)

INFO 09-16 08:20:26 llm_engine.py:213] Initializing an LLM engine (v0.6.0) with config: model='./models/Phi-3-medium-4k-instruct', speculative_config=None, tokenizer='./models/Phi-3-medium-4k-instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=./models/Phi-3-medium-4k-instruct, use_v2_block_manager=False, num_scheduler_steps=1, enable_prefix_cachin

INFO 09-16 08:20:27 selector.py:240] Cannot use FlashAttention-2 backend due to sliding window.
INFO 09-16 08:20:27 selector.py:116] Using XFormers backend.


  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")


INFO 09-16 08:20:28 model_runner.py:915] Starting to load model ./models/Phi-3-medium-4k-instruct...
INFO 09-16 08:20:28 selector.py:240] Cannot use FlashAttention-2 backend due to sliding window.
INFO 09-16 08:20:28 selector.py:116] Using XFormers backend.


Loading safetensors checkpoint shards:   0% Completed | 0/6 [00:00<?, ?it/s]


INFO 09-16 08:20:51 model_runner.py:926] Loading model weights took 26.0838 GB
INFO 09-16 08:20:53 gpu_executor.py:122] # GPU blocks: 2800, # CPU blocks: 1310
INFO 09-16 08:20:55 model_runner.py:1217] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-16 08:20:55 model_runner.py:1221] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-16 08:21:09 model_runner.py:1335] Graph capturing finished in 14 secs.


In [9]:
sampling_params = SamplingParams(temperature=0.7, top_p=0.95)
tokenizer = AutoTokenizer.from_pretrained(MODELZOO["phi3-14b"])

In [10]:
data = load_dataset("json", data_files="./data/test_dataset.jsonl")['train']
tokenized_chat = tokenizer.apply_chat_template(data, tokenize=False, add_generation_prompt=True)
print(tokenized_chat)




In [11]:
def apply_chat_template_with_system(messages, system_message="<|system|> This is a system message.\n<|end|>\n"):
    token_ids = []
    for message in messages:
        chat_message = (
            system_message  # 시스템 메시지 추가
            + "<|user|>\n" + message  # 유저 메시지 추가
            + "<|end|>\n<|assistant|>\n"  # 엔딩 토큰 및 어시스턴트 토큰 추가
        )
        token_ids.append(chat_message)
    return token_ids

In [28]:
####### Section 3. Load data and Inference -> Performance evaluation part #######
start = time.perf_counter()
data = load_dataset("json", data_files="./data/test_dataset.jsonl")['train']
messages = KeyDataset(data,'message')

sys = "You are a helpful AI Assistant. Help users by replying to their queries and make sure the responses are polite. Do not hallucinate."

messages = list(messages)
token_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
token_ids.insert(0, "<|system|>" + sys + "<|end|>")
print(token_ids)

outs = llm.generate(token_ids, sampling_params)

end = time.perf_counter()

['<|system|>You are a helpful AI Assistant. Help users by replying to their queries and make sure the responses are polite. Do not hallucinate.<|end|>', "<|user|>\n\nRead the question and answer the following sentence in given multiple choice.\nAnswer only the sentence you chose. Never include a question and other word in your answer.\n\nquestion: Frilled sharks and angler fish live far beneath the surface of the ocean, which is why they are known as\nchoices: ['Deep sea animals', 'fish', 'Long Sea Fish', 'Far Sea Animals']\n<|end|>\n<|assistant|>\n", "<|user|>\n\nRead the question and answer the following sentence in given multiple choice.\nAnswer only the sentence you chose. Never include a question and other word in your answer.\n\nquestion: Gas can fill any container it is given, and liquid\nchoices: ['is standard weight and size', 'is the opposite of variable', 'only needs a few', 'uses what it needs']\n<|end|>\n<|assistant|>\n", "<|user|>\n\nRead the question and answer the follo

Processed prompts: 100%|██████████| 31/31 [00:01<00:00, 22.59it/s, est. speed input: 1867.51 toks/s, output: 142.87 toks/s]


In [29]:
####### Section 4. Accuracy (Just for leasderboard) #######
print("===== Answers =====")
correct = 0
for i, out in enumerate(outs[1:]):
    correct_answer = data[i]["answer"]
    # 생성된 답변에서 불필요한 텍스트 제거
    # answer = out.outputs[0].text
    # cleaned_answer = re.sub(r"A:\n\n\n### response ###\n\n|\n### response ###\n\n|A: |\nB:", "", answer).lstrip().replace("\n","")
    # cleaned_answer = cleaned_answer.replace("answer: ','")
    answer = out.outputs[0].text.lstrip().replace("\n","")

    # 정답과 출력된 답변을 비교
    if answer != correct_answer:
        print(f"Correct Answer: {correct_answer}")
        print(f"Generated Answer: {answer}")
    if answer == correct_answer:
        correct += 1
 
print("===== Perf result =====")
print("Elapsed_time: ", end-start)
print(f"Correctness: {correct}/{len(data)}")

===== Answers =====
Correct Answer: uses what it needs
Generated Answer: is standard weight and size
Correct Answer: It holds 500 mL of water
Generated Answer: It is a sphere.
Correct Answer: fluid spreads from pores
Generated Answer: the air becomes arid
Correct Answer: lead to less impacted soil
Generated Answer: help prevent the effects of erosion
Correct Answer: storms
Generated Answer: wind
Correct Answer: Kool-Aid
Generated Answer: Water
Correct Answer: rosebuds
Generated Answer: clothing
===== Perf result =====
Elapsed_time:  2.1028496958315372
Correctness: 23/30
