In [1]:
import time
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers.pipelines.pt_utils import KeyDataset

from vllm import LLM, SamplingParams

import re

In [2]:
# my local models
MODELZOO = {
    "phi3-14b" : "./models/Phi-3-medium-4k-instruct",
    "phi3-3.8b" : "./models/Phi-3-mini-4k-instruct",
    "bloom-560m": "./models/bloom-560m",
}

In [3]:
llm = LLM(
    model=MODELZOO["phi3-14b"],
    tensor_parallel_size=1,
    speculative_model=MODELZOO["phi3-3.8b"],
    num_speculative_tokens=7,
    use_v2_block_manager=True,
    max_model_len=100,  # Decrease this value to match the cache limit
)

INFO 09-10 11:25:04 llm_engine.py:213] Initializing an LLM engine (v0.6.0) with config: model='./models/Phi-3-medium-4k-instruct', speculative_config=SpeculativeConfig(draft_model='./models/Phi-3-mini-4k-instruct', num_spec_tokens=7), tokenizer='./models/Phi-3-medium-4k-instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=100, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=./models/Phi-3-medium-4k-ins

  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")


INFO 09-10 11:25:06 selector.py:240] Cannot use FlashAttention-2 backend due to sliding window.
INFO 09-10 11:25:06 selector.py:116] Using XFormers backend.
INFO 09-10 11:25:06 spec_decode_worker.py:162] Configuring SpecDecodeWorker with proposer=<class 'vllm.spec_decode.multi_step_worker.MultiStepWorker'>
INFO 09-10 11:25:06 rejection_sampler.py:64] Use pytorch for rejection sampling.
INFO 09-10 11:25:06 spec_decode_worker.py:176] Configuring SpecDecodeWorker with sampler=<class 'vllm.model_executor.layers.rejection_sampler.RejectionSampler'>
INFO 09-10 11:25:06 model_runner.py:915] Starting to load model ./models/Phi-3-medium-4k-instruct...
INFO 09-10 11:25:07 selector.py:240] Cannot use FlashAttention-2 backend due to sliding window.
INFO 09-10 11:25:07 selector.py:116] Using XFormers backend.


Loading safetensors checkpoint shards:   0% Completed | 0/6 [00:00<?, ?it/s]


INFO 09-10 11:26:15 model_runner.py:926] Loading model weights took 26.0838 GB
INFO 09-10 11:26:15 model_runner.py:915] Starting to load model ./models/Phi-3-mini-4k-instruct...
INFO 09-10 11:26:15 selector.py:240] Cannot use FlashAttention-2 backend due to sliding window.
INFO 09-10 11:26:15 selector.py:116] Using XFormers backend.


Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 09-10 11:26:32 model_runner.py:926] Loading model weights took 7.1183 GB
INFO 09-10 11:26:35 gpu_executor.py:122] # GPU blocks: 182, # CPU blocks: 1310
INFO 09-10 11:26:38 model_runner.py:1217] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-10 11:26:38 model_runner.py:1221] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-10 11:26:54 model_runner.py:1335] Graph capturing finished in 16 secs.
INFO 09-10 11:26:59 model_runner.py:1217] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INF

In [4]:
def data_preprocessing(data):
    new_set = []
    for i in range(len(data)):
        temp = data[i]['message'][0]['content']
        new_set.append(temp)
    return new_set

In [7]:
####### Section 3. Load data and Inference -> Performance evaluation part #######
start = time.perf_counter()
data = load_dataset("json", data_files="./data/test_dataset.jsonl")['train']
data_list = data_preprocessing(data)
outs = llm.generate(data_list, sampling_params)
end = time.perf_counter()

Processed prompts:   0%|          | 0/30 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



Processed prompts: 100%|██████████| 30/30 [00:04<00:00,  7.15it/s, est. speed input: 595.19 toks/s, output: 87.44 toks/s]


In [8]:
sampling_params = SamplingParams(temperature=0.0, top_p=0.95)

outputs = llm.generate(data_list, sampling_params)

for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Generated text: {generated_text!r}")

Processed prompts:   0%|          | 0/30 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



Processed prompts: 100%|██████████| 30/30 [00:02<00:00, 13.81it/s, est. speed input: 1150.22 toks/s, output: 168.99 toks/s]

Generated text: 'Answer: Deep sea animals\n\nQuestion: \n\n'
Generated text: 'answer: Gas can fill any container it is given, and liquid is the opposite'
Generated text: '\nanswer: they are genetically called to\n\n'
Generated text: 'answer: south\n\nanswer: south'
Generated text: '\nanswer: An aircraft taking a trip\n\nexplanation: Kinetic'
Generated text: 'answer: protozoa\n\n\nquestion: What is the name of the'
Generated text: 'answer: Green house\n\n\nquestion: What is the name of the process'
Generated text: ''
Generated text: '\nanswer: It holds 500 mL of water.\n\n'
Generated text: 'answer: the air becomes arid\n\nexplanation=The air becomes'
Generated text: 'answer: July\n\nexplanation: July'
Generated text: '\nanswer: speaking with a witness\n\n- response: speaking with a witness'
Generated text: 'answer: shell\n\nexplanation=shell'
Generated text: 'Answer: the final barrel is'
Generated text: 'answer: particles of iron\n\n\nquestion: The first step in the process'
Generated t




In [9]:
####### Section 4. Accuracy (Just for leasderboard) #######
print("===== Answers =====")
correct = 0
for i, out in enumerate(outs):
    correct_answer = data[i]["answer"]
    # 생성된 답변에서 불필요한 텍스트 제거
    answer = out.outputs[0].text
    cleaned_answer = re.sub(r"A:\n\n\n### response ###\n\n|\n### response ###\n\n|A: |\nB:", "", answer).lstrip().replace("\n","")
    cleaned_answer = cleaned_answer.replace("answer: ","")
    
    # 정답과 출력된 답변을 비교
    print(f"Correct Answer: {correct_answer}")
    print(f"Generated Answer: {cleaned_answer}")
    if answer == cleaned_answer:
        correct += 1
        print(answer,"correct!!")
 
print("===== Perf result =====")
print("Elapsed_time: ", end-start)
print(f"Correctness: {correct}/{len(data)}")

===== Answers =====
Correct Answer: Deep sea animals
Generated Answer: Answer: Deep sea animalsQuestion: 
Correct Answer: uses what it needs
Generated Answer: Gas can fill any container it is given, and liquid is the opposite
Correct Answer: they are genetically called to
Generated Answer: they are genetically called to
Correct Answer: south
Generated Answer: southsouth
Correct Answer: An aircraft taking a trip
Generated Answer: An aircraft taking a tripexplanation: Kinetic
Correct Answer: protozoa
Generated Answer: protozoaquestion: What is the name of the
Correct Answer: Green house
Generated Answer: Green housequestion: What is the name of the process
Correct Answer: it unfreezes, because it is cold-blooded
Generated Answer: 
 correct!!
Correct Answer: It holds 500 mL of water
Generated Answer: It holds 500 mL of water.
Correct Answer: fluid spreads from pores
Generated Answer: the air becomes aridexplanation=The air becomes
Correct Answer: July
Generated Answer: Julyexplanation: Ju