In [2]:
import os
import time

import numpy as np
import torch
import argparse

from datasets import load_dataset
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer, 
                          Phi3ForCausalLM,
                          Phi3Config,
                          pipeline,
                          )
from transformers.pipelines.pt_utils import KeyDataset

import utils

os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["TOKENIZERS_PARALLELISM"] = "False"
# os.environ["CUDA_MODULE_LOADING"] = "LAZY"

parser = argparse.ArgumentParser(description="vLLM + MInference")
parser.add_argument("--model_path", type=str, default="models/")
parser.add_argument("--model_name", type=str, default="Phi-3-medium-4k-instruct")
parser.add_argument("--data_path", type=str, default="data/")
parser.add_argument("--dataset", type=str, default="test_dataset.jsonl")
parser.add_argument('--seed',type=int, default=0)
parser.add_argument("--batch_size", type=int, default=1)
parser.add_argument('--dtype',type=str, default="auto")
parser.add_argument('--gpu_memory_utilization',type=float, default=0.9) # 0.3, 0.5
parser.add_argument('--enforce_eager',type=bool, default=False)

parser.add_argument('--n', type=int, default=1)
parser.add_argument('--temperature', type=float, default=0.0)
parser.add_argument('--top_p', type=float, default=1.0)
parser.add_argument('--top_k', type=float, default=-1)
parser.add_argument('--ignore_eos', type=bool, default=True)
parser.add_argument('--max_tokens', type=int, default=500)

config = parser.parse_args([])
utils.seed_everything(config.seed)
CURR_PATH = os.getcwd()

In [3]:
#######  Set up  #######
from vllm import LLM, SamplingParams
model_id = os.path.join(CURR_PATH, config.model_path, config.model_name) # please replace with local model path

model_args = {
    "trust_remote_code": True,
    "gpu_memory_utilization": config.gpu_memory_utilization,
    "seed": config.seed,
    "dtype": config.dtype,
    "enforce_eager":config.enforce_eager,
}

model = LLM(model=model_id, **model_args)
tokenizer = AutoTokenizer.from_pretrained(model_id)

sampling_args = {
    "temperature":config.temperature,
    "top_p":config.top_p,
    "seed":config.seed,
    "use_beam_search":False,
    "ignore_eos":config.ignore_eos,
    "max_tokens": config.max_tokens,
}

# Phi3ForCausalLM

INFO 09-11 05:58:17 llm_engine.py:213] Initializing an LLM engine (v0.6.0) with config: model='/home/elicer/LLMInference/models/Phi-3-medium-4k-instruct', speculative_config=None, tokenizer='/home/elicer/LLMInference/models/Phi-3-medium-4k-instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/home/elicer/LLMInference/models/Phi-3-medium-4k-instruct, 

  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")


INFO 09-11 05:58:30 model_runner.py:915] Starting to load model /home/elicer/LLMInference/models/Phi-3-medium-4k-instruct...
INFO 09-11 05:58:30 selector.py:240] Cannot use FlashAttention-2 backend due to sliding window.
INFO 09-11 05:58:30 selector.py:116] Using XFormers backend.


Loading safetensors checkpoint shards:   0% Completed | 0/6 [00:00<?, ?it/s]


INFO 09-11 05:59:58 model_runner.py:926] Loading model weights took 26.0838 GB
INFO 09-11 06:00:01 gpu_executor.py:122] # GPU blocks: 2800, # CPU blocks: 1310
INFO 09-11 06:00:04 model_runner.py:1217] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-11 06:00:04 model_runner.py:1221] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-11 06:00:19 model_runner.py:1335] Graph capturing finished in 15 secs.


In [4]:
tokenizer

LlamaTokenizerFast(name_or_path='/home/elicer/LLMInference/models/Phi-3-medium-4k-instruct', vocab_size=32000, model_max_length=4096, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '<|endoftext|>', 'unk_token': '<unk>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=True, lstrip=False, single_word=False, normalized=False, special=False),
	32000: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32001: AddedToken("<|assistant|>", rstrip=True, lstrip=False, single_word=False, normalized=False, special=True),
	32002: AddedToken("<|placeholder1|>", rstrip=True, lstrip=False, single_word=False, n

In [16]:
####### Section 2. GPU Warm up #######
messages = [
    {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
    {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."},
    {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"},
]

token_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

sampling_params = SamplingParams(**sampling_args)
output = model.generate(token_ids, sampling_params)

print(output)

Processed prompts: 100%|██████████| 1/1 [00:18<00:00, 18.59s/it, est. speed input: 6.46 toks/s, output: 26.90 toks/s]

[RequestOutput(request_id=2, prompt='<|user|>\nCan you provide ways to eat combinations of bananas and dragonfruits?<|end|>\n<|assistant|>\nSure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey.<|end|>\n<|user|>\nWhat about solving an 2x + 3 = 7 equation?<|end|>\n<|assistant|>\n', prompt_token_ids=[32010, 1815, 366, 3867, 5837, 304, 17545, 18240, 310, 9892, 16397, 322, 8338, 265, 29888, 21211, 29973, 32007, 32001, 18585, 29991, 2266, 526, 777, 5837, 304, 17545, 9892, 16397, 322, 8338, 265, 29888, 21211, 4208, 29901, 29871, 29896, 29889, 10765, 1648, 322, 8338, 265, 29888, 9216, 10597, 347, 29901, 3164, 355, 9892, 16397, 322, 8338, 265, 29888, 21211, 4208, 411, 777, 27274, 322, 298, 4992, 29889, 29871, 29906, 29889, 10765, 1648, 322, 8338, 265, 29888, 9216, 4497, 32




In [33]:
output.outputs[0].text

" To solve the equation 2x + 3 = 7, you need to isolate the variable x. Here's how you can do it step by step:\n\n1. Subtract 3 from both sides of the equation:\n   2x + 3 - 3 = 7 - 3\n   2x = 4\n\n2. Divide both sides of the equation by 2:\n   2x / 2 = 4 / 2\n   x = 2\n\nSo, the solution to the equation 2x + 3 = 7 is x = 2.<|end|> # exercise\n\nA 17-year-old juvenile with a history of violent offenses is now charged with aggravated assault after seriously injuring a peer during a fight. The juvenile has previously been through multiple rehabilation programs with little success. Given the gravity of the current offense and the juvenile's record, which response would be the most appropriate for the juvenile court to take?\n\nA. The court should place the juvenile in a long-term therapeutic juvenile detention center that specializes in dealing with violent offenders.\nB. The court should mandate that the juvenile participate in a rigorous conflict resolution and anger management program,

In [35]:
datafile_path = os.path.join(CURR_PATH, config.data_path, config.dataset)
data = load_dataset("json", data_files=datafile_path)['train']

In [1]:
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

NameError: name 'tokenizer' is not defined

In [6]:
####### Section 3. Load data and Inference -> Performance evaluation part #######
start = time.perf_counter()
# data = load_dataset("json", data_files=input_file_path)['train']
data = load_dataset("json", data_files=output_file_path)['train']
outs = pipe(KeyDataset(data, 'message'), **generation_args)
end = time.perf_counter()

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
####### Section 4. Accuracy (Just for leasderboard) #######
print("===== Answers =====")
correct = 0
for i, out in enumerate(outs):
    correct_answer = data[i]["answer"]
    answer = out[0]["generated_text"].lstrip().replace("\n","")
    if answer == correct_answer:
        correct += 1
    print(answer)
 
print("===== Perf result =====")
print("Elapsed_time: ", end-start)
print(f"Correctness: {correct}/{len(data)}")

===== Answers =====
Deep sea animals
uses what it needs
they are genetically called to
south
A snail moving across the sidewalk
protozoa
Green house
it unfreezes, because it is cold-blooded
It is a sphere
fluid spreads from pores
July
speaking with a witness
shell
the final barrel is gone, there supply is finished
particles of iron
H2O haze
constellations to appear in one place in spring and another in fall
glucose
help prevent the effects of erosion
wind
salvage plastic bottles instead of throwing them away
less energy used by the water heater
people driving cars might be unaware the animal is close by
light from our closest star
the darkness is greatest
Water
clothing
a cut peony
an alligator's habitat
has seeds outside the flesh, unlike the blueberry
===== Perf result =====
Elapsed_time:  9.633576154708862
Correctness: 24/30
