In [3]:
import time
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers.pipelines.pt_utils import KeyDataset

from vllm import LLM, SamplingParams
from openai import OpenAI



In [None]:
# my local models
MODELZOO = {
    "phi3-14b" : "./Phi-3-medium-4k-instruct",
    "phi3-3.8b" : "./Phi-3-mini-4k-instruct",
    "bloom-560m": "./bloom-560m",
}

llm = LLM(
    model= MODELZOO["phi3-14b"],
    tensor_parallel_size=1,
    speculative_model=MODELZOO["phi3-3.8b"],
    num_speculative_tokens=5,
    use_v2_block_manager=True,
)

In [None]:
####### Section 2. GPU Warm up #######
prompts = [
    "The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

outputs = llm.generate(prompts, sampling_params)

for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

In [None]:
####### Section 3. Load data and Inference -> Performance evaluation part #######
start = time.time()
data = load_dataset("json", data_files="test_dataset.jsonl")['train']
outs = llm.generate(KeyDataset(data, 'message'), sampling_params)
end = time.time()

In [None]:
####### Section 4. Accuracy (Just for leasderboard) #######
print("===== Answers =====")
correct = 0
for i, out in enumerate(outs):
    correct_answer = data[i]["answer"]
    answer = out[0]["generated_text"].lstrip().replace("\n","")
    if answer == correct_answer:
        correct += 1
    print(answer)
 
print("===== Perf result =====")
print("Elapsed_time: ", end-start)
print(f"Correctness: {correct}/{len(data)}")