In [1]:
###  Import Library  ###
import os
import time
import random
import argparse

import numpy as np
import torch

from datasets import load_dataset
from huggingface_hub import hf_hub_download

from llama_cpp import Llama
from llama_cpp.llama_speculative import LlamaPromptLookupDecoding

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# !CMAKE_ARGS="-DGGML_CUDA=on -DLLAVA_BUILD=off" pip install -U llama-cpp-python --force-reinstall --no-cache-dir

In [None]:
# !CMAKE_ARGS="-DGGML_CUDA=on" FORCE_CMAKE=1 pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir

In [2]:
###  Config  ###
parser = argparse.ArgumentParser(description="llama.cpp")
parser.add_argument("--cache_dir", type=str, default="./models/")
parser.add_argument("--data_dir", type=str, default="./data/")
parser.add_argument("--data_name", type=str, default="test_dataset.jsonl")
parser.add_argument('--n_gpu_layers', type=int, default=30, )
parser.add_argument('--num_pred_tokens', type=int, default=10, help="speculative decoding")
parser.add_argument('--seed',type=int, default=0)
parser.add_argument('--temperature', type=float, default=0.0)

config = parser.parse_args([])

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
def _seed_everything(seed):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True # False

_seed_everything(config.seed)

In [11]:
###  Load Model  ###
model = Llama.from_pretrained(
	repo_id="watchstep/Phi-3-medium-4k-instruct-fP32-gguf",
	filename="Phi-3-medium-4k-instruct-fp32.gguf",
	n_gpu_layers=-1,
	verbose=True,
    seed=config.seed,
	draft_model=LlamaPromptLookupDecoding(num_pred_tokens=config.num_pred_tokens),
    # cache_dir=config.cache_dir,
)

llama_model_loader: loaded meta data with 35 key-value pairs and 243 tensors from /home/elicer/.cache/huggingface/hub/models--watchstep--Phi-3-medium-4k-instruct-fP32-gguf/snapshots/fdd8127a38f301c07b1d1eca6f7ed716f515feb0/./Phi-3-medium-4k-instruct-fp32.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = phi3
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Phi 3 Medium 4k Instruct
llama_model_loader: - kv   3:                           general.finetune str              = 4k-instruct
llama_model_loader: - kv   4:                           general.basename str              = Phi-3
llama_model_loader: - kv   5:                         general.size_label str              = medium
ll

In [12]:
def apply_chat_template(messages):
    formatted_messages = []
    
    for message in messages:
        formatted = [] 
        for msg in message:
            if isinstance(msg, dict):
                role = msg.get("role")
                content = msg.get("content", "").strip()  
                
                if role == "user":
                    formatted.append(f"<|user|>\n\n{content}\n<|end|>\n")
        
        formatted.append("<|assistant|>\n")
        formatted_messages.append(''.join(formatted))
    
    return formatted_messages

In [13]:
###  Warm up ###
system_message = "You are a helpful AI Assistant. Help users by replying to their queries and make sure the responses are polite. Do not hallucinate."
prompt = f"<|assistant|>\n{system_message}<|end|>"

output = model(
      prompt,
      max_tokens=32,
      echo=False,
)

print(output['choices'][0]['text'])

llama_perf_context_print:        load time =      65.00 ms
llama_perf_context_print: prompt eval time =       0.00 ms /   109 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /    23 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =    1615.56 ms /   132 tokens


 Absolutely! I'm here to assist you with any questions or concerns you might have. Please feel free to ask anything, and I'll


In [None]:
### Load data and Inference ### 
start = time.perf_counter()

data = load_dataset("json", data_files="./data/test_dataset.jsonl")['train']
messages = data['message']
token_ids = apply_chat_template(messages)

outs = []
for token_id in token_ids:
    with torch.inference_mode(), torch.cuda.amp.autocast():
        output = model(token_id,
        temperature=config.temperature,
        echo=False)

    out = output['choices'][0]['text']

    outs.append([{
        'generated_text': out
    }])

end = time.perf_counter()

In [None]:
#### Benchmark ###
print("===== Answers =====")
correct = 0
for i, out in enumerate(outs):
    correct_answer = data[i]["answer"]
    answer = out[0]["generated_text"].lstrip().replace("\n","")
    if answer == correct_answer:
        correct += 1
 
print("===== Perf result =====")
print("Elapsed_time: ", end-start)
print(f"Correctness: {correct}/{len(data)}")