In [1]:
###  Import Library  ###
import os
import time
import random
import argparse

import numpy as np
import torch

from datasets import load_dataset
from huggingface_hub import hf_hub_download

from llama_cpp import Llama
from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
###  Config  ###
parser = argparse.ArgumentParser(description="llama.cpp")
parser.add_argument("--cache_dir", type=str, default=" ~/.cache/huggingface/hub/")
parser.add_argument("--data_dir", type=str, default="./data/")
parser.add_argument("--data_name", type=str, default="test_dataset.jsonl")
parser.add_argument('--seed',type=int, default=0)
parser.add_argument('--temperature', type=float, default=0.0)
parser.add_argument('--num_pred_tokens', type=int, default=10)

config = parser.parse_args([])

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def _seed_everything(seed):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

_seed_everything(config.seed)

In [3]:
###  Load Model  ###
model = Llama.from_pretrained(
	repo_id="watchstep/Phi-3-medium-4k-instruct-fp32-gguf",
	filename="Phi-3-medium-4k-instruct-fp32.gguf",
	n_gpu_layers=-1,
	n_ctx=1024,
	n_batch=1024,
	verbose=True,
    seed=config.seed,
	draft_model=LlamaPromptLookupDecoding(num_pred_tokens=10),
    # cache_dir=config.cache_dir,
)

llama_model_loader: loaded meta data with 35 key-value pairs and 243 tensors from /home/elicer/.cache/huggingface/hub/models--watchstep--Phi-3-medium-4k-instruct-fp32-gguf/snapshots/fdd8127a38f301c07b1d1eca6f7ed716f515feb0/./Phi-3-medium-4k-instruct-fp32.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = phi3
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Phi 3 Medium 4k Instruct
llama_model_loader: - kv   3:                           general.finetune str              = 4k-instruct
llama_model_loader: - kv   4:                           general.basename str              = Phi-3
llama_model_loader: - kv   5:                         general.size_label str              = medium
ll

In [4]:
#prompt 후보들 
system_message={"hj":"You are a highly helpful and knowledgeable AI assistant specializing in answering user queries accurately and politely.",
                "hr":"You are a helpful and precise AI assistant focused on providing concise and accurate answers. Answer concisely by selecting the correct option from the provided choices in [] and output only the selected answer without any additional explanation or formatting.",
                "ji":"You are a helpful AI Assistant. Help users by replying to their queries and make sure the responses are polite. Do not hallucinate.",
}
prompt = f"<|system|>\n{system_message['hj']}\n"

print(prompt)


<|system|>
You are a highly helpful and knowledgeable AI assistant specializing in answering user queries accurately and politely.



In [5]:
def apply_chat_template(messages):
    formatted_messages = []
    
    formatted_messages.extend(
        ''.join(
            [f"{prompt}<|user|>\n{msg.get('content', '').strip()}\n<|end|>\n\n" if msg.get('role') == 'user' else '' for msg in message]
        ) + "<|assistant|>\n" for message in messages
    )
    
    return formatted_messages

In [6]:
data = load_dataset("json", data_files="./data/test_dataset.jsonl")['train']
messages = data['message']
token_ids = apply_chat_template(messages)

In [7]:
def save_to_jsonl(token_ids, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        for token_id in token_ids:
            json.dump({"token_id": token_id}, file)
            file.write('\n')  

def load_from_jsonl(filename):
    token_ids = []
    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line)
            token_ids.append(data['token_id'])  
    return token_ids

In [8]:
save_to_jsonl(token_ids,"processed_data.jsonl")

In [9]:
dummy = [
    {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
    {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."},
    {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"},
]

output = model.create_chat_completion(
      messages=dummy
)

# print(output['choices'][0]['text'])
print(output["choices"][0]['message']['content'])

llama_perf_context_print:        load time =     202.52 ms
llama_perf_context_print: prompt eval time =       0.00 ms /   659 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /    32 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =    4136.78 ms /   691 tokens


Certainly! To solve the equation 2x + 3 = 7, you would follow these steps:

1. Subtract 3 from both sides: 2x + 3 - 3 = 7 - 3, which simplifies to 2x = 4.
2. Divide both sides by 2: (2x)/2 = 4/2, which simplifies to x = 2.

So, the solution to the equation 2x + 3 = 7 is x = 2.


In [10]:
### Load data and Inference ### 
start = time.perf_counter()
token_ids = load_from_jsonl("processed_data.jsonl")
outs = []
for token_id in token_ids:
    with torch.inference_mode(), torch.autocast(device_type="cuda"):
        output = model(token_id,
        temperature=config.temperature,
        echo=False)

    out = output['choices'][0]['text']

    outs.append([{
        'generated_text': out
    }])

end = time.perf_counter()

llama_perf_context_print:        load time =     202.52 ms
llama_perf_context_print: prompt eval time =       0.00 ms /   135 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =     121.49 ms /   136 tokens
Llama.generate: 66 prefix-match hit, remaining 49 prompt tokens to eval
llama_perf_context_print:        load time =     202.52 ms
llama_perf_context_print: prompt eval time =       0.00 ms /    60 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =      98.50 ms /    61 tokens
Llama.generate: 66 prefix-match hit, remaining 58 prompt tokens to eval
llama_perf_context_print:        load time =     202.52 ms
llama_perf_context_print: p

In [11]:
outs

[[{'generated_text': 'Deep sea animals'}],
 [{'generated_text': 'is standard weight and size'}],
 [{'generated_text': 'they are genetically called to\n\n'}],
 [{'generated_text': 'north'}],
 [{'generated_text': 'An aircraft taking a trip'}],
 [{'generated_text': 'protozoa'}],
 [{'generated_text': 'Green house'}],
 [{'generated_text': 'it unfreezes, because it is cold-blooded'}],
 [{'generated_text': 'It holds 500 mL of water\n\nExplanation:'}],
 [{'generated_text': 'the air becomes arid'}],
 [{'generated_text': 'July'}],
 [{'generated_text': 'speaking with a witness'}],
 [{'generated_text': 'shell'}],
 [{'generated_text': 'the final barrel is gone, there supply is finished'}],
 [{'generated_text': 'particles of iron'}],
 [{'generated_text': 'H2O haze'}],
 [{'generated_text': 'constellations to appear in one place in spring and another in fall'}],
 [{'generated_text': 'glucose'}],
 [{'generated_text': 'help prevent the effects of erosion'}],
 [{'generated_text': 'wind'}],
 [{'generated_

In [12]:
#### Benchmark ###
print("===== Answers =====")
correct = 0

for i, out in enumerate(outs):
    correct_answer = data[i]["answer"]
    answer = out[0]["generated_text"].lstrip().replace("\n","")
    if answer == correct_answer:
        correct += 1
 
print("===== Perf result =====")
print("Elapsed_time: ", end-start)
print(f"Correctness: {correct}/{len(data)}")

===== Answers =====
===== Perf result =====
Elapsed_time:  4.590992929064669
Correctness: 21/31
