In [1]:
import time
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers.pipelines.pt_utils import KeyDataset

In [2]:
####### Section 1. Set up #######
torch.random.manual_seed(0)
model_id = "./models/Phi-3-medium-4k-instruct" # please replace with local model path
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
 
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [3]:
generation_args = {
    "max_new_tokens": 20,
    "return_full_text": False,
    "temperature": 0.0, #decoding할 때 확률값을 random으로 줌. 
    "do_sample": False,
    "batch_size": 100 , # batch size를 지정함. 
}

In [4]:
torch.cuda.empty_cache() #cache를 지움

In [5]:
####### Section 3. Load data and Inference -> Performance evaluation part #######
start = time.time()
data = load_dataset("json", data_files="./data/test_dataset.jsonl")['train']
outs = pipe(KeyDataset(data, 'message'), **generation_args)
end = time.time()

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
You are not running the flash-attention implementation, expect numerical differences.


In [6]:
outs

[[{'generated_text': ' Deep sea animals'}],
 [{'generated_text': ' is standard weight and size'}],
 [{'generated_text': ' they are genetically called to'}],
 [{'generated_text': ' south'}],
 [{'generated_text': ' An aircraft taking a trip'}],
 [{'generated_text': ' protozoa'}],
 [{'generated_text': ' Green house'}],
 [{'generated_text': ' it unfreezes, because it is cold-blooded'}],
 [{'generated_text': ' It holds 500 mL of water'}],
 [{'generated_text': ' the air becomes arid'}],
 [{'generated_text': ' July'}],
 [{'generated_text': ' speaking with a witness'}],
 [{'generated_text': ' shell'}],
 [{'generated_text': ' the final barrel is gone, there supply is finished\n'}],
 [{'generated_text': ' particles of iron'}],
 [{'generated_text': ' H2O haze'}],
 [{'generated_text': ' constellations to appear in one place in spring and another in fall'}],
 [{'generated_text': ' glucose'}],
 [{'generated_text': ' help prevent the effects of erosion'}],
 [{'generated_text': ' wind'}],
 [{'generate

In [None]:
####### Section 4. Accuracy (Just for leasderboard) #######
print("===== Answers =====")
correct = 0
for i, out in enumerate(outs):
    correct_answer = data[i]["answer"]
    answer = out[0]["generated_text"].lstrip().replace("\n","")
    if answer == correct_answer:
        correct += 1
    # print(answer)
 
print("===== Perf result =====")
print("Elapsed_time: ", end-start)
print(f"Correctness: {correct}/{len(data)}")

In [None]:
####### Section 4. Accuracy (Just for leasderboard) #######
print("===== Answers =====")
correct = 0
for i, out in enumerate(outs):
    correct_answer = data[i]["answer"]
    # 생성된 답변에서 불필요한 텍스트 제거
    answer = out.outputs[0].text
    cleaned_answer = re.sub(r"A:\n\n\n### response ###\n\n|\n### response ###\n\n|A: |\nB:", "", answer).lstrip().replace("\n","")
    cleaned_answer = cleaned_answer.replace("answer: ","")
    
    # 정답과 출력된 답변을 비교
    print(f"Correct Answer: {correct_answer}")
    print(f"Generated Answer: {cleaned_answer}")
    if answer == cleaned_answer:
        correct += 1
        print(answer,"correct!!")
 
print("===== Perf result =====")
print("Elapsed_time: ", end-start)
print(f"Correctness: {correct}/{len(data)}")