## 3 Evaluating Locally deployed models

### 3.1 Load the (Quantized) model to a single GPU

In [1]:
import accelerate, bitsandbytes
import torch, os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

from transformers import LlamaTokenizerFast

model_path = '/share/model/llama-2-7b-chat-hf/'
# model_path = '/ssdshare/LLMs/llama3-Chinese-chat-8b/'
tokenizer = LlamaTokenizerFast.from_pretrained(model_path,padding_side='left')
qconfig=BitsAndBytesConfig(load_in_8bit=True)

model = AutoModelForCausalLM.from_pretrained(model_path, 
                                             device_map="cuda:0", 
                                             quantization_config=qconfig) 
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

Verify that the model is loaded to GPU (look at the memory utilization).

In [2]:
!nvidia-smi

### 3.2 Generate responses locally

In [3]:
def chat_resp(model, tokenizer, question_list):
    # question_list is a list of questions
    inputs = tokenizer(question_list, return_tensors="pt", padding=True, truncation=True, max_length=4096).input_ids.to("cuda")
    outputs = model.generate(inputs, pad_token_id=tokenizer.eos_token_id, max_new_tokens=512, do_sample=True, temperature=0.7)
    resp = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return resp

def chat_resp_batched(model, tokenizer, question_list, batch_size=4):
    # Split the question list into batches of the specified size
    batches = [question_list[i:i + batch_size] for i in range(0, len(question_list), batch_size)]
    all_responses = []
    
    for batch in batches:
        print(f"processing batch: %s " % batch)
        responses = chat_resp(model, tokenizer, batch)
        all_responses.extend(responses)
    return all_responses

In [4]:
def gsm8k_prompt(question):
    chat = [
        {"role": "system", "content": """Please solve the given math problem by providing a detailed, step-by-step explanation. Begin by outlining each step involved in your solution, ensuring clarity and precision in your calculations. After you have worked through the problem, conclude your response by summarizing the solution and stating the final answer as a single exact numerical value on the last line. """},
        {"role": "user", "content": "Question: " + question},
    ]

    s = tokenizer.apply_chat_template(chat, tokenize=False)

    return s

In [5]:
## Test the model with a sample question

p = gsm8k_prompt("Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?")
p = [p]
resp = chat_resp(model, tokenizer, p)
print(resp[0])

### 3.3 Prepare the evaluation datasets

In [6]:
# add proxy to access huggingface ...
os.environ['HTTP_PROXY']="http://Clash:QOAF8Rmd@10.1.0.213:7890"
os.environ['HTTPS_PROXY']="http://Clash:QOAF8Rmd@10.1.0.213:7890"
os.environ['ALL_PROXY']="socks5://Clash:QOAF8Rmd@10.1.0.213:7893"

In [7]:
from datasets import load_dataset
dataset = load_dataset("gsm8k", "main")

# to save time, we only use a small subset
subset = dataset['test'][5:30]
questions = subset['question']
answers = subset['answer']

dataset

In [8]:
# We only want the numeric answers from the dataset for evalation (maybe a bad choice?)

def get_exact_answer(x):
    i = x.index('####')
    return x[i+5:].strip('\n')

num_answers = list(map(get_exact_answer, answers))
print(num_answers)


In [9]:
# this is very tentative and bad way to find the exact answer, consider fixing it. 

import re
def get_numbers(s):
    number =[]
    lines = s.split('\n')
    for i in range(-1, -len(lines), -1):
        number = re.findall(r'\d+(?:\.\d+)?', lines[i])
        if len(number) > 0:
            break
    if (len(number) == 0):
        return '-9999'
    return number[-1]  # the last number is the answer

In [10]:
t = """
Toulouse has twice as many sheep as Charleston. Charleston has 4 times as many sheep as Seattle. How many sheep do Toulouse, Charleston, and Seattle have together if Seattle has 20 sheep?

Solution:
Let's start by using the information we know:

Toulouse has twice as many sheep as Charleston, so Toulouse has 2x = 2 \* 4 = 8 sheep.
Charleston has 4 times as many sheep as Seattle, so Charleston has 4 \* 20 = 80 sheep.
So, Toulouse has 8 sheep, Charleston has 80 sheep, and Seattle has 20 sheep.
Together, they have 8 + 80 + 20 = 128 sheep.


"""

get_numbers(t)

### 3.4 Evaluate!

In [11]:
question_prompts = [gsm8k_prompt(q) for q in questions]
resps = chat_resp_batched(model, tokenizer, question_prompts, batch_size=5)

llm_answers = []

for resp in resps:
    print("--------")
    print(resp)
    print("--------")
    num = get_numbers(resp)
    print(num)
    llm_answers.append(num)
    print("---------" )
    print(llm_answers)

In [12]:
print(llm_answers)
print(num_answers)

In [13]:
## manual way to compute the correct rate

error = 0
for i in range(0, len(llm_answers)):
    if llm_answers[i] != num_answers[i]:
        error += 1
print(f"number of errors: %s \n correct rate: %s" % (error, 1 - error / len(llm_answers))) 

In [14]:
## the way of using HuggingFace evaluate functions

import evaluate
exact_match = evaluate.load("exact_match")
results = exact_match.compute(predictions=llm_answers, references=num_answers)
print(results)