# Lab 6 Performance Evaluation

First, we need to install a few more dependencies. 

In [1]:
!pip3 install -r requirements.txt
# !pip3 install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com

In [2]:
import os
import requests
import threading
import evaluate
import numpy as np
import matplotlib.pyplot as plt
import re
import json
import warnings
import threading
import pandas as pd
import random as rd
from tqdm import tqdm

In [3]:
# add proxy to access openai ...
import os
os.environ['HTTP_PROXY']="http://Clash:QOAF8Rmd@10.1.0.213:7890"
os.environ['HTTPS_PROXY']="http://Clash:QOAF8Rmd@10.1.0.213:7890"
os.environ['ALL_PROXY']="socks5://Clash:QOAF8Rmd@10.1.0.213:7893"

# 1. NLP related Metrics
You can try to change the `prediction` and `reference` in the following sample code and see the range of the metrics.

## 1.1 Accuracy
Accuracy is the proportion of correct predictions among the total number of cases processed. 

It can be computed with: Accuracy = (TP + TN) / (TP + TN + FP + FN) Where: TP: True positive TN: True negative FP: False positive FN: False negative

The metric ranges from 0 to 1 and a higher score is better.

In [4]:
accuracy = evaluate.load('accuracy')

In [5]:
accuracy.compute(references=[0,1,0,1], predictions=[0,1,0,0])

In [6]:
accuracy.compute(references=[0,1,2,1,1], predictions=[1,2,2,1,1])

## 1.2 BLEU
BLEU, or the Bilingual Evaluation Understudy, is a score for comparing a candidate translation of text to one or more reference translations. The higher the BLEU score, the more similar the generated text is to the reference text. Its value ranges from 0 to 1.

This metric compares the n-gram overlap between the machine translation result and the reference translation where an n-gram is a sequence of consecutive n words. 

The metric ranges from 0 to 1 and a higher score is better.


### Single sentence score
Observe how the score varies as the input changes.

In [7]:
bleu = evaluate.load("bleu")

In [8]:
prediction1 = 'the cat is on the yoga mat'
reference1 = 'the cat sat on the yoga mat'
bleu_score = bleu.compute(predictions=[prediction1], references=[reference1])
bleu_score['bleu']

In [9]:
prediction2 = 'the value of life lies in what you create for others not in what you possess'
reference2 = 'the meaning of life lies in what you give to others not in what you receive'
bleu_score = bleu.compute(predictions=[prediction2], references=[reference2])
bleu_score

In [10]:
prediction3 = 'the adversary abusing Sybil accounts imposes a critical threat to establishing trust and integrity in web services'
reference3 = 'establishing trust in web services is threatened by the adversary'
bleu_score = bleu.compute(predictions=[prediction3], references=[reference3])
bleu_score['bleu']

### Multiple sentence score

In [11]:
bleu_score = bleu.compute(predictions=[prediction1, prediction2, prediction3], 
                          references=[reference1, reference2, reference3])
bleu_score['bleu']

### Incremental adding predictions

In [12]:
bleu.add(predictions=prediction1, references=reference1)
bleu.add(predictions=prediction2, references=reference2)
bleu.add(predictions=prediction3, references=reference3)
bleu_score = bleu.compute()
bleu_score['bleu']

## 1.3 ROUGE

ROUGE, or Recall-Oriented Understudy for Gisting Evaluation, is a set of metrics and a software package used for evaluating automatic summarization and machine translation software in natural language processing. 

The metrics compare an automatically produced summary or translation against a reference or a set of references (human-produced) summary or translation. Rouge-1 considers 1-gram, Rouge-2 2-gram and so on. Rouge-L considers the longest common subsequence.

Note that ROUGE is case insensitive, meaning that upper case letters are treated the same way as lower case letters.

The metric ranges from 0 to 1 and a higher score is better.


In [13]:
rouge = evaluate.load("rouge")

In [14]:
prediction1 = 'the cat is on the yoga mat'
reference1 = 'the cat sat on the yoga mat'

rouge_scores = rouge.compute(predictions=[prediction1], references=[reference1])
rouge_scores

In [15]:
prediction2 = 'the value of life lies in what you create for others not in what you possess'
reference2 = 'the meaning of life lies in what you give to others not in what you receive'

rouge_scores = rouge.compute(predictions=[prediction2], references=[reference2])
rouge_scores

In [16]:
prediction3 = 'the adversary abusing Sybil accounts imposes a critical threat to establishing trust and integrity in web services'
reference3 = 'establishing trust in web services is threatened by the adversary'

rouge_scores = rouge.compute(predictions=[prediction3], references=[reference3])
rouge_scores

Rouge also supports multiple sentence score and incremental computing. You could try below if interested.

## 1.4 Perplexity

Perplexity measures the uncertainty of a language model's predictions. 

Given a model and an input text sequence, perplexity measures how likely the model is to generate the input text sequence.  

- Lower perplexity is better - it means the model is more confident and accurate in its predictions
- A perplexity of 1 would be perfect (but unrealistic), meaning the model perfectly predicts every token
- The higher the perplexity, the more "surprised" or "confused" the model is by the text

The range of this metric is [0, inf). A lower score is better.

In [40]:
perplexity = evaluate.load("perplexity", module_type="metric")

In [47]:
input_texts = ["Perplexity measures the uncertainty of a language model's predictions.",
               "Higher the perplexity, the more surprised or confused the model is by the text.", 
               "X8nP7qLz3 RtYvA5cE2 mD9fGh JkUbW s6i"]
results = perplexity.compute(model_id='gpt2',
                             add_start_token=False,
                             predictions=input_texts) 
results

## 1.5 Combining multiple metrics

In [48]:
metrics = evaluate.combine(['bleu', 'rouge'])
scores = metrics.compute(predictions=[prediction1, prediction2, prediction3], 
                          references=[reference1, reference2, reference3])
scores

# 2 Evaluatings Models over REST API

## 2.1 Some utility functions

In [51]:
from dotenv import load_dotenv
import os
load_dotenv()
openai_api_key = os.environ.get("INFINI_API_KEY")
openai_base_url = os.environ.get("INFINI_BASE_URL")


In [67]:
import time
import random
from openai import RateLimitError
from openai import OpenAI

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_base_url,
)

# a single thread version of get_llm_output
def get_llm_output(model_name, question_content, question_id, output, semaphore=None):
    # the last semaphore is used for parallel execution only
    try:
        retries = 5
        for attempt in range(retries):
            try:
                if isinstance(question_content, str):
                    messages = [{"role": "user", "content": question_content}]
                else:
                    messages = [{"role": "user", "content": q} for q in question_content]

                chat_response = client.chat.completions.create(
                    model=model_name,
                    messages=messages,
                    max_tokens=2048,
                    temperature=0,
                    seed=42
                )

                llm_answer = chat_response.choices[0].message.content.strip()
                output[question_id] = llm_answer
                break  # 成功就退出 retry 循环

            except RateLimitError as e:
                wait_time = random.uniform(1, 3) * (2 ** attempt)
                print(f"RateLimitError on Q{question_id}, retry {attempt+1}/{retries}, wait {wait_time:.1f}s")
                time.sleep(wait_time)

            except Exception as e:
                print(f"Error on Q{question_id}: {e}")
                break  # 其他异常不重试

        else:
            # 所有尝试失败，填空防止后续崩
            output[question_id] = ""

    finally:
        if semaphore:
            semaphore.release() # release the semaphore


Evaluation requires running lots of Q/A's.  How to run them fast enough is the key.  Revisit my favourite topic: thread synchronization.

In [55]:
# extending the single thread version to parallel execution
def get_llm_output_parallel(model_name, question_contents, max_threads=5):
    # Create threads for each question
    output = {}
    threads = []
    semaphore = threading.Semaphore(max_threads)
    for question_id, question_content in tqdm(enumerate(question_contents)):
        semaphore.acquire() 
        thread = threading.Thread(target=get_llm_output, args=(model_name, question_content, question_id, output, semaphore))
        threads.append(thread)
        thread.start()
        # semaphore is released when the thread ends, in the single thread version

    # Wait for all threads to complete
    for thread in threads:
        thread.join()

    sorted_keys = sorted(output.keys())
    sorted_outputs = [output[key] for key in sorted_keys]        
    return sorted_outputs

In [56]:
def print_llm_outputs(model_name, question_contents, llm_answers, references):
    for i, (question, answer, reference) in enumerate(zip(question_contents, llm_answers, references)):
        print('Question %d: %s'%(i, question))
        print('Answer from Model %s: %s'%(model_name, answer))
        print('Reference Answer: %s\n'%(reference))

## 2.2 Example: Evaluate News Summarization Results

### Load the data and preprocess

The CNN / DailyMail Dataset is an English-language dataset containing just over 300k unique news articles as written by journalists at CNN and the Daily Mail. 

For each instance, there is a string for the article, a string for the highlights, and a string for the id.

Data source: https://huggingface.co/datasets/cnn_dailymail

In [57]:
from datasets import load_dataset, load_from_disk

# d=load_dataset(r"ccdv/cnn_dailymail", '3.0.0')  # reading from huggingface
# d.save_to_disk('/share/data/cnn_dailymail/')  # saving to disk for later use
d = load_from_disk('/ssdshare/share/data/cnn_dailymail/')  # loading from disk

In [58]:
def create_prompt(x):
    s = "Please summarize the following news article in no more than 30 words.\n %s" %(x['article'])
    x['question_content'] = s
    return x

d = d['test'].map(create_prompt) # Use test set to evaluate
d

### Evaluate the models
It is a summarization task, so we can use both BLEU and ROUGE as evaluation metrics.

In [59]:
EVALUATE_N= 10  # To save time, we evaluate the first 10 articles only.
metrics = evaluate.combine(['bleu', 'rouge'])  # metrics to evaluate
overall_scores = {}

for model_name in ['qwen2.5-72b-instruct', 'llama-3.3-70b-instruct',]:
    print(f'============== {model_name}  ==============')
    question_contents = d['question_content'][:EVALUATE_N]
    references = d['highlights'][:EVALUATE_N]
    llm_answers = get_llm_output_parallel(model_name, question_contents, max_threads=5)
    # print("Predictions:", llm_answers)
    # print("References:", references)
    # print("Lengths:", len(llm_answers), len(references))
    scores = metrics.compute(predictions=llm_answers, references=references)
    overall_scores[model_name] = [scores['bleu'], scores['rouge1'], scores['rouge2'], scores['rougeL']]    
    print_llm_outputs(model_name, question_contents, llm_answers, references)

In [60]:
# nice print the results using pandas
import pandas as pd
performance_df = pd.DataFrame(overall_scores)
performance_df.index = ['BLEU', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L']
performance_df

In [61]:
performance_df.plot.bar()
plt.ylabel('Evaluation metric score')
plt.title('LLM performance in news summarization')
plt.show()

## 2.3 Multiple choice question answering

### Load the data from huggingface
C-Eval is a comprehensive Chinese evaluation suite for foundation models. It consists of 13948 multi-choice questions spanning 52 diverse disciplines and four difficulty levels. Here we use two disciplines: art studies and operating system.

Data source: https://cevalbenchmark.com/index_zh.html

In [63]:
def get_question_content(x):
    s = "以下是单项选择题,请直接给出其中的正确答案。请只输出ABCD当中的一个,不需要作解释。\n%s\nA. %s\nB. %s\nC. %s\nD. %s" %(x['question'],x['A'], x['B'], x['C'], x['D'])
    x['question_content'] = s
    return x

In [64]:
data_names = ['art_studies', 'operating_system']
ds = []

# Please try this code if you want to load the data from huggingface (sometimes it will be slow or even failed)
# for data_name in data_names:
#     d=load_dataset(r"ceval/ceval-exam", name=data_name)
#     d_updated = d['val'].map(get_question_content) # Use validation set to evaluate
#     print(data_name)
#     print(d_updated)
#     ds.append(d_updated)


for data_name in data_names:
    file_path = f"/ssdshare/share/data/ceval-exam/val/{data_name}_val.csv"
    d = load_dataset("csv", data_files={ "val": file_path })["val"]
    d_updated = d.map(get_question_content)
    print(data_name)
    print(d_updated)
    ds.append(d_updated)


### Evaluate the models
Accuracy is used to evaluate the model.

In [65]:
def get_options(llm_answers):
    # Select the option that occurs most times in the model output as the final answer.
    options = []
    for llm_answer in llm_answers:
        option_frequencies = [llm_answer.count(option) for option in 'ABCD']
        most_frequent = np.argmax(option_frequencies)
        most_frequent_option = 'ABCD'[most_frequent]
        options.append(most_frequent_option)
    return options

def option2num(options):
    # Transform the ABCD options to numbers for accuracy evaluation.
    option2num_dict = {'A':0 ,'B':1, 'C':2, 'D':3}
    nums = list(map(lambda x:option2num_dict[x], options))
    return nums

In [66]:
overall_scores = {} # Evaluation results for all models

for model_name in ['qwen2.5-72b-instruct', 'llama-3.3-70b-instruct',]:
    scores = []
    print(f'============== {model_name}  ==============')
    for i, d in enumerate(ds):
        print('Data %s has %d questions'%(data_names[i], d.num_rows))
        question_contents = d['question_content']
        llm_answers = get_llm_output_parallel(model_name, question_contents, max_threads=5)
        print_llm_outputs(model_name, question_contents, llm_answers, d['answer'])        
        llm_answers = get_options(llm_answers)
        acc = accuracy.compute(references=option2num(d['answer']), predictions=option2num(llm_answers)) 
        scores.append(acc['accuracy'])
    overall_scores[model_name] = scores

In [35]:
accuracy_df =  pd.DataFrame(overall_scores)
accuracy_df.index = data_names
accuracy_df

In [36]:
accuracy_df.plot.barh()
plt.xlabel('Accuracy')
plt.title('LLM performance in C-Eval benchmark')
plt.show()

In [None]:
#### Your Task ####
# Find another dataset to evaluate two other models of your choice.
# You should choose two models that have different sizes and different performance.
# For this notebook, use questions with easy to judge answers only (e.g. multiple choice questions), 
# or the answer is a simple yes/no question, or a single word answer.
# Report at least two metrics to evaluate the models.


In [1]:
# do not forget to clean the gpu memory
import torch
torch.cuda.empty_cache()

In [2]:
# check the GPU memory utilization
!nvidia-smi