In [1]:
# !pip install evaluate
# 参考1：https://github.com/EleutherAI/lm-evaluation-harness?tab=readme-ov-file#overview
# 参考2：https://github.com/meta-llama/llama-recipes/tree/main/tools/benchmarks/llm_eval_harness
# 参考3：https://github.com/meta-llama/llama-recipes/tree/main/tools/benchmarks/llm_eval_harness/meta_eval
# 参考4：https://huggingface.co/datasets/meta-llama/Llama-3.2-1B-evals

In [2]:
import torch
import torchvision

print("Torch version:", torch.__version__)
print("Torchvision version:", torchvision.__version__)
print(torch.version.cuda)

Torch version: 2.4.0+cu121
Torchvision version: 0.19.0+cu121
12.1


In [3]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from trl import SFTTrainer
from transformers import TrainingArguments, TextStreamer
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel
from datasets import Dataset
from unsloth import is_bfloat16_supported

# Saving model
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Warnings
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

2024-11-23 18:36:20.613887: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-23 18:36:20.613925: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-23 18:36:20.614414: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-23 18:36:20.617607: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


# 可以忽略的内容

In [4]:
# 参考https://huggingface.co/datasets/meta-llama/Llama-3.2-1B-evals
from datasets import load_dataset
data = load_dataset("meta-llama/Llama-3.2-1B-evals",
        name="Llama-3.2-1B-evals__metrics",
        split="latest"
)

In [5]:
benchmark_list=data['benchmark_label']
# 找到所有benchmark_label="MMLU"的下标
indices = [index for index, value in enumerate(benchmark_list) if value == "MMLU"]

In [6]:
# 选取MMLU评测集中第一个评测任务
data[indices[0]]

{'benchmark_label': 'MMLU',
 'metric_tag': 'world_religions/nll_token_target_norm',
 'metric_value_computed': 1.3,
 'metric_type': 'subtask_metric',
 'eval_config': {'top_p': '0',
  'seed': '42',
  'max_gen_len': '0',
  'top_k': '0',
  'temperature': '0.0',
  'prompt_fn': 'functools.partial(<function jinja_format at 0x7f909c2f1480>, \'The following are multiple choice questions (with answers) about {{ subject }}.\\n\\n{% for x in few_shot -%}\\n{{ x["question"] }}\\nA. {{ x["choices"]["A"] }}\\nB. {{ x["choices"]["B"] }}\\nC. {{ x["choices"]["C"] }}\\nD. {{ x["choices"]["D"] }}\\nAnswer: {{ x["answer"] }}\\n\\n{% endfor -%}\\n{{ question }}\\nA. {{ choices["A"] }}\\nB. {{ choices["B"] }}\\nC. {{ choices["C"] }}\\nD. {{ choices["D"] }}\\nAnswer: {{ choice_text }}\', subject=\'world religions\')',
  'num_few_shot': '5',
  'max_prompt_len': '3840',
  'num_generations': None,
  'return_logprobs': None}}

In [7]:
prompt_template=','.join(data[indices[0]]['eval_config']['prompt_fn'].split(',')[1:-1])
prompt_template

' \'The following are multiple choice questions (with answers) about {{ subject }}.\\n\\n{% for x in few_shot -%}\\n{{ x["question"] }}\\nA. {{ x["choices"]["A"] }}\\nB. {{ x["choices"]["B"] }}\\nC. {{ x["choices"]["C"] }}\\nD. {{ x["choices"]["D"] }}\\nAnswer: {{ x["answer"] }}\\n\\n{% endfor -%}\\n{{ question }}\\nA. {{ choices["A"] }}\\nB. {{ choices["B"] }}\\nC. {{ choices["C"] }}\\nD. {{ choices["D"] }}\\nAnswer: {{ choice_text }}\''

In [8]:
# llama3.2在mmlu数据集上用的提示词模板
print(prompt_template.replace("\\n", "\n"))

 'The following are multiple choice questions (with answers) about {{ subject }}.

{% for x in few_shot -%}
{{ x["question"] }}
A. {{ x["choices"]["A"] }}
B. {{ x["choices"]["B"] }}
C. {{ x["choices"]["C"] }}
D. {{ x["choices"]["D"] }}
Answer: {{ x["answer"] }}

{% endfor -%}
{{ question }}
A. {{ choices["A"] }}
B. {{ choices["B"] }}
C. {{ choices["C"] }}
D. {{ choices["D"] }}
Answer: {{ choice_text }}'


# 通过lm-evaluation-harness评估LLM

参考https://github.com/meta-llama/llama-recipes/tree/main/tools/benchmarks/llm_eval_harness/meta_eval \
在tools\benchmarks\llm_eval_harness\meta_eval\eval_config.yaml中发现最新的公开config只支持llama3.1，无法复现llama3.2。而llama3.1中最小的是8B模型，该LLM太大，所以没法直接利用上面网址中的方法复现llama3.1。\
我们只能通过lm-evaluation-harness评估LLM，因为缺少config(比如提示词模板)和评估时的各种trick，所以难以复现结果 \
下面在mmlu_abstract_algebra上评估llama-3.2-1B和我们之前上传的微调模型，发现微调后效果变差，符合预期

In [9]:
# 参考：https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md
!lm_eval --tasks list

2024-11-23 18:36:25.600247: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-23 18:36:25.600286: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-23 18:36:25.600778: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-23 18:36:25.604542: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-23:18:36:26,934 INFO     [__main__.py:272] Ve

## HellaSwag

In [10]:
# !lm_eval --model hf \
#     --model_args "pretrained=unsloth/Llama-3.2-1B-bnb-4bit" \
#     --tasks hellaswag \
#     --device cuda:0 \
#     --batch_size 16 \
#     --write_out

In [11]:
# !lm_eval --model hf \
#     --model_args "pretrained=meta-llama/Llama-3.2-1B" \
#     --tasks hellaswag \
#     --device cuda:0 \
#     --batch_size 16

## MMLU

In [12]:
# 因为LoRA微调的base model是量化后的模型，所以我们也要评估量化模型在hellaswag和mmlu上的性能
!lm_eval --model hf \
    --model_args "pretrained=unsloth/Llama-3.2-1B-bnb-4bit" \
    --tasks mmlu \
    --device cuda:0 \
    --batch_size 16 \
    --write_out

2024-11-23 18:36:32.125749: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-23 18:36:32.125786: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-23 18:36:32.126265: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-23 18:36:32.129488: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-23:18:36:33,485 INFO     [__main__.py:272] Ve

In [13]:
!lm_eval --model hf \
    --model_args "pretrained=meta-llama/Llama-3.2-1B" \
    --tasks mmlu \
    --device cuda:0 \
    --batch_size 16

2024-11-23 18:38:27.086283: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-23 18:38:27.086321: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-23 18:38:27.086809: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-23 18:38:27.089924: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-23:18:38:28,391 INFO     [__main__.py:272] Ve

## arc_challenge

In [14]:
!lm_eval --model hf \
    --model_args "pretrained=unsloth/Llama-3.2-1B-bnb-4bit" \
    --tasks arc_challenge \
    --device cuda:0 \
    --batch_size 16 \
    --write_out

2024-11-23 18:40:20.764580: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-23 18:40:20.764618: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-23 18:40:20.765138: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-23 18:40:20.768508: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-23:18:40:22,084 INFO     [__main__.py:272] Ve

In [15]:
!lm_eval --model hf \
    --model_args "pretrained=meta-llama/Llama-3.2-1B" \
    --tasks arc_challenge \
    --device cuda:0 \
    --batch_size 16

2024-11-23 18:40:41.955037: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-23 18:40:41.955072: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-23 18:40:41.955548: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-23 18:40:41.958572: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-23:18:40:43,255 INFO     [__main__.py:272] Ve