In [1]:
# !pip install evaluate
# 参考1：https://github.com/EleutherAI/lm-evaluation-harness?tab=readme-ov-file#overview
# 参考2：https://github.com/meta-llama/llama-recipes/tree/main/tools/benchmarks/llm_eval_harness
# 参考3：https://github.com/meta-llama/llama-recipes/tree/main/tools/benchmarks/llm_eval_harness/meta_eval
# 参考4：https://huggingface.co/datasets/meta-llama/Llama-3.2-1B-evals

In [1]:
import torch
import torchvision

print("Torch version:", torch.__version__)
print("Torchvision version:", torchvision.__version__)
print(torch.version.cuda)

Torch version: 2.4.0+cu121
Torchvision version: 0.19.0+cu121
12.1


In [2]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from trl import SFTTrainer
from transformers import TrainingArguments, TextStreamer
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel
from datasets import Dataset
from unsloth import is_bfloat16_supported

# Saving model
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Warnings
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

2024-11-22 16:52:01.441874: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-22 16:52:01.441904: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-22 16:52:01.442382: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-22 16:52:01.445540: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


# 通过lm-evaluation-harness评估LoRA微调

参考https://github.com/meta-llama/llama-recipes/tree/main/tools/benchmarks/llm_eval_harness/meta_eval \
在tools\benchmarks\llm_eval_harness\meta_eval\eval_config.yaml中发现最新的公开config只支持llama3.1，无法复现llama3.2。而llama3.1中最小的是8B模型，该LLM太大，所以没法直接利用上面网址中的方法复现llama3.1。\
我们只能通过lm-evaluation-harness评估LLM，因为缺少config(比如提示词模板)和评估时的各种trick，所以难以复现结果 \
下面在mmlu_abstract_algebra上评估llama-3.2-1B和我们之前上传的微调模型，发现微调后效果变差，符合预期

In [3]:
model, tokenizer = FastLanguageModel.from_pretrained(model_name = "model/1B_finetuned_llama3.2_LoRA",
                                                        max_seq_length = 5020,
                                                        dtype = None,
                                                        load_in_4bit = True)

==((====))==  Unsloth 2024.11.6: Fast Llama patching. Transformers = 4.46.3.
   \\   /|    GPU: NVIDIA GeForce RTX 4080 SUPER. Max memory: 15.992 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2024.11.6 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [4]:
os.environ["HF_TOKEN"] = "换成你自己的hugging face token key"
model.push_to_hub("1B_finetuned_llama3.2_LoRA", use_auth_token=os.getenv("HF_TOKEN"))
tokenizer.push_to_hub("1B_finetuned_llama3.2_LoRA", use_auth_token=os.getenv("HF_TOKEN"))

README.md:   0%|          | 0.00/579 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

Saved model to https://huggingface.co/1B_finetuned_llama3.2_LoRA


tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [6]:
!lm_eval --model hf \
    --model_args "pretrained=yuntaozh/1B_finetuned_llama3.2_LoRA" \
    --tasks mmlu \
    --device cuda:0 \
    --batch_size 16

2024-11-22 16:58:13.431719: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-22 16:58:13.431767: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-22 16:58:13.432247: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-22 16:58:13.435350: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-22:16:58:14,751 INFO     [__main__.py:272] Ve

# 通过lm-evaluation-harness评估IA3微调

In [11]:
from transformers import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained("model/1B_finetuned_llama3.2_IA3")

tokenizer = AutoTokenizer.from_pretrained("model/1B_finetuned_llama3.2_IA3")

Loading adapter weights from model/1B_finetuned_llama3.2_IA3 led to unexpected keys not found in the model:  ['model.layers.0.mlp.down_proj.ia3_l.default', 'model.layers.0.self_attn.k_proj.ia3_l.default', 'model.layers.0.self_attn.v_proj.ia3_l.default', 'model.layers.1.mlp.down_proj.ia3_l.default', 'model.layers.1.self_attn.k_proj.ia3_l.default', 'model.layers.1.self_attn.v_proj.ia3_l.default', 'model.layers.10.mlp.down_proj.ia3_l.default', 'model.layers.10.self_attn.k_proj.ia3_l.default', 'model.layers.10.self_attn.v_proj.ia3_l.default', 'model.layers.11.mlp.down_proj.ia3_l.default', 'model.layers.11.self_attn.k_proj.ia3_l.default', 'model.layers.11.self_attn.v_proj.ia3_l.default', 'model.layers.12.mlp.down_proj.ia3_l.default', 'model.layers.12.self_attn.k_proj.ia3_l.default', 'model.layers.12.self_attn.v_proj.ia3_l.default', 'model.layers.13.mlp.down_proj.ia3_l.default', 'model.layers.13.self_attn.k_proj.ia3_l.default', 'model.layers.13.self_attn.v_proj.ia3_l.default', 'model.layers.

In [12]:
os.environ["HF_TOKEN"] = "换成你自己的hugging face token key"
model.push_to_hub("1B_finetuned_llama3.2_IA3", use_auth_token=os.getenv("HF_TOKEN"))
tokenizer.push_to_hub("1B_finetuned_llama3.2_IA3", use_auth_token=os.getenv("HF_TOKEN"))

adapter_model.safetensors:   0%|          | 0.00/595k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/yuntaozh/1B_finetuned_llama3.2_IA3/commit/7b012c11090d6043f9d1914900b21c9ce3f33a53', commit_message='Upload tokenizer', commit_description='', oid='7b012c11090d6043f9d1914900b21c9ce3f33a53', pr_url=None, repo_url=RepoUrl('https://huggingface.co/yuntaozh/1B_finetuned_llama3.2_IA3', endpoint='https://huggingface.co', repo_type='model', repo_id='yuntaozh/1B_finetuned_llama3.2_IA3'), pr_revision=None, pr_num=None)

In [13]:
!lm_eval --model hf \
    --model_args "pretrained=yuntaozh/1B_finetuned_llama3.2_IA3" \
    --tasks mmlu \
    --device cuda:0 \
    --batch_size 16

2024-11-22 17:10:11.830788: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-22 17:10:11.830838: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-22 17:10:11.831362: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-22 17:10:11.835466: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-22:17:10:13,214 INFO     [__main__.py:272] Ve