In [1]:
import pandas as pd

df = pd.read_parquet('/home/bcds/On-the-Fly_MoE_Inference/bagel-v0.5/bagel-clean-v0.5.parquet')

In [10]:
df_train=df.sample(frac=0.95,random_state=200)
df_eval=df.drop(df_train.index)
# df_train.to_json("bagel_train.jsonl", orient="records", lines=False)
df_eval.to_json("bagel_eval.json", orient="records", lines=False)

In [31]:
"""conversation prompt templates"""

import dataclasses
from enum import auto, Enum
from typing import List, Any


class SeparatorStyle(Enum):
    """Different separator style."""

    ADD_COLON_SINGLE = auto()
    ADD_COLON_TWO = auto()
    NO_COLON_SINGLE = auto()
    BAIZE = auto()
    DOLLY = auto()
    RWKV = auto()
IGNORE_TOKEN_ID = -100

@dataclasses.dataclass
class Conversation:
    """A class that keeps all conversation history."""

    # System prompts
    system: str
    # Two roles
    roles: List[str]
    # All messages
    messages: List[List[str]]
    # Offset of few shot examples
    offset: int
    # Separator
    sep_style: SeparatorStyle
    sep: str
    sep2: str = None
    # Stop criteria (the default one is EOS token)
    stop_str: str = None
    # Stops generation if meeting any token in this list
    stop_token_ids: List[int] = None

    # Used for the state in the gradio servers.
    conv_id: Any = None
    skip_next: bool = False
    model_name: str = None

    def get_prompt(self):
        """Get the prompt for generation."""
        if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
            ret = self.system + self.sep
            for role, message in self.messages:
                if message:
                    ret += role + ": " + message + self.sep
                else:
                    ret += role + ":"
            return ret
        if self.sep_style == SeparatorStyle.ADD_COLON_TWO:
            seps = [self.sep, self.sep2]
            ret = self.system + seps[0]
            for i, (role, message) in enumerate(self.messages):
                if message:
                    ret += role + ": " + message + seps[i % 2]
                else:
                    ret += role + ":"
            return ret
        raise ValueError(f"Invalid style: {self.sep_style}")

    def append_message(self, role, message):
        """Append a new message."""
        self.messages.append([role, message])

    def to_openai_api_messages(self):
        """Convert the conversation to OpenAI chat completion format."""
        ret = [{"role": "system", "content": self.system}]

        for i, (_, msg) in enumerate(self.messages[self.offset:]):
            if i % 2 == 0:
                ret.append({"role": "user", "content": msg})
            else:
                if msg is not None:
                    ret.append({"role": "assistant", "content": msg})
        return ret

    def copy(self):
        return Conversation(
            system=self.system,
            roles=self.roles,
            messages=[[x, y] for x, y in self.messages],
            offset=self.offset,
            sep_style=self.sep_style,
            sep=self.sep,
            sep2=self.sep2,
            stop_str=self.stop_str,
            stop_token_ids=self.stop_token_ids,
            conv_id=self.conv_id,
            model_name=self.model_name,
        )

    def dict(self):
        return {
            "system": self.system,
            "roles": self.roles,
            "messages": self.messages,
            "offset": self.offset,
            "conv_id": self.conv_id,
            "model_name": self.model_name,
        }


def get_default_conv_template(conv_name):
    return Conversation(
        system="A chat between a curious user and an artificial intelligence assistant. "
        "The assistant gives helpful, detailed, and polite answers to the user's questions.",
        roles=("USER", "ASSISTANT"),
        messages=(),
        offset=0,
        sep_style=SeparatorStyle.ADD_COLON_TWO,
        sep=" ",
        sep2="</s>",
    )
import numpy as np
def preprocess(sources, tokenizer, seq_length):
    """conversation preprocess."""
    conv = get_default_conv_template("vicuna").copy()
    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
    print(conv.roles[0], conv.roles[1])

    # Apply prompt templates
    conversations = []
    for i, source in enumerate(sources):
        print(source[0].get("from"))
        if roles.get(source[0].get("from")) != conv.roles[0]:
            # Skip the first one if it is not from human
            source = source[1:]

        conv.messages = []
        for j, sentence in enumerate(source):
            role = roles.get(sentence.get("from"))
            if role != conv.roles[j % 2]:
                raise ValueError(f"sources[{i}] is wrong.")
            conv.append_message(role, sentence["value"])
        conversations.append(conv.get_prompt())

    sep = conv.sep + conv.roles[1] + ": "
    # Tokenize conversations
    input_ids = []
    targets = []
    # attention_mask = []
    for conversation in conversations:
        rounds = conversation.split(conv.sep2)
        ids = [tokenizer.bos_token_id]
        mask = [1]
        for i, rou in enumerate(rounds):
            if rou == "":
                break
            conv_out = tokenizer(rou)
            ids.extend(conv_out['input_ids'][1:])
            mask.extend(conv_out['attention_mask'][1:])
        d = {'input_ids': ids, 'attention_mask': mask}
        # pylint: disable=W0212
        d = tokenizer._pad(d, max_length=seq_length, padding_strategy='max_length')
        input_ids.append(d['input_ids'][:seq_length])
        # attention_mask.append(d['attention_mask'])

        target = np.array(d['input_ids'])
        total_len = int(np.not_equal(target, tokenizer.pad_token_id).sum())
        cur_len = 1
        target[:cur_len] = IGNORE_TOKEN_ID
        for i, rou in enumerate(rounds):
            if rou == "":
                break
            parts = rou.split(sep)
            if len(parts) != 2:
                break
            parts[0] += sep
            round_len = len(tokenizer(rou)['input_ids']) - 1
            instruction_len = len(tokenizer(parts[0])['input_ids']) - 3

            target[cur_len : cur_len + instruction_len] = IGNORE_TOKEN_ID

            cur_len += round_len
        target[cur_len:] = IGNORE_TOKEN_ID

        if cur_len < seq_length:
            if cur_len != total_len:
                target[:] = IGNORE_TOKEN_ID
        else:
            target = target[:seq_length]
        targets.append(target.tolist())

    input_ids = np.array(input_ids, dtype=np.int32)
    targets = np.array(targets, dtype=np.int32)

    return dict(
        input_ids=input_ids,
        labels=targets,
    )

class SupervisedDataset:
    """Dataset for supervised fine-tuning."""

    def __init__(self, raw_data, tokenizer, seq_length):
        super(SupervisedDataset, self).__init__()

        sources = [example["conversations"] for example in raw_data][:100]
        data_dict = preprocess(sources, tokenizer, seq_length)

        self.input_ids = data_dict.get("input_ids", None)
        self.labels = data_dict.get("labels", None)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, i):
        return dict(
            input_ids=self.input_ids[i],
            labels=self.labels[i]
        )

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
import json

with open('../path.json', 'r') as f:
    path = json.load(f)
    model_name = path['mixtral']

tokenizer = AutoTokenizer.from_pretrained(model_name)
with open('./bagel_eval.json', 'r') as f:
    raw_data = json.load(f)

bagel = SupervisedDataset(raw_data=raw_data, tokenizer=tokenizer, seq_length=512)

In [33]:
sources = [example["conversations"] for example in raw_data][:100]

In [4]:
import json

# 假设你的JSON文件名为data.json
with open('/home/bcds/On-the-Fly_MoE_Inference/OpenHermes-2.5/openhermes2_5.json', 'r', encoding='utf-8') as file:
    raw_data = json.load(file)



In [7]:
# 处理数据并拼接成指定格式
processed_data = []
data = [example["conversations"] for example in raw_data]
for item in data:
    try:
        # print(item)
        user_message = item[0]['value']
        assistant_message = item[1]['value']
        formatted_message = f"</s>USER: {user_message} </s>ASSISTANT: {assistant_message}"
        processed_data.append({"text": formatted_message})  # 按照要求格式保存
    except:
        continue

# 将处理后的数据保存为新的JSON文件
with open('/home/bcds/On-the-Fly_MoE_Inference/OpenHermes-2.5/processed_data.json', 'w', encoding='utf-8') as output_file:
    json.dump(processed_data, output_file, ensure_ascii=True)  # 保存为JSON格式

print("数据处理完成，已保存为 processed_data.json")

数据处理完成，已保存为 processed_data.json


In [1]:
import json

# 假设你的JSON文件名为data.json
with open('/home/bcds/On-the-Fly_MoE_Inference/bagel-v0.5/bagel_eval.json', 'r', encoding='utf-8') as file:
    raw_data = json.load(file)

# 处理数据并拼接成指定格式
processed_data = []
data = [example["conversations"] for example in raw_data]
for item in data:
    try:
        # print(item)
        user_message = item[0]['value']
        assistant_message = item[1]['value']
        formatted_message = f"</s>USER: {user_message} </s>ASSISTANT: {assistant_message}"
        processed_data.append({"text": formatted_message})  # 按照要求格式保存
    except:
        continue

# 将处理后的数据保存为新的JSON文件
with open('/home/bcds/On-the-Fly_MoE_Inference/bagel-v0.5/processed_data.json', 'w', encoding='utf-8') as output_file:
    json.dump(processed_data, output_file, ensure_ascii=True)  # 保存为JSON格式

print("数据处理完成，已保存为 processed_data.json")

数据处理完成，已保存为 processed_data.json


In [3]:
from datasets import load_dataset

# 加载处理后的 JSON 文件
dataset = load_dataset('json', data_files='/home/bcds/On-the-Fly_MoE_Inference/bagel-v0.5/processed_data.json')

# 查看数据集内容
print(dataset)

Generating train split: 38807 examples [00:01, 26122.88 examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 38807
    })
})





In [22]:
# "USER", "ASSISTANT"
roles = {"human": "USER", "gpt": "ASSISTANT"}
# Apply prompt templates
conversations = []
for i, source in enumerate(sources):
    print(source[0].get("from"))
    if roles.get(source[0].get("from")) != "human":
        # Skip the first one if it is not from human
        source = source[1:]

human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
human
huma

TypeError: 'NoneType' object is not subscriptable

In [1]:
! export HF_ENDPOINT="https://hf-mirror.com"

In [2]:
import json
from lm_eval.models.vllm_causallms import VLLM
import os 
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
with open('../path.json', 'r') as f:
    path = json.load(f)
    model_name = path['mixtral']

lora_save_path = '/home/lz/On-the-Fly_MoE_Inference/quantize/saved/training/nohqq_noaver/checkpoint-200'
VLLM(pretrained=model_name, lora_local_path=lora_save_path)

  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 

In [1]:
avg_list = [1,2,3,4]

print('Average Sparsity: ', f'{sum(avg_list)/len(avg_list):.4f}')
print('Max Sparsity: {:.4f}'.format(max(avg_list)))
print('Min Sparsity: {:.4f}'.format(min(avg_list)))

Average Sparsity:  2.5000
Max Sparsity: 4.0000
Min Sparsity: 1.0000


In [1]:
import torch
from modeling_mixtral import set_profile_mode, load_thresholds
from utils import myevaluate, get_model
import json 
import argparse
from peft import PeftModelForCausalLM

# def doeval(dtype, lora_save_path, args):
dtype = torch.float16
threshold_path_name='chess_up_threshold'
use_average = True
lora_save_path = '/home/lz/On-the-Fly_MoE_Inference/quantize/saved/training/nohqq/checkpoint-200'
with open('../path.json', 'r') as f:
    path = json.load(f)
    model_name = path['mixtral']
    threshold_path = path[threshold_path_name]

with open('./device_map_1.json', 'r') as f:
    device_map = json.load(f)

## 开启稀疏模式
set_profile_mode(False)
load_thresholds(f'{threshold_path}/thresholds_0_8.pt', use_average=use_average)	
llm, tokenizer = get_model(model_name, device_map, dtype=dtype)



  from .autonotebook import tqdm as notebook_tqdm


Set profile_threshold to False
Thresholds loaded from /home/lz/On-the-Fly_MoE_Inference/saving/threshold/c4_mixtral_up/thresholds_0_8.pt


Loading checkpoint shards: 100%|██████████| 19/19 [00:24<00:00,  1.27s/it]


AttributeError: 'MixtralForCausalLM' object has no attribute '_split_kwargs'

In [2]:

PeftModelForCausalLM.from_pretrained(llm, lora_save_path, 'default')

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MixtralForCausalLM(
      (model): MixtralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MixtralDecoderLayer(
            (self_attn): MixtralSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.01, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): 

In [11]:
for name,para in llm.base_model.layers[0].self_attn.q_proj.lora_A.named_parameters():
    print(name, para)
    break

default.weight Parameter containing:
tensor([[-0.0019, -0.0103,  0.0009,  ...,  0.0043,  0.0092, -0.0057],
        [ 0.0047,  0.0046, -0.0017,  ...,  0.0062,  0.0033, -0.0004],
        [-0.0011,  0.0035, -0.0030,  ..., -0.0140, -0.0139,  0.0035],
        ...,
        [-0.0109,  0.0147,  0.0099,  ..., -0.0129,  0.0135,  0.0080],
        [ 0.0067, -0.0071, -0.0059,  ..., -0.0132, -0.0062, -0.0133],
        [ 0.0104,  0.0120,  0.0111,  ...,  0.0022,  0.0015,  0.0138]],
       device='cuda:0')


In [None]:
import torch
lora = torch.load('/home/lz/On-the-Fly_MoE_Inference/quantize/saved/training/test/checkpoint-300_lora_combine.pt')
print(lora['parameters'])

### 训练

In [1]:
import torch
import os
import sys
os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6"
import transformers
from modeling_mixtral import set_profile_mode, load_thresholds
import json
from utils import get_model, CompensatedModel
from hqq.core.quantize import *
from hqq.models.hf.mixtral import MixtralHQQ
from hqq.core.peft import PeftUtils
from datasets import load_dataset, Dataset
import functools

# # 加载 C4 数据集的验证集
with open('../path.json', 'r') as file:
    paths = json.load(file)
    fineweb_path = paths.get('fineweb', '')
    model_name = paths.get('mixtral','')
    threshold_path = paths.get('chess_up_sparsity_threshold','')

with open('./device_map.json', 'r') as f:
    device_map = json.load(f)

set_profile_mode(False)
load_thresholds(f'{threshold_path}/thresholds_0_8.pt')
dtype = torch.bfloat16
print('using ',dtype)
llm, tokenizer = get_model(model_name, device_map, dtype=dtype)

q4_config    = BaseQuantizeConfig(nbits=8, group_size=64) 
q3_config    = BaseQuantizeConfig(nbits=2, group_size=64)

quant_config      = {'block_sparse_moe.experts.w3'   : q3_config}
MixtralHQQ.quantize_model(llm, quant_config=quant_config, compute_dtype=dtype, device=device_map)

base_lora_params = {'lora_type':'default', 'r':128, 'lora_alpha':128, 'dropout':0.05, 'train_dtype':dtype}

lora_params      = {'self_attn.q_proj': base_lora_params,
                    'self_attn.k_proj': base_lora_params,
                    'self_attn.v_proj': base_lora_params,
                    'self_attn.o_proj': base_lora_params,
                    'block_sparse_moe.experts.w1'   : base_lora_params,
                    'block_sparse_moe.experts.w3'   : base_lora_params,
                    'block_sparse_moe.experts.w2'   : base_lora_params}


PeftUtils.add_lora(llm, lora_params)
lora = '/home/lz/On-the-Fly_MoE_Inference/quantize/saved/training/test/checkpoint-300_lora_combine.pt'
PeftUtils.load_lora_weights(llm, lora)
class CompensatedModel(torch.nn.Module):
    def __init__(self, model, path, layerid, expertid):
        super(CompensatedModel, self).__init__()
        self.model = model
        self.A = torch.load(path + f'A_{layerid}_{expertid}.pt').to(dtype)
        self.B_prime = torch.load(path + f'B_prime_{layerid}_{expertid}.pt').to(dtype)

    def forward(self, input_ids):
        outputs = self.model(input_ids)
        residual = (input_ids @ self.A.T) @ self.B_prime.T
        outputs += residual
    
        return outputs
for i in range(32):
    if i == 31:
        print(f"Layer {i} done...")
    for j in range(8):
        llmdevice = llm.model.layers[i].block_sparse_moe.experts[j].w3.linear_layer.device
        llm.model.layers[i].block_sparse_moe.experts[j].w3.linear_layer = \
        CompensatedModel(llm.model.layers[i].block_sparse_moe.experts[j].w3.linear_layer, '/home/lz/On-the-Fly_MoE_Inference/quantize/saved/eora/', layerid=i, expertid=j).to(llmdevice)

  from .autonotebook import tqdm as notebook_tqdm


Set profile_threshold to False
Thresholds loaded from /home/lz/On-the-Fly_MoE_Inference/quantize/threshold/c4_mixtral_up/thresholds_0_8.pt
using  torch.bfloat16


Loading checkpoint shards: 100%|██████████| 19/19 [00:34<00:00,  1.83s/it]
100%|██████████| 32/32 [00:00<00:00, 557.99it/s]
100%|██████████| 32/32 [00:15<00:00,  2.03it/s]
100%|██████████| 32/32 [00:13<00:00,  2.46it/s]
100%|██████████| 32/32 [00:00<00:00, 33.41it/s]


Layer 31 done...


In [4]:
llm.model.layers[0].block_sparse_moe.experts[0].w1

HQQLinearLoRA(
  (linear_layer): Linear(in_features=4096, out_features=14336, bias=False)
  (peft_drop): Dropout(p=0.05, inplace=False)
)

In [3]:
import json
from datasets import load_dataset, Dataset, concatenate_datasets
import functools

def preprocess_data(batch, tokenizer):
    # 使用 tokenizer 将文本数据转换为模型输入
    inputs = tokenizer(batch['text'], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    inputs["labels"] = inputs.input_ids.clone()
    return inputs

with open('../path.json', 'r') as file:
    paths = json.load(file)
    fineweb_path = paths.get('fineweb', '')
openmath = load_dataset("/home/lz/web-math/",data_files="/home/lz/web-math/openmath1.json")
fineweb = load_dataset(fineweb_path)
openmath_text = openmath['train']['text'][:4000] 
fineweb_text = fineweb['train']['text'][:12000]


Repo card metadata block was not found. Setting CardData to empty.


21411

In [6]:
test_num = 0.1
seed = 42

combined_text = openmath_text + fineweb_text
combined_dataset = Dataset.from_dict({"text": combined_text})
combined_train = combined_dataset.train_test_split(test_size=test_num, seed=seed)
train_data = combined_train['train']
test_data = combined_train['test']

new_train_data = train_data.map(
    functools.partial(
    preprocess_data,
    tokenizer=tokenizer
), batched=True)
new_test_data = test_data.map(
    functools.partial(
    preprocess_data,
    tokenizer=tokenizer
), batched=True)
new_train_data.shuffle(seed)
new_test_data.shuffle(seed)

NameError: name 'tokenizer' is not defined

In [None]:
from hqq.core.peft import PeftUtils
from transformers import AutoTokenizer, BitsAndBytesConfig, AdamW
from transformers import (
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments
)

class CustomTrainer(transformers.Trainer):
    def save_model(self, output_dir=None, _internal_call=False):
        # 如果没有指定output_dir，则使用训练参数中的输出目录
        if output_dir is None:
            output_dir = self.args.output_dir #这里的args不是该脚本的输入，而是TrainerArgs

        # 确保输出目录存在
        os.makedirs(output_dir, exist_ok=True)

        # 保存完整的模型参数
        # torch.save(self.model.state_dict(), os.path.join(output_dir, 'pytorch_model.bin'))
        
        self.model.eval()
        PeftUtils.cast_lora_weights(self.model, dtype=torch.float16)

        #Save LoRA weights
        PeftUtils.save_lora_weights(self.model, output_dir+'_lora_combine.pt')

        PeftUtils.cast_lora_weights(self.model, dtype=torch.bfloat16)
        self.model.train()

        # 保存配置文件和tokenizer
        self.model.config.save_pretrained(output_dir)
        if self.tokenizer is not None:
            self.tokenizer.save_pretrained(output_dir)

model_save_path='./saved/training/less2'
learning_rate = 1e-4
micro_batch_size=8
epochs=2
save_steps = 5
save_total_limit = 6
sample_num = len(new_train_data)
optimizer=AdamW(filter(lambda p : p.requires_grad, llm.parameters()),lr=learning_rate)
linear_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(sample_num*epochs) // micro_batch_size)
args = TrainingArguments(
    output_dir=model_save_path,
    num_train_epochs=epochs,
    # max_steps=opt.max_steps,
    # fp16=True,
    bf16=True,
    optim="adamw_torch",# paged_adamw_8bit
    learning_rate=learning_rate,
    lr_scheduler_type="cosine",
    per_device_train_batch_size=micro_batch_size,
    gradient_accumulation_steps=1,
    gradient_checkpointing=False,   ### 先设置成False
    group_by_length=False,
    logging_steps=50,
    eval_steps=50,
    save_strategy="steps",
    save_only_model=True,
    save_steps=save_steps,
    save_total_limit=save_total_limit,
    disable_tqdm=False,
    report_to='tensorboard',
    logging_dir='/home/lz/On-the-Fly_MoE_Inference/quantize/saved/logs/'
)

trainer = CustomTrainer(
    model=llm,
    train_dataset=new_train_data.select(range(30)),
    eval_dataset=new_test_data.select(range(4)),
    args=args,
    optimizers=(optimizer, linear_scheduler),
    data_collator=DataCollatorForSeq2Seq(
    tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True),
)

# silence the warnings. re-enable for inference!
llm.config.use_cache = False
trainer.train()



Step,Training Loss


100%|██████████| 32/32 [00:00<00:00, 1097.59it/s]
100%|██████████| 32/32 [00:00<00:00, 3088.38it/s]
100%|██████████| 32/32 [00:00<00:00, 736.16it/s]
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
100%|██████████| 32/32 [00:00<00:00, 1061.88it/s]
100%|██████████| 32/32 [00:00<00:00, 2894.18it/s]
100%|██████████| 32/32 [00:00<00:00, 382.07it/s]
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=8, training_loss=1.8526314496994019, metrics={'train_runtime': 46.1998, 'train_samples_per_second': 1.299, 'train_steps_per_second': 0.173, 'total_flos': 6860099687546880.0, 'train_loss': 1.8526314496994019, 'epoch': 2.0})

### 加载模型

In [1]:
import torch
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6"
from modeling_mixtral import MixtralForCausalLM, set_profile_mode, load_thresholds
import json
from utils import get_model

# # 加载 C4 数据集的验证集
with open('../path.json', 'r') as file:
    paths = json.load(file)
    fineweb_path = paths.get('c4', '')
    model_name = paths.get('mixtral','')
    threshold_path = paths.get('mixtral_threshold','')

with open('./device_map.json', 'r') as f:
    device_map = json.load(f)

set_profile_mode(True)
sparsity_level=0.8
filepath = str(sparsity_level).replace('.', '_')
load_thresholds(f'{threshold_path}/thresholds_{filepath}.pt', use_average=False)
llm, tokenizer = get_model(model_name, device_map)
# %%
#Quantize
from hqq.core.quantize import *
q4_config    = BaseQuantizeConfig(nbits=8, group_size=64) 
q3_config    = BaseQuantizeConfig(nbits=2, group_size=64)

quant_config = {
  'block_sparse_moe.experts.w3'  :q3_config,
}
from hqq.models.hf.mixtral import MixtralHQQ
MixtralHQQ.quantize_model(llm, quant_config=quant_config, compute_dtype=torch.float16, device=device_map)

  from .autonotebook import tqdm as notebook_tqdm
2025-01-27 10:05:09,569	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


Set profile_threshold to True


  up_th = torch.load(threshold_path, map_location='cuda')["up_proj_states_thresholds"]


Thresholds loaded from /home/bcds/On-the-Fly_MoE_Inference/saving/threshold/c4_mixtral/thresholds_0_8.pt


Loading checkpoint shards: 100%|██████████| 19/19 [00:12<00:00,  1.51it/s]
100%|██████████| 32/32 [00:00<00:00, 2355.44it/s]
100%|██████████| 32/32 [00:09<00:00,  3.49it/s]


MixtralForCausalLM(
  (model): MixtralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MixtralDecoderLayer(
        (self_attn): MixtralSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MixtralRotaryEmbedding()
        )
        (block_sparse_moe): MixtralSparseMoeBlock(
          (gate): Linear(in_features=4096, out_features=8, bias=False)
          (experts): ModuleList(
            (0-7): 8 x MixtralBlockSparseTop2MLP(
              (w1): Linear(in_features=4096, out_features=14336, bias=False)
              (w2): Linear(in_features=14336, out_features=4096, bias=False)
              (w3): HQQLinear(in_features=4096, out_features=14336, bias=False)
 

加载lora

In [None]:
from hqq.core.peft import PeftUtils
PeftUtils.load_lora_weights(llm, '/home/lz/On-the-Fly_MoE_Inference/quantize/saved/training/less2/checkpoint-750_lora_combine.pt')

  0%|          | 0/929 [00:00<?, ?it/s]

100%|██████████| 929/929 [00:09<00:00, 97.35it/s] 
100%|██████████| 929/929 [00:00<00:00, 159327.30it/s]


加载数据集

In [2]:
# %%
from datasets import load_dataset
def preprocess_data(batch):
    # 使用 tokenizer 将文本数据转换为模型输入
    inputs = tokenizer(batch['text'], padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    inputs["labels"] = inputs.input_ids.clone()
    return inputs

# 定义一个函数来选择特征并丢弃不需要的
def select_features(example):
    return {
        'input_ids': example['input_ids'],
        'attention_mask': example['attention_mask'],
        'labels': example['labels']
    }

tokenizer.pad_token = tokenizer.eos_token

with open('../path.json', 'r') as file:
    paths = json.load(file)
    c4_path = paths.get('c4', '')
c4 = load_dataset(c4_path)
# 对数据集进行预处理
c4_dataset = c4.map(preprocess_data, batched=True)
# c4_dataset = c4_dataset.map(select_features, batched=True)
c4_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
# c4_dataset
top_four_thousand_data = c4_dataset['validation'].select(range(10000))

import numpy as np

def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)

from torch.utils.data import DataLoader
from tqdm import tqdm
set_seed(42)

# 定义数据加载器
batch_size = 8
# dataloader = DataLoader(c4_dataset['validation'], batch_size=batch_size)
dataloader = DataLoader(top_four_thousand_data, batch_size=batch_size)
# %%

Map:   0%|          | 0/364608 [00:00<?, ? examples/s]

Map: 100%|██████████| 364608/364608 [01:03<00:00, 5776.10 examples/s]


In [3]:
import torch
import os

llm_base = MixtralForCausalLM.from_pretrained(
    model_name,
    device_map='cpu',
    use_cache=True,
    torch_dtype=torch.float16,
    # attn_implementation="flash_attention_2"
) 

Loading checkpoint shards: 100%|██████████| 19/19 [00:09<00:00,  2.10it/s]


### eora恢复

直接从文件中读取

In [None]:
import torch

class CompensatedModel(torch.nn.Module):
    def __init__(self, model, path, layerid, expertid):
        super(CompensatedModel, self).__init__()
        self.model = model
        ### self.A and self.B_prime are initialized as the values loaded from the file
        self.A = torch.load(path + f'A_{layerid}_{expertid}.pt').to(torch.float16)
        self.B_prime = torch.load(path + f'B_prime_{layerid}_{expertid}.pt').to(torch.float16)
        

    def forward(self, input_ids):
        outputs = self.model(input_ids)
        residual = (input_ids @ self.A.T) @ self.B_prime.T
        outputs += residual
    
        return outputs

for i in range(32):
    print(f"Layer {i} done...")
    for j in range(8):
        llmdevice = llm.model.layers[i].block_sparse_moe.experts[j].w3.device
        llm.model.layers[i].block_sparse_moe.experts[j].w3 = \
        CompensatedModel(llm.model.layers[i].block_sparse_moe.experts[j].w3, '/home/lz/On-the-Fly_MoE_Inference/quantize/saved/', layerid=i, expertid=j).to(llmdevice)
        

第一次计算

In [4]:
# %%
def profle_svdllm(name, model, calib_loader, dev):
    # model.to(dev)
    if "llama" in name or "mixtral" in name or "vicuna" in name:
        layers = model.model.layers
    print("Start obtaining the whitening matrix...")
    def hook(module, input, output):
        inp = input[0].detach().float()
        if inp.dim() == 2:   # for opt
            inp = inp.unsqueeze(0)
        adds = torch.matmul(inp.transpose(1,2), inp)
        adds_sum = torch.sum(adds, dim=0)
        module.raw_scaling_diag_matrix += adds_sum
        del inp, adds, adds_sum
        torch.cuda.empty_cache()
    for name, module in model.named_modules():
        if "w3" in name:
            # print(name)
            module.raw_scaling_diag_matrix = 0
            module.register_forward_hook(hook)
            
    for batch in tqdm(calib_loader):
        inputs = batch['input_ids'].to(llm.device)
        model(inputs)
    for name, module in model.named_modules():
        if "w3" in name:
            module._forward_hooks.clear()
            # print(module.raw_scaling_diag_matrix)
    torch.cuda.empty_cache()

    profiling_mat = {}
    print("Start Cholesky Decomposition...")
    
    layer_profile = {}
    for name, module in model.named_modules():
        if "w3" in name:
            covariance = module.raw_scaling_diag_matrix.double().to(dev)
            if not torch.allclose(covariance, covariance.t(), atol=1e-6):
                raise ValueError("Covariance matrix is not symmetric.")
                    # Perform eigen decomposition
            Lambda, Q = torch.linalg.eigh(covariance, UPLO='U')
            if torch.isnan(Lambda).any() or torch.isinf(Lambda).any():
                raise ValueError("Lambda contains NaN or Inf values.")

            # 检查 Lambda 是否包含负值
            if (Lambda < 0).any():
                print("Lambda contains negative values. Clamping to zero.")
                eigenvalues = torch.linalg.eigvalsh(covariance)
                covariance += (- eigenvalues[0] + 2e-6) * torch.eye(covariance.shape[0]).cuda()
                Lambda, Q = torch.linalg.eigh(covariance, UPLO='U')
                print(f"Lambda min: {Lambda.min().item()}, Lambda max: {Lambda.max().item()}")
            # 现在进行平方根操作
            Lambda_diag = torch.diag(torch.sqrt(Lambda))
            # Sort eigenvalues and eigenvectors in descending order
            indices = torch.argsort(Lambda, descending=True)
            Lambda = Lambda[indices]
            Q = Q[:, indices]

            # Compute Q_prime = Q * sqrt(Lambda)
            Lambda_diag = torch.diag(torch.sqrt(Lambda))
            Q_prime = torch.matmul(Q, Lambda_diag)
            layer_profile[name] = Q_prime.cpu()
            profiling_mat[name] = layer_profile
    return profiling_mat

profiling_mat=profle_svdllm("mixtral", llm, dataloader, "cuda")


Start obtaining the whitening matrix...


100%|██████████| 1250/1250 [57:54<00:00,  2.78s/it]


Start Cholesky Decomposition...


In [5]:
class CompensatedModel(torch.nn.Module):
    def __init__(self, model, B_prime, A):
        super(CompensatedModel, self).__init__()
        self.model = model
        self.B_prime = torch.nn.Parameter(torch.tensor(B_prime)).to(torch.float16)
        self.A = torch.nn.Parameter(torch.tensor(A)).to(torch.float16)
        # print(self.A.shape,self.B_prime.shape)
    def forward(self, input_ids):
        outputs = self.model(input_ids)
        residual = (input_ids @ self.A.T) @ self.B_prime.T
        torch.add(outputs, residual, out = outputs)
    
        return outputs
    
for i in range(32):
    print(f"Layer {i} done...")
    for j in range(8):
        llmdevice = llm.model.layers[i].block_sparse_moe.experts[j].w3.device
        Delta_W = llm_base.model.layers[i].block_sparse_moe.experts[j].w3.weight.to(llmdevice) - llm.model.layers[i].block_sparse_moe.experts[j].w3.dequantize()
        Q_prime = profiling_mat[f"model.layers.{i}.block_sparse_moe.experts.{j}.w3"][f"model.layers.{i}.block_sparse_moe.experts.{j}.w3"].cuda().float()
        Delta_W_prime =  Delta_W.to(torch.float32).to(llmdevice) @ Q_prime.to(torch.float32).to(llmdevice)
        llm_base.model.layers[i].block_sparse_moe.experts[j].w3.cpu()
        # 步骤5: 进行SVD分解并取前r个奇异值
        rank = 64  # 设置 desired rank
        U_prime, Sigma_prime, V_prime = torch.linalg.svd(Delta_W_prime, full_matrices=False)
        U_prime = U_prime[:, :rank]
        Sigma_prime = Sigma_prime[:rank]
        V_prime = V_prime[:rank, :]

        B_prime = U_prime @ torch.diag(Sigma_prime)
        A_prime = V_prime

        # 步骤6: 投影回原空间
        A = A_prime.to(llmdevice) @ torch.linalg.inv(Q_prime).to(llmdevice)
        llm.model.layers[i].block_sparse_moe.experts[j].w3 = CompensatedModel(llm.model.layers[i].block_sparse_moe.experts[j].w3, B_prime, A).to(llmdevice)
        torch.save(B_prime, f"./saved/B_prime_{i}_{j}.pt")
        torch.save(A, f"./saved/A_{i}_{j}.pt")

del llm_base

Layer 0 done...


  self.B_prime = torch.nn.Parameter(torch.tensor(B_prime)).to(torch.float16)
  self.A = torch.nn.Parameter(torch.tensor(A)).to(torch.float16)


Layer 1 done...
Layer 2 done...
Layer 3 done...
Layer 4 done...
Layer 5 done...
Layer 6 done...
Layer 7 done...
Layer 8 done...
Layer 9 done...
Layer 10 done...
Layer 11 done...
Layer 12 done...
Layer 13 done...
Layer 14 done...
Layer 15 done...
Layer 16 done...
Layer 17 done...
Layer 18 done...
Layer 19 done...
Layer 20 done...
Layer 21 done...
Layer 22 done...
Layer 23 done...
Layer 24 done...
Layer 25 done...
Layer 26 done...
Layer 27 done...
Layer 28 done...
Layer 29 done...
Layer 30 done...
Layer 31 done...


### threshold

In [None]:
import torch
import numpy as np
datasets = torch.load('../saving/threshold/chess/datasets.pt')
set_profile_mode(True)
def get_batch(data, batch_size, block_size):
    start_idxs = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in start_idxs])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in start_idxs])
    return x, y

Set profile_threshold to True


In [None]:
sparsity_level = 0.8
# device = 'cuda:1'
device_2 = 'cpu'
avg_loss = 0.0
n_batch = 64 * 2
# accum_steps = 4 
accum_steps = 2
batch_size = 1
block_size = 2048
torch.manual_seed(42)

model = llm

n_layers = len(model.model.layers)
n_experts = len(model.model.layers[0].block_sparse_moe.experts)

up_proj_states_thresholds = [torch.zeros([n_experts,]) for _ in range(n_layers)]
gate_proj_states_mean_squares = [[torch.zeros(model.config.intermediate_size) for _ in range(n_experts)] for _ in range(n_layers)]

up_states = [[torch.zeros([accum_steps * batch_size * block_size //2, model.config.intermediate_size]) for _ in range(n_experts)] for _ in range(n_layers)]
gate_states = [[torch.zeros([accum_steps * batch_size * block_size //2, model.config.intermediate_size]) for _ in range(n_experts)] for _ in range(n_layers)]

with torch.no_grad():
    for step in range(n_batch // accum_steps):
        print(step * accum_steps)
        all_counts = [0 for _ in range(n_layers * n_experts)]
        for batch_idx in range(accum_steps):
            # print('batch_idx:', batch_idx)
            inputs, labels = get_batch(datasets['validation'], batch_size, block_size)
            inputs = inputs.cuda()
            outputs = model(inputs, labels=inputs)
            avg_loss = avg_loss + outputs.loss / n_batch

            for layer_idx in range(n_layers):
                for expert_idx in range(n_experts):
                    counts = all_counts[layer_idx * n_experts + expert_idx]

                    states = model.model.layers[layer_idx].block_sparse_moe.experts[expert_idx].up_proj_states.reshape(-1, model.config.intermediate_size)
                    cur_counts = states.size(0)
                    # print('counts and cur_counts:',counts, cur_counts)
                    # print(states.size())
                    # print(up_states[layer_idx][expert_idx][counts : counts+cur_counts, :].size())
                    up_states[layer_idx][expert_idx][counts : counts+cur_counts, :] = states

                    states = model.model.layers[layer_idx].block_sparse_moe.experts[expert_idx].gate_proj_states.reshape(-1, model.config.intermediate_size)
                    gate_states[layer_idx][expert_idx][counts : counts+cur_counts, :] = states
                    # counts += cur_counts
                    all_counts[layer_idx * n_experts + expert_idx] += cur_counts

        for layer_idx in range(n_layers):   
            for expert_idx in range(n_experts):
                # print('layer_idx:', layer_idx, 'expert_idx:', expert_idx)
                useful_num = all_counts[layer_idx * n_experts + expert_idx]
                topk_num = int(useful_num * model.config.intermediate_size * sparsity_level)
                up_proj_states_thresholds[layer_idx][expert_idx] += up_states[layer_idx][expert_idx][0:useful_num,:].to(device_2).abs().flatten().kthvalue(topk_num).values.to('cpu')
                gate_proj_states_mean_squares[layer_idx][expert_idx] += (torch.sum(gate_states[layer_idx][expert_idx][0:useful_num,:].to(dev
                
                
                ice_2) ** 2, dim=0).to('cpu') / useful_num).to('cpu')

for layer_idx in range(n_layers):
    for expert_idx in range(n_experts):
        gate_proj_states_mean_squares[layer_idx][expert_idx] /= n_batch // accum_steps
        up_proj_states_thresholds[layer_idx][expert_idx] /= n_batch // accum_steps

0
2
4
6
8
10
12
14
16
18
20
22
24
26
28
30
32
34
36
38
40
42
44
46
48
50
52
54
56
58
60
62
64
66
68
70
72
74
76
78
80
82
84
86
88
90
92
94
96
98
100
102
104
106
108
110
112
114
116
118
120
122
124
126


In [None]:
importance_thresholds = [torch.zeros([n_experts,]) for _ in range(n_layers)]
up_proj_states_thresholds_2 = [[torch.zeros(model.config.intermediate_size) for _ in range(n_experts)] for _ in range(n_layers)]

with torch.no_grad():
    for step in range(n_batch // accum_steps):
        print(step * accum_steps)
        all_counts = [0 for _ in range(n_layers * n_experts)]
        for batch_idx in range(accum_steps):
            inputs, labels = get_batch(datasets['validation'], batch_size, block_size)
            inputs = inputs.cuda()
            outputs = model(inputs, labels=inputs)
            avg_loss = avg_loss + outputs.loss / n_batch

            for layer_idx in range(n_layers):
                for expert_idx in range(n_experts):
                    counts = all_counts[layer_idx * n_experts + expert_idx]
                    states = model.model.layers[layer_idx].block_sparse_moe.experts[expert_idx].up_proj_states.reshape(-1, states.size(-1))
                    cur_counts = states.size(0)
                    up_states[layer_idx][expert_idx][counts:cur_counts+counts, :] = states
                    # counts += cur_counts
                    all_counts[layer_idx * n_experts + expert_idx] += cur_counts
                
        for layer_idx in range(n_layers):   
            for expert_idx in range(n_experts):
                useful_num = all_counts[layer_idx * n_experts + expert_idx]
                importance_scores = up_states[layer_idx][expert_idx][:useful_num,:] ** 2 * gate_proj_states_mean_squares[layer_idx][expert_idx]
                importance_thresholds[layer_idx][expert_idx] += importance_scores.to(device_2).flatten().kthvalue(int(importance_scores.numel() * sparsity_level)).values.to('cpu')

for layer_idx in range(n_layers):
    for expert_idx in range(n_experts):
        importance_thresholds[layer_idx][expert_idx] /= n_batch // accum_steps
        up_proj_states_thresholds_2[layer_idx][expert_idx] = (importance_thresholds[layer_idx][expert_idx].expand_as(up_proj_states_thresholds_2[layer_idx][expert_idx]) / gate_proj_states_mean_squares[layer_idx][expert_idx]) ** 0.5

thresholds = {'up_proj_states_thresholds': up_proj_states_thresholds, 'up_proj_states_thresholds_2': up_proj_states_thresholds_2}

0
2
4
6
8
10
12
14
16
18
20
22
24
26
28
30
32
34
36
38
40
42
44
46
48
50
52
54
56
58
60
62
64
66
68
70
72
74
76
78
80
82
84
86
88
90
92
94
96
98
100
102
104
106
108
110
112
114
116
118
120
122
124
126


In [None]:
save_path = './threshold/training_up'

sp = str(sparsity_level).replace('.', '_')
print('save in:', save_path)
torch.save(thresholds, f'{save_path}/thresholds_{sp}.pt')

save in: ./threshold/training_up


### C4

In [9]:
# %%
from datasets import load_dataset
def preprocess_data(batch):
    # 使用 tokenizer 将文本数据转换为模型输入
    inputs = tokenizer(batch['text'], padding="max_length", truncation=True, max_length=256, return_tensors="pt")
    inputs["labels"] = inputs.input_ids.clone()
    return inputs

# 定义一个函数来选择特征并丢弃不需要的
def select_features(example):
    return {
        'input_ids': example['input_ids'],
        'attention_mask': example['attention_mask'],
        'labels': example['labels']
    }

tokenizer.pad_token = tokenizer.eos_token

# with open('../path.json', 'r') as file:
#     paths = json.load(file)
#     c4_path = paths.get('c4', '')
# c4 = load_dataset(c4_path)
# # 对数据集进行预处理
c4_dataset = c4.map(preprocess_data, batched=True)
# c4_dataset = c4_dataset.map(select_features, batched=True)
c4_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
# # c4_dataset
top_four_thousand_data = c4_dataset['validation'].select(range(4000))

import numpy as np

def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)

from torch.utils.data import DataLoader
from tqdm import tqdm
set_seed(42)

# 定义数据加载器
batch_size = 8
# dataloader = DataLoader(c4_dataset['validation'], batch_size=batch_size)
dataloader = DataLoader(top_four_thousand_data, batch_size=batch_size)
# %%

Map: 100%|██████████| 364608/364608 [01:30<00:00, 4040.35 examples/s]


In [10]:
from tqdm import tqdm
# 计算评估损失
total_loss = 0.0
num_batches = 0

for batch in tqdm(dataloader):
    input_ids = batch['input_ids'].to(llm.device)
    attention_mask = batch['attention_mask'].to(llm.device)
    labels = batch['labels'].to(llm.device)
    
    # 禁用梯度计算
    with torch.no_grad():
        outputs = llm(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        num_batches += 1
        if num_batches % 100 == 0:
            print(f"[{num_batches}], Eval Loss: {total_loss / (num_batches)}")

# 计算平均损失
eval_loss = total_loss / num_batches
print(f"Eval Loss: {eval_loss}")

 20%|██        | 100/500 [02:08<07:53,  1.18s/it]

[100], Eval Loss: 2.7280060803890227


 40%|████      | 200/500 [04:05<05:53,  1.18s/it]

[200], Eval Loss: 2.7090926414728163


 60%|██████    | 300/500 [05:57<03:55,  1.18s/it]

[300], Eval Loss: 2.7416156327724455


 80%|████████  | 400/500 [07:54<01:52,  1.13s/it]

[400], Eval Loss: 2.7384608909487724


100%|██████████| 500/500 [09:52<00:00,  1.18s/it]

[500], Eval Loss: 2.7316660568714144
Eval Loss: 2.7316660568714144





In [None]:
llm.model.layers[0].block_sparse_moe.experts[0].

In [None]:
for layerid in range(32):
    for expertid in range(8):
        llm.model.layers[layerid].block_sparse_moe.experts[expertid].print_ratio()


layer 0 expert 0 ratio: 0.2167
layer 0 expert 1 ratio: 0.1075
layer 0 expert 2 ratio: 0.0871
layer 0 expert 3 ratio: 0.2104
layer 0 expert 4 ratio: 0.2017
layer 0 expert 5 ratio: 0.2117
layer 0 expert 6 ratio: 0.2164
layer 0 expert 7 ratio: 0.2175
layer 1 expert 0 ratio: 0.2209
layer 1 expert 1 ratio: 0.2114
layer 1 expert 2 ratio: 0.3353
layer 1 expert 3 ratio: 0.2220
layer 1 expert 4 ratio: 0.2407
layer 1 expert 5 ratio: 0.3093
layer 1 expert 6 ratio: 0.2343
layer 1 expert 7 ratio: 0.2426
layer 2 expert 0 ratio: 0.2640
layer 2 expert 1 ratio: 0.2186
layer 2 expert 2 ratio: 0.3159
layer 2 expert 3 ratio: 0.3497
layer 2 expert 4 ratio: 0.2149
layer 2 expert 5 ratio: 0.2662
layer 2 expert 6 ratio: 0.2368
layer 2 expert 7 ratio: 0.2575
layer 3 expert 0 ratio: 0.3555
layer 3 expert 1 ratio: 0.2156
layer 3 expert 2 ratio: 0.2297
layer 3 expert 3 ratio: 0.2700
layer 3 expert 4 ratio: 0.2890
layer 3 expert 5 ratio: 0.2718
layer 3 expert 6 ratio: 0.2421
layer 3 expert 7 ratio: 0.3535
layer 4 

In [None]:
os.environ["HF_ENDPOINT"]="https://hf-mirror.com"


import lm_eval
from lm_eval.models.huggingface import HFLM
from lm_eval import evaluator
del dataloader

In [None]:
def evaluate(task_name_list, model, tokenizer, num_fewshot, device):
    hflm = HFLM(pretrained=llm, tokenizer=tokenizer)
    results = evaluator.simple_evaluate(
    model=hflm,
    tasks=task_name_list,
    num_fewshot=num_fewshot)
    print(results['results'])



# triviaqa
task_list=['winogrande','sciq','openbookqa','arc_challenge','arc_easy']
# 'boolq',
# task_list=['truthfulqa_gen','triviaqa_gen']
evaluate(task_list, llm, tokenizer, 0, "cuda")


2025-01-03:13:11:00,259 INFO     [evaluator.py:152] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234
2025-01-03:13:11:00,261 INFO     [evaluator.py:203] Using pre-initialized model
Using the latest cached version of the module from /home/lz/.cache/huggingface/modules/datasets_modules/datasets/winogrande/a826c3d3506aefe0e9e9390dcb53271070536586bab95849876b2c1743df56e2 (last modified on Thu Jan  2 22:35:53 2025) since it couldn't be found locally at winogrande, or remotely on the Hugging Face Hub.
