### 训练

In [2]:
import torch
import os
import sys
os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6"
import transformers
from modeling_mixtral import set_profile_mode, load_thresholds
import json
from utils import get_model, CompensatedModel
from hqq.core.quantize import *
from hqq.models.hf.base import AutoHQQHFModel
from hqq.core.peft import PeftUtils
from datasets import load_dataset, Dataset
import functools


# # 加载 C4 数据集的验证集
with open('../path.json', 'r') as file:
    paths = json.load(file)
    fineweb_path = paths.get('fineweb', '')
    model_name = paths.get('mixtral','')
    threshold_path = paths.get('chess_up_sparsity_threshold','')

with open('./device_map.json', 'r') as f:
    device_map = json.load(f)

set_profile_mode(False)
load_thresholds(f'{threshold_path}/thresholds_0_8.pt')
dtype = torch.bfloat16
print('using ',dtype)
llm, tokenizer = get_model(model_name, device_map, dtype=dtype)

q4_config    = BaseQuantizeConfig(nbits=8, group_size=64) 
q3_config    = BaseQuantizeConfig(nbits=2, group_size=64)

quant_config      = {'block_sparse_moe.experts.w3'   : q3_config}
AutoHQQHFModel.quantize_model(llm, quant_config=quant_config, compute_dtype=dtype, device=device_map)

base_lora_params = {'lora_type':'default', 'r':128, 'lora_alpha':128, 'dropout':0.05, 'train_dtype':dtype}

lora_params      = {'self_attn.q_proj': base_lora_params,
                    'self_attn.k_proj': base_lora_params,
                    'self_attn.v_proj': base_lora_params,
                    'self_attn.o_proj': base_lora_params,
                    'block_sparse_moe.experts.w1'   : base_lora_params,
                    'block_sparse_moe.experts.w3'   : base_lora_params,
                    'block_sparse_moe.experts.w2'   : base_lora_params}


PeftUtils.add_lora(llm, lora_params)
class CompensatedModel(torch.nn.Module):
    def __init__(self, model, path, layerid, expertid):
        super(CompensatedModel, self).__init__()
        self.model = model
        self.A = torch.load(path + f'A_{layerid}_{expertid}.pt').to(dtype)
        self.B_prime = torch.load(path + f'B_prime_{layerid}_{expertid}.pt').to(dtype)

    def forward(self, input_ids):
        outputs = self.model(input_ids)
        residual = (input_ids @ self.A.T) @ self.B_prime.T
        outputs += residual
    
        return outputs
for i in range(32):
    if i == 31:
        print(f"Layer {i} done...")
    for j in range(8):
        llmdevice = llm.model.layers[i].block_sparse_moe.experts[j].w3.linear_layer.device
        llm.model.layers[i].block_sparse_moe.experts[j].w3.linear_layer = \
        CompensatedModel(llm.model.layers[i].block_sparse_moe.experts[j].w3.linear_layer, '/home/lz/On-the-Fly_MoE_Inference/quantize/saved/eora/', layerid=i, expertid=j).to(llmdevice)

Set profile_threshold to False
Thresholds loaded from /home/lz/On-the-Fly_MoE_Inference/quantize/threshold/c4_mixtral_up/thresholds_0_8.pt
using  torch.bfloat16


Loading checkpoint shards: 100%|██████████| 19/19 [00:33<00:00,  1.77s/it]
100%|██████████| 354/354 [00:00<00:00, 11383.67it/s]
100%|██████████| 929/929 [00:15<00:00, 61.27it/s]


In [9]:
llm.model.layers[0].block_sparse_moe.experts[0].w3.linear_layer.model

HQQLinear(in_features=4096, out_features=14336, bias=False)

In [3]:
from datasets import load_dataset, Dataset, concatenate_datasets
import functools

def preprocess_data(batch, tokenizer):
    # 使用 tokenizer 将文本数据转换为模型输入
    inputs = tokenizer(batch['text'], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    inputs["labels"] = inputs.input_ids.clone()
    return inputs


openmath = load_dataset("/home/lz/web-math/",data_files="/home/lz/web-math/openmath1.json")
fineweb = load_dataset(fineweb_path)
openmath_text = openmath['train']['text'][:2000] 
fineweb_text = fineweb['train']['text'][:6000]


Repo card metadata block was not found. Setting CardData to empty.


In [4]:
test_num = 0.1
seed = 42

combined_text = openmath_text + fineweb_text
combined_dataset = Dataset.from_dict({"text": combined_text})
combined_train = combined_dataset.train_test_split(test_size=test_num, seed=seed)
train_data = combined_train['train']
test_data = combined_train['test']

new_train_data = train_data.map(
    functools.partial(
    preprocess_data,
    tokenizer=tokenizer
), batched=True)
new_test_data = test_data.map(
    functools.partial(
    preprocess_data,
    tokenizer=tokenizer
), batched=True)
new_train_data.shuffle(seed)
new_test_data.shuffle(seed)

Map: 100%|██████████| 7200/7200 [00:05<00:00, 1381.09 examples/s]
Map: 100%|██████████| 800/800 [00:00<00:00, 1357.93 examples/s]


Dataset({
    features: ['text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 800
})

In [5]:
from hqq.core.peft import PeftUtils
from transformers import AutoTokenizer, BitsAndBytesConfig, AdamW
from transformers import (
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments
)

class CustomTrainer(transformers.Trainer):
    def save_model(self, output_dir=None, _internal_call=False):
        # 如果没有指定output_dir，则使用训练参数中的输出目录
        if output_dir is None:
            output_dir = self.args.output_dir #这里的args不是该脚本的输入，而是TrainerArgs

        # 确保输出目录存在
        os.makedirs(output_dir, exist_ok=True)

        # 保存完整的模型参数
        # torch.save(self.model.state_dict(), os.path.join(output_dir, 'pytorch_model.bin'))
        
        # PeftUtils.cast_lora_weights(self.model, dtype=torch.bfloat16)

        #Save LoRA weights
        PeftUtils.save_lora_weights(self.model, output_dir+'_lora_combine.pt')

        # 保存配置文件和tokenizer
        self.model.config.save_pretrained(output_dir)
        if self.tokenizer is not None:
            self.tokenizer.save_pretrained(output_dir)

model_save_path='./saved/training/less2'
learning_rate = 1e-4
micro_batch_size=8
epochs=2
save_steps = 100
save_total_limit = 6
sample_num = len(new_train_data)
optimizer=AdamW(filter(lambda p : p.requires_grad, llm.parameters()),lr=learning_rate)
linear_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(sample_num*epochs) // micro_batch_size)
args = TrainingArguments(
    output_dir=model_save_path,
    num_train_epochs=epochs,
    # max_steps=opt.max_steps,
    # fp16=True,
    bf16=True,
    optim="adamw_torch",# paged_adamw_8bit
    learning_rate=learning_rate,
    lr_scheduler_type="cosine",
    per_device_train_batch_size=micro_batch_size,
    gradient_accumulation_steps=1,
    gradient_checkpointing=False,   ### 先设置成False
    group_by_length=False,
    logging_steps=50,
    eval_steps=50,
    save_strategy="steps",
    save_only_model=True,
    save_steps=save_steps,
    save_total_limit=save_total_limit,
    disable_tqdm=False,
    report_to='tensorboard',
    logging_dir='/home/lz/On-the-Fly_MoE_Inference/quantize/saved/logs/'
)

trainer = CustomTrainer(
    model=llm,
    train_dataset=new_train_data.select(range(3000)),
    eval_dataset=new_test_data.select(range(300)),
    args=args,
    optimizers=(optimizer, linear_scheduler),
    data_collator=DataCollatorForSeq2Seq(
    tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True),
)

# silence the warnings. re-enable for inference!
llm.config.use_cache = False
trainer.train()



Step,Training Loss
50,1.7708
100,1.6669
150,1.719
200,1.6447
250,1.6743
300,1.6822
350,1.6622
400,1.3972
450,1.0624
500,1.1457


100%|██████████| 929/929 [00:00<00:00, 214813.85it/s]
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
100%|██████████| 929/929 [00:00<00:00, 144582.87it/s]
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
100%|██████████| 929/929 [00:00<00:00, 195000.92it/s]
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
100%|██████████| 929/929 [00:00<00:00, 205241.42it/s]
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
100%|██████████| 929/929 [00:00<00:00, 145207.89it/s]
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
100%|██████████| 929/929 [00:00<00:00, 208804.91it/s]
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
100%|██████████| 929/929 [00:00<00:00, 211766.76it/s]
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
100%|██████████| 929

TrainOutput(global_step=750, training_loss=1.401180679321289, metrics={'train_runtime': 2267.8098, 'train_samples_per_second': 2.646, 'train_steps_per_second': 0.331, 'total_flos': 6.74877413523456e+17, 'train_loss': 1.401180679321289, 'epoch': 2.0})

### 加载模型

In [1]:
import torch
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6"
from modeling_mixtral import MixtralForCausalLM, set_profile_mode
import json
from utils import get_model

# # 加载 C4 数据集的验证集
with open('../path.json', 'r') as file:
    paths = json.load(file)
    fineweb_path = paths.get('c4', '')
    model_name = paths.get('mixtral','')

with open('./device_map.json', 'r') as f:
    device_map = json.load(f)

set_profile_mode(True)
llm, tokenizer = get_model(model_name, device_map)
# %%
#Quantize
from hqq.core.quantize import *
q4_config    = BaseQuantizeConfig(nbits=8, group_size=64) 
q3_config    = BaseQuantizeConfig(nbits=2, group_size=64)

quant_config = {
  'block_sparse_moe.experts.w3'  :q3_config,
}
from hqq.models.hf.base import AutoHQQHFModel
AutoHQQHFModel.quantize_model(llm, quant_config=quant_config, compute_dtype=torch.float16, device=device_map)

  from .autonotebook import tqdm as notebook_tqdm


Set profile_threshold to True


Loading checkpoint shards: 100%|██████████| 19/19 [00:35<00:00,  1.85s/it]
100%|██████████| 354/354 [00:00<00:00, 10211.37it/s]
100%|██████████| 929/929 [00:15<00:00, 60.64it/s]


MixtralForCausalLM(
  (model): MixtralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MixtralDecoderLayer(
        (self_attn): MixtralSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MixtralRotaryEmbedding()
        )
        (block_sparse_moe): MixtralSparseMoeBlock(
          (gate): Linear(in_features=4096, out_features=8, bias=False)
          (experts): ModuleList(
            (0-7): 8 x MixtralBlockSparseTop2MLP(
              (w1): Linear(in_features=4096, out_features=14336, bias=False)
              (w2): Linear(in_features=14336, out_features=4096, bias=False)
              (w3): HQQLinear(in_features=4096, out_features=14336, bias=False)
 

加载lora

In [3]:
from hqq.core.peft import PeftUtils
PeftUtils.load_lora_weights(llm, '/home/lz/On-the-Fly_MoE_Inference/quantize/saved/training/less2/checkpoint-750_lora_combine.pt')

  0%|          | 0/929 [00:00<?, ?it/s]

100%|██████████| 929/929 [00:09<00:00, 97.35it/s] 
100%|██████████| 929/929 [00:00<00:00, 159327.30it/s]


加载数据集

In [13]:
# %%
from datasets import load_dataset
def preprocess_data(batch):
    # 使用 tokenizer 将文本数据转换为模型输入
    inputs = tokenizer(batch['text'], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    inputs["labels"] = inputs.input_ids.clone()
    return inputs

# 定义一个函数来选择特征并丢弃不需要的
def select_features(example):
    return {
        'input_ids': example['input_ids'],
        'attention_mask': example['attention_mask'],
        'labels': example['labels']
    }

tokenizer.pad_token = tokenizer.eos_token

with open('../path.json', 'r') as file:
    paths = json.load(file)
    c4_path = paths.get('c4', '')
c4 = load_dataset(c4_path)
# 对数据集进行预处理
c4_dataset = c4.map(preprocess_data, batched=True)
# c4_dataset = c4_dataset.map(select_features, batched=True)
c4_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
# c4_dataset
top_four_thousand_data = c4_dataset['validation'].select(range(400))

import numpy as np

def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)

from torch.utils.data import DataLoader
from tqdm import tqdm
set_seed(42)

# 定义数据加载器
batch_size = 8
# dataloader = DataLoader(c4_dataset['validation'], batch_size=batch_size)
dataloader = DataLoader(top_four_thousand_data, batch_size=batch_size)
# %%

Map:   0%|          | 0/45576 [00:00<?, ? examples/s]

Map: 100%|██████████| 45576/45576 [00:13<00:00, 3333.84 examples/s]


In [None]:

import torch
import os

llm_base = MixtralForCausalLM.from_pretrained(
    model_name,
    device_map='cpu',
    use_cache=True,
    torch_dtype=torch.float16,
    # attn_implementation="flash_attention_2"
) 

### eora恢复

直接从文件中读取

In [3]:
import torch

class CompensatedModel(torch.nn.Module):
    def __init__(self, model, path, layerid, expertid):
        super(CompensatedModel, self).__init__()
        self.model = model
        ### self.A and self.B_prime are initialized as the values loaded from the file
        self.A = torch.load(path + f'A_{layerid}_{expertid}.pt').to(torch.float16)
        self.B_prime = torch.load(path + f'B_prime_{layerid}_{expertid}.pt').to(torch.float16)
        

    def forward(self, input_ids):
        outputs = self.model(input_ids)
        residual = (input_ids @ self.A.T) @ self.B_prime.T
        outputs += residual
    
        return outputs

for i in range(32):
    print(f"Layer {i} done...")
    for j in range(8):
        llmdevice = llm.model.layers[i].block_sparse_moe.experts[j].w3.device
        llm.model.layers[i].block_sparse_moe.experts[j].w3 = \
        CompensatedModel(llm.model.layers[i].block_sparse_moe.experts[j].w3, '/home/lz/On-the-Fly_MoE_Inference/quantize/saved/', layerid=i, expertid=j).to(llmdevice)
        

Layer 0 done...
Layer 1 done...
Layer 2 done...
Layer 3 done...
Layer 4 done...
Layer 5 done...
Layer 6 done...
Layer 7 done...
Layer 8 done...
Layer 9 done...
Layer 10 done...
Layer 11 done...
Layer 12 done...
Layer 13 done...
Layer 14 done...
Layer 15 done...
Layer 16 done...
Layer 17 done...
Layer 18 done...
Layer 19 done...
Layer 20 done...
Layer 21 done...
Layer 22 done...
Layer 23 done...
Layer 24 done...
Layer 25 done...
Layer 26 done...
Layer 27 done...
Layer 28 done...
Layer 29 done...
Layer 30 done...
Layer 31 done...


第一次计算

In [None]:
# %%
def profle_svdllm(name, model, calib_loader, dev):
    # model.to(dev)
    if "llama" in name or "mixtral" in name or "vicuna" in name:
        layers = model.model.layers
    print("Start obtaining the whitening matrix...")
    def hook(module, input, output):
        inp = input[0].detach().float()
        if inp.dim() == 2:   # for opt
            inp = inp.unsqueeze(0)
        adds = torch.matmul(inp.transpose(1,2), inp)
        adds_sum = torch.sum(adds, dim=0)
        module.raw_scaling_diag_matrix += adds_sum
        del inp, adds, adds_sum
        torch.cuda.empty_cache()
    for name, module in model.named_modules():
        if "w3" in name:
            # print(name)
            module.raw_scaling_diag_matrix = 0
            module.register_forward_hook(hook)
            
    for batch in tqdm(calib_loader):
        inputs = batch['input_ids'].to(llm.device)
        model(inputs)
    for name, module in model.named_modules():
        if "w3" in name:
            module._forward_hooks.clear()
            # print(module.raw_scaling_diag_matrix)
    torch.cuda.empty_cache()

    profiling_mat = {}
    print("Start Cholesky Decomposition...")
    
    layer_profile = {}
    for name, module in model.named_modules():
        if "w3" in name:
            covariance = module.raw_scaling_diag_matrix.double().to(dev)
            if not torch.allclose(covariance, covariance.t(), atol=1e-6):
                raise ValueError("Covariance matrix is not symmetric.")
                    # Perform eigen decomposition
            Lambda, Q = torch.linalg.eigh(covariance, UPLO='U')
            if torch.isnan(Lambda).any() or torch.isinf(Lambda).any():
                raise ValueError("Lambda contains NaN or Inf values.")

            # 检查 Lambda 是否包含负值
            if (Lambda < 0).any():
                print("Lambda contains negative values. Clamping to zero.")
                eigenvalues = torch.linalg.eigvalsh(covariance)
                covariance += (- eigenvalues[0] + 2e-6) * torch.eye(covariance.shape[0]).cuda()
                Lambda, Q = torch.linalg.eigh(covariance, UPLO='U')
                print(f"Lambda min: {Lambda.min().item()}, Lambda max: {Lambda.max().item()}")
            # 现在进行平方根操作
            Lambda_diag = torch.diag(torch.sqrt(Lambda))
            # Sort eigenvalues and eigenvectors in descending order
            indices = torch.argsort(Lambda, descending=True)
            Lambda = Lambda[indices]
            Q = Q[:, indices]

            # Compute Q_prime = Q * sqrt(Lambda)
            Lambda_diag = torch.diag(torch.sqrt(Lambda))
            Q_prime = torch.matmul(Q, Lambda_diag)
            layer_profile[name] = Q_prime.cpu()
            profiling_mat[name] = layer_profile
    return profiling_mat

profiling_mat=profle_svdllm("mixtral", llm, dataloader, "cuda")


In [None]:
class CompensatedModel(torch.nn.Module):
    def __init__(self, model, B_prime, A):
        super(CompensatedModel, self).__init__()
        self.model = model
        self.B_prime = torch.nn.Parameter(torch.tensor(B_prime)).to(torch.float16)
        self.A = torch.nn.Parameter(torch.tensor(A)).to(torch.float16)
        # print(self.A.shape,self.B_prime.shape)
    def forward(self, input_ids):
        outputs = self.model(input_ids)
        residual = (input_ids @ self.A.T) @ self.B_prime.T
        torch.add(outputs, residual, out = outputs)
    
        return outputs
    
for i in range(32):
    print(f"Layer {i} done...")
    for j in range(8):
        llmdevice = llm.model.layers[i].block_sparse_moe.experts[j].w3.device
        Delta_W = llm_base.model.layers[i].block_sparse_moe.experts[j].w3.weight.to(llmdevice) - llm.model.layers[i].block_sparse_moe.experts[j].w3.dequantize()
        Q_prime = profiling_mat[f"model.layers.{i}.block_sparse_moe.experts.{j}.w3"][f"model.layers.{i}.block_sparse_moe.experts.{j}.w3"].cuda().float()
        Delta_W_prime =  Delta_W.to(torch.float32).to(llmdevice) @ Q_prime.to(torch.float32).to(llmdevice)
        llm_base.model.layers[i].block_sparse_moe.experts[j].w3.cpu()
        # 步骤5: 进行SVD分解并取前r个奇异值
        rank = 256  # 设置 desired rank
        U_prime, Sigma_prime, V_prime = torch.linalg.svd(Delta_W_prime, full_matrices=False)
        U_prime = U_prime[:, :rank]
        Sigma_prime = Sigma_prime[:rank]
        V_prime = V_prime[:rank, :]

        B_prime = U_prime @ torch.diag(Sigma_prime)
        A_prime = V_prime

        # 步骤6: 投影回原空间
        A = A_prime.to(llmdevice) @ torch.linalg.inv(Q_prime).to(llmdevice)
        llm.model.layers[i].block_sparse_moe.experts[j].w3 = CompensatedModel(llm.model.layers[i].block_sparse_moe.experts[j].w3, B_prime, A).to(llmdevice)
        torch.save(B_prime, f"./saved/B_prime_{i}_{j}.pt")
        torch.save(A, f"./saved/A_{i}_{j}.pt")

del llm_base

### threshold

In [4]:
import torch
import numpy as np
datasets = torch.load('../saving/threshold/chess/datasets.pt')
set_profile_mode(True)
def get_batch(data, batch_size, block_size):
    start_idxs = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in start_idxs])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in start_idxs])
    return x, y

Set profile_threshold to True


In [5]:
sparsity_level = 0.8
# device = 'cuda:1'
device_2 = 'cpu'
avg_loss = 0.0
n_batch = 64 * 2
# accum_steps = 4 
accum_steps = 2
batch_size = 1
block_size = 2048
torch.manual_seed(42)

model = llm

n_layers = len(model.model.layers)
n_experts = len(model.model.layers[0].block_sparse_moe.experts)

up_proj_states_thresholds = [torch.zeros([n_experts,]) for _ in range(n_layers)]
gate_proj_states_mean_squares = [[torch.zeros(model.config.intermediate_size) for _ in range(n_experts)] for _ in range(n_layers)]

up_states = [[torch.zeros([accum_steps * batch_size * block_size //2, model.config.intermediate_size]) for _ in range(n_experts)] for _ in range(n_layers)]
gate_states = [[torch.zeros([accum_steps * batch_size * block_size //2, model.config.intermediate_size]) for _ in range(n_experts)] for _ in range(n_layers)]

with torch.no_grad():
    for step in range(n_batch // accum_steps):
        print(step * accum_steps)
        all_counts = [0 for _ in range(n_layers * n_experts)]
        for batch_idx in range(accum_steps):
            # print('batch_idx:', batch_idx)
            inputs, labels = get_batch(datasets['validation'], batch_size, block_size)
            inputs = inputs.cuda()
            outputs = model(inputs, labels=inputs)
            avg_loss = avg_loss + outputs.loss / n_batch

            for layer_idx in range(n_layers):
                for expert_idx in range(n_experts):
                    counts = all_counts[layer_idx * n_experts + expert_idx]

                    states = model.model.layers[layer_idx].block_sparse_moe.experts[expert_idx].up_proj_states.reshape(-1, model.config.intermediate_size)
                    cur_counts = states.size(0)
                    # print('counts and cur_counts:',counts, cur_counts)
                    # print(states.size())
                    # print(up_states[layer_idx][expert_idx][counts : counts+cur_counts, :].size())
                    up_states[layer_idx][expert_idx][counts : counts+cur_counts, :] = states

                    states = model.model.layers[layer_idx].block_sparse_moe.experts[expert_idx].gate_proj_states.reshape(-1, model.config.intermediate_size)
                    gate_states[layer_idx][expert_idx][counts : counts+cur_counts, :] = states
                    # counts += cur_counts
                    all_counts[layer_idx * n_experts + expert_idx] += cur_counts

        for layer_idx in range(n_layers):   
            for expert_idx in range(n_experts):
                # print('layer_idx:', layer_idx, 'expert_idx:', expert_idx)
                useful_num = all_counts[layer_idx * n_experts + expert_idx]
                topk_num = int(useful_num * model.config.intermediate_size * sparsity_level)
                up_proj_states_thresholds[layer_idx][expert_idx] += up_states[layer_idx][expert_idx][0:useful_num,:].to(device_2).abs().flatten().kthvalue(topk_num).values.to('cpu')
                gate_proj_states_mean_squares[layer_idx][expert_idx] += (torch.sum(gate_states[layer_idx][expert_idx][0:useful_num,:].to(dev
                
                
                ice_2) ** 2, dim=0).to('cpu') / useful_num).to('cpu')

for layer_idx in range(n_layers):
    for expert_idx in range(n_experts):
        gate_proj_states_mean_squares[layer_idx][expert_idx] /= n_batch // accum_steps
        up_proj_states_thresholds[layer_idx][expert_idx] /= n_batch // accum_steps

0
2
4
6
8
10
12
14
16
18
20
22
24
26
28
30
32
34
36
38
40
42
44
46
48
50
52
54
56
58
60
62
64
66
68
70
72
74
76
78
80
82
84
86
88
90
92
94
96
98
100
102
104
106
108
110
112
114
116
118
120
122
124
126


In [6]:
importance_thresholds = [torch.zeros([n_experts,]) for _ in range(n_layers)]
up_proj_states_thresholds_2 = [[torch.zeros(model.config.intermediate_size) for _ in range(n_experts)] for _ in range(n_layers)]

with torch.no_grad():
    for step in range(n_batch // accum_steps):
        print(step * accum_steps)
        all_counts = [0 for _ in range(n_layers * n_experts)]
        for batch_idx in range(accum_steps):
            inputs, labels = get_batch(datasets['validation'], batch_size, block_size)
            inputs = inputs.cuda()
            outputs = model(inputs, labels=inputs)
            avg_loss = avg_loss + outputs.loss / n_batch

            for layer_idx in range(n_layers):
                for expert_idx in range(n_experts):
                    counts = all_counts[layer_idx * n_experts + expert_idx]
                    states = model.model.layers[layer_idx].block_sparse_moe.experts[expert_idx].up_proj_states.reshape(-1, states.size(-1))
                    cur_counts = states.size(0)
                    up_states[layer_idx][expert_idx][counts:cur_counts+counts, :] = states
                    # counts += cur_counts
                    all_counts[layer_idx * n_experts + expert_idx] += cur_counts
                
        for layer_idx in range(n_layers):   
            for expert_idx in range(n_experts):
                useful_num = all_counts[layer_idx * n_experts + expert_idx]
                importance_scores = up_states[layer_idx][expert_idx][:useful_num,:] ** 2 * gate_proj_states_mean_squares[layer_idx][expert_idx]
                importance_thresholds[layer_idx][expert_idx] += importance_scores.to(device_2).flatten().kthvalue(int(importance_scores.numel() * sparsity_level)).values.to('cpu')

for layer_idx in range(n_layers):
    for expert_idx in range(n_experts):
        importance_thresholds[layer_idx][expert_idx] /= n_batch // accum_steps
        up_proj_states_thresholds_2[layer_idx][expert_idx] = (importance_thresholds[layer_idx][expert_idx].expand_as(up_proj_states_thresholds_2[layer_idx][expert_idx]) / gate_proj_states_mean_squares[layer_idx][expert_idx]) ** 0.5

thresholds = {'up_proj_states_thresholds': up_proj_states_thresholds, 'up_proj_states_thresholds_2': up_proj_states_thresholds_2}

0
2
4
6
8
10
12
14
16
18
20
22
24
26
28
30
32
34
36
38
40
42
44
46
48
50
52
54
56
58
60
62
64
66
68
70
72
74
76
78
80
82
84
86
88
90
92
94
96
98
100
102
104
106
108
110
112
114
116
118
120
122
124
126


In [7]:
save_path = './threshold/training_up'

sp = str(sparsity_level).replace('.', '_')
print('save in:', save_path)
torch.save(thresholds, f'{save_path}/thresholds_{sp}.pt')

save in: ./threshold/training_up


### C4

In [14]:
from tqdm import tqdm
# 计算评估损失
total_loss = 0.0
num_batches = 0

for batch in tqdm(dataloader):
    input_ids = batch['input_ids'].to(llm.device)
    attention_mask = batch['attention_mask'].to(llm.device)
    labels = batch['labels'].to(llm.device)
    
    # 禁用梯度计算
    with torch.no_grad():
        outputs = llm(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        num_batches += 1
        if num_batches % 100 == 0:
            print(f"[{num_batches}], Eval Loss: {total_loss / (num_batches)}")

# 计算平均损失
eval_loss = total_loss / num_batches
print(f"Eval Loss: {eval_loss}")

100%|██████████| 50/50 [01:05<00:00,  1.31s/it]

Eval Loss: 3.0378498029708862





In [7]:
for layerid in range(32):
    for expertid in range(8):
        llm.model.layers[layerid].block_sparse_moe.experts[expertid].print_ratio()


layer 0 expert 0 ratio: 0.2167
layer 0 expert 1 ratio: 0.1075
layer 0 expert 2 ratio: 0.0871
layer 0 expert 3 ratio: 0.2104
layer 0 expert 4 ratio: 0.2017
layer 0 expert 5 ratio: 0.2117
layer 0 expert 6 ratio: 0.2164
layer 0 expert 7 ratio: 0.2175
layer 1 expert 0 ratio: 0.2209
layer 1 expert 1 ratio: 0.2114
layer 1 expert 2 ratio: 0.3353
layer 1 expert 3 ratio: 0.2220
layer 1 expert 4 ratio: 0.2407
layer 1 expert 5 ratio: 0.3093
layer 1 expert 6 ratio: 0.2343
layer 1 expert 7 ratio: 0.2426
layer 2 expert 0 ratio: 0.2640
layer 2 expert 1 ratio: 0.2186
layer 2 expert 2 ratio: 0.3159
layer 2 expert 3 ratio: 0.3497
layer 2 expert 4 ratio: 0.2149
layer 2 expert 5 ratio: 0.2662
layer 2 expert 6 ratio: 0.2368
layer 2 expert 7 ratio: 0.2575
layer 3 expert 0 ratio: 0.3555
layer 3 expert 1 ratio: 0.2156
layer 3 expert 2 ratio: 0.2297
layer 3 expert 3 ratio: 0.2700
layer 3 expert 4 ratio: 0.2890
layer 3 expert 5 ratio: 0.2718
layer 3 expert 6 ratio: 0.2421
layer 3 expert 7 ratio: 0.3535
layer 4 

In [7]:
os.environ["HF_ENDPOINT"]="https://hf-mirror.com"


import lm_eval
from lm_eval.models.huggingface import HFLM
from lm_eval import evaluator
del dataloader

In [8]:
def evaluate(task_name_list, model, tokenizer, num_fewshot, device):
    hflm = HFLM(pretrained=llm, tokenizer=tokenizer)
    results = evaluator.simple_evaluate(
    model=hflm,
    tasks=task_name_list,
    num_fewshot=num_fewshot)
    print(results['results'])



# triviaqa
task_list=['winogrande','sciq','openbookqa','arc_challenge','arc_easy']
# 'boolq',
# task_list=['truthfulqa_gen','triviaqa_gen']
evaluate(task_list, llm, tokenizer, 0, "cuda")


2025-01-03:13:11:00,259 INFO     [evaluator.py:152] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234
2025-01-03:13:11:00,261 INFO     [evaluator.py:203] Using pre-initialized model
Using the latest cached version of the module from /home/lz/.cache/huggingface/modules/datasets_modules/datasets/winogrande/a826c3d3506aefe0e9e9390dcb53271070536586bab95849876b2c1743df56e2 (last modified on Thu Jan  2 22:35:53 2025) since it couldn't be found locally at winogrande, or remotely on the Hugging Face Hub.
