### 加载模型

In [None]:
import json
import torch
from transformers import AutoTokenizer
from modeling_mixtral import MixtralForCausalLM, set_profile_mode, load_thresholds
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3,4"

def _load_model(model_name, threshold_path, device_map):
    print(f"Loading model {model_name}")
    ## 开启稀疏模式
    set_profile_mode(False)
    load_thresholds(f'{threshold_path}/thresholds_0_8.pt', use_average=True)

    model = MixtralForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        use_cache=True,
        torch_dtype=torch.float16,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"
    return model, tokenizer

model_name = 'mixtral'
threshold_path_name = 'chess_up_threshold'
with open('../path.json', 'r') as f:
    path = json.load(f)
    model_name = path['mixtral']
    threshold_path = path[threshold_path_name]

with open('../quantize/device_map_1.json', 'r') as f:
    device_map = json.load(f)
model, tokenizer = _load_model(model_name, threshold_path, device_map)

  from .autonotebook import tqdm as notebook_tqdm


Loading model /home/bcds/venv/dilab/Mixtral-8x7B-v0.1
Set profile_threshold to False


  up_th = torch.load(threshold_path, map_location='cuda')["up_proj_states_thresholds_2"]


Thresholds loaded from /home/bcds/On-the-Fly_MoE_Inference/saving/threshold/c4_mixtral_up/thresholds_0_8.pt


Loading checkpoint shards: 100%|██████████| 19/19 [00:28<00:00,  1.48s/it]


In [None]:
for i in range(1, 32):
    print("layer ", i)
    gate_router = model.model.layers[i].block_sparse_moe.gate
    torch.save(gate_router.weight, f"./router/{i}.pt")

### 加载数据集

In [None]:
datasets = torch.load('../saving/threshold/chess/datasets.pt')
import torch
import numpy as np
def get_batch(data, batch_size, block_size):
    start_idxs = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in start_idxs])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in start_idxs])
    return x, y

  datasets = torch.load('../saving/threshold/chess/datasets.pt')


保存激活值和专家路由的数据集

In [None]:
from tqdm import trange
sparsity_level = 0.8
# device = 'cuda:1'
device_2 = 'cpu'
avg_loss = 0.0
n_batch = 64 * 20
# n_batch = 2
# accum_steps = 4 
accum_steps = 64
batch_size = 1
block_size = 2048
torch.manual_seed(42)
n_layers = len(model.model.layers)
n_experts = len(model.model.layers[0].block_sparse_moe.experts)
split = 'train'

with torch.no_grad():
    # for step in range(n_batch // accum_steps):
    for step in trange(n_batch // accum_steps):
        # print(step * accum_steps)
        for batch_idx in range(accum_steps):
            # print('batch_idx:', batch_idx)
            inputs, labels = get_batch(datasets[split], batch_size, block_size)
            inputs = inputs.cuda()
            outputs = model(inputs, labels=inputs)
            avg_loss = avg_loss + outputs.loss / n_batch

        for layer_idx in range(1, 32):
            if layer_idx < 4:
                d = list(zip(model.model.layers[layer_idx-1].block_sparse_moe.activations, model.model.layers[layer_idx].block_sparse_moe.gate_logits))
                
                torch.save(d,f'merge/a2ef_{layer_idx}_{step}.pth')
                print(f'saving merge/a2ef_{layer_idx}_{step}.pth done')
            model.model.layers[layer_idx-1].block_sparse_moe.activations.clear()
            model.model.layers[layer_idx].block_sparse_moe.gate_logits.clear()
            ### 清除缓存
            torch.cuda.empty_cache()


### 专家预测器

In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.cuda.amp import GradScaler, autocast # 用于混合精度训练
from torch.utils.data import DataLoader, Dataset, random_split
from torch.utils.tensorboard import SummaryWriter

os.environ["CUDA_VISIBLE_DEVICES"] = "3,4"

def top_k_position_accuracy_unordered(output, target, k=1):
    """Compute the accuracy based on the intersection of top-k values between output and target,
       regardless of their order."""
    with torch.no_grad():
        # 获取 output 和 target 中 top-k 最大值的索引
        _, topk_pred_indices = output.topk(k, 1, True)
        _, topk_target_indices = target.topk(k, 1, True)
        # 初始化批次的正确计数
        batch_size = output.size(0)
        correct_counts = 0
        
        # 检查每个样本的预测top-k是否包含在真实的top-k中
        for i in range(batch_size):
            # 将预测和目标的top-k索引转换为集合
            set_pred = set(topk_pred_indices[i].tolist())
            set_target = set(topk_target_indices[i].tolist())
            
            # 计算交集
            intersection = set_pred.intersection(set_target)
            
            # 计算正确的预测个数
            correct_counts = correct_counts+len(intersection)
        
        # 计算平均正确率
        return correct_counts,batch_size*k

def eval_model(model, val_loader,):
    # Example validation loop
    model.eval()
    total_topk_accuracy_1 = 0
    total_topk_accuracy_2 = 0
    cont=0
    len1=0
    len2=0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to("cuda"), targets.to("cuda")
            with autocast():
                outputs = model(inputs)
            # 计算 top-K 准确率（不考虑顺序）
            topk_accuracy_1 = top_k_position_accuracy_unordered(outputs, targets, k=1)
            topk_accuracy_2 = top_k_position_accuracy_unordered(outputs, targets, k=2)
            total_topk_accuracy_1 += topk_accuracy_1[0]
            total_topk_accuracy_2 += topk_accuracy_2[0]
            len1+= topk_accuracy_1[1]
            len2+= topk_accuracy_2[1]   
        avg_topk_accuracy_1 = total_topk_accuracy_1 / len1
        avg_topk_accuracy_2 = total_topk_accuracy_2 / len2
        # print(len2)
        print(f'Top-{1} Accuracy: {avg_topk_accuracy_1:.4f}', f'Top-{2} Accuracy (unordered): {avg_topk_accuracy_2:.4f}')

class CustomDataset(Dataset):
    def __init__(self, file_paths):
        # 加载数据
        self.data = []
        
        # 遍历文件路径列表，加载每个文件
        for file_path in file_paths:
            # 加载当前文件的数据
            file_data = torch.load(file_path)
            # 将当前文件的数据追加到总数据列表中
            self.data.extend(file_data)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        x, y = self.data[idx]
        return x.detach().clone(), y.detach().clone()

class RouterModel(nn.Module):
    def __init__(self, input_dim, output_dim, layer_id):
        super(RouterModel, self).__init__()
        self.linear1 = nn.Linear(input_dim, output_dim)
        loaded_weights = torch.load(f"./router/{i}.pt")
        with torch.no_grad():  # 禁用梯度计算
            self.linear1.weight.copy_(loaded_weights)

    def forward(self, x):
        return self.linear1(x)

def sparse_row(row, topk=2, use_abs=False):
    """
    对每一行保留 topk 个最大值的索引为 1，其余为 0。
    
    参数:
        row (torch.Tensor): 输入的一行数据。
        topk (int): 保留的最大值的数量。
        use_abs (bool): 是否使用绝对值进行排序。
    
    返回:
        sparse_row (torch.Tensor): 稀疏化后的行。
    """
    if use_abs:
        row = torch.abs(row)  # 如果需要使用绝对值，先取绝对值
    
    # 找到 topk 个最大值的索引
    topk_indices = torch.topk(row, topk).indices
    
    # 创建一个与 row 相同大小的零张量
    sparse_row = torch.zeros_like(row)
    
    # 将 topk_indices 对应的值置为 1
    sparse_row[topk_indices] = 1
    
    return sparse_row

def generate_label(y, topk=2, use_abs=False):
    """
    对输入的张量 y 的每一行进行稀疏化，保留 topk 个最大值的索引为 1，其余为 0。
    
    参数:
        y (torch.Tensor): 输入的张量。
        topk (int): 保留的最大值的数量。
        use_abs (bool): 是否使用绝对值进行排序。
    
    返回:
        sparse_tensor (torch.Tensor): 稀疏化后的张量。
    """
    # 对每一行进行稀疏化
    sparse_tensor = torch.stack([sparse_row(row, topk, use_abs) for row in y])
    return sparse_tensor

def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=25, writer=None):
    scaler = GradScaler()  # 创建 GradScaler 对象
    for epoch in range(epochs):
        model.train()
        for batch_idx, (inputs, targets) in enumerate(train_loader):
            inputs, targets = inputs.cuda(), targets.cuda()

            optimizer.zero_grad()

            # 使用 autocast 来进行自动混合精度处理
            with torch.cuda.amp.autocast():
                outputs = model(inputs)
                ### targets 按照大小编码成 0,1 
                loss = criterion(outputs, generate_label(targets))

            # 使用 GradScaler 来缩放损失，然后进行反向传播
            # 注意：反向传播不包含在 autocast() 块中
            scaler.scale(loss).backward()
            writer.add_scalar('Loss/Train', loss.item(), epoch * len(train_loader) + batch_idx)
            # 调用 scaler.step() 来更新模型权重，并调用 scaler.update() 准备下一步
            scaler.step(optimizer)
            scaler.update()
        if epoch % 2 == 0:
            model.eval()
            eval_model(model, val_loader,)


In [3]:
for i in range(1, 4):
    print("layer ", i)
    file_names = [f'merge/a2ef_{i}_{j}.pth' for j in range(10)]
    dataset = CustomDataset(file_paths=file_names)
    # 划分训练集和验证集
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    train_loader = DataLoader(train_dataset, batch_size=2048, shuffle=True)
    print(len(train_dataset))
    val_loader = DataLoader(val_dataset, batch_size=2048, shuffle=False)

    model = RouterModel(4096, 8, i).cuda()
    eval_model(model, val_loader)
    # criterion = nn.MSELoss().to("cuda")
    # criterion = nn.CrossEntropyLoss().to("cuda")
    criterion = nn.SmoothL1Loss()
    # criterion = nn.KLDivLoss(reduction='batchmean').to("cuda")
    optimizer = optim.Adam(model.parameters(), lr=5e-4) #lr=5e-5
    writer = SummaryWriter('runs/predictor_multilayer')
    train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10, writer=writer)
    torch.save(model.state_dict(), f"./training/{i}.pth")

layer  1


  file_data = torch.load(file_path)


1048576


  loaded_weights = torch.load(f"./router/{i}.pt")
  with autocast():


Top-1 Accuracy: 0.5306 Top-2 Accuracy (unordered): 0.6018


  scaler = GradScaler()  # 创建 GradScaler 对象
  with torch.cuda.amp.autocast():
  with autocast():


Top-1 Accuracy: 0.6168 Top-2 Accuracy (unordered): 0.7004
Top-1 Accuracy: 0.6196 Top-2 Accuracy (unordered): 0.7022
Top-1 Accuracy: 0.6157 Top-2 Accuracy (unordered): 0.7022
Top-1 Accuracy: 0.6173 Top-2 Accuracy (unordered): 0.7024
Top-1 Accuracy: 0.6173 Top-2 Accuracy (unordered): 0.7029
layer  2


  file_data = torch.load(file_path)


1048576


  loaded_weights = torch.load(f"./router/{i}.pt")


Top-1 Accuracy: 0.7208 Top-2 Accuracy (unordered): 0.7591
Top-1 Accuracy: 0.7558 Top-2 Accuracy (unordered): 0.8121
Top-1 Accuracy: 0.7560 Top-2 Accuracy (unordered): 0.8119
Top-1 Accuracy: 0.7550 Top-2 Accuracy (unordered): 0.8124
Top-1 Accuracy: 0.7544 Top-2 Accuracy (unordered): 0.8119
Top-1 Accuracy: 0.7550 Top-2 Accuracy (unordered): 0.8125
layer  3
1048576
Top-1 Accuracy: 0.7575 Top-2 Accuracy (unordered): 0.7682
Top-1 Accuracy: 0.7749 Top-2 Accuracy (unordered): 0.8032
Top-1 Accuracy: 0.7779 Top-2 Accuracy (unordered): 0.8035
Top-1 Accuracy: 0.7760 Top-2 Accuracy (unordered): 0.8026
Top-1 Accuracy: 0.7765 Top-2 Accuracy (unordered): 0.8033
Top-1 Accuracy: 0.7770 Top-2 Accuracy (unordered): 0.8030


重新训练router

In [None]:
import torch.nn as nn
import torch.optim as optim

from torch.cuda.amp import GradScaler, autocast  # 用于混合精度训练
import torch.nn.functional as F
import torch.nn.init as init

class SimpleLinearModel(nn.Module):
    def __init__(self,input_dim,output_dim,hidden_dim=32):
        super(SimpleLinearModel, self).__init__()
        self.linear1 = nn.Linear(input_dim, hidden_dim)
        self.activation = nn.SiLU() # 添加 ReLU 激活函数
        self.linear2 = nn.Linear(hidden_dim,output_dim)  # 添加一个 8x8 线性层
        init.kaiming_normal_(self.linear1.weight, mode='fan_out', nonlinearity='relu')
        init.kaiming_normal_(self.linear2.weight, mode='fan_out', nonlinearity='relu')
        self.linear1.bias.data.fill_(0)
        self.linear2.bias.data.fill_(0)

    def forward(self, x):
        x= self.linear1(x)
        x= self.activation(x)
        return self.linear2(x)
        
model=SimpleLinearModel(4096,8, hidden_dim=128)
model.to("cuda")  # 假设使用 GPU

SimpleLinearModel(
  (linear1): Linear(in_features=4096, out_features=128, bias=True)
  (activation): SiLU()
  (linear2): Linear(in_features=128, out_features=8, bias=True)
)