### 加载模型

In [1]:
import json
import torch
from transformers import AutoTokenizer
from modeling_mixtral import MixtralForCausalLM, set_profile_mode, load_thresholds
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2,7"

def _load_model(model_name, threshold_path, device_map):
    print(f"Loading model {model_name}")
    ## 开启稀疏模式
    set_profile_mode(False)
    load_thresholds(f'{threshold_path}/thresholds_0_8.pt', use_average=True)

    model = MixtralForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        use_cache=True,
        torch_dtype=torch.float16,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"
    return model, tokenizer

model_name = 'mixtral'
threshold_path_name = 'chess_up_threshold'
with open('../path.json', 'r') as f:
    path = json.load(f)
    model_name = path['mixtral']
    threshold_path = path[threshold_path_name]

with open('../quantize/device_map_1.json', 'r') as f:
    device_map = json.load(f)
model, tokenizer = _load_model(model_name, threshold_path, device_map)

  from .autonotebook import tqdm as notebook_tqdm


Loading model /home/bcds/venv/dilab/Mixtral-8x7B-v0.1
Set profile_threshold to False


  up_th = torch.load(threshold_path, map_location='cuda')["up_proj_states_thresholds_2"]


Thresholds loaded from /home/bcds/On-the-Fly_MoE_Inference/saving/threshold/c4_mixtral_up/thresholds_0_8.pt


Loading checkpoint shards: 100%|██████████| 19/19 [00:20<00:00,  1.09s/it]


### 加载数据集

In [2]:
datasets = torch.load('../saving/threshold/chess/datasets.pt')
import torch
import numpy as np
def get_batch(data, batch_size, block_size):
    start_idxs = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in start_idxs])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in start_idxs])
    return x, y

  datasets = torch.load('../saving/threshold/chess/datasets.pt')


保存激活值和专家路由的数据集

In [None]:
from tqdm import trange
sparsity_level = 0.8
# device = 'cuda:1'
device_2 = 'cpu'
avg_loss = 0.0
n_batch = 64
# n_batch = 2
# accum_steps = 4 
accum_steps = 2
batch_size = 1
block_size = 2048
torch.manual_seed(42)
n_layers = len(model.model.layers)
n_experts = len(model.model.layers[0].block_sparse_moe.experts)
split = 'train'

with torch.no_grad():
    # for step in range(n_batch // accum_steps):
    for step in trange(n_batch // accum_steps):
        # print(step * accum_steps)
        for batch_idx in range(accum_steps):
            # print('batch_idx:', batch_idx)
            inputs, labels = get_batch(datasets[split], batch_size, block_size)
            inputs = inputs.cuda()
            outputs = model(inputs, labels=inputs)
            avg_loss = avg_loss + outputs.loss / n_batch

for layer_idx in range(1, n_layers):
    d = list(zip(model.model.layers[layer_idx-1].block_sparse_moe.activations, model.model.layers[layer_idx].block_sparse_moe.gate_logits))
    
    torch.save(d,f'merge/a2ef_{layer_idx}.pth')
    print(f'saving merge/a2ef_{layer_idx}.pth done')

### 专家预测器

In [3]:
import torch.nn as nn
import torch.optim as optim

from torch.cuda.amp import GradScaler, autocast  # 用于混合精度训练
import torch.nn.functional as F
import torch.nn.init as init
def top_k_position_accuracy_unordered(output, target, k=1):
    """Compute the accuracy based on the intersection of top-k values between output and target,
       regardless of their order."""
    with torch.no_grad():
        # 获取 output 和 target 中 top-k 最大值的索引
        _, topk_pred_indices = output.topk(k, 1, True)
        _, topk_target_indices = target.topk(k, 1, True)
        # 初始化批次的正确计数
        batch_size = output.size(0)
        correct_counts = 0
        
        # 检查每个样本的预测top-k是否包含在真实的top-k中
        for i in range(batch_size):
            # 将预测和目标的top-k索引转换为集合
            set_pred = set(topk_pred_indices[i].tolist())
            set_target = set(topk_target_indices[i].tolist())
            
            # 计算交集
            intersection = set_pred.intersection(set_target)
            
            # 计算正确的预测个数
            correct_counts = correct_counts+len(intersection)
        
        # 计算平均正确率
        return correct_counts,batch_size*k

def eval_model(model, val_loader,):
    # Example validation loop
    model.eval()
    total_topk_accuracy_1 = 0
    total_topk_accuracy_2 = 0
    cont=0
    len1=0
    len2=0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to("cuda"), targets.to("cuda")
            with autocast():
                outputs = model(inputs)
            # 计算 top-K 准确率（不考虑顺序）
            topk_accuracy_1 = top_k_position_accuracy_unordered(outputs, targets, k=1)
            topk_accuracy_2 = top_k_position_accuracy_unordered(outputs, targets, k=2)
            total_topk_accuracy_1 += topk_accuracy_1[0]
            total_topk_accuracy_2 += topk_accuracy_2[0]
            len1+= topk_accuracy_1[1]
            len2+= topk_accuracy_2[1]   
        avg_topk_accuracy_1 = total_topk_accuracy_1 / len1
        avg_topk_accuracy_2 = total_topk_accuracy_2 / len2
        # print(len2)
        print(f'Validation Top-{1} Position Accuracy (unordered): {avg_topk_accuracy_1:.4f}')
        print(f'Validation Top-{2} Position Accuracy (unordered): {avg_topk_accuracy_2:.4f}')


In [7]:
import torch
from torch.utils.data import DataLoader, Dataset, random_split

class CustomDataset(Dataset):
    def __init__(self, file_paths):
        # 加载数据
        self.data = []
        
        # 遍历文件路径列表，加载每个文件
        for file_path in file_paths:
            # 加载当前文件的数据
            file_data = torch.load(file_path)
            # 将当前文件的数据追加到总数据列表中
            self.data.extend(file_data)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        x, y = self.data[idx]
        return x.detach().clone(), y.detach().clone()


for i in range(31, 32):
    print("layer ", i)
    dataset = CustomDataset(file_paths=[f'merge/a2ef_{i}.pth'])
    # 划分训练集和验证集
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    train_loader = DataLoader(train_dataset, batch_size=2048, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=2048, shuffle=False)

    gate_router = model.model.layers[i].block_sparse_moe.gate
    eval_model(gate_router, train_loader,)

layer  31


  file_data = torch.load(file_path)
  with autocast():


Validation Top-1 Position Accuracy (unordered): 0.8144
Validation Top-2 Position Accuracy (unordered): 0.8049


In [2]:
import torch.nn as nn
import torch.optim as optim

from torch.cuda.amp import GradScaler, autocast  # 用于混合精度训练
import torch.nn.functional as F
import torch.nn.init as init

class SimpleLinearModel(nn.Module):
    def __init__(self,input_dim,output_dim,hidden_dim=32):
        super(SimpleLinearModel, self).__init__()
        self.linear1 = nn.Linear(input_dim, hidden_dim)
        self.activation = nn.SiLU() # 添加 ReLU 激活函数
        self.linear2 = nn.Linear(hidden_dim,output_dim)  # 添加一个 8x8 线性层
        init.kaiming_normal_(self.linear1.weight, mode='fan_out', nonlinearity='relu')
        init.kaiming_normal_(self.linear2.weight, mode='fan_out', nonlinearity='relu')
        self.linear1.bias.data.fill_(0)
        self.linear2.bias.data.fill_(0)

    def forward(self, x):
        x= self.linear1(x)
        x= self.activation(x)
        return self.linear2(x)
        
model=SimpleLinearModel(4096,8, hidden_dim=128)
model.to("cuda")  # 假设使用 GPU

SimpleLinearModel(
  (linear1): Linear(in_features=4096, out_features=128, bias=True)
  (activation): SiLU()
  (linear2): Linear(in_features=128, out_features=8, bias=True)
)

In [None]:
from torch.utils.tensorboard import SummaryWriter
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=25):
    scaler = GradScaler()  # 创建 GradScaler 对象
    for epoch in range(epochs):
        model.train()
        for batch_idx, (inputs, targets) in enumerate(train_loader):
            inputs, targets = inputs.cuda(), targets.cuda()

            optimizer.zero_grad()

            # 使用 autocast 来进行自动混合精度处理
            with autocast():
                outputs = model(inputs)
                # kl
                outputs_probs = F.log_softmax(outputs, dim=1)
                loss=criterion(outputs_probs, targets)
                # mse
                # loss = criterion(outputs, targets)

            # 使用 GradScaler 来缩放损失，然后进行反向传播
            # 注意：反向传播不包含在 autocast() 块中
            scaler.scale(loss).backward()
            writer.add_scalar('Loss/Train', loss.item(), epoch * len(train_loader) + batch_idx)
            # 调用 scaler.step() 来更新模型权重，并调用 scaler.update() 准备下一步
            scaler.step(optimizer)
            scaler.update()
        model.eval()
        with torch.no_grad():
            val_loss = 0
            for inputs, targets in val_loader:
                inputs, targets = inputs.cuda(), targets.cuda()
                with autocast():
                    outputs = model(inputs)
                    # kl
                    outputs_probs=F.log_softmax(outputs, dim=1)
                    val_loss += criterion(outputs_probs, targets).item()
                    # mse
                    # val_loss += criterion(outputs, targets).item()
            print(f'Epoch {epoch+1}, Validation Loss: {val_loss / len(val_loader)}')
            
# criterion = nn.MSELoss().to("cuda")
criterion = nn.KLDivLoss(reduction='batchmean').to("cuda")
optimizer = optim.Adam(model.parameters(), lr=0.004) #lr=5e-5
writer = SummaryWriter('runs/predictor_multilayer')
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=15)