### 先都加载到cpu上

In [1]:
from transformers import MixtralForCausalLM, AutoTokenizer
import torch
import torch.nn as nn
from typing import Optional
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import json

def get_model(model_name, device_map, dtype=torch.bfloat16):
    llm = MixtralForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        use_cache=True,
        torch_dtype=dtype,
    ) 
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    return llm, tokenizer

with open('../path.json', 'r') as f:
    path = json.load(f)
    model_name = path['mixtral']
    # threshold_path = path[threshold_path_name]

with open('../quantize/device_map_1.json', 'r') as f:
    device_map = json.load(f)

dtype = torch.float16
llm, tokenizer = get_model(model_name, 'cpu', dtype=dtype)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 19/19 [00:06<00:00,  2.97it/s]


In [6]:
import torch
import torch.nn as nn
import json

with open("../path.json", "r") as f:
    paths = json.load(f)
    up_threshold_path = paths["chess_up_threshold"]

class CachedMLP(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, dtype, sparsity: float = 0.2):
        super(CachedMLP, self).__init__()
        self.sparsity = sparsity
        filepath = str(sparsity).replace('.', '_')
        th_path = f'{up_threshold_path}/thresholds_{filepath}.pt'
        self.up_th = torch.load(th_path, map_location='cuda')["up_proj_states_thresholds_2"]
        self.activenum = int(sparsity * hidden_dim)  # 根据稀疏阈值计算激活的维度
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.dtype = dtype

        # 直接存储三个二维张量
        self.w1 = None  # 形状: [activenum, input_dim]
        self.w2 = None  # 形状: [input_dim, activenum]
        self.w3 = None  # 形状: [activenum, input_dim]

        self.activation = nn.GELU()

    def load_from_cpu(self, cpu_mlp, x: torch.Tensor = None, layer_id=0, expert_id=0):
        """
        从 CPU 上的 MLP 加载参数到 GPU 上的缓存 MLP。
        如果是 prefill 阶段（x.size(1) > 1），加载全部参数。
        如果是 decode 阶段，根据 w3 * x 和 up_th 的大小关系，稀疏化加载参数。
        """
        # 获取 CPU 上的参数
        w1_weight = cpu_mlp['w1'].weight.data  # 形状: [hidden_dim, input_dim]
        w2_weight = cpu_mlp['w2'].weight.data  # 形状: [input_dim, hidden_dim]
        w3_weight = cpu_mlp['w3'].weight.data  # 形状: [hidden_dim, input_dim]

        # if layer_id == 0:
        #     print(f'in expert{expert_id} ',x.size())
        if x.size(0) == 1:  # decode 阶段
            # if layer_id == 0:
            # print(f"in {expert_id} decode load from cpu")
            # 计算 w3 * x
            w3_output = torch.matmul(x, w3_weight.T)  # 形状: [batch_size, activenum]

            # 根据 w3_output 和 up_th 确定需要稀疏化的神经元位置
            threshold = self.up_th[layer_id][expert_id]
            active_mask = w3_output.abs() > threshold  # 形状: [batch_size, activenum]
            active_indices = torch.where(active_mask.any(dim=0))[0]  # 按列筛选，形状: [num_active]

            # 限制为 activenum 个
            # active_indices = active_indices[:self.activenum]
            active_indices = active_indices.to('cpu')

            # 对于 w1，选取第二个维度（hidden_dim）中大于阈值的部分
            sparse_w1 = w1_weight[active_indices, :]  # 形状: [activenum, input_dim]

            # 对于 w2，选取第一个维度（input_dim）的前 activenum 个
            sparse_w2 = w2_weight[:, active_indices]  # 形状: [input_dim, activenum]

            # 将稀疏化后的参数上传到 GPU
            self.w1 = sparse_w1.to('cuda')
            self.w2 = sparse_w2.to('cuda')

            return w3_output[:, active_indices]
        else:  # prefill 阶段
            # if layer_id == 0 and expert_id == 0:
            #     print("in prefill load from cpu")
            # 加载全部参数
            self.w1 = w1_weight.to('cuda')
            self.w2 = w2_weight.to('cuda')
            self.w3 = w3_weight.to('cuda')
            return None

    def forward(self, x: torch.Tensor, cpu_mlp=None, layer_id=0, expert_id=0) -> torch.Tensor:
        """
        前向传播逻辑。
        如果是 prefill 阶段，直接计算。
        如果是 decode 阶段，先加载稀疏化参数，再计算。
        """
        # 确保输入在 GPU 上
        x = x.to('cuda')
        if x.size(0) > 1:  # prefill 阶段
            # 加载全部参数
            self.load_from_cpu(cpu_mlp, x, layer_id=layer_id, expert_id=expert_id)
            # 计算 w3
            w3_output = torch.matmul(x, self.w3.T)  # 形状: [batch_size, activenum]
        else:  # decode 阶段
            # 加载稀疏化参数并获取 w3_output
            w3_output = self.load_from_cpu(cpu_mlp, x, layer_id, expert_id)

        # 计算 w1
        w1_output = self.activation(torch.matmul(x, self.w1.T))  # 形状: [batch_size, activenum]
        # 计算 w2
        x = torch.matmul(w1_output * w3_output, self.w2.T)  # 形状: [batch_size, input_dim]

        return x

def convert_mixtral_to_cached_mlp(llm, dtype, sparsity=0.9):
    """
    将 Mixtral 模型的 MLP 层替换为缓存 MLP 的版本。
    """
    ### 其他部分存放在GPU上
    llm.model.embed_tokens.cuda()
    for i in range(32):
        llm.model.layers[i].self_attn.cuda()
        llm.model.layers[i].input_layernorm.cuda()
        llm.model.layers[i].post_attention_layernorm.cuda()
        llm.model.layers[i].block_sparse_moe.gate.cuda()
    llm.model.norm.cuda()
    llm.lm_head.cuda()
    
    # 在 GPU 上缓存一个 MLP 实例
    cached_mlp = CachedMLP(
        input_dim=llm.config.hidden_size,
        hidden_dim=llm.config.intermediate_size,
        dtype=dtype,
        sparsity=sparsity
    )

    # 遍历每一层的 block_sparse_moe.experts，将 w3 加载到 GPU
    for i in range(len(llm.model.layers)):
        for j in range(len(llm.model.layers[i].block_sparse_moe.experts)):
            # 将 w3 加载到 GPU
            llm.model.layers[i].block_sparse_moe.experts[j].w3.cuda()

            # 保存原始的 w1、w2、w3 层（常驻 CPU）
            llm.model.layers[i].block_sparse_moe.experts[j].cpu_mlp = {
                "w1": llm.model.layers[i].block_sparse_moe.experts[j].w1,
                "w2": llm.model.layers[i].block_sparse_moe.experts[j].w2,
                "w3": llm.model.layers[i].block_sparse_moe.experts[j].w3,
            }

            # 替换为缓存 MLP 的版本
            llm.model.layers[i].block_sparse_moe.experts[j].forward = lambda x, cached_mlp=cached_mlp, cpu_mlp=llm.model.layers[i].block_sparse_moe.experts[j].cpu_mlp,layer_id=i,expert_id=j: cached_mlp_forward(x, cached_mlp, cpu_mlp, layer_id, expert_id)

    return llm

def cached_mlp_forward(x, cached_mlp, cpu_mlp, layer_id = 0, expert_id = 0):
    """
    动态加载 CPU 上的 MLP 参数到缓存的 MLP，并执行前向传播。
    """
    # 使用缓存的 MLP 进行计算
    if x.size(0) == 0:
        return torch.zeros(x.shape, device='cuda')
    output = cached_mlp(x, cpu_mlp, layer_id, expert_id)

    return output

# 将模型转换为缓存 MLP 的版本
llm = convert_mixtral_to_cached_mlp(llm, dtype, sparsity=0.8)

  self.up_th = torch.load(th_path, map_location='cuda')["up_proj_states_thresholds_2"]


### 测试时间开销

In [9]:
import json
from datasets import load_dataset, Dataset
from transformers import GenerationConfig

input_length = 32
MAX_LENGTH =  32
output_length = 32
test_samples = 1

with open("../path.json", "r") as f:
    paths = json.load(f)
    fineweb_path = paths["fineweb"]

def preprocess_data(data, tokenizer):
	# 使用 tokenizer 将文本数据转换为模型输入
	inputs = tokenizer(data, padding="max_length", truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
	inputs["labels"] = inputs.input_ids.clone()
	return inputs

fineweb = load_dataset("parquet",data_files=fineweb_path) #726000
fineweb_text = fineweb['train']['text'][:test_samples] 

for output_length in [1, output_length]:
    print("output length is {}".format(output_length))
    for text in fineweb_text:
        # input_ids = torch.randint(0, 32000, (1, input_length)).cuda()  # 随机生成输入 token IDs
        # attention_mask = torch.ones((1, input_length)).cuda()  # 假设 attention mask
        inputs = preprocess_data(text, tokenizer)

        # 预热（避免第一次运行时的额外开销）
        with torch.no_grad():
            output = llm(input_ids=inputs["input_ids"].cuda(), attention_mask=inputs["attention_mask"].cuda())

        # 测试时间
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)

        # 开始计时
        torch.cuda.synchronize()
        start_event.record()

        # 前向传播
        with torch.no_grad():
            output = llm.generate(
                input_ids=inputs["input_ids"].cuda(),
                attention_mask=inputs["attention_mask"].cuda(),
                max_length=input_length + output_length,  # 总长度为输入长度 + 输出长度
                generation_config=GenerationConfig(do_sample=False)
            )

        # 结束计时
        end_event.record()
        torch.cuda.synchronize()

        # 计算时间
        elapsed_time = start_event.elapsed_time(end_event) / 1000  # 转换为秒
        print(f"Generated output length: {len(output[0]) - input_length}")
        # print(output)
        print(tokenizer.batch_decode(output, skip_special_tokens=True))
        print(f"Time taken: {elapsed_time:.4f} seconds")

output length is 1


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generated output length: 1
['The Independent Jane\nFor all the love, romance and scandal in Jane Austen’s books, what they are really about is freedom and independence. Independence']
Time taken: 6.8550 seconds
output length is 32


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generated output length: 32
['The Independent Jane\nFor all the love, romance and scandal in Jane Austen’s books, what they are really about is freedom and independence. Independence to choose your own husband or wife, to have a career, to live where you want, to be who you are.\n\nIn this lively']
Time taken: 25.2694 seconds


In [10]:
(25.2694 - 6.8550) / 31 

0.5940129032258065

#### v0版本

In [None]:
class CachedMLP(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, dtype):
        super(CachedMLP, self).__init__()
        # 定义 w1、w2、w3 三个线性层
        self.w1 = nn.Linear(input_dim, hidden_dim, bias=False, dtype=dtype)
        self.w2 = nn.Linear(hidden_dim, input_dim, bias=False, dtype=dtype)
        self.w3 = nn.Linear(input_dim, hidden_dim, bias=False, dtype=dtype)
        self.activation = nn.GELU()

        # 将 MLP 缓存在 GPU 上
        self.cuda()

    def load_from_cpu(self, cpu_mlp):
        """
        从 CPU 上的 MLP 加载参数到 GPU 上的缓存 MLP。
        """
        # 将 CPU 上的参数复制到 GPU 上的缓存 MLP
        # print(cpu_mlp)
        # print(cpu_mlp.w1.state_dict())
        self.w1.load_state_dict(cpu_mlp['w1'].state_dict())
        self.w2.load_state_dict(cpu_mlp['w2'].state_dict())
        self.w3.load_state_dict(cpu_mlp['w3'].state_dict())

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # 确保输入在 GPU 上
        x = x.to('cuda')
        # 计算 w1 和 w3
        # print(self.w1.type, x.type)
        w1_output = self.activation(self.w1(x))
        w3_output = self.w3(x)
        # 计算 w2
        x = self.w2(w1_output * w3_output)
        return x

def convert_mixtral_to_cached_mlp(llm, dtype):
    """
    将 Mixtral 模型的 MLP 层替换为缓存 MLP 的版本。
    """
    ### 其他部分存放在GPU上
    llm.model.embed_tokens.cuda()
    for i in range(32):
        llm.model.layers[i].self_attn.cuda()
        llm.model.layers[i].input_layernorm.cuda()
        llm.model.layers[i].post_attention_layernorm.cuda()
        llm.model.layers[i].block_sparse_moe.gate.cuda()
    llm.model.norm.cuda()
    llm.lm_head.cuda()
    
    # 在 GPU 上缓存一个 MLP 实例
    cached_mlp = CachedMLP(
        input_dim=llm.config.hidden_size,
        hidden_dim=llm.config.intermediate_size,
        dtype=dtype,
    )

    # 遍历每一层的 block_sparse_moe.experts
    for i in range(len(llm.model.layers)):
        for j in range(len(llm.model.layers[i].block_sparse_moe.experts)):
            # 保存原始的 w1、w2、w3 层（常驻 CPU）
            llm.model.layers[i].block_sparse_moe.experts[j].cpu_mlp = {
                "w1": llm.model.layers[i].block_sparse_moe.experts[j].w1,
                "w2": llm.model.layers[i].block_sparse_moe.experts[j].w2,
                "w3": llm.model.layers[i].block_sparse_moe.experts[j].w3,
            }

            # 替换为缓存 MLP 的版本
            llm.model.layers[i].block_sparse_moe.experts[j].forward = lambda x, cached_mlp=cached_mlp, cpu_mlp=llm.model.layers[i].block_sparse_moe.experts[j].cpu_mlp: cached_mlp_forward(x, cached_mlp, cpu_mlp)

    return llm

def cached_mlp_forward(x, cached_mlp, cpu_mlp):
    """
    动态加载 CPU 上的 MLP 参数到缓存的 MLP，并执行前向传播。
    """
    # 从 CPU 上传参数到缓存的 MLP
    cached_mlp.load_from_cpu(cpu_mlp)

    # 使用缓存的 MLP 进行计算
    output = cached_mlp(x)

    # 将缓存的 MLP 参数清空（可选）
    # cached_mlp.load_from_cpu({
    #     "w1": nn.Linear(cached_mlp.w1.in_features, cached_mlp.w1.out_features).cpu(),
    #     "w2": nn.Linear(cached_mlp.w2.in_features, cached_mlp.w2.out_features).cpu(),
    #     "w3": nn.Linear(cached_mlp.w3.in_features, cached_mlp.w3.out_features).cpu(),
    # })

    return output

# 将模型转换为缓存 MLP 的版本
llm = convert_mixtral_to_cached_mlp(llm, dtype)