### 先都加载到cpu上

In [1]:
from transformers import MixtralForCausalLM, AutoTokenizer
import torch
import torch.nn as nn
from typing import Optional
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import json

def get_model(model_name, device_map, dtype=torch.bfloat16):
    llm = MixtralForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        use_cache=True,
        torch_dtype=dtype,
    ) 
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    return llm, tokenizer


with open('../path.json', 'r') as f:
    path = json.load(f)
    model_name = path['mixtral']
    # threshold_path = path[threshold_path_name]

with open('../quantize/device_map_1.json', 'r') as f:
    device_map = json.load(f)

dtype = torch.float16
llm, tokenizer = get_model(model_name, 'cpu', dtype=dtype)

class CachedMLP(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, dtype):
        super(CachedMLP, self).__init__()
        # 定义 w1、w2、w3 三个线性层
        self.w1 = nn.Linear(input_dim, hidden_dim, bias=False, dtype=dtype)
        self.w2 = nn.Linear(hidden_dim, input_dim, bias=False, dtype=dtype)
        self.w3 = nn.Linear(input_dim, hidden_dim, bias=False, dtype=dtype)
        self.activation = nn.GELU()

        # 将 MLP 缓存在 GPU 上
        self.cuda()

    def load_from_cpu(self, cpu_mlp):
        """
        从 CPU 上的 MLP 加载参数到 GPU 上的缓存 MLP。
        """
        # 将 CPU 上的参数复制到 GPU 上的缓存 MLP
        # print(cpu_mlp)
        # print(cpu_mlp.w1.state_dict())
        self.w1.load_state_dict(cpu_mlp['w1'].state_dict())
        self.w2.load_state_dict(cpu_mlp['w2'].state_dict())
        self.w3.load_state_dict(cpu_mlp['w3'].state_dict())

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # 确保输入在 GPU 上
        x = x.to('cuda')
        # 计算 w1 和 w3
        # print(self.w1.type, x.type)
        w1_output = self.activation(self.w1(x))
        w3_output = self.w3(x)
        # 计算 w2
        x = self.w2(w1_output * w3_output)
        return x

def convert_mixtral_to_cached_mlp(llm, dtype):
    """
    将 Mixtral 模型的 MLP 层替换为缓存 MLP 的版本。
    """
    ### 其他部分存放在GPU上
    llm.model.embed_tokens.cuda()
    for i in range(32):
        llm.model.layers[i].self_attn.cuda()
        llm.model.layers[i].input_layernorm.cuda()
        llm.model.layers[i].post_attention_layernorm.cuda()
        llm.model.layers[i].block_sparse_moe.gate.cuda()
    llm.model.norm.cuda()
    llm.lm_head.cuda()
    
    # 在 GPU 上缓存一个 MLP 实例
    cached_mlp = CachedMLP(
        input_dim=llm.config.hidden_size,
        hidden_dim=llm.config.intermediate_size,
        dtype=dtype,
    )

    # 遍历每一层的 block_sparse_moe.experts
    for i in range(len(llm.model.layers)):
        for j in range(len(llm.model.layers[i].block_sparse_moe.experts)):
            # 保存原始的 w1、w2、w3 层（常驻 CPU）
            llm.model.layers[i].block_sparse_moe.experts[j].cpu_mlp = {
                "w1": llm.model.layers[i].block_sparse_moe.experts[j].w1,
                "w2": llm.model.layers[i].block_sparse_moe.experts[j].w2,
                "w3": llm.model.layers[i].block_sparse_moe.experts[j].w3,
            }

            # 替换为缓存 MLP 的版本
            llm.model.layers[i].block_sparse_moe.experts[j].forward = lambda x, cached_mlp=cached_mlp, cpu_mlp=llm.model.layers[i].block_sparse_moe.experts[j].cpu_mlp: cached_mlp_forward(x, cached_mlp, cpu_mlp)

    return llm

def cached_mlp_forward(x, cached_mlp, cpu_mlp):
    """
    动态加载 CPU 上的 MLP 参数到缓存的 MLP，并执行前向传播。
    """
    # 从 CPU 上传参数到缓存的 MLP
    cached_mlp.load_from_cpu(cpu_mlp)

    # 使用缓存的 MLP 进行计算
    output = cached_mlp(x)

    # 将缓存的 MLP 参数清空（可选）
    # cached_mlp.load_from_cpu({
    #     "w1": nn.Linear(cached_mlp.w1.in_features, cached_mlp.w1.out_features).cpu(),
    #     "w2": nn.Linear(cached_mlp.w2.in_features, cached_mlp.w2.out_features).cpu(),
    #     "w3": nn.Linear(cached_mlp.w3.in_features, cached_mlp.w3.out_features).cpu(),
    # })

    return output

# 将模型转换为缓存 MLP 的版本
llm = convert_mixtral_to_cached_mlp(llm, dtype)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 19/19 [00:05<00:00,  3.17it/s]


In [6]:
# 示例输入
input_length = 32
output_length = 8
input_ids = torch.randint(0, 32000, (1, input_length)).cuda()  # 随机生成输入 token IDs
attention_mask = torch.ones((1, input_length)).cuda()  # 假设 attention mask

# 预热（避免第一次运行时的额外开销）
# with torch.no_grad():
#     _ = llm(input_ids=input_ids, attention_mask=attention_mask)

# 测试时间
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)

# 开始计时
torch.cuda.synchronize()
start_event.record()

# 前向传播
with torch.no_grad():
    output = llm.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=input_length + output_length,  # 总长度为输入长度 + 输出长度
        num_return_sequences=1,
    )

# 结束计时
end_event.record()
torch.cuda.synchronize()

# 计算时间
elapsed_time = start_event.elapsed_time(end_event) / 1000  # 转换为秒
print(f"Generated output length: {len(output[0]) - input_length}")
print(f"Time taken: {elapsed_time:.4f} seconds")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generated output length: 8
Time taken: 89.0892 seconds
