### 先都加载到cpu上

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

from modeling_mixtral import MixtralForCausalLM
from transformers import AutoTokenizer
import torch
import torch.nn as nn
from typing import Optional
import json

def get_model(model_name, device_map, dtype=torch.bfloat16):
    llm = MixtralForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        use_cache=True,
        torch_dtype=dtype,
    ) 
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    return llm, tokenizer

with open('../path.json', 'r') as f:
    path = json.load(f)
    model_name = path['mixtral']
    # threshold_path = path[threshold_path_name]

with open('../quantize/device_map_1.json', 'r') as f:
    device_map = json.load(f)

dtype = torch.float16
llm, tokenizer = get_model(model_name, 'cpu', dtype=dtype)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 19/19 [00:05<00:00,  3.22it/s]


In [3]:
from typing import Tuple, Optional
import torch
import torch.nn as nn
import threading
import json
from queue import Queue

class CachedMLP(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, dtype, sparsity: float = 0.2):
        super(CachedMLP, self).__init__()
        self.sparsity = sparsity
        self.activenum = int((1 - sparsity) * hidden_dim)
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.dtype = dtype

        # GPU 缓存张量
        self.w1_gpu = torch.empty((self.activenum, self.input_dim), dtype=self.dtype, device='cuda')
        self.w2_gpu = torch.empty((self.input_dim, self.activenum), dtype=self.dtype, device='cuda')
        self.w3_gpu = torch.empty((self.activenum, self.input_dim), dtype=self.dtype, device='cuda')

        self.activation = nn.GELU()

        # Pinned Memory 缓冲区
        self.register_buffer('sparse_w1_cpu', torch.empty((self.activenum, self.input_dim), dtype=self.dtype, device='cpu'))
        self.register_buffer('sparse_w2_cpu', torch.empty((self.input_dim, self.activenum), dtype=self.dtype, device='cpu'))
        self.register_buffer('sparse_w3_cpu', torch.empty((self.activenum, self.input_dim), dtype=self.dtype, device='cpu'))
        self.sparse_w1_cpu = self.sparse_w1_cpu.pin_memory()
        self.sparse_w2_cpu = self.sparse_w2_cpu.pin_memory()
        self.sparse_w3_cpu = self.sparse_w3_cpu.pin_memory()

        # 统计信息
        self.load_from_cpu_time = 0.0
        self.load_from_cpu_calls = 0

    def load_from_cpu(self, cpu_mlp, stream: torch.cuda.Stream):
        """
        从CPU加载参数，并使用指定的CUDA流进行异步复制到GPU。
        
        参数:
            cpu_mlp: 包含CPU上参数的字典。
            stream: 用于数据传输的CUDA流。
        """
        # 从CPU加载参数
        self.sparse_w1_cpu.copy_(cpu_mlp['w1'].data[:self.activenum, :])
        self.sparse_w2_cpu.copy_(cpu_mlp['w2'].data[:, :self.activenum])
        self.sparse_w3_cpu.copy_(cpu_mlp['w3'].data[:self.activenum, :])

        # 异步复制到GPU
        with torch.cuda.stream(stream):
            self.w1_gpu.copy_(self.sparse_w1_cpu, non_blocking=True)
            self.w2_gpu.copy_(self.sparse_w2_cpu, non_blocking=True)
            self.w3_gpu.copy_(self.sparse_w3_cpu, non_blocking=True)

    def forward(self, x: torch.Tensor, cpu_mlp=None) -> torch.Tensor:
        if not x.is_cuda:
            raise ValueError("输入张量必须在CUDA设备上。")

        # if cpu_mlp:
        #     self.load_from_cpu(cpu_mlp)

        w3_output = torch.matmul(x, self.w3_gpu.T)
        w1_output = self.activation(torch.matmul(x, self.w1_gpu.T))
        w2 = self.w2_gpu.T
        x = torch.matmul(w1_output * w3_output, w2)

        return x

    def get_load_from_cpu_stats(self):
        if self.load_from_cpu_calls == 0:
            return 0.0, 0.0
        avg_time = self.load_from_cpu_time / self.load_from_cpu_calls
        return self.load_from_cpu_time, avg_time

    def clear_load_from_cpu_stats(self):
        self.load_from_cpu_time = 0.0
        self.load_from_cpu_calls = 0

def convert_mixtral_to_cached_mlp(llm, dtype, sparsity=0.9):
    ### 其他部分存放在GPU上
    llm.model.embed_tokens.cuda()
    for i in range(32):
        llm.model.layers[i].self_attn.cuda()
        llm.model.layers[i].input_layernorm.cuda()
        llm.model.layers[i].post_attention_layernorm.cuda()
        llm.model.layers[i].block_sparse_moe.gate.cuda()
    llm.model.norm.cuda()
    llm.lm_head.cuda()
    cached_mlps = []
    
    for i, layer in enumerate(llm.model.layers):
        # 创建每个层的CachedMLP
        cached_mlp = CachedMLP(
            input_dim=llm.config.hidden_size,
            hidden_dim=llm.config.intermediate_size,
            dtype=dtype,
            sparsity=sparsity
        )
        cached_mlps.append(cached_mlp)
        
        for j, expert in enumerate(layer.block_sparse_moe.experts):
            expert.cpu_mlp = {
                "w1": expert.w1.cpu().weight,
                "w2": expert.w2.cpu().weight,
                "w3": expert.w3.cpu().weight,
            }
            # 替换forward方法
            expert.forward = lambda x, cached_mlp=cached_mlp, cpu_mlp=expert.cpu_mlp: cached_mlp(x, cpu_mlp)
    
    return llm, cached_mlps

class PipelineLLM:
    def __init__(self, llm, cached_mlps):
        """
        初始化 PipelineLLM，替换模型每一层的 forward 方法。
        
        参数:
            llm: 原始的大模型
            cached_mlps: 每一层对应的 CachedMLP 实例列表
        """
        self.llm = llm
        self.cached_mlps = cached_mlps
        self.num_layers = len(cached_mlps)
        self.lock = threading.Lock()
        
        # 创建两个共享的CUDA流
        self.stream_queue = Queue(maxsize=2)
        self.stream_queue.put(torch.cuda.Stream())
        self.stream_queue.put(torch.cuda.Stream())
        
        self._replace_forward_methods()
    
    def _load_device_map(self, path):
        """
        读取 device_map.json 文件并返回设备映射字典。
        """
        with open(path, 'r') as f:
            device_map = json.load(f)
        return device_map
            
    def _replace_forward_methods(self):
        """
        替换模型每一层的 forward 方法，添加参数预加载逻辑和注意力计算。
        """
        for i, layer in enumerate(self.llm.model.layers):
            cached_mlp = self.cached_mlps[i]

            def new_forward(hidden_states: torch.Tensor,
                            attention_mask: Optional[torch.Tensor] = None,
                            position_ids: Optional[torch.LongTensor] = None,
                            past_key_value: Optional[Tuple[torch.Tensor]] = None,
                            output_attentions: Optional[bool] = False,
                            output_router_logits: Optional[bool] = False,
                            use_cache: Optional[bool] = False,
                            cache_position: Optional[torch.LongTensor] = None,
                            cached_mlp=cached_mlp, layer=layer, i=i):
                residual = hidden_states
                hidden_states = layer.input_layernorm(hidden_states)

                # Self Attention
                hidden_states, self_attn_weights, present_key_value = layer.self_attn(
                    hidden_states=hidden_states,
                    attention_mask=attention_mask,
                    position_ids=position_ids,
                    past_key_value=past_key_value,
                    output_attentions=output_attentions,
                    use_cache=use_cache,
                    cache_position=cache_position,
                )
                hidden_states = residual + hidden_states

                # Fully Connected
                residual = hidden_states
                hidden_states = layer.post_attention_layernorm(hidden_states)

                # 如果不是最后一层，预加载下一层的参数
                if i < self.num_layers - 1:
                    next_cached_mlp = self.cached_mlps[i + 1]
                    next_cpu_mlp = self.llm.model.layers[i + 1].block_sparse_moe.experts[0].cpu_mlp
                    # 异步复制到指定设备，使用共享的流
                    threading.Thread(target=self._async_load, args=(next_cached_mlp, next_cpu_mlp, 'cuda')).start()
                
                # hidden_states = layer.block_sparse_moe(hidden_states)
                batch_size, sequence_length, hidden_dim = hidden_states.shape
                hidden_states = hidden_states.view(-1, hidden_dim)
                # 仅使用第一个专家
                expert_layer = layer.block_sparse_moe.experts[0]
                final_hidden_states = expert_layer(hidden_states, cached_mlp)
                final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
                
                hidden_states = residual + final_hidden_states

                outputs = (hidden_states,)

                if output_attentions:
                    outputs += (self_attn_weights,)

                if use_cache:
                    outputs += (present_key_value,)

                return outputs
            # 替换 forward 方法
            layer.forward = new_forward

    def _async_load(self, cached_mlp, cpu_mlp, device):
        """
        异步加载 MLP 参数到指定设备，使用共享的CUDA流。
        """
        # 获取一个可用的流
        stream = self.stream_queue.get()
        try:
            with torch.cuda.stream(stream):
                cached_mlp.load_from_cpu(cpu_mlp, stream)
            # 等待数据传输完成
            torch.cuda.current_stream().wait_stream(stream)
        finally:
            # 将流归还到队列
            self.stream_queue.put(stream)

# 将模型转换为使用CachedMLP的版本
llm, cached_mlps = convert_mixtral_to_cached_mlp(llm, dtype, sparsity=0.8)

# 创建流水线模型
pipeline_llm = PipelineLLM(llm, cached_mlps).llm


### 测试时间开销

In [4]:
import json
from datasets import load_dataset, Dataset
from transformers import GenerationConfig

input_length = 32
MAX_LENGTH = input_length
output_length = 32
test_samples = 8

with open("../path.json", "r") as f:
    paths = json.load(f)
    fineweb_path = paths["fineweb"]

def preprocess_data(data, tokenizer):
	# 使用 tokenizer 将文本数据转换为模型输入
	inputs = tokenizer(data, padding="max_length", truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
	inputs["labels"] = inputs.input_ids.clone()
	return inputs

fineweb = load_dataset("parquet",data_files=fineweb_path) #726000
fineweb_text = fineweb['train']['text'][:test_samples] 

prefill_time, decode_time = 0, 0
for output_length in [1, output_length]:
    print("output length is {}".format(output_length))
    for text in fineweb_text:
        # input_ids = torch.randint(0, 32000, (1, input_length)).cuda()  # 随机生成输入 token IDs
        # attention_mask = torch.ones((1, input_length)).cuda()  # 假设 attention mask
        inputs = preprocess_data(text, tokenizer)

        # 预热（避免第一次运行时的额外开销）
        with torch.no_grad():
            output = llm(input_ids=inputs["input_ids"].cuda(), attention_mask=inputs["attention_mask"].cuda())

        # 测试时间
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)

        # cached_mlp.clear_load_from_cpu_stats()
        # 开始计时
        torch.cuda.synchronize()
        start_event.record()

        # 前向传播
        with torch.no_grad():
            output = llm.generate(
                input_ids=inputs["input_ids"].cuda(),
                attention_mask=inputs["attention_mask"].cuda(),
                max_length=input_length + output_length,  # 总长度为输入长度 + 输出长度
                generation_config=GenerationConfig(do_sample=False),
                pad_token_id=tokenizer.eos_token_id
            )

        # 结束计时
        end_event.record()
        torch.cuda.synchronize()

        # 计算时间
        elapsed_time = start_event.elapsed_time(end_event) / 1000  # 转换为秒
        # print(f"Generated output length: {len(output[0]) - input_length}")
        # print(output)
        # print(tokenizer.batch_decode(output, skip_special_tokens=True))
        print(f"Time taken: {elapsed_time:.4f} seconds")
        if output_length == 1:
            prefill_time += elapsed_time
        else:
            decode_time += elapsed_time
            # total_time, avg_time = cached_mlp.get_load_from_cpu_stats()
            # print(f"Total time spent in load_from_cpu: {total_time/1000:.4f} s")

timepertoken = (decode_time - prefill_time) / (output_length-1) / test_samples
print("decode time:", '{:.4f}'.format((decode_time - prefill_time) /test_samples), ' s')
print("decode phase speed:", '{:.4f}'.format(1/timepertoken) , ' token/s')

output length is 1
Time taken: 0.1289 seconds
Time taken: 0.0751 seconds
Time taken: 0.1243 seconds
Time taken: 0.1018 seconds
Time taken: 0.0810 seconds
Time taken: 0.0764 seconds
Time taken: 0.1409 seconds
Time taken: 0.1059 seconds
output length is 32
Time taken: 3.7325 seconds
Time taken: 3.5565 seconds
Time taken: 4.2298 seconds
Time taken: 4.1699 seconds
Time taken: 4.4598 seconds
Time taken: 4.3867 seconds
Time taken: 4.9485 seconds
Time taken: 5.0861 seconds
decode time: 4.2169  s
decode phase speed: 7.3513  token/s


In [1]:
#### GPU版本
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3,4"

from modeling_mixtral import MixtralForCausalLM
from transformers import AutoTokenizer
import torch
import torch.nn as nn
from typing import Optional
import json

def get_model(model_name, device_map, dtype=torch.bfloat16):
    llm = MixtralForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        use_cache=True,
        torch_dtype=dtype,
    ) 
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    return llm, tokenizer

with open('../path.json', 'r') as f:
    path = json.load(f)
    model_name = path['mixtral']

with open('../quantize/device_map_1.json', 'r') as f:
    device_map = json.load(f)

dtype = torch.float16
llm, tokenizer = get_model(model_name, device_map, dtype=dtype)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 19/19 [00:23<00:00,  1.25s/it]


torch.profile

In [5]:
import json
from datasets import load_dataset, Dataset
from transformers import GenerationConfig

input_length = 32
MAX_LENGTH = input_length
output_length = 2
test_samples = 4

with open("../path.json", "r") as f:
    paths = json.load(f)
    fineweb_path = paths["fineweb"]

def preprocess_data(data, tokenizer):
	# 使用 tokenizer 将文本数据转换为模型输入
	inputs = tokenizer(data, padding="max_length", truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
	inputs["labels"] = inputs.input_ids.clone()
	return inputs

fineweb = load_dataset("parquet",data_files=fineweb_path) #726000
fineweb_text = fineweb['train']['text'][:test_samples] 

print("output length is {}".format(output_length))
text = fineweb_text[0]
inputs = preprocess_data(text, tokenizer)

# cached_mlp.clear_load_from_cpu_stats()
with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA,
    ]
) as p:
    # 前向传播
    with torch.no_grad():
        output = llm.generate(
            input_ids=inputs["input_ids"].cuda(),
            attention_mask=inputs["attention_mask"].cuda(),
            max_length=input_length + output_length,  # 总长度为输入长度 + 输出长度
            generation_config=GenerationConfig(do_sample=False),
            pad_token_id=tokenizer.eos_token_id
        )
print(p.key_averages().table(
    sort_by="self_cpu_time_total", row_limit=-1))
p.export_chrome_trace("./trace-t.json")

output length is 2
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                       cudaLaunchKernel        25.99%      28.059ms        25.99%      28.059ms       9.369us       0.000us         0.00%       0.000us       0.000us          2995  
                                               aten::mm        12.40%      13.387ms        20.11%      21.714ms      48.254us       8.617ms         9.71%       8.617ms      19.150us       

加载到GPU上

In [1]:
from transformers import MixtralForCausalLM, AutoTokenizer
import torch
import torch.nn as nn
from typing import Optional
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
import json

def get_model(model_name, device_map, dtype=torch.bfloat16):
    llm = MixtralForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        use_cache=True,
        torch_dtype=dtype,
    ) 
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    return llm, tokenizer

with open('../path.json', 'r') as f:
    path = json.load(f)
    model_name = path['mixtral']
    # threshold_path = path[threshold_path_name]

with open('../quantize/device_map_1.json', 'r') as f:
    device_map = json.load(f)

dtype = torch.float16
llm, tokenizer = get_model(model_name, device_map, dtype=dtype)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 19/19 [00:25<00:00,  1.36s/it]


#### v0版本

In [None]:
class CachedMLP(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, dtype):
        super(CachedMLP, self).__init__()
        # 定义 w1、w2、w3 三个线性层
        self.w1 = nn.Linear(input_dim, hidden_dim, bias=False, dtype=dtype)
        self.w2 = nn.Linear(hidden_dim, input_dim, bias=False, dtype=dtype)
        self.w3 = nn.Linear(input_dim, hidden_dim, bias=False, dtype=dtype)
        self.activation = nn.GELU()

        # 将 MLP 缓存在 GPU 上
        self.cuda()

    def load_from_cpu(self, cpu_mlp):
        """
        从 CPU 上的 MLP 加载参数到 GPU 上的缓存 MLP。
        """
        # 将 CPU 上的参数复制到 GPU 上的缓存 MLP
        # print(cpu_mlp)
        # print(cpu_mlp.w1.state_dict())
        self.w1.load_state_dict(cpu_mlp['w1'].state_dict())
        self.w2.load_state_dict(cpu_mlp['w2'].state_dict())
        self.w3.load_state_dict(cpu_mlp['w3'].state_dict())

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # 确保输入在 GPU 上
        x = x.to('cuda')
        # 计算 w1 和 w3
        # print(self.w1.type, x.type)
        w1_output = self.activation(self.w1(x))
        w3_output = self.w3(x)
        # 计算 w2
        x = self.w2(w1_output * w3_output)
        return x

def convert_mixtral_to_cached_mlp(llm, dtype):
    """
    将 Mixtral 模型的 MLP 层替换为缓存 MLP 的版本。
    """
    ### 其他部分存放在GPU上
    llm.model.embed_tokens.cuda()
    for i in range(32):
        llm.model.layers[i].self_attn.cuda()
        llm.model.layers[i].input_layernorm.cuda()
        llm.model.layers[i].post_attention_layernorm.cuda()
        llm.model.layers[i].block_sparse_moe.gate.cuda()
    llm.model.norm.cuda()
    llm.lm_head.cuda()
    
    # 在 GPU 上缓存一个 MLP 实例
    cached_mlp = CachedMLP(
        input_dim=llm.config.hidden_size,
        hidden_dim=llm.config.intermediate_size,
        dtype=dtype,
    )

    # 遍历每一层的 block_sparse_moe.experts
    for i in range(len(llm.model.layers)):
        for j in range(len(llm.model.layers[i].block_sparse_moe.experts)):
            # 保存原始的 w1、w2、w3 层（常驻 CPU）
            llm.model.layers[i].block_sparse_moe.experts[j].cpu_mlp = {
                "w1": llm.model.layers[i].block_sparse_moe.experts[j].w1,
                "w2": llm.model.layers[i].block_sparse_moe.experts[j].w2,
                "w3": llm.model.layers[i].block_sparse_moe.experts[j].w3,
            }

            # 替换为缓存 MLP 的版本
            llm.model.layers[i].block_sparse_moe.experts[j].forward = lambda x, cached_mlp=cached_mlp, cpu_mlp=llm.model.layers[i].block_sparse_moe.experts[j].cpu_mlp: cached_mlp_forward(x, cached_mlp, cpu_mlp)

    return llm

def cached_mlp_forward(x, cached_mlp, cpu_mlp):
    """
    动态加载 CPU 上的 MLP 参数到缓存的 MLP，并执行前向传播。
    """
    # 从 CPU 上传参数到缓存的 MLP
    cached_mlp.load_from_cpu(cpu_mlp)

    # 使用缓存的 MLP 进行计算
    output = cached_mlp(x)

    # 将缓存的 MLP 参数清空（可选）
    # cached_mlp.load_from_cpu({
    #     "w1": nn.Linear(cached_mlp.w1.in_features, cached_mlp.w1.out_features).cpu(),
    #     "w2": nn.Linear(cached_mlp.w2.in_features, cached_mlp.w2.out_features).cpu(),
    #     "w3": nn.Linear(cached_mlp.w3.in_features, cached_mlp.w3.out_features).cpu(),
    # })

    return output

# 将模型转换为缓存 MLP 的版本
llm = convert_mixtral_to_cached_mlp(llm, dtype)