### 先都加载到cpu上

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "6,1,2"
os.environ["TOKENIZERS_PARALLELISM"] = "False"
from modeling_mixtral import MixtralForCausalLM
from transformers import AutoTokenizer
import torch
import torch.nn as nn
from typing import Optional
import json
### HQQ量化
from hqq.core.quantize import *
from hqq.models.hf.mixtral import MixtralPatch
import transformers
from hqq.models.base import BaseHQQModel
from accelerate import init_empty_weights
import gemlite
from gemlite import GemLiteLinearTriton 
GemLiteLinearTriton.get_default_gemv = lambda *args, **kwargs: 'GEMV'

class BaseHQQHFModel(BaseHQQModel):
    # Save model architecture
    @classmethod
    def cache_model(cls, model, save_dir):
        model.config.save_pretrained(save_dir)

    # Create empty model from config
    @classmethod
    def create_model(cls, save_dir, kwargs):
        model_kwargs = {}
        for key in ["attn_implementation"]:
            if key in kwargs:
                model_kwargs[key] = kwargs[key]

        config = transformers.AutoConfig.from_pretrained(
            cls.get_config_file(save_dir)
        )

        with init_empty_weights():
            model = MixtralForCausalLM._from_config(config, **model_kwargs)

        return model

class MixtralHQQ(MixtralPatch, BaseHQQHFModel):
    pass

with open('../path.json', 'r') as f:
    path = json.load(f)
    model_name = path['mixtral']
    threshold_path = path['chess_up_threshold']

save_dir = './hqqsaved'
dtype = torch.float16
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
### 从保存的权重中加载
llm = MixtralHQQ.from_quantized(save_dir, compute_dtype=dtype, device='cpu')
HQQLinear.set_backend(HQQBackend.PYTORCH)

backend       = "gemlite" #'torchao_int4' #"torchao_int4" (4-bit only) or "gemlite" (4-bit + 2-bit)
# #Optimize
from hqq.utils.patching import prepare_for_inference
prepare_for_inference(llm, backend=backend, verbose=True)

if(backend == 'gemlite'):
	gemlite.core.GEMLITE_TRITON_RESTRICT_M = True
	gemlite.core.GemLiteLinear.load_config('/tmp/gemlite_config.json')

  from .autonotebook import tqdm as notebook_tqdm


using atten... sdpa


  return torch.load(cls.get_weight_file(save_dir), map_location=map_location)
100%|██████████| 32/32 [00:00<00:00, 358.15it/s]
100%|██████████| 32/32 [00:00<00:00, 582.61it/s]
 19%|█▉        | 6/32 [01:13<05:10, 11.93s/it]

In [None]:
device_map = {layer_idx: 'cuda:1' if layer_idx <= 16 else 'cuda:2' for layer_idx in range(1, 32)}
from convert import convert_mixtral_to_cached_mlp

prefill_layers = 6  ### 固定在device上的MLP层数
llm, cached_mlps = convert_mixtral_to_cached_mlp(llm, dtype, sparsity=0.8, backends=backend, 
                                                 device='cuda:0', device_map=device_map, threshold_path = threshold_path, prefill_layers=prefill_layers)

device_id = 0

  up_th = torch.load(threshold_path, map_location='cuda')["up_proj_states_thresholds"]


Thresholds loaded from /data2/lz/On-the-Fly_MoE_Inference/saving/threshold/c4_mixtral_up/thresholds_0_8.pt
active neural num  2867
active neural num  2867
... loading layer 6 for pipelineLLM
... loading layer 7 for pipelineLLM
... loading layer 8 for pipelineLLM
... loading layer 9 for pipelineLLM
... loading layer 10 for pipelineLLM
... loading layer 11 for pipelineLLM
... loading layer 12 for pipelineLLM
... loading layer 13 for pipelineLLM
... loading layer 14 for pipelineLLM
... loading layer 15 for pipelineLLM
... loading layer 16 for pipelineLLM
... loading layer 17 for pipelineLLM
... loading layer 18 for pipelineLLM
... loading layer 19 for pipelineLLM
... loading layer 20 for pipelineLLM
... loading layer 21 for pipelineLLM
... loading layer 22 for pipelineLLM
... loading layer 23 for pipelineLLM
... loading layer 24 for pipelineLLM
... loading layer 25 for pipelineLLM
... loading layer 26 for pipelineLLM
... loading layer 27 for pipelineLLM
... loading layer 28 for pipelineLL

In [None]:
from pipelinellm import PipelineLLM
# 创建流水线模型
PLLM = PipelineLLM(llm, cached_mlps, 1, 3, training_epoch=20,
                   device='cuda:0', device_map=device_map, prefill_layers=prefill_layers, print_layer_info=True) ### use ep

  self.load_state_dict(torch.load(f'../expert_predictor/training/{layer_idx}-{training_epoch}.pth'))


In [4]:
class CUDAGraphRunner():
    def __init__(self, model):
        self.model = model
        self.cuda_graph = None
        self.graph_input = torch.zeros((1,4096), dtype=torch.float16, device=f'cuda:{device_id}')
        self.graph_output = None
    
    def capture(self, x,):
        assert self.cuda_graph is None
        self.graph_input = self.graph_input.copy_(x).to(x.device)
        self.cuda_graph = torch.cuda.CUDAGraph()
        # self.cuda_graph.enable_debug_mode()
        with torch.cuda.graph(self.cuda_graph):
            self.graph_output = self.model(self.graph_input,)
        torch.cuda.synchronize()
        
    def forward(self, x,):
        self.graph_input.copy_(x)
        self.cuda_graph.replay()
        return self.graph_output

    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)
    
inp = torch.randn(1, 4096).half().cuda(device_id)
for i in range(prefill_layers):
    for j in range(len(llm.model.layers[0].block_sparse_moe.experts)):
        expert=llm.model.layers[i].block_sparse_moe.experts[j]
        print(expert(inp))
        graph_runner = CUDAGraphRunner(expert)
        graph_runner.capture(inp)
        print(graph_runner(inp))

        llm.model.layers[i].block_sparse_moe.experts[j].graph = graph_runner

tensor([[ 0.8022,  0.2603,  0.7651,  ..., -0.0645,  0.4971, -0.7744]],
       device='cuda:0', dtype=torch.float16)
tensor([[ 0.8032,  0.2590,  0.7666,  ..., -0.0640,  0.4919, -0.7734]],
       device='cuda:0', dtype=torch.float16)
tensor([[ 0.2974, -0.9438,  0.7803,  ...,  0.1995,  0.3020,  0.0087]],
       device='cuda:0', dtype=torch.float16)
tensor([[ 0.2961, -0.9458,  0.7803,  ...,  0.1973,  0.3037,  0.0090]],
       device='cuda:0', dtype=torch.float16)
tensor([[ 0.3416, -0.1520,  0.0690,  ...,  0.0269,  0.6045,  0.3049]],
       device='cuda:0', dtype=torch.float16)
tensor([[ 0.3418, -0.1517,  0.0685,  ...,  0.0281,  0.6060,  0.3052]],
       device='cuda:0', dtype=torch.float16)
tensor([[-0.1482,  0.2288,  0.7109,  ...,  0.2812,  0.6899, -0.5537]],
       device='cuda:0', dtype=torch.float16)
tensor([[-0.1476,  0.2305,  0.7114,  ...,  0.2812,  0.6914, -0.5566]],
       device='cuda:0', dtype=torch.float16)
tensor([[ 0.0220,  0.6519, -0.1591,  ...,  0.7368, -0.5747, -0.2932]],
 

### 测试时间开销

In [5]:
import json
# import torch._dynamo.config
from datasets import load_dataset, Dataset
from transformers import GenerationConfig

input_length = 2
MAX_LENGTH = 2
output_length = 10
test_samples = 1

with open("../path.json", "r") as f:
    paths = json.load(f)
    fineweb_path = paths["fineweb"]

def preprocess_data(data, tokenizer):
	# 使用 tokenizer 将文本数据转换为模型输入
	inputs = tokenizer(data, padding="max_length", truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
	inputs["labels"] = inputs.input_ids.clone()
	return inputs

filt_type = fineweb_path.split('.')[-1]
fineweb = load_dataset(filt_type, data_files=fineweb_path) #726000
fineweb_text = fineweb['train']['text']

print("warm up ...")
# 预热（避免第一次运行时的额外开销）
for text in fineweb_text[:5]:
    inputs = preprocess_data(text, tokenizer)
    with torch.no_grad():
        output = llm(input_ids=inputs["input_ids"].cuda(device_id), attention_mask=inputs["attention_mask"].cuda(device_id))

warm up ...
in prefill layer  0
in prefill layer  1
in prefill layer  2
in prefill layer  3
in prefill layer  4
in prefill layer  5
in prefill layer  6
in prefill layer  7
in prefill layer  8
in prefill layer  9
in prefill layer  10
in prefill layer  11
in prefill layer  12
in prefill layer  13
in prefill layer  14
in prefill layer  15
in prefill layer  16
in prefill layer  17
in prefill layer  18
in prefill layer  19
in prefill layer  20
in prefill layer  21
in prefill layer  22
in prefill layer  23
in prefill layer  24
in prefill layer  25
in prefill layer  26
in prefill layer  27
in prefill layer  28
in prefill layer  29
in prefill layer  30
in prefill layer  31
in prefill layer  0
in prefill layer  1
in prefill layer  2
in prefill layer  3
in prefill layer  4
in prefill layer  5
in prefill layer  6
in prefill layer  7
in prefill layer  8
in prefill layer  9
in prefill layer  10
in prefill layer  11
in prefill layer  12
in prefill layer  13
in prefill layer  14
in prefill layer  15


In [10]:
from transformers import GenerationConfig
import torch

# 定义 input_length 和 output_length 的范围
# input_length_range = [16]
# output_length_range = [128,256]
# input_length_range = [16,]  # 16到32
input_length_range = [32,64,128]
output_length_range = [128,256,512,1024]  # 128到1024

test_samples = 5
device_id = 0

def preprocess_data(data, tokenizer, max_length):
    # 使用 tokenizer 将文本数据转换为模型输入
    inputs = tokenizer(data, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")
    inputs["labels"] = inputs.input_ids.clone()
    return inputs

for input_length in input_length_range:
    for output_length in output_length_range:
        MAX_LENGTH = input_length
        generated_all = 0
        prefill_time, decode_time = 0, 0
        reloaded_experts = 0

        # 打开文件以写入结果
        with open(f"{input_length}-{output_length}.out", "w") as f:
            print(f"output length is {output_length}", file=f)
            for text in fineweb_text[2:2+test_samples]:
                inputs = preprocess_data(text, tokenizer, MAX_LENGTH)
                ### 清空统计数据
                PLLM.get_prefill_time()
                PLLM.get_reload_experts()

                # 测试时间
                start_event = torch.cuda.Event(enable_timing=True)
                end_event = torch.cuda.Event(enable_timing=True)

                # 开始计时
                torch.cuda.synchronize()
                start_event.record()

                # 前向传播
                with torch.no_grad():
                    output = llm.generate(
                        input_ids=inputs["input_ids"].cuda(device_id),
                        attention_mask=inputs["attention_mask"].cuda(device_id),
                        max_length=input_length + output_length,  # 总长度为输入长度 + 输出长度
                        generation_config=GenerationConfig(do_sample=False),
                        pad_token_id=tokenizer.pad_token_id, 
                        # cache_implementation="static" ## moe not support
                    )

                # 结束计时
                end_event.record()
                torch.cuda.synchronize()

                if (len(output[0]) - input_length) == output_length:
                    # 计算时间
                    elapsed_time = start_event.elapsed_time(end_event) / 1000  # 转换为秒
                    decode_time += elapsed_time
                    cur_prefill_time = PLLM.get_prefill_time()
                    prefill_time += cur_prefill_time
                    generated_all += (len(output[0]) - input_length)
                    reloaded_experts += PLLM.get_reload_experts()
                print(f"Generated length: {len(output[0]) - input_length}", f"Time taken: {elapsed_time:.2f} s,", f"prefill time: {cur_prefill_time:.2f} s", file=f)
                # print(output, file=f)
                print(tokenizer.batch_decode(output, skip_special_tokens=True), file=f)

            print("Generate speed:", '{:.4f}'.format((generated_all) / decode_time) , 'token/s', file=f)
            timepertoken = (decode_time - prefill_time) / (generated_all)
            print("decode phase speed(not cover prefill phase):", '{:.4f}'.format(1/timepertoken) , 'token/s', file=f)
            expertpertoken = reloaded_experts / generated_all
            print("the number of reloaded experts per token:", '{:.3f}, ({:.2f}%)'.format(expertpertoken, 100 * expertpertoken / ((32-prefill_layers) * 2)), file=f)

in prefill layer  0
in prefill layer  1
in prefill layer  2
in prefill layer  3
in prefill layer  4
in prefill layer  5
in prefill layer  6
in prefill layer  7
in prefill layer  8
in prefill layer  9
in prefill layer  10
in prefill layer  11
in prefill layer  12
in prefill layer  13
in prefill layer  14
in prefill layer  15
in prefill layer  16
in prefill layer  17
in prefill layer  18
in prefill layer  19
in prefill layer  20
in prefill layer  21
in prefill layer  22
in prefill layer  23
in prefill layer  24
in prefill layer  25
in prefill layer  26
in prefill layer  27
in prefill layer  28
in prefill layer  29
in prefill layer  30
in prefill layer  31
in prefill layer  0
in prefill layer  1
in prefill layer  2
in prefill layer  3
in prefill layer  4
in prefill layer  5
in prefill layer  6
in prefill layer  7
in prefill layer  8
in prefill layer  9
in prefill layer  10
in prefill layer  11
in prefill layer  12
in prefill layer  13
in prefill layer  14
in prefill layer  15
in prefill l

In [9]:
import json
from datasets import load_dataset, Dataset
from transformers import GenerationConfig

input_length = 10
MAX_LENGTH = input_length
output_length = 256
test_samples = 5
device_id = 0

def preprocess_data(data, tokenizer):
	# 使用 tokenizer 将文本数据转换为模型输入
	inputs = tokenizer(data, padding="max_length", truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
	inputs["labels"] = inputs.input_ids.clone()
	return inputs

generated_all = 0
prefill_time, decode_time = 0, 0
reloaded_experts = 0
print("output length is {}".format(output_length))
for text in fineweb_text[2:2+test_samples] :
    inputs = preprocess_data(text, tokenizer)
    ### 清空统计数据
    PLLM.get_prefill_time()
    PLLM.get_reload_experts()

    # 测试时间
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)

    # 开始计时
    torch.cuda.synchronize()
    start_event.record()

    # 前向传播
    with torch.no_grad():
        output = llm.generate(
            input_ids=inputs["input_ids"].cuda(device_id),
            attention_mask=inputs["attention_mask"].cuda(device_id),
            max_length=input_length + output_length,  # 总长度为输入长度 + 输出长度
            generation_config=GenerationConfig(do_sample=False),
            pad_token_id=tokenizer.pad_token_id, 
            # cache_implementation="static" ## moe not support
        )

    # 结束计时
    end_event.record()
    torch.cuda.synchronize()

    # 计算时间
    elapsed_time = start_event.elapsed_time(end_event) / 1000  # 转换为秒
    decode_time += elapsed_time
    cur_prefill_time = PLLM.get_prefill_time()
    prefill_time += cur_prefill_time
    print(f"Generated length: {len(output[0]) - input_length}", f"Time taken: {elapsed_time:.2f} s,", f"prefill time: {cur_prefill_time:.2f} s")
    # print(output)
    print(tokenizer.batch_decode(output, skip_special_tokens=True))

    generated_all += (len(output[0]) - input_length - 1)
    reloaded_experts += PLLM.get_reload_experts()

print("Generate speed:", '{:.4f}'.format((generated_all+test_samples) / decode_time) , 'token/s')
timepertoken = (decode_time - prefill_time) / (generated_all)
print("decode phase speed(not cover prefill phase):", '{:.4f}'.format(1/timepertoken) , 'token/s')
expertpertoken = reloaded_experts / generated_all
print("the number of reloaded experts per token:", '{:.3f}, ({:.2f}%)'.format(expertpertoken, 100 * expertpertoken / ((32-prefill_layers) * 2)))

output length is 256
in prefill layer  0
in prefill layer  1
in prefill layer  2
in prefill layer  3
in prefill layer  4
in prefill layer  5
in prefill layer  6
in prefill layer  7
in prefill layer  8
in prefill layer  9
in prefill layer  10
in prefill layer  11
in prefill layer  12
in prefill layer  13
in prefill layer  14
in prefill layer  15
in prefill layer  16
in prefill layer  17
in prefill layer  18
in prefill layer  19
in prefill layer  20
in prefill layer  21
in prefill layer  22
in prefill layer  23
in prefill layer  24
in prefill layer  25
in prefill layer  26
in prefill layer  27
in prefill layer  28
in prefill layer  29
in prefill layer  30
in prefill layer  31


KeyboardInterrupt: 

In [6]:

########################################################################
#Save gemlite cache
if(backend == 'gemlite'):
	gemlite.core.GemLiteLinear.cache_config('/tmp/gemlite_config.json') 

#### torch.profile

attention使用sdpa实现

In [11]:
import json
from datasets import load_dataset, Dataset
from transformers import GenerationConfig

input_length = 2
MAX_LENGTH = input_length
output_length = 2
test_samples = 1

with open("../path.json", "r") as f:
    paths = json.load(f)
    fineweb_path = paths["fineweb"]

def preprocess_data(data, tokenizer):
	# 使用 tokenizer 将文本数据转换为模型输入
	inputs = tokenizer(data, padding="max_length", truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
	inputs["labels"] = inputs.input_ids.clone()
	return inputs

filt_type = fineweb_path.split('.')[-1]
fineweb = load_dataset(filt_type, data_files=fineweb_path) #726000
fineweb_text = fineweb['train']['text']

print("output length is {}".format(output_length))
text = fineweb_text[0]
inputs = preprocess_data(text, tokenizer)

# cached_mlp.clear_load_from_cpu_stats()
with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA,
    ]
) as p:
    # 前向传播
    with torch.no_grad():
        output = llm.generate(
            input_ids=inputs["input_ids"].cuda(),
            attention_mask=inputs["attention_mask"].cuda(),
            max_length=input_length + output_length,  # 总长度为输入长度 + 输出长度
            generation_config=GenerationConfig(do_sample=False),
            pad_token_id=tokenizer.eos_token_id
        )
print(p.key_averages().table(
    sort_by="self_cpu_time_total", row_limit=-1))
p.export_chrome_trace("./offloading-3090-simulateprefill.json")

output length is 2
in prefill layer  0
in prefill layer  1
in prefill layer  2
in prefill layer  3
in prefill layer  4
in prefill layer  5
in prefill layer  6
in prefill layer  7
in prefill layer  8
in prefill layer  9
in prefill layer  10
in prefill layer  11
in prefill layer  12
in prefill layer  13
in prefill layer  14
in prefill layer  15
in prefill layer  16
in prefill layer  17
in prefill layer  18
in prefill layer  19
in prefill layer  20
in prefill layer  21
in prefill layer  22
in prefill layer  23
in prefill layer  24
in prefill layer  25
in prefill layer  26
in prefill layer  27
in prefill layer  28
in prefill layer  29
in prefill layer  30
in prefill layer  31
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time

#### 测试一个正常输出

In [7]:
import json
from datasets import load_dataset, Dataset
from transformers import GenerationConfig

input_length = 10
MAX_LENGTH = input_length
output_length = 16
test_samples = 1

def preprocess_data(data, tokenizer):
	# 使用 tokenizer 将文本数据转换为模型输入
	inputs = tokenizer(data, padding="max_length", truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
	inputs["labels"] = inputs.input_ids.clone()
	return inputs

generated_all, decode_time, prefill_time = 0, 0, 0
# print("max output length is {}".format(output_length))
text = "The future of AI is here, "

clear_prefill_time = PLLM.get_prefill_time()
clear_experts = PLLM.get_reload_experts()
# print("need to zero: ", clear_experts, clear_prefill_time)

llm.eval()
inputs = preprocess_data(text, tokenizer)
# 测试时间
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)

# 开始计时
torch.cuda.synchronize()
start_event.record()

# 前向传播
with torch.no_grad():
    output = llm.generate(
        input_ids=inputs["input_ids"].cuda(device_id),
        attention_mask=inputs["attention_mask"].cuda(device_id),
        max_length=input_length + output_length,  # 总长度为输入长度 + 输出长度
        generation_config=GenerationConfig(do_sample=False),
        pad_token_id=tokenizer.pad_token_id, 
        # cache_implementation="static" ## moe not support
    )

# 结束计时
end_event.record()
torch.cuda.synchronize()

# 计算时间
elapsed_time = start_event.elapsed_time(end_event) / 1000  # 转换为秒
decode_time += elapsed_time
cur_prefill_time = PLLM.get_prefill_time()
prefill_time += cur_prefill_time
print(f"Generated length: {len(output[0]) - input_length}", f"Time taken: {elapsed_time:.2f} s", f"prefill time: {cur_prefill_time:.2f} s")
# print(output)
print(tokenizer.batch_decode(output, skip_special_tokens=True))

generated_all += (len(output[0]) - input_length -1)

timepertoken = (decode_time - prefill_time) / (generated_all)
print("decode phase speed:", '{:.4f}'.format(1/timepertoken) , ' token/s')
print("the number of experts reload per token:", PLLM.get_reload_experts() / generated_all)

in prefill layer  0


TypeError: 'GemLiteLinearTriton' object is not subscriptable

### 加载到GPU上

In [1]:
from transformers import MixtralForCausalLM, AutoTokenizer
import torch
import torch.nn as nn
from typing import Optional
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
import json

def get_model(model_name, device_map, dtype=torch.bfloat16):
    llm = MixtralForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        use_cache=True,
        torch_dtype=dtype,
    ) 
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    return llm, tokenizer

with open('../path.json', 'r') as f:
    path = json.load(f)
    model_name = path['mixtral']
    # threshold_path = path[threshold_path_name]

with open('../quantize/device_map_1.json', 'r') as f:
    device_map = json.load(f)

dtype = torch.float16
llm, tokenizer = get_model(model_name, device_map, dtype=dtype)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 19/19 [00:25<00:00,  1.36s/it]
