### 先都加载到cpu上

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
os.environ["TOKENIZERS_PARALLELISM"] = "False"
# from modeling_mixtral import MixtralForCausalLM
from transformers import AutoTokenizer, MixtralForCausalLM
import torch
import torch.nn as nn
from typing import Optional
import json

with open('../path.json', 'r') as f:
    path = json.load(f)
    model_name = path['mixtral']
    # threshold_path = path[threshold_path_name]

with open("../quantize/device_map.json", "r") as f:
    device_map = json.load(f)

def get_model(model_name, device_map, dtype=torch.bfloat16, use_cache=True):
    llm = MixtralForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        use_cache=use_cache,
        torch_dtype=dtype,
    ) 
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    return llm, tokenizer

dtype = torch.float16
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
# llm, tokenizer = get_model(model_name, 'cpu', dtype=dtype)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
### HQQ量化
from hqq.core.quantize import *
from hqq.models.hf.mixtral import MixtralHQQ

save_dir = './hqqsaved'
### 第一次加载
# q3_config    = BaseQuantizeConfig(nbits=2, group_size=64)
# quant_config      = {'block_sparse_moe.experts.w3'   : q3_config}
# llm = MixtralForCausalLM.from_pretrained(
#         model_name,
#         device_map='cpu',
#         use_cache=True,
#         torch_dtype=dtype,
#     ) 
# MixtralHQQ.quantize_model(llm, quant_config=quant_config, compute_dtype=dtype, device='cuda:0')
#### 先放CUDA量化，然后再传回CPU
# MixtralHQQ.save_quantized(llm, save_dir)

### 从保存的权重中加载
llm = MixtralHQQ.from_quantized(save_dir, compute_dtype=dtype, device='cpu')
HQQLinear.set_backend(HQQBackend.PYTORCH)

backend       = "gemlite" #'torchao_int4' #"torchao_int4" (4-bit only) or "gemlite" (4-bit + 2-bit)
#Optimize
from hqq.utils.patching import prepare_for_inference
prepare_for_inference(llm, backend=backend, verbose=True)
#Load GemLite cache
if(backend == 'gemlite'):
	import gemlite
	gemlite.core.GEMLITE_TRITON_RESTRICT_M = True
	gemlite.core.GemLiteLinear.load_config('/tmp/gemlite_config.json')

  return torch.load(cls.get_weight_file(save_dir), map_location=map_location)
100%|██████████| 32/32 [00:00<00:00, 410.31it/s]
100%|██████████| 32/32 [00:00<00:00, 1150.05it/s]
100%|██████████| 32/32 [02:56<00:00,  5.52s/it]


In [3]:
from pipelinellm import convert_mixtral_to_cached_mlp, PipelineLLM
llm, cached_mlps = convert_mixtral_to_cached_mlp(llm, dtype, sparsity=0.8)

# 创建流水线模型
PLLM = PipelineLLM(llm, cached_mlps, 1, 3, training_epoch=20, print_layer_info=True) ### use ep

active neural num  2867
active neural num  2867


  self.load_state_dict(torch.load(f'/home/bcds/On-the-Fly_MoE_Inference/expert_predictor/training/{layer_idx}-{training_epoch}.pth'))


### 测试时间开销

In [4]:
import json
from datasets import load_dataset, Dataset
from transformers import GenerationConfig


input_length = 10
MAX_LENGTH = input_length
output_length = 32
test_samples = 1
device_id = 0

with open("../path.json", "r") as f:
    paths = json.load(f)
    fineweb_path = paths["fineweb"]

def preprocess_data(data, tokenizer):
	# 使用 tokenizer 将文本数据转换为模型输入
	inputs = tokenizer(data, padding="max_length", truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
	inputs["labels"] = inputs.input_ids.clone()
	return inputs

filt_type = fineweb_path.split('.')[-1]
fineweb = load_dataset(filt_type, data_files=fineweb_path) #726000
fineweb_text = fineweb['train']['text']

# 预热（避免第一次运行时的额外开销）
# for text in fineweb_text[:5] :
#     inputs = preprocess_data(text, tokenizer)
#     with torch.no_grad():
#         output = llm(input_ids=inputs["input_ids"].cuda(device_id), attention_mask=inputs["attention_mask"].cuda(device_id))

generated_all = 0
prefill_time, decode_time = 0, 0
reloaded_experts = 0
print("output length is {}".format(output_length))
for text in fineweb_text[2:2+test_samples] :
    inputs = preprocess_data(text, tokenizer)
    ### 清空统计数据
    PLLM.get_prefill_time()
    PLLM.get_reload_experts()

    # 测试时间
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)

    # 开始计时
    torch.cuda.synchronize()
    start_event.record()

    # 前向传播
    with torch.no_grad():
        output = llm.generate(
            input_ids=inputs["input_ids"].cuda(device_id),
            attention_mask=inputs["attention_mask"].cuda(device_id),
            max_length=input_length + output_length,  # 总长度为输入长度 + 输出长度
            generation_config=GenerationConfig(do_sample=False),
            pad_token_id=tokenizer.pad_token_id, 
            # cache_implementation="static" ## moe not support
        )

    # 结束计时
    end_event.record()
    torch.cuda.synchronize()

    # 计算时间
    elapsed_time = start_event.elapsed_time(end_event) / 1000  # 转换为秒
    decode_time += elapsed_time
    cur_prefill_time = PLLM.get_prefill_time()
    prefill_time += cur_prefill_time
    print(f"Generated length: {len(output[0]) - input_length}", f"Time taken: {elapsed_time:.2f} s,", f"prefill time: {cur_prefill_time:.2f} s")
    # print(output)
    print(tokenizer.batch_decode(output, skip_special_tokens=True))

    generated_all += (len(output[0]) - input_length - 1)
    reloaded_experts += PLLM.get_reload_experts()

timepertoken = (decode_time - prefill_time) / (generated_all)
print("decode phase speed:", '{:.4f}'.format(1/timepertoken) , 'token/s')
print("the number of reloaded experts per token:", '{:.3f}'.format(reloaded_experts / generated_all))

output length is 32
in prefill layer  0
in prefill layer  1
in prefill layer  2
in prefill layer  3
in prefill layer  4
in prefill layer  5
in prefill layer  6
in prefill layer  7
in prefill layer  8
in prefill layer  9
in prefill layer  10
in prefill layer  11
in prefill layer  12
in prefill layer  13
in prefill layer  14
in prefill layer  15
in prefill layer  16
in prefill layer  17
in prefill layer  18
in prefill layer  19
in prefill layer  20
in prefill layer  21
in prefill layer  22
in prefill layer  23
in prefill layer  24
in prefill layer  25
in prefill layer  26
in prefill layer  27
in prefill layer  28
in prefill layer  29
in prefill layer  30
in prefill layer  31
Generated length: 32 Time taken: 832.55 s, prefill time: 821.77 s
['How do you get HIV?\nHIV is a the/cc ( ( -( !.) )9t((2/5009996(AB0']
decode phase speed: 2.8764 token/s
the number of reloaded experts per token: 15.290


In [6]:

########################################################################
#Save gemlite cache
if(backend == 'gemlite'):
	gemlite.core.GemLiteLinear.cache_config('/tmp/gemlite_config.json') 

#### torch.profile

trace-offloading-r.json是最优，就是做完一个index就传一个

In [5]:
import json
from datasets import load_dataset, Dataset
from transformers import GenerationConfig

input_length = 1
MAX_LENGTH = input_length
output_length = 2
test_samples = 4

with open("../path.json", "r") as f:
    paths = json.load(f)
    fineweb_path = paths["fineweb"]

def preprocess_data(data, tokenizer):
	# 使用 tokenizer 将文本数据转换为模型输入
	inputs = tokenizer(data, padding="max_length", truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
	inputs["labels"] = inputs.input_ids.clone()
	return inputs

fineweb = load_dataset("parquet",data_files=fineweb_path) #726000
fineweb_text = fineweb['train']['text'][:test_samples] 

print("output length is {}".format(output_length))
text = fineweb_text[0]
inputs = preprocess_data(text, tokenizer)

# cached_mlp.clear_load_from_cpu_stats()
with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA,
    ]
) as p:
    # 前向传播
    with torch.no_grad():
        output = llm.generate(
            input_ids=inputs["input_ids"].cuda(),
            attention_mask=inputs["attention_mask"].cuda(),
            max_length=input_length + output_length,  # 总长度为输入长度 + 输出长度
            generation_config=GenerationConfig(do_sample=False),
            pad_token_id=tokenizer.eos_token_id
        )
print(p.key_averages().table(
    sort_by="self_cpu_time_total", row_limit=-1))
p.export_chrome_trace("./offloading-hqq2-reload2.json")

output length is 2
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            aten::index        24.56%     143.583ms        33.20%     194.114ms     348.499us       1.446ms         0.30%       2.117ms       3.801us           557  
                                  cudaStreamSynchronize        16.41%      95.945ms        16.41%      95.945ms     134.944us     120.451us         0.03%     120.451us       0.169us       

#### 测试一个正常输出

In [27]:
import json
from datasets import load_dataset, Dataset
from transformers import GenerationConfig

input_length = 6
MAX_LENGTH = input_length
output_length = 12
device_id = 0
test_samples = 1

def preprocess_data(data, tokenizer):
	# 使用 tokenizer 将文本数据转换为模型输入
	inputs = tokenizer(data, padding="max_length", truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
	inputs["labels"] = inputs.input_ids.clone()
	return inputs

generated_all, decode_time, prefill_time = 0, 0, 0
# print("max output length is {}".format(output_length))
text = "The future of AI is "

clear_prefill_time = PLLM.get_prefill_time()
clear_experts = PLLM.get_reload_experts()
# print("need to zero: ", clear_experts, clear_prefill_time)

inputs = preprocess_data(text, tokenizer)
# 测试时间
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)

# 开始计时
torch.cuda.synchronize()
start_event.record()

# 前向传播
with torch.no_grad():
    output = llm.generate(
        input_ids=inputs["input_ids"].cuda(device_id),
        attention_mask=inputs["attention_mask"].cuda(device_id),
        max_length=input_length + output_length,  # 总长度为输入长度 + 输出长度
        generation_config=GenerationConfig(do_sample=False),
        pad_token_id=tokenizer.pad_token_id, 
        # cache_implementation="static" ## moe not support
    )

# 结束计时
end_event.record()
torch.cuda.synchronize()

# 计算时间
elapsed_time = start_event.elapsed_time(end_event) / 1000  # 转换为秒
decode_time += elapsed_time
cur_prefill_time = PLLM.get_prefill_time()
prefill_time += cur_prefill_time
print(f"Generated length: {len(output[0]) - input_length}", f"Time taken: {elapsed_time:.2f} s", f"prefill time: {cur_prefill_time:.2f} s")
# print(output)
print(tokenizer.batch_decode(output, skip_special_tokens=True))

generated_all += (len(output[0]) - input_length -1)

timepertoken = (decode_time - prefill_time) / (generated_all)
print("decode phase speed:", '{:.4f}'.format(1/timepertoken) , ' token/s')
print("the number of experts reload per token:", PLLM.get_reload_experts() / generated_all)

Generated length: 12 Time taken: 41.38 s prefill time: 37.84 s
['The future of AI is here, and it’s brighter than you think.']
decode phase speed: 3.1135  token/s
the number of experts reload per token: 8.727272727272727


### 加载到GPU上

In [1]:
from transformers import MixtralForCausalLM, AutoTokenizer
import torch
import torch.nn as nn
from typing import Optional
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
import json

def get_model(model_name, device_map, dtype=torch.bfloat16):
    llm = MixtralForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        use_cache=True,
        torch_dtype=dtype,
    ) 
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    return llm, tokenizer

with open('../path.json', 'r') as f:
    path = json.load(f)
    model_name = path['mixtral']
    # threshold_path = path[threshold_path_name]

with open('../quantize/device_map_1.json', 'r') as f:
    device_map = json.load(f)

dtype = torch.float16
llm, tokenizer = get_model(model_name, device_map, dtype=dtype)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 19/19 [00:25<00:00,  1.36s/it]
