### 先都加载到cpu上

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
os.environ["TOKENIZERS_PARALLELISM"] = "False"
from modeling_mixtral import MixtralForCausalLM
from transformers import AutoTokenizer
import torch
import torch.nn as nn
from typing import Optional
import json

with open('../path.json', 'r') as f:
    path = json.load(f)
    model_name = path['mixtral']
    # threshold_path = path[threshold_path_name]

with open("../quantize/device_map.json", "r") as f:
    device_map = json.load(f)

def get_model(model_name, device_map, dtype=torch.bfloat16, use_cache=True):
    llm = MixtralForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        use_cache=use_cache,
        torch_dtype=dtype,
    ) 
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    return llm, tokenizer

dtype = torch.float16
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
# llm, tokenizer = get_model(model_name, 'cpu', dtype=dtype)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
### HQQ量化
from hqq.core.quantize import *
from hqq.models.hf.mixtral import MixtralHQQ

save_dir = './hqqsaved'
### 第一次加载
# q3_config    = BaseQuantizeConfig(nbits=2, group_size=64)
# quant_config      = {'block_sparse_moe.experts.w3'   : q3_config}
# llm = MixtralForCausalLM.from_pretrained(
#         model_name,
#         device_map='cpu',
#         use_cache=True,
#         torch_dtype=dtype,
#     ) 
# MixtralHQQ.quantize_model(llm, quant_config=quant_config, compute_dtype=dtype, device='cuda:0')
#### 先放CUDA量化，然后再传回CPU
# MixtralHQQ.save_quantized(llm, save_dir)

### 从保存的权重中加载
llm = MixtralHQQ.from_quantized(save_dir, compute_dtype=dtype, device='cpu')
HQQLinear.set_backend(HQQBackend.PYTORCH)

backend       = "gemlite" #'torchao_int4' #"torchao_int4" (4-bit only) or "gemlite" (4-bit + 2-bit)
# #Optimize
from hqq.utils.patching import prepare_for_inference
prepare_for_inference(llm, backend=backend, verbose=True)
#Load GemLite cache
if(backend == 'gemlite'):
	import gemlite
	gemlite.core.GEMLITE_TRITON_RESTRICT_M = True
	gemlite.core.GemLiteLinear.load_config('/tmp/gemlite_config.json')

  return torch.load(cls.get_weight_file(save_dir), map_location=map_location)
100%|██████████| 32/32 [00:00<00:00, 390.17it/s]
100%|██████████| 32/32 [00:00<00:00, 1183.41it/s]
100%|██████████| 32/32 [02:53<00:00,  5.42s/it]


In [None]:
from pipelinellm import convert_mixtral_to_cached_mlp, PipelineLLM
llm, cached_mlps = convert_mixtral_to_cached_mlp(llm, dtype, sparsity=0.8)

# 创建流水线模型
PLLM = PipelineLLM(llm, cached_mlps, 1, 3, training_epoch=20, print_layer_info=True) ### use ep

active neural num  2867
active neural num  2867


  self.load_state_dict(torch.load(f'/home/bcds/On-the-Fly_MoE_Inference/expert_predictor/training/{layer_idx}-{training_epoch}.pth'))


#### 尝试替换torch.compile()

In [None]:
w3_forward = torch.compile(llm.model.layers[0].block_sparse_moe.experts[1].w3.forward, fullgraph=True, mode="reduce-overhead")
w3 = llm.model.layers[0].block_sparse_moe.experts[1].w3
w3.cuda(0)
w3.forward = w3_forward

### 测试时间开销

In [4]:
import json
from datasets import load_dataset, Dataset
from transformers import GenerationConfig


input_length = 10
MAX_LENGTH = input_length
output_length = 32
test_samples = 10
device_id = 0

with open("../path.json", "r") as f:
    paths = json.load(f)
    fineweb_path = paths["fineweb"]

def preprocess_data(data, tokenizer):
	# 使用 tokenizer 将文本数据转换为模型输入
	inputs = tokenizer(data, padding="max_length", truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
	inputs["labels"] = inputs.input_ids.clone()
	return inputs

filt_type = fineweb_path.split('.')[-1]
fineweb = load_dataset(filt_type, data_files=fineweb_path) #726000
fineweb_text = fineweb['train']['text']

# 预热（避免第一次运行时的额外开销）
# for text in fineweb_text[:5] :
#     inputs = preprocess_data(text, tokenizer)
#     with torch.no_grad():
#         output = llm(input_ids=inputs["input_ids"].cuda(device_id), attention_mask=inputs["attention_mask"].cuda(device_id))

generated_all = 0
prefill_time, decode_time = 0, 0
reloaded_experts = 0
print("output length is {}".format(output_length))
for text in fineweb_text[2:2+test_samples] :
    inputs = preprocess_data(text, tokenizer)
    ### 清空统计数据
    PLLM.get_prefill_time()
    PLLM.get_reload_experts()

    # 测试时间
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)

    # 开始计时
    torch.cuda.synchronize()
    start_event.record()

    # 前向传播
    with torch.no_grad():
        output = llm.generate(
            input_ids=inputs["input_ids"].cuda(device_id),
            attention_mask=inputs["attention_mask"].cuda(device_id),
            max_length=input_length + output_length,  # 总长度为输入长度 + 输出长度
            generation_config=GenerationConfig(do_sample=False),
            pad_token_id=tokenizer.pad_token_id, 
            # cache_implementation="static" ## moe not support
        )

    # 结束计时
    end_event.record()
    torch.cuda.synchronize()

    # 计算时间
    elapsed_time = start_event.elapsed_time(end_event) / 1000  # 转换为秒
    decode_time += elapsed_time
    cur_prefill_time = PLLM.get_prefill_time()
    prefill_time += cur_prefill_time
    print(f"Generated length: {len(output[0]) - input_length}", f"Time taken: {elapsed_time:.2f} s,", f"prefill time: {cur_prefill_time:.2f} s")
    # print(output)
    print(tokenizer.batch_decode(output, skip_special_tokens=True))

    generated_all += (len(output[0]) - input_length - 1)
    reloaded_experts += PLLM.get_reload_experts()

timepertoken = (decode_time - prefill_time) / (generated_all)
print("decode phase speed:", '{:.4f}'.format(1/timepertoken) , 'token/s')
print("the number of reloaded experts per token:", '{:.3f}'.format(reloaded_experts / generated_all))

output length is 32
Generated length: 32 Time taken: 49.89 s, prefill time: 40.31 s
['How do you get HIV?\nHIV is a virus.\n\nHIV is a virus.\n\nHIV is a virus.\n\nHIV is a virus.\n\nHIV']
Generated length: 32 Time taken: 56.94 s, prefill time: 48.50 s
['CTComms sends on average 2 million emails a year.\n\nTheir 2018 goal is 10,000,000.\n\nTheir ']
Generated length: 32 Time taken: 56.66 s, prefill time: 49.29 s
['Hold the salt: UCLA engineers develop a new, inexpensive, 3-D-printed, low-power, all-in-one, re-usable, and fast']
Generated length: 32 Time taken: 46.00 s, prefill time: 39.39 s
['Not Just for Kids\nThe Hunt for Fallen Fins\n\nBy LINDSAY A. ALLUP\n\nThe 1999-2000 St. John’s']
Generated length: 32 Time taken: 43.67 s, prefill time: 37.17 s
['The Solar and Heliospheric Observatory (SOHO) is a joint ESA-NASA-Pillar 3 (Solar) mission. Launched in 199']
Generated length: 32 Time taken: 41.15 s, prefill time: 34.54 s
['Bolivia: Coca-chewing protestors\n\nBolivia, 1993. Coca-chewin

In [6]:

########################################################################
#Save gemlite cache
if(backend == 'gemlite'):
	gemlite.core.GemLiteLinear.cache_config('/tmp/gemlite_config.json') 

#### torch.profile

trace-offloading-r.json是最优，就是做完一个index就传一个

In [6]:
import json
from datasets import load_dataset, Dataset
from transformers import GenerationConfig

input_length = 1
MAX_LENGTH = input_length
output_length = 2
test_samples = 4

with open("../path.json", "r") as f:
    paths = json.load(f)
    fineweb_path = paths["fineweb"]

def preprocess_data(data, tokenizer):
	# 使用 tokenizer 将文本数据转换为模型输入
	inputs = tokenizer(data, padding="max_length", truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
	inputs["labels"] = inputs.input_ids.clone()
	return inputs

fineweb = load_dataset("parquet",data_files=fineweb_path) #726000
fineweb_text = fineweb['train']['text'][:test_samples] 

print("output length is {}".format(output_length))
text = fineweb_text[0]
inputs = preprocess_data(text, tokenizer)

# cached_mlp.clear_load_from_cpu_stats()
with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA,
    ]
) as p:
    # 前向传播
    with torch.no_grad():
        output = llm.generate(
            input_ids=inputs["input_ids"].cuda(),
            attention_mask=inputs["attention_mask"].cuda(),
            max_length=input_length + output_length,  # 总长度为输入长度 + 输出长度
            generation_config=GenerationConfig(do_sample=False),
            pad_token_id=tokenizer.eos_token_id
        )
print(p.key_averages().table(
    sort_by="self_cpu_time_total", row_limit=-1))
p.export_chrome_trace("./offloading-hqq2-reload_new.json")

output length is 2
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            aten::index        26.30%     124.066ms        27.48%     129.665ms     218.292us       1.762ms         0.40%       2.103ms       3.541us           594  
                                        cudaMemcpyAsync        16.46%      77.643ms        16.46%      77.643ms      55.778us       0.000us         0.00%       0.000us       0.000us       

#### 测试一个正常输出

In [None]:
import json
from datasets import load_dataset, Dataset
from transformers import GenerationConfig

input_length = 6
MAX_LENGTH = input_length
output_length = 32
device_id = 0
test_samples = 1

def preprocess_data(data, tokenizer):
	# 使用 tokenizer 将文本数据转换为模型输入
	inputs = tokenizer(data, padding="max_length", truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
	inputs["labels"] = inputs.input_ids.clone()
	return inputs

generated_all, decode_time, prefill_time = 0, 0, 0
# print("max output length is {}".format(output_length))
text = "The future of AI is "

clear_prefill_time = PLLM.get_prefill_time()
clear_experts = PLLM.get_reload_experts()
# print("need to zero: ", clear_experts, clear_prefill_time)

inputs = preprocess_data(text, tokenizer)
# 测试时间
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)

# 开始计时
torch.cuda.synchronize()
start_event.record()

# 前向传播
with torch.no_grad():
    output = llm.generate(
        input_ids=inputs["input_ids"].cuda(device_id),
        attention_mask=inputs["attention_mask"].cuda(device_id),
        max_length=input_length + output_length,  # 总长度为输入长度 + 输出长度
        generation_config=GenerationConfig(do_sample=False),
        pad_token_id=tokenizer.pad_token_id, 
        # cache_implementation="static" ## moe not support
    )

# 结束计时
end_event.record()
torch.cuda.synchronize()

# 计算时间
elapsed_time = start_event.elapsed_time(end_event) / 1000  # 转换为秒
decode_time += elapsed_time
cur_prefill_time = PLLM.get_prefill_time()
prefill_time += cur_prefill_time
print(f"Generated length: {len(output[0]) - input_length}", f"Time taken: {elapsed_time:.2f} s", f"prefill time: {cur_prefill_time:.2f} s")
# print(output)
print(tokenizer.batch_decode(output, skip_special_tokens=True))

generated_all += (len(output[0]) - input_length -1)

timepertoken = (decode_time - prefill_time) / (generated_all)
print("decode phase speed:", '{:.4f}'.format(1/timepertoken) , ' token/s')
print("the number of experts reload per token:", PLLM.get_reload_experts() / generated_all)

Generated length: 32 Time taken: 43.89 s prefill time: 37.30 s
['The future of AI is here, and it’s not as scary as you might think. In this article, we’ll take a look at the 10 most important AI']
decode phase speed: 4.7067  token/s
the number of experts reload per token: 7.903225806451613


### 加载到GPU上

In [1]:
from transformers import MixtralForCausalLM, AutoTokenizer
import torch
import torch.nn as nn
from typing import Optional
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
import json

def get_model(model_name, device_map, dtype=torch.bfloat16):
    llm = MixtralForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        use_cache=True,
        torch_dtype=dtype,
    ) 
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    return llm, tokenizer

with open('../path.json', 'r') as f:
    path = json.load(f)
    model_name = path['mixtral']
    # threshold_path = path[threshold_path_name]

with open('../quantize/device_map_1.json', 'r') as f:
    device_map = json.load(f)

dtype = torch.float16
llm, tokenizer = get_model(model_name, device_map, dtype=dtype)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 19/19 [00:25<00:00,  1.36s/it]
