### 加载模型

In [1]:
import json
import torch
from transformers import AutoTokenizer
from modeling_mixtral import MixtralForCausalLM, set_profile_mode, load_thresholds
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"

def _load_model(model_name, threshold_path, device_map):
    print(f"Loading model {model_name}")
    ## 开启稀疏模式
    set_profile_mode(False)
    load_thresholds(f'{threshold_path}/thresholds_0_8.pt', use_average=True)

    model = MixtralForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        use_cache=True,
        torch_dtype=torch.float16,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"
    return model, tokenizer

model_name = 'mixtral'
threshold_path_name = 'chess_up_sparsity_threshold'
with open('../path.json', 'r') as f:
    path = json.load(f)
    model_name = path['mixtral']
    threshold_path = path[threshold_path_name]

with open('../quantize/device_map_1.json', 'r') as f:
    device_map = json.load(f)
model, tokenizer = _load_model(model_name, threshold_path, device_map)

  from .autonotebook import tqdm as notebook_tqdm


Loading model /home/lz/Mixtral-8x7B-v0.1
Set profile_threshold to False


  up_th = torch.load(threshold_path, map_location='cuda')["up_proj_states_thresholds_2"]


Thresholds loaded from /home/lz/On-the-Fly_MoE_Inference/quantize/threshold/c4_mixtral_up/thresholds_0_8.pt


Loading checkpoint shards: 100%|██████████| 19/19 [00:22<00:00,  1.20s/it]


### 加载数据集

In [2]:
datasets = torch.load('../saving/threshold/chess/datasets.pt')
import torch
import numpy as np
def get_batch(data, batch_size, block_size):
    start_idxs = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in start_idxs])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in start_idxs])
    return x, y

  datasets = torch.load('../saving/threshold/chess/datasets.pt')


保存激活值和专家路由的数据集

In [4]:
sparsity_level = 0.8
# device = 'cuda:1'
device_2 = 'cpu'
avg_loss = 0.0
n_batch = 64
# n_batch = 2
# accum_steps = 4 
accum_steps = 2
batch_size = 1
block_size = 2048
torch.manual_seed(42)
n_layers = len(model.model.layers)
n_experts = len(model.model.layers[0].block_sparse_moe.experts)

with torch.no_grad():
    for step in range(n_batch // accum_steps):
        print(step * accum_steps)
        for batch_idx in range(accum_steps):
            # print('batch_idx:', batch_idx)
            inputs, labels = get_batch(datasets['validation'], batch_size, block_size)
            inputs = inputs.cuda()
            outputs = model(inputs, labels=inputs)
            avg_loss = avg_loss + outputs.loss / n_batch

for layer_idx in range(1, n_layers):
    d = list(zip(model.model.layers[layer_idx-1].block_sparse_moe.activations, model.model.layers[layer_idx].block_sparse_moe.gate_logits))
    
    torch.save(d,f'merge/a2ef_{layer_idx}.pth')

0
2
4
6
8
10
12
14
16
18
20
22
24
26
28
30
32
34
36
38
40
42
44
46
48
50
52
54
56
58
60
62
