In [1]:
import torch
import json
# from modeling_llama_up import set_profile_mode
import os
import csv
from utils import get_model, set_seed

os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
### from path.json read paths of model and dataset
model_name = "mixtral"
dataset_name = "c4"
with open('../path.json', 'r') as file:
    paths = json.load(file)
    model_path = paths.get(model_name, '')
    dataset_path = paths.get(dataset_name, '')
    save_path = paths.get('chess_up_threshold','')
    print('model path:', model_path, '\ndataset path:', dataset_path, '\nsave path:', save_path)

set_seed(42)
# c4data = get_c4_data(model_path, dataset_path, sample_num = 400)
model = get_model(model_path)

  from .autonotebook import tqdm as notebook_tqdm
MixtralForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


model path: /home/lz/Mixtral-8x7B-v0.1 
dataset path: /home/lz/c4 
save path: /home/lz/On-the-Fly_MoE_Inference/saving/threshold/c4_mixtral_up


Loading checkpoint shards: 100%|██████████| 19/19 [01:06<00:00,  3.51s/it]


with sparsity of 0


In [4]:
from datasets import load_dataset
from transformers import AutoTokenizer

raw_datasets = load_dataset("/home/lz/c4")

tokenizer = AutoTokenizer.from_pretrained(model_path)
def process(example):
    ids = tokenizer.encode(example['text'])
    out = {'ids': ids, 'len': len(ids)}
    return out

tokenized = raw_datasets.map(process, desc='tokenizing raw datasets', num_proc=64)
import numpy as np
datasets = dict()

for split, dset in tokenized.items():
    datasets[split] = []
    length = np.sum(dset['len'])
    datasets[split] = np.ndarray((length, ), np.uint32)
    idx = 0
    for row in dset:
        datasets[split][idx:idx + row['len']] = row['ids']
        idx += row['len']
torch.save(datasets, 'datasets.pt')

Generating validation split: 0 examples [00:00, ? examples/s]

Generating validation split: 45576 examples [00:00, 121496.76 examples/s]
tokenizing raw datasets (num_proc=64): 100%|██████████| 45576/45576 [00:02<00:00, 17946.73 examples/s]


In [2]:
datasets = torch.load('./threshold/chess/datasets.pt')
import torch
import numpy as np
def get_batch(data, batch_size, block_size):
    start_idxs = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in start_idxs])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in start_idxs])
    return x, y

  datasets = torch.load('./threshold/chess/datasets.pt')


In [4]:
sparsity_level = 0.7
# device = 'cuda:1'
device_2 = 'cpu'
avg_loss = 0.0
n_batch = 64
# accum_steps = 4 
accum_steps = 2
batch_size = 1
block_size = 2048
torch.manual_seed(42)
n_layers = len(model.model.layers)
n_experts = len(model.model.layers[0].block_sparse_moe.experts)

up_proj_states_thresholds = [torch.zeros([n_experts,]) for _ in range(n_layers)]
gate_proj_states_mean_squares = [[torch.zeros(model.config.intermediate_size) for _ in range(n_experts)] for _ in range(n_layers)]

up_states = [[torch.zeros([accum_steps * batch_size * block_size //2, model.config.intermediate_size]) for _ in range(n_experts)] for _ in range(n_layers)]
gate_states = [[torch.zeros([accum_steps * batch_size * block_size //2, model.config.intermediate_size]) for _ in range(n_experts)] for _ in range(n_layers)]

with torch.no_grad():
    for step in range(n_batch // accum_steps):
        print(step * accum_steps)
        all_counts = [0 for _ in range(n_layers * n_experts)]
        for batch_idx in range(accum_steps):
            # print('batch_idx:', batch_idx)
            inputs, labels = get_batch(datasets['validation'], batch_size, block_size)
            inputs = inputs.cuda()
            outputs = model(inputs, labels=inputs)
            avg_loss = avg_loss + outputs.loss / n_batch

            for layer_idx in range(n_layers):
                for expert_idx in range(n_experts):
                    counts = all_counts[layer_idx * n_experts + expert_idx]

                    states = model.model.layers[layer_idx].block_sparse_moe.experts[expert_idx].up_proj_states.reshape(-1, model.config.intermediate_size)
                    cur_counts = states.size(0)
                    # print('counts and cur_counts:',counts, cur_counts)
                    # print(states.size())
                    # print(up_states[layer_idx][expert_idx][counts : counts+cur_counts, :].size())
                    up_states[layer_idx][expert_idx][counts : counts+cur_counts, :] = states

                    states = model.model.layers[layer_idx].block_sparse_moe.experts[expert_idx].gate_proj_states.reshape(-1, model.config.intermediate_size)
                    gate_states[layer_idx][expert_idx][counts : counts+cur_counts, :] = states
                    # counts += cur_counts
                    all_counts[layer_idx * n_experts + expert_idx] += cur_counts

        for layer_idx in range(n_layers):   
            for expert_idx in range(n_experts):
                # print('layer_idx:', layer_idx, 'expert_idx:', expert_idx)
                useful_num = all_counts[layer_idx * n_experts + expert_idx]
                topk_num = int(useful_num * model.config.intermediate_size * sparsity_level)
                up_proj_states_thresholds[layer_idx][expert_idx] += up_states[layer_idx][expert_idx][0:useful_num,:].to(device_2).abs().flatten().kthvalue(topk_num).values.to('cpu')
                gate_proj_states_mean_squares[layer_idx][expert_idx] += (torch.sum(gate_states[layer_idx][expert_idx][0:useful_num,:].to(device_2) ** 2, dim=0).to('cpu') / useful_num).to('cpu')

for layer_idx in range(n_layers):
    for expert_idx in range(n_experts):
        gate_proj_states_mean_squares[layer_idx][expert_idx] /= n_batch // accum_steps
        up_proj_states_thresholds[layer_idx][expert_idx] /= n_batch // accum_steps

0
2
4
6
8
10
12
14
16
18
20
22
24
26
28
30
32
34
36
38
40
42
44
46
48
50
52
54
56
58
60
62


In [5]:
gate_proj_states_mean_squares[0][1],up_proj_states_thresholds[0][1]

(tensor([0., 0., 0.,  ..., 0., 0., 0.]), tensor(0.0481))

In [7]:
importance_thresholds = [torch.zeros([n_experts,]) for _ in range(n_layers)]
up_proj_states_thresholds_2 = [[torch.zeros(model.config.intermediate_size) for _ in range(n_experts)] for _ in range(n_layers)]

with torch.no_grad():
    for step in range(n_batch // accum_steps):
        print(step * accum_steps)
        all_counts = [0 for _ in range(n_layers * n_experts)]
        for batch_idx in range(accum_steps):
            inputs, labels = get_batch(datasets['validation'], batch_size, block_size)
            inputs = inputs.cuda()
            outputs = model(inputs, labels=inputs)
            avg_loss = avg_loss + outputs.loss / n_batch

            for layer_idx in range(n_layers):
                for expert_idx in range(n_experts):
                    counts = all_counts[layer_idx * n_experts + expert_idx]
                    states = model.model.layers[layer_idx].block_sparse_moe.experts[expert_idx].up_proj_states.reshape(-1, states.size(-1))
                    cur_counts = states.size(0)
                    up_states[layer_idx][expert_idx][counts:cur_counts+counts, :] = states
                    # counts += cur_counts
                    all_counts[layer_idx * n_experts + expert_idx] += cur_counts
                
        for layer_idx in range(n_layers):   
            for expert_idx in range(n_experts):
                useful_num = all_counts[layer_idx * n_experts + expert_idx]
                importance_scores = up_states[layer_idx][expert_idx][:useful_num,:] ** 2 * gate_proj_states_mean_squares[layer_idx][expert_idx]
                importance_thresholds[layer_idx][expert_idx] += importance_scores.to(device_2).flatten().kthvalue(int(importance_scores.numel() * sparsity_level)).values.to('cpu')

for layer_idx in range(n_layers):
    for expert_idx in range(n_experts):
        importance_thresholds[layer_idx][expert_idx] /= n_batch // accum_steps
        up_proj_states_thresholds_2[layer_idx][expert_idx] = (importance_thresholds[layer_idx][expert_idx].expand_as(up_proj_states_thresholds_2[layer_idx][expert_idx]) / gate_proj_states_mean_squares[layer_idx][expert_idx]) ** 0.5

thresholds = {'up_proj_states_thresholds': up_proj_states_thresholds, 'up_proj_states_thresholds_2': up_proj_states_thresholds_2}

torch.save(thresholds, f'{save_path}/thresholds.pt')

0
2
4


In [18]:
print(thresholds["gate_proj_states_thresholds_2"])

[tensor([0.0430, 0.0532, 0.0464,  ..., 0.0520, 0.0472, 0.0433]), tensor([0.0546, 0.0589, 0.0626,  ..., 0.0616, 0.0647, 0.0532]), tensor([0.0695, 0.0710, 0.0788,  ..., 0.0764, 0.0781, 0.0753]), tensor([0.1077, 0.1176, 0.1157,  ..., 0.1057, 0.1080, 0.1078]), tensor([0.1757, 0.1625, 0.1389,  ..., 0.1653, 0.1647, 0.1757]), tensor([0.1111, 0.1100, 0.1518,  ..., 0.1844, 0.1667, 0.1718]), tensor([0.1231, 0.1730, 0.1974,  ..., 0.1832, 0.1677, 0.1704]), tensor([0.1086, 0.1731, 0.0501,  ..., 0.1701, 0.1229, 0.1641]), tensor([0.1794, 0.2142, 0.1114,  ..., 0.2115, 0.1843, 0.1764]), tensor([0.1431, 0.1840, 0.1670,  ..., 0.1544, 0.1965, 0.1978]), tensor([0.1780, 0.1699, 0.1990,  ..., 0.1513, 0.1691, 0.1669]), tensor([0.1339, 0.1460, 0.2050,  ..., 0.1584, 0.1386, 0.1630]), tensor([0.1705, 0.1448, 0.1696,  ..., 0.2083, 0.1902, 0.2003]), tensor([0.2199, 0.1668, 0.1999,  ..., 0.1858, 0.1896, 0.1733]), tensor([0.1939, 0.2284, 0.1840,  ..., 0.1795, 0.2215, 0.2305]), tensor([0.2053, 0.2025, 0.2201,  ..., 0