In [1]:
import torch
import json
from modeling_llama_up import set_profile_mode
import os
import csv
from utils import get_c4_data, get_model, set_seed
from tqdm import tqdm

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
### from path.json read paths of model and dataset
model_name = "Llama3-8b"
dataset_name = "c4"
with open('../path.json', 'r') as file:
    paths = json.load(file)
    model_path = paths.get(model_name, '')
    dataset_path = paths.get(dataset_name, '')
    save_path = paths.get('threshold_up_new','')
    print('model path:', model_path, '\ndataset path:', dataset_path, '\nsave path:', save_path)

set_seed(42)
# c4data = get_c4_data(model_path, dataset_path, sample_num = 400)
model = get_model(model_path)
set_profile_mode(mode = False)


model path: /mnt/storage/zyx/Meta-Llama-3-8B 
dataset path: /home/lz/workspace/llama2-7b/HQQ/notebooks/draft 
save path: /home/lz/workspace/llama2-7b/on_the_fly_moe/saving/threshold/chess


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

with sparsity of 0
profile mode is False


In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer

raw_datasets = load_dataset(dataset_path)

tokenizer = AutoTokenizer.from_pretrained(model_path)
def process(example):
    ids = tokenizer.encode(example['text'])
    out = {'ids': ids, 'len': len(ids)}
    return out

tokenized = raw_datasets.map(process, desc='tokenizing raw datasets', num_proc=64)
import numpy as np
datasets = dict()

for split, dset in tokenized.items():
    datasets[split] = []
    length = np.sum(dset['len'])
    datasets[split] = np.ndarray((length, ), np.uint32)
    idx = 0
    for row in dset:
        datasets[split][idx:idx + row['len']] = row['ids']
        idx += row['len']
torch.save(datasets, 'datasets.pt')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
datasets = torch.load('./saving/threshold/chess/datasets.pt')

In [4]:
def get_batch(data, batch_size, block_size):
    start_idxs = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in start_idxs])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in start_idxs])
    return x, y

In [5]:
sparsity_level = 0.7
# device = 'cuda:1'
device_2 = 'cpu'
avg_loss = 0.0
n_batch = 64
accum_steps = 4
batch_size = 1
block_size = 2048
torch.manual_seed(42)

gate_proj_states_thresholds = [torch.zeros([1,]) for _ in range(len(model.model.layers))]
up_proj_states_mean_squares = [torch.zeros(model.config.intermediate_size) for _ in range(len(model.model.layers))]
attention_inputs_thresholds = [torch.zeros([1,]) for _ in range(len(model.model.layers))]
attention_outputs_thresholds = [torch.zeros([1,]) for _ in range(len(model.model.layers))]

gate_proj_states = [torch.zeros([accum_steps * batch_size * block_size, model.config.intermediate_size]) for _ in range(len(model.model.layers))]
up_proj_states = [torch.zeros([accum_steps * batch_size * block_size, model.config.intermediate_size]) for _ in range(len(model.model.layers))]
attention_input_states = [torch.zeros([accum_steps * batch_size * block_size, model.config.hidden_size]) for _ in range(len(model.model.layers))]
attention_output_states = [torch.zeros([accum_steps * batch_size * block_size, model.config.hidden_size]) for _ in range(len(model.model.layers))]

with torch.no_grad():
    for step in range(n_batch // accum_steps):
        print(step * accum_steps)
        for batch_idx in range(accum_steps):
            inputs, labels = get_batch(datasets['validation'], batch_size, block_size)
            inputs = inputs.cuda()
            outputs = model(inputs, labels=inputs)
            avg_loss = avg_loss + outputs.loss / n_batch

            for layer_idx in range(len(model.model.layers)):
                ### gate/up换个位置
                # states = model.model.layers[layer_idx].mlp.gate_proj_states
                states = model.model.layers[layer_idx].mlp.up_proj_states
                gate_proj_states[layer_idx][batch_idx * batch_size * block_size : (batch_idx + 1) * batch_size * block_size, :] = states.reshape(-1, states.size(-1))

                # states = model.model.layers[layer_idx].mlp.up_proj_states
                states = model.model.layers[layer_idx].mlp.gate_proj_states
                up_proj_states[layer_idx][batch_idx * batch_size * block_size : (batch_idx + 1) * batch_size * block_size, :] = states.reshape(-1, states.size(-1))

                # states = model.model.layers[layer_idx].self_attn.attention_input_states
                # attention_input_states[layer_idx][batch_idx * batch_size * block_size : (batch_idx + 1) * batch_size * block_size, :] = states.reshape(-1, states.size(-1))

                # states = model.model.layers[layer_idx].self_attn.attention_output_states
                # attention_output_states[layer_idx][batch_idx * batch_size * block_size : (batch_idx + 1) * batch_size * block_size, :] = states.reshape(-1, states.size(-1))
        
        for layer_idx in range(len(model.model.layers)):   
            gate_proj_states_thresholds[layer_idx] += gate_proj_states[layer_idx].to(device_2).abs().flatten().kthvalue(int(gate_proj_states[layer_idx].numel() * sparsity_level)).values.to('cpu')

            # attention_inputs_thresholds[layer_idx] += attention_input_states[layer_idx].to(device_2).abs().flatten().kthvalue(int(attention_input_states[layer_idx].numel() * sparsity_level)).values.to('cpu')

            # attention_outputs_thresholds[layer_idx] += attention_output_states[layer_idx].to(device_2).abs().flatten().kthvalue(int(attention_output_states[layer_idx].numel() * sparsity_level)).values.to('cpu')
            
            up_proj_states_mean_squares[layer_idx] += (torch.sum(up_proj_states[layer_idx].to(device_2) ** 2, dim=0).to('cpu') / up_proj_states[layer_idx].size(0)).to('cpu')

for layer_idx in range(len(model.model.layers)):
    gate_proj_states_thresholds[layer_idx] /= n_batch // accum_steps
    # attention_inputs_thresholds[layer_idx] /= n_batch // accum_steps
    # attention_outputs_thresholds[layer_idx] /= n_batch // accum_steps
    up_proj_states_mean_squares[layer_idx] /= n_batch // accum_steps



0
4
8
12
16
20
24
28
32
36
40
44
48
52
56
60


In [7]:
avg_loss

0.0

In [6]:
importance_thresholds = [torch.zeros([1,]) for _ in range(len(model.model.layers))]
gate_proj_states_thresholds_2 = [torch.zeros(model.config.intermediate_size) for _ in range(len(model.model.layers))]

with torch.no_grad():
    for step in range(n_batch // accum_steps):
        print(step * accum_steps)
        for batch_idx in range(accum_steps):
            inputs, labels = get_batch(datasets['validation'], batch_size, block_size)
            inputs = inputs.cuda()
            outputs = model(inputs, labels=inputs)
            avg_loss = avg_loss + outputs.loss / n_batch

            for layer_idx in range(len(model.model.layers)):
                # states = model.model.layers[layer_idx].mlp.gate_proj_states
                states = model.model.layers[layer_idx].mlp.up_proj_states
                gate_proj_states[layer_idx][batch_idx * batch_size * block_size : (batch_idx + 1) * batch_size * block_size, :] = states.reshape(-1, states.size(-1))
        
        for layer_idx in range(len(model.model.layers)):   
            importance_scores = gate_proj_states[layer_idx] ** 2 * up_proj_states_mean_squares[layer_idx]
            importance_thresholds[layer_idx] += importance_scores.to(device_2).flatten().kthvalue(int(importance_scores.numel() * sparsity_level)).values.to('cpu')

for layer_idx in range(len(model.model.layers)):
    importance_thresholds[layer_idx] /= n_batch // accum_steps
    gate_proj_states_thresholds_2[layer_idx] = (importance_thresholds[layer_idx].expand_as(gate_proj_states_thresholds_2[layer_idx]) / up_proj_states_mean_squares[layer_idx]) ** 0.5

thresholds = {'up_proj_states_thresholds': gate_proj_states_thresholds, 'up_proj_states_thresholds_2': gate_proj_states_thresholds_2}

with open('path.json', 'r') as file:
    paths = json.load(file)
    save_path = paths.get('threshold_up_new','')
    print('save path:', save_path)
torch.save(thresholds, save_path)

0
4
8
12
16
20
24
28
32
36
40
44
48
52
56
60


In [18]:
print(thresholds["gate_proj_states_thresholds_2"])

[tensor([0.0430, 0.0532, 0.0464,  ..., 0.0520, 0.0472, 0.0433]), tensor([0.0546, 0.0589, 0.0626,  ..., 0.0616, 0.0647, 0.0532]), tensor([0.0695, 0.0710, 0.0788,  ..., 0.0764, 0.0781, 0.0753]), tensor([0.1077, 0.1176, 0.1157,  ..., 0.1057, 0.1080, 0.1078]), tensor([0.1757, 0.1625, 0.1389,  ..., 0.1653, 0.1647, 0.1757]), tensor([0.1111, 0.1100, 0.1518,  ..., 0.1844, 0.1667, 0.1718]), tensor([0.1231, 0.1730, 0.1974,  ..., 0.1832, 0.1677, 0.1704]), tensor([0.1086, 0.1731, 0.0501,  ..., 0.1701, 0.1229, 0.1641]), tensor([0.1794, 0.2142, 0.1114,  ..., 0.2115, 0.1843, 0.1764]), tensor([0.1431, 0.1840, 0.1670,  ..., 0.1544, 0.1965, 0.1978]), tensor([0.1780, 0.1699, 0.1990,  ..., 0.1513, 0.1691, 0.1669]), tensor([0.1339, 0.1460, 0.2050,  ..., 0.1584, 0.1386, 0.1630]), tensor([0.1705, 0.1448, 0.1696,  ..., 0.2083, 0.1902, 0.2003]), tensor([0.2199, 0.1668, 0.1999,  ..., 0.1858, 0.1896, 0.1733]), tensor([0.1939, 0.2284, 0.1840,  ..., 0.1795, 0.2215, 0.2305]), tensor([0.2053, 0.2025, 0.2201,  ..., 0

In [19]:
pt2 = torch.load('./threshold/chess/thresholds_0_7.pt')["gate_proj_states_thresholds_2"]
pt2[0].shape