## PD结合的思路

### 分别替换gate/up层

先输入一句话，按照70%稀疏去记录prefill阶段激活的神经元，最后统计这个输入prompt对应的最高激活次数的70%的神经元。

In [1]:
import os
import json
import torch
from transformers import LlamaForCausalLM, AutoTokenizer
import convert_llama
from transformers import GenerationConfig
from datasets import load_dataset

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

### from path.json read paths of model and dataset
model_name = "Llama3-8b"
dataset_name = "c4"
with open('path.json', 'r') as file:
    paths = json.load(file)
    model_path = paths.get(model_name, '')
    dataset_path = paths.get(dataset_name, '')

c4 = load_dataset(dataset_path)
model = LlamaForCausalLM.from_pretrained(
    model_path,
    device_map='auto',
    use_cache=True,
    torch_dtype=torch.float16,
)
convert_llama.convert_llama_model(model, sparsity=0.1, start_num=14, end_num=16, )

tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00,  2.03s/it]
Convert Llama Models: 452it [00:00, 167742.47it/s]


Converted Model Done


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
for c4_demo in c4['validation']['text'][:1]:
    input_demo = tokenizer(c4_demo, padding="max_length", truncation=True, max_length=200, return_tensors="pt")
    generate_ids = model.generate(input_demo.input_ids.to('cuda:0'), max_length=230, generation_config=GenerationConfig(do_sample=False), pad_token_id=tokenizer.eos_token_id)
    tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    model.model.layers[15].mlp.gate_proj.coreinfer_recall()
    model.model.layers[15].mlp.up_proj.coreinfer_recall()


[prefill] in gate layer: 15
[prefill] in up layer: 15
in decode, gate layer 15
Overlap count: 1243.8621, Overlap ratio: 0.8680
in decode, up layer 15
Overlap count: 1080.8966, Overlap ratio: 0.7543


In [2]:
### 使用greedy decode
generate_ids = model.generate(input_demo.input_ids.to('cuda:0'), max_length=230, generation_config=GenerationConfig(do_sample=False), pad_token_id=tokenizer.eos_token_id)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

[prefill] in gate layer: 15
[prefill] in up layer: 15


'The woman who died after falling from a bridge over the A21 has been identified as a Sevenoaks mum.\nMarta Kendle, 37, fell from the Gracious Lane bridge on the morning of February 19.\nPolice were called to the carriageway around 6.10am and the road was promptly closed in both directions.\nDespite paramedics best efforts, Marta, who was originally from Poland, was pronounced dead at the scene.\nKent and Medway Coroners office have confirmed an inquest into her death will open on Wednesday (February 27).\nTributes to the mum were left at the scene and on social media.\nFriend, Jodi Cahill posted on Facebook: "I will certainly remember you. I am sorry we did not see how lost and alone you felt.\n"Be at peace dear Marta."\nA floral tribute left at the scene said goodbye to the "beautiful and kind soul".\nIt read: "To a beautiful and kind soul. You will be missed. Rest in peace."\nA spokesman for Kent Police said: "Officers were called to the A21 at Gracious Lane,'

In [3]:
model.model.layers[15].mlp.gate_proj.coreinfer_recall()
model.model.layers[15].mlp.up_proj.coreinfer_recall()

in decode, gate layer 15
Overlap count: 1243.8621, Overlap ratio: 0.8680
in decode, up layer 15
Overlap count: 1080.8966, Overlap ratio: 0.7543


## 加载保存的激活值

In [3]:
from torch import nn
import torch.nn.init as init
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
    
from torch.utils.tensorboard import SummaryWriter
from torch.cuda.amp import GradScaler, autocast  # 用于混合精度训练
from torch.utils.data import DataLoader, Dataset, random_split
import torch.optim as optim
import torch

def load_datasets(layerid = 1, expertid = 0):
    datasets_x = []
    datasets_y = []
    datasets_x1 = []
    for fileid in range(1, 5):
        # print(fileid)
        # 加一个map_location
        d = torch.load(f'/mnt/newdata/lz/sparsity/c4_llama/new_gate_and_up/{fileid}-{layerid}.pth', map_location=lambda storage, loc: storage.cuda(0))
        datasets_x.append(d[0])
        datasets_x1.append(d[1])
        datasets_y.append(d[2])
    # 
    x,x1,y = torch.cat(datasets_x,dim=1), torch.cat(datasets_x1,dim=1), torch.cat(datasets_y,dim=1)
    datasets_x.clear()
    datasets_y.clear()
    datasets_x1.clear()
    x = x.reshape(-1, 4096)
    x1 = x1.reshape(-1, 14336)
    y = y.reshape(-1, 14336)
    # print(x[0].shape)
    return x,x1,y

class CustomDataset(Dataset):
    def __init__(self, layerid = 1, expertid = 0):
        # 加载数据self.data_x1,
        self.data_x, self.data_x1, self.data_y = load_datasets(layerid)
        print(len(self.data_x1),len(self.data_x),len(self.data_y))

    def __len__(self):
        return len(self.data_x)
    
    def __getitem__(self, idx):
        return self.data_x[idx],self.data_x1[idx],self.data_y[idx]

In [4]:
# expertid = 2
layerid = 15
dataset = CustomDataset(layerid)
print(len(dataset), dataset[0][0].shape, dataset[0][1].shape) # torch.Size([512, 4096])
# 划分训练集和验证集
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1024, shuffle=False)


110137 110137 110137
110137 torch.Size([4096]) torch.Size([14336])
