### 加载模型

In [1]:
import json
import torch
from transformers import AutoTokenizer
from modeling_mixtral_2 import MixtralForCausalLM
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"

def _load_model(model_name = "Llama3-8b"):
    print(f"Loading model {model_name}")
    ### from path.json read paths of model and dataset
    with open('../path.json', 'r') as file:
        paths = json.load(file)
        model_path = paths.get(model_name, '')

    model = MixtralForCausalLM.from_pretrained(
        model_path,
        device_map='auto',
        use_cache=True,
        torch_dtype=torch.float16,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"
    return model, tokenizer

from convert_model import convert_mixtral_model

model_name = 'mixtral'
model, tokenizer = _load_model(model_name)
model = convert_mixtral_model(model, start_num=-1, end_num=32, gamma=0.2,)

  from .autonotebook import tqdm as notebook_tqdm
MixtralForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


Loading model mixtral


Loading checkpoint shards: 100%|██████████| 19/19 [00:22<00:00,  1.16s/it]
  thresholds = torch.load(threshold_path)["up_proj_states_thresholds_2"]


Converted Model Done


### 加载数据集

In [2]:
datasets = torch.load('../saving/threshold/chess/datasets.pt')
import torch
import numpy as np
def get_batch(data, batch_size, block_size):
    start_idxs = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in start_idxs])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in start_idxs])
    return x, y

  datasets = torch.load('../saving/threshold/chess/datasets.pt')


In [None]:
sparsity_level = 0.8
# device = 'cuda:1'
device_2 = 'cpu'
avg_loss = 0.0
n_batch = 64 * 4
# accum_steps = 4 
accum_steps = 2
batch_size = 1
block_size = 2048
torch.manual_seed(42)

with torch.no_grad():
    for step in range(n_batch // accum_steps):
        print(step * accum_steps)
        for batch_idx in range(accum_steps):
            # print('batch_idx:', batch_idx)
            inputs, labels = get_batch(datasets['validation'], batch_size, block_size)
            inputs = inputs.cuda()
            outputs = model(inputs, labels=inputs)
            avg_loss = avg_loss + outputs.loss / n_batch