In [1]:
dataset = ['boolq', 'piqa','social_i_qa','winogrande','ARC-Easy','ARC-Challenge','openbookqa','hellaswag']
DATAPATH = r'dataset/'+dataset[0]+'/train.json'

In [None]:
# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import sys
import json
from typing import List
from badam import BlockOptimizer
import fire
import torch
import transformers
# from datasets import load_dataset
from datasets import load_dataset, concatenate_datasets
from typing import List, Optional, Union
from transformers import TrainerState,TrainingArguments,TrainerControl
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
# 替换绝对路径即可
sys.path.append("/home/xjz/proj/Subspace-Tuning/CR_MR/peft/src/")
from peft import (  # noqa: E402
    LoraConfig,
    DoraConfig,
    BottleneckConfig,
    PrefixTuningConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
)
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer, AutoModel  # noqa: F402

# model/data params
base_model: str = "/home/xjz/model/Qwen2_5_3B"  # the only required argument
data_path: str = DATAPATH
output_dir: str = "/data/xjz/test/"+dataset[0]
adapter_name: str = "lora"
load_8bit : bool = False
# training hyperparams
batch_size:int = 1
micro_batch_size:int =1
num_epochs: int = 1.2
learning_rate: float = 1e-5
weight_decay: float = 0.0
cutoff_len: int = 256
val_set_size: int = 0
use_gradient_checkpointing: bool = False
eval_step: int = 0
save_step: int = 3000
# lora hyperparams
lora_r: int = 8
lora_alpha: int = 16
lora_dropout: float = 0.05
lora_target_modules: List[str] = ["q_proj", "k_proj", "v_proj"]
# bottleneck adapter hyperparams
bottleneck_size: int = 256
non_linearity: str = "tanh"
adapter_dropout: float = 0.0
use_parallel_adapter: bool = False
use_adapterp: bool = False
target_modules: List[str] = None
# Dora hyperparams
dora_simple: bool = True
Wdecompose_target_modules: List[str] = None
scaling: Union[float, str] = 1.0
# prefix tuning hyperparams
num_virtual_tokens: int = 30
# llm hyperparams
train_on_inputs: bool = True  # if False, masks out inputs in loss
group_by_length: bool = False  # faster, but produces an odd training loss curve
# wandb params
wandb_project: str = ""
wandb_run_name: str = ""
wandb_watch: str = "" # options: false | gradients | all
wandb_log_model: str = ""  # options: false | true
resume_from_checkpoint: str = None  # either training checkpoint or final adapter

gradient_accumulation_steps = int(batch_size) // int(micro_batch_size)

device_map = "auto"
world_size = int(os.environ.get("WORLD_SIZE", 1))
ddp = world_size != 1
if ddp:
    device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
    gradient_accumulation_steps = gradient_accumulation_steps / world_size

# Check if parameter passed or if set within environ
use_wandb = len(wandb_project) > 0 or (
    "WANDB_PROJECT" in os.environ and len(os.environ["WANDB_PROJECT"]) > 0
)
# Only overwrite environ if wandb param passed
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project
if len(wandb_watch) > 0:
    os.environ["WANDB_WATCH"] = wandb_watch
if len(wandb_log_model) > 0:
    os.environ["WANDB_LOG_MODEL"] = wandb_log_model

if load_8bit:
    model = AutoModelForCausalLM.from_pretrained(
        base_model,
        load_in_8bit=load_8bit,
        torch_dtype=torch.float16,
        device_map=device_map,
        trust_remote_code=True,
    )
else:
    model = AutoModelForCausalLM.from_pretrained(
        base_model,
        load_in_8bit=False,
        torch_dtype=torch.float16,
        device_map={"": int(os.environ.get("LOCAL_RANK") or 0)},
        trust_remote_code=True,
    )



if model.config.model_type == "llama":
# Due to the name of transformers' LlamaTokenizer, we have to do this
# need to handle llama 3 separately
    if "Llama-3" in base_model:
        print("load llama-3 tokenizer")
        tokenizer = AutoTokenizer.from_pretrained(base_model)
    else:
        tokenizer = LlamaTokenizer.from_pretrained(base_model)
else:
    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
print(model)


In [4]:
tempgrad_dict = {}
import pickle
class SkipStepCallback(transformers.TrainerCallback):

    grad_dict = {}
    has_grad = 0
    skip_kwd_list = ['input_layernorm', 'post_attention_layernorm', 'bias','lm_head','embed_tokens','rotary_emb','act_fn']
    def on_save(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        return None
       
    def on_step_begin(self, args, state, control, **kwargs):
        pass
    
    def on_pre_optimizer_step(self, args, state, control, **kwargs):
        
        model = kwargs['model']
        for name, param in model.named_parameters():
            if param.grad is not None:
                if not any(kwd in name for kwd in self.skip_kwd_list):
                    self.grad_dict[name] += (param.grad**2).sum().item()
        self.has_grad = 0
       
    def on_step_end(self, args, state, control, **kwargs):
        pass
    
    def on_train_begin(self, args, state, control, **kwargs):
        model =  kwargs['model']
     
        grad_skip_kwd_list = ['head', 'norm', 'bias','input_layernorm','post_attention_layernorm','input_layernorm', 'bias','lm_head','embed_tokens','rotary_emb','act_fn'] 
        for name, param in model.named_parameters():
            if not any(kwd in name for kwd in grad_skip_kwd_list):
                self.grad_dict[name] = 0.0
                
    def on_train_end(self, args, state, control, **kwargs):
    
       
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        json_lines = []
        for key, value in self.grad_dict.items():
            json_line = json.dumps({key: value})
            json_lines.append(json_line)
        print(json_lines)
        # 将每行 JSON 字符串写入文件
        Toutput_dir = DATAPATH+r'expert3b1152.json'  
        with open(Toutput_dir, 'w') as f:
            for line in json_lines:
                f.write(line + '\n')
        print("JSON file saved successfully.")

In [None]:
if adapter_name == "prefix-tuning":
    model.to('cuda')

if data_path.endswith(".json"):  # todo: support jsonl
    data = load_dataset("json", data_files=data_path)
else:
    data = load_dataset(data_path)

if resume_from_checkpoint:
# Check the available weights and load them
    checkpoint_name = os.path.join(
        resume_from_checkpoint, "pytorch_model.bin"
    )  # Full checkpoint
    if not os.path.exists(checkpoint_name):
        checkpoint_name = os.path.join(
            resume_from_checkpoint, "adapter_model.bin"
        )  # only LoRA model - LoRA config above has to fit
        resume_from_checkpoint = (
            False  # So the trainer won't try loading its state
        )
    # The two files above have a different name depending on how they were saved, but are actually the same.
    if os.path.exists(checkpoint_name):
        print(f"Restarting from {checkpoint_name}")
        adapters_weights = torch.load(checkpoint_name)
        model = set_peft_model_state_dict(model, adapters_weights)
    else:
        print(f"Checkpoint {checkpoint_name} not found")
        
# 修改为qwen2 start
def process_func(example):
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{example['instruction'] + example['input']}<|im_end|>\n<|im_start|>assistant\n", add_special_tokens=False)
    response = tokenizer(f"{example['output']}", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id] 
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]  # 因为eos token咱们也是要关注的所以 补充为1 
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]  
    if len(input_ids) > cutoff_len:
        input_ids = input_ids[:cutoff_len]   
        attention_mask = attention_mask[:cutoff_len]    
        labels = labels[:cutoff_len] 
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

if val_set_size > 0:
    train_val = data["train"].train_test_split(
        test_size=val_set_size, shuffle=True, seed=42
    )
    train_data = (
        train_val["train"].shuffle().map(process_func,remove_columns=data['train'].column_names)
    )
    val_data = (
        train_val["test"].shuffle().map(process_func,remove_columns=data['test'].column_names)
    )
else:
    train_data = data['train'].shuffle().map(process_func,remove_columns=data['train'].column_names)
    val_data = None
# 修改为qwen2 end

original_optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
scheduler = torch.optim.lr_scheduler. ExponentialLR(original_optimizer, gamma=0.95)

optimizer = BlockOptimizer(
base_optimizer=original_optimizer, 
named_parameters_list=list(model.named_parameters()), 
switch_block_every=1, 
switch_mode="descending",
verbose=2 # information level, will print trainable parameters when setting to 2
)
trainer = transformers.Trainer(
model=model,
train_dataset=train_data,
eval_dataset=val_data,

args=transformers.TrainingArguments(
    per_device_train_batch_size=micro_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    warmup_steps=20,
    num_train_epochs=num_epochs,
    max_steps = 108,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    # fp16=True,
    logging_steps=20,
    # 测试有用没
    logging_dir= output_dir,
    
    optim="adamw_torch",
    
    evaluation_strategy="steps" if val_set_size > 0 else "no",
    eval_steps=eval_step if val_set_size > 0 else None,
    
    save_strategy="steps",
    save_steps=save_step,
    output_dir=output_dir,
    save_total_limit=3,
    load_best_model_at_end=True if val_set_size > 0 else False,
    ddp_find_unused_parameters=False if ddp else None,
    group_by_length=group_by_length,
    report_to="wandb" if use_wandb else None,
    run_name=wandb_run_name if use_wandb else None,
),
data_collator=transformers.DataCollatorForSeq2Seq(
    tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
),
optimizers=(optimizer, scheduler),

)
trainer.add_callback(SkipStepCallback)

model.config.use_cache = False

if torch.__version__ >= "2" and sys.platform != "win32":
    model = torch.compile(model)

train_result = trainer.train(resume_from_checkpoint=resume_from_checkpoint)

metrics = train_result.metrics
metrics["train_samples"] = len(train_data)
trainer.log_metrics("train", metrics)




In [7]:
# import pickle
# with open('grad_dict.pkl', 'rb') as f:
#     loaded_grad_dict = pickle.load(f)

In [8]:
# import numpy as np
# import torch
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# grad_skip_kwd_list = ['head', 'embed_tokens', 'norm', 'bias']  # Fully tune head and class token, freeze patch_embed,
#                                                          # we find pos_embed can be either fully ft or unstructured ft, doesn't make much difference
# grad_matrix_kwd_list = ['.q.', '.k.', '.v.', 'proj', 'fc']  # Might structurally tune the matrices: q, k, v, proj, fc1, and fc2
# grad_vector_kwd_list = ['norm', 'bias']  # Might structurly tune the vectors
# grad_dict = loaded_grad_dict
# grad_shapes = {}
# grad_shapes_int = {}
# for key in grad_dict.keys():
#         if not any(kwd in key for kwd in grad_skip_kwd_list):
#             # print('yes')
#             print(key)
#             grad_tensor = grad_dict[key].to(device)
#             sumvalue = grad_tensor.sum().item()
#             grad_shapes[key] = sumvalue
# #             grad_shapes_int[key] = np.cumprod(list(grad_dict[key].shape))[-1]

# # large_tensor = torch.cat([grad_dict[key].flatten() for key in grad_shapes.keys()])

In [9]:
# # grad_shapes
# # # 假设 self.grad_dict 是一个字典，其中每个值都是一个浮点数
# # top_20 = dict(sorted(grad_shapes.items(), key=lambda item: item[1], reverse=True)[:20])

# # # top_20 现在包含前 20 个最大值的项
# # print(top_20)
# threshold = 0.5
# filtered_items = {k: v for k, v in grad_shapes.items() if v > threshold}
# print(f"Items with values greater than {threshold}:")
# print(len(filtered_items))


In [10]:
# for key in filtered_items:
#     filtered_items[key] = 1
# filtered_items

In [11]:
# import re
# def infer_param_groups(param_names, include_embedding = False, include_lm_head = False):
#         """automatic inference of the parameter groups based on the parameter names.
#         divide groups into:
#             * embedding
#             * transformer layers
#             * lm_head and others
#         """
        
#         block_prefix_list = []
#         lm_head_and_other_params = []
#         embed_pattern = r'.*embed[^.]*\.'
#         layer_pattern = r'.*layers.[^.]*\.'

#         for name in param_names:
#             print(name)
#             if any(prefix[0] in name for prefix in block_prefix_list):
#                 continue
            
#             if re.findall(layer_pattern, name):
#                 block_prefix_list.append(re.findall(layer_pattern, name))
#             elif re.findall(embed_pattern, name) and include_embedding:
#                 block_prefix_list.append(re.findall(embed_pattern, name))
#             else:
#                 lm_head_and_other_params.append(name)
        
#         if include_lm_head:
#             block_prefix_list.append(lm_head_and_other_params)
        
#         return block_prefix_list

# test_list =  infer_param_groups([n for n,_ in list(model.named_parameters())])
        


In [None]:
# for n,_ in list(model.named_parameters()):
#     if 'lora' in n:
#         print(n)

# a = [1,2,3,4]
# a.end