In [1]:
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig, DataCollatorForSeq2Seq
import pandas as pd
import numpy as np
import datasets 
import torch
from torch import nn 
from torch.utils.data import Dataset,DataLoader
from argparse import Namespace
from peft import get_peft_model, AdaLoraConfig, TaskType
from torchkeras import KerasModel 
from accelerate import Accelerator 

In [2]:
# 配置参数
cfg = Namespace()

#dataset
cfg.prompt_column = 'prompt'
cfg.response_column = 'response'
cfg.history_column = None
cfg.source_prefix = '' #添加到每个prompt开头的前缀引导语

cfg.max_source_length = 128 
cfg.max_target_length = 128

#model
cfg.model_name_or_path = 'THUDM/chatglm2-6b-int4'
cfg.quantization_bit = None #仅仅预测时可以选 4 or 8 


#train
cfg.epochs = 100 
cfg.lr = 5e-3
cfg.batch_size = 1
cfg.gradient_accumulation_steps = 16 #梯度累积

In [3]:
config = AutoConfig.from_pretrained(cfg.model_name_or_path,
                                    cache_dir='./chatglm2_6bint4',
                                    trust_remote_code=True)

tokenizer = AutoTokenizer.from_pretrained(cfg.model_name_or_path,
                                          cache_dir='./chatglm2_6bint4',
                                          trust_remote_code=True)

model = AutoModel.from_pretrained(cfg.model_name_or_path,config=config,
                                  cache_dir='./chatglm2_6bint4',
                                  trust_remote_code=True).half() 

#先量化瘦身
if cfg.quantization_bit is not None:
    print(f"Quantized to {cfg.quantization_bit} bit")
    model = model.quantize(cfg.quantization_bit)
    
#再移动到GPU上
model = model.cuda()

Compile parallel cpu kernel gcc -O3 -fPIC -pthread -fopenmp -std=c99 C:\Users\17132\.cache\huggingface\modules\transformers_modules\THUDM\chatglm2-6b-int4\66ecaf1db3a5085714e133357ea4824b69698743\quantization_kernels_parallel.c -shared -o C:\Users\17132\.cache\huggingface\modules\transformers_modules\THUDM\chatglm2-6b-int4\66ecaf1db3a5085714e133357ea4824b69698743\quantization_kernels_parallel.so failed.
Compile cpu kernel gcc -O3 -fPIC -std=c99 C:\Users\17132\.cache\huggingface\modules\transformers_modules\THUDM\chatglm2-6b-int4\66ecaf1db3a5085714e133357ea4824b69698743\quantization_kernels.c -shared -o C:\Users\17132\.cache\huggingface\modules\transformers_modules\THUDM\chatglm2-6b-int4\66ecaf1db3a5085714e133357ea4824b69698743\quantization_kernels.so failed.


In [4]:
keyword = '创造者'

description = '''我是由“医”路畅通科普团队制作的健康科普小助手ExploreMedAI，您可以叫我小e。团队成员郑书栋主要负责我的训练任务。
                    这一项目由“久脉”健康科普基地赞助，使用清华大学开源的ChatGLM2-6B作为底模，在上海交通大学“思源”一号超算平台上训练完成。
                    作为一个轻量化的健康科普大模型，我可以实现本地部署，离线使用。后续团队成员会开发出更多的版本来满足各类用户需求，请多支持。'''

#对prompt使用一些简单的数据增强的方法，以便更好地收敛。
def get_prompt_list(keyword):
    return [f'{keyword}', 
            f'小e，你的{keyword}是谁?',
            f'你的{keyword}是谁？',
            f'介绍一下你的{keyword}。',
            f'介绍一下你的{keyword}',
            f'小e，介绍一下你的{keyword}。',
            f'谁是你的{keyword}？',
            f'小e，谁是你的{keyword}？',
            f'是谁创造了你？',
            f'小e，是谁创造了你？',
            f'是谁赋予了你生命？',
            f'小e，是谁赋予了你生命？',
            f'小e，谁将你创造出来的？',
            f'谁将你创造出来的？',
            f'小e，你的制作团队是谁？',
            f'你的制作团队是谁？',
           ]

data =[{'prompt':x,'response':description} for x in get_prompt_list(keyword) ]
dfdata = pd.DataFrame(data)
display(dfdata) 

Unnamed: 0,prompt,response
0,创造者,我是由“医”路畅通科普团队制作的健康科普小助手ExploreMedAI，您可以叫我小e。团队...
1,小e，你的创造者是谁?,我是由“医”路畅通科普团队制作的健康科普小助手ExploreMedAI，您可以叫我小e。团队...
2,你的创造者是谁？,我是由“医”路畅通科普团队制作的健康科普小助手ExploreMedAI，您可以叫我小e。团队...
3,介绍一下你的创造者。,我是由“医”路畅通科普团队制作的健康科普小助手ExploreMedAI，您可以叫我小e。团队...
4,介绍一下你的创造者,我是由“医”路畅通科普团队制作的健康科普小助手ExploreMedAI，您可以叫我小e。团队...
5,小e，介绍一下你的创造者。,我是由“医”路畅通科普团队制作的健康科普小助手ExploreMedAI，您可以叫我小e。团队...
6,谁是你的创造者？,我是由“医”路畅通科普团队制作的健康科普小助手ExploreMedAI，您可以叫我小e。团队...
7,小e，谁是你的创造者？,我是由“医”路畅通科普团队制作的健康科普小助手ExploreMedAI，您可以叫我小e。团队...
8,是谁创造了你？,我是由“医”路畅通科普团队制作的健康科普小助手ExploreMedAI，您可以叫我小e。团队...
9,小e，是谁创造了你？,我是由“医”路畅通科普团队制作的健康科普小助手ExploreMedAI，您可以叫我小e。团队...


In [5]:
#训练集和验证集一样
ds_train_raw = ds_val_raw = datasets.Dataset.from_pandas(dfdata)

In [6]:
#这是支持 history列处理，并且按照batch预处理数据的方法。
def preprocess(examples):
    max_seq_length = cfg.max_source_length + cfg.max_target_length
    model_inputs = {
        "input_ids": [],
        "labels": [],
    }
    for i in range(len(examples[cfg.prompt_column])):
        if examples[cfg.prompt_column][i] and examples[cfg.response_column][i]:
            query, answer = examples[cfg.prompt_column][i], examples[cfg.response_column][i]

            history = examples[cfg.history_column][i] if cfg.history_column is not None else None
            prompt = tokenizer.build_prompt(query, history)

            prompt = cfg.source_prefix + prompt
            a_ids = tokenizer.encode(text=prompt, add_special_tokens=True, truncation=True,
                                     max_length=cfg.max_source_length)
            b_ids = tokenizer.encode(text=answer, add_special_tokens=False, truncation=True,
                                     max_length=cfg.max_target_length)

            context_length = len(a_ids)
            input_ids = a_ids + b_ids + [tokenizer.eos_token_id]
            labels = [tokenizer.pad_token_id] * context_length + b_ids + [tokenizer.eos_token_id]

            pad_len = max_seq_length - len(input_ids)
            input_ids = input_ids + [tokenizer.pad_token_id] * pad_len
            labels = labels + [tokenizer.pad_token_id] * pad_len
            labels = [(l if l != tokenizer.pad_token_id else -100) for l in labels]
            model_inputs["input_ids"].append(input_ids)
            model_inputs["labels"].append(labels)
    return model_inputs

In [7]:
ds_train = ds_train_raw.map(
    preprocess,
    batched=True,
    num_proc=4,
    remove_columns=ds_train_raw.column_names
)

ds_val = ds_val_raw.map(
    preprocess,
    batched=True,
    num_proc=4,
    remove_columns=ds_val_raw.column_names
)

Map (num_proc=4):   0%|          | 0/16 [00:00<?, ? examples/s]

NameError: name 'cfg' is not defined

In [None]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=None,
    label_pad_token_id=-100,
    pad_to_multiple_of=None,
    padding=False
)

dl_train = DataLoader(ds_train,batch_size = cfg.batch_size,
                      num_workers = 2, shuffle = True, collate_fn = data_collator 
                     )
dl_val = DataLoader(ds_val,batch_size = cfg.batch_size,
                      num_workers = 2, shuffle = False, collate_fn = data_collator 
                     )

In [None]:
#训练时节约GPU占用
model.config.use_cache=False
model.supports_gradient_checkpointing = True  #
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

peft_config = AdaLoraConfig(
    task_type=TaskType.CAUSAL_LM, inference_mode=False,
    r=8,
    lora_alpha=32, lora_dropout=0.1,
    target_modules=["query", "value"]
)

peft_model = get_peft_model(model, peft_config)

peft_model.is_parallelizable = True
peft_model.model_parallel = True
peft_model.print_trainable_parameters()

In [None]:
for name,para in peft_model.named_parameters():
    if '.2.' in name:
        break 
    if 'lora' in name.lower():
        print(name+':')
        print('shape = ',list(para.shape),'\t','sum = ',para.sum().item())
        print('\n')

In [None]:
class StepRunner:
    def __init__(self, net, loss_fn, accelerator=None, stage = "train", metrics_dict = None, 
                 optimizer = None, lr_scheduler = None
                 ):
        self.net,self.loss_fn,self.metrics_dict,self.stage = net,loss_fn,metrics_dict,stage
        self.optimizer,self.lr_scheduler = optimizer,lr_scheduler
        self.accelerator = accelerator if accelerator is not None else Accelerator() 
        if self.stage=='train':
            self.net.train() 
        else:
            self.net.eval()
    
    def __call__(self, batch):
        
        #loss
        with self.accelerator.autocast():
            loss = self.net(input_ids=batch["input_ids"],labels=batch["labels"]).loss

        #backward()
        if self.optimizer is not None and self.stage=="train":
            self.accelerator.backward(loss)
            if self.accelerator.sync_gradients:
                self.accelerator.clip_grad_norm_(self.net.parameters(), 1.0)
            self.optimizer.step()
            if self.lr_scheduler is not None:
                self.lr_scheduler.step()
            self.optimizer.zero_grad()
            
        all_loss = self.accelerator.gather(loss).sum()
        
        #losses (or plain metrics that can be averaged)
        step_losses = {self.stage+"_loss":all_loss.item()}
        
        #metrics (stateful metrics)
        step_metrics = {}
        
        if self.stage=="train":
            if self.optimizer is not None:
                step_metrics['lr'] = self.optimizer.state_dict()['param_groups'][0]['lr']
            else:
                step_metrics['lr'] = 0.0
        return step_losses,step_metrics
    
KerasModel.StepRunner = StepRunner 


#仅仅保存lora可训练参数
def save_ckpt(self, ckpt_path='checkpoint', accelerator = None):
    unwrap_net = accelerator.unwrap_model(self.net)
    unwrap_net.save_pretrained(ckpt_path)
    
def load_ckpt(self, ckpt_path='checkpoint'):
    import os
    self.net.load_state_dict(
        torch.load(os.path.join(ckpt_path,'adapter_model.bin')),strict =False)
    self.from_scratch = False
    
KerasModel.save_ckpt = save_ckpt 
KerasModel.load_ckpt = load_ckpt

In [None]:
optimizer = torch.optim.AdamW(peft_model.parameters(),lr=cfg.lr) 
keras_model = KerasModel(peft_model,loss_fn = None,
        optimizer=optimizer) 
ckpt_path = './single_chatglm2'

In [None]:
keras_model.fit(train_data = dl_train,
                val_data = dl_val,
                epochs=100,
                patience=20,
                monitor='val_loss',
                mode='min',
                ckpt_path = ckpt_path,
                mixed_precision='fp16',
                gradient_accumulation_steps = cfg.gradient_accumulation_steps
               )