## 前置动作
1. 克隆`https://github.com/unit-mesh/unit-minions`项目
1. 将`userstory_detail_double_clean_cn.jsonl`数据集放到`input1`或者修改下面的数据集路径
1. 将`ChatGLM-6b`的模型文件准备到`input2`或者修改下面模型引用的路径。或者直接指定为`THUDM/chatglm-6b`
1. 如果不是用的`openbayes`平台，记得修改下方的模型输出路径，并提前创建好路径

In [None]:
!git clone https://github.com/mymusise/ChatGLM-Tuning.git

In [5]:
%cd ChatGLM-Tuning
pip install -r requirements.txt 

In [3]:
!python cover_alpaca2jsonl.py \
    --data_path /openbayes/input/input0/userstory_detail_double_clean_cn.jsonl \
    --save_path /output/dataset/userstory_detail_double_clean_cn_alpaca.jsonl

In [4]:
!python tokenize_dataset_rows.py \
    --jsonl_path /openbayes/home/dataset/userstory_detail_double_clean_cn_alpaca.jsonl \
    --save_path /openbayes/home/dataset \
    --max_seq_length 380 \
    --model_name /openbayes/input/input1

In [5]:
import sys

sys.path.append("../")

In [6]:
from transformers import AutoTokenizer, AutoModel, TrainingArguments, AutoConfig
import torch
import torch.nn as nn
from peft import get_peft_model, LoraConfig, TaskType


class CastOutputToFloat(nn.Sequential):
    def forward(self, x): return super().forward(x).to(torch.float32)

model = AutoModel.from_pretrained("/openbayes/input/input1", load_in_8bit=True, trust_remote_code=True, device_map='auto')
model.supports_gradient_checkpointing = True
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
model.lm_head = CastOutputToFloat(model.lm_head)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

model.is_parallelizable = True
model.model_parallel = True


In [7]:
tokenizer = AutoTokenizer.from_pretrained("/openbayes/input/input1", trust_remote_code=True)

## Test before finetune

In [8]:
from cover_alpaca2jsonl import format_example
import json


with open("/openbayes/input/input0/userstory_detail_double_clean_cn.jsonl", encoding="utf-8") as f:
    examples = list(f)
    examplesSample = examples[:2]

# instructions = json.load(open("data/alpaca_data.json"))


with torch.no_grad():
    for idx, item in enumerate(examplesSample):
        item = json.loads(item)
        feature = format_example(item)
        input_text = feature["context"]
        input_ids = tokenizer.encode(input_text, return_tensors='pt')
        out = model.generate(
            input_ids=input_ids,
            max_length=150,
            temperature=0
        )
        answer = tokenizer.decode(out[0])
        print(answer)
        item['infer_answer'] = answer
        print(f"### {idx+1}.Answer:\n", item.get('output'), '\n\n')

In [9]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, inference_mode=False,
    r=8,
    lora_alpha=32, lora_dropout=0.1,
)

model = get_peft_model(model, peft_config)
model.is_parallelizable = True
model.model_parallel = True

In [10]:
import datasets

dataset_path = "/openbayes/home/dataset"

dataset = datasets.load_from_disk(dataset_path)

# train_num = 2000

# mini_train_dataset = datasets.Dataset.from_dict(dataset[:train_num])
mini_train_dataset = dataset

In [11]:
from transformers import Trainer, HfArgumentParser
import os


def data_collator(features: list) -> dict:
    len_ids = [len(feature["input_ids"]) for feature in features]
    longest = max(len_ids)
    input_ids = []
    labels_list = []
    for ids_l, feature in sorted(zip(len_ids, features), key=lambda x: -x[0]):
        ids = feature["input_ids"]
        seq_len = feature["seq_len"]
        labels = (
            [-100] * (seq_len - 1) + ids[(seq_len - 1) :] + [-100] * (longest - ids_l)
        )
        ids = ids + [tokenizer.pad_token_id] * (longest - ids_l)
        _ids = torch.LongTensor(ids)
        labels_list.append(torch.LongTensor(labels))
        input_ids.append(_ids)
    input_ids = torch.stack(input_ids)
    labels = torch.stack(labels_list)
    return {
        "input_ids": input_ids,
        "labels": labels,
    }

class ModifiedTrainer(Trainer):

    def compute_loss(self, model, inputs, return_outputs=False):
        return model(
            input_ids=inputs["input_ids"],
            labels=inputs["labels"],
        ).loss
    
    def save_model(self, output_dir=None, _internal_call=False):
        from transformers.trainer import TRAINING_ARGS_NAME

        os.makedirs(output_dir, exist_ok=True)
        torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
        saved_params = {
            k: v.to("cpu") for k, v in self.model.named_parameters() if v.requires_grad
        }
        torch.save(saved_params, os.path.join(output_dir, "adapter_model.bin"))

In [12]:
training_args = TrainingArguments(
    "output",
    fp16 =True,
    gradient_accumulation_steps=1,
    per_device_train_batch_size = 8,
    learning_rate = 1e-4,
    max_steps=3000,
    save_steps=1000,
    logging_steps=100,
    remove_unused_columns=False,
    seed=0,
    data_seed=0,
    group_by_length=False,
)


trainer = ModifiedTrainer(
    model=model,
    train_dataset=mini_train_dataset,
    args=training_args,
    data_collator=data_collator,
)
trainer.train()

## Test After finetune

In [13]:
from cover_alpaca2jsonl import format_example
import json


# instructions = json.load(open("data/alpaca_data.json"))


# with torch.no_grad():
#     for idx, item in enumerate(instructions[:5]):
#         feature = format_example(item)
#         input_text = feature["context"]
#         input_ids = tokenizer.encode(input_text, return_tensors='pt')
#         out = model.generate(
#             input_ids=input_ids,
#             max_length=150,
#             temperature=0
#         )
#         answer = tokenizer.decode(out[0])
#         print(answer)
#         item['infer_answer'] = answer
#         print(f"### {idx+1}.Answer:\n", item.get('output'), '\n\n')

# from cover_alpaca2jsonl import format_example
# import json


with open("/openbayes/input/input0/userstory_detail_double_clean_cn.jsonl", encoding="utf-8") as f:
    examples = list(f)
    examplesSample = examples[:2]
    
    

# instructions = json.load(open("data/alpaca_data.json"))


with torch.no_grad():
    for idx, item in enumerate(examplesSample):
        item = json.loads(item)
        feature = format_example(item)
        input_text = feature["context"]
        input_ids = tokenizer.encode(input_text, return_tensors='pt')
        out = model.generate(
            input_ids=input_ids,
            max_length=150,
            temperature=0
        )
        answer = tokenizer.decode(out[0])
        print(answer)
        item['infer_answer'] = answer
        print(f"### {idx+1}.Answer:\n", item.get('output'), '\n\n')

In [14]:
import os

model.save_pretrained("/output/models/lora/chatglm/userstory_detail")

def save_tunable_parameters(model, path):
    saved_params = {
        k: v.to("cpu")
        for k, v in model.named_parameters()
        if v.requires_grad
    }
    torch.save(saved_params, path)
save_tunable_parameters(model, os.path.join("/output/models/lora/chatglm/userstory_detail", "chatglm-lora-userstory_detail.pt"))

In [2]:
%cd ..
pip install -r unit-minions/apps/userstory/chatglm/chatglm-web/requirements.txt

In [None]:
!python unit-minions/apps/userstory/chatglm/chatglm-web/app.py