# 基于ChatYuan模型微调示例


## 安装需要的python包

In [None]:
# 安装需要的包
!pip install sentencepiece
!pip install transformers
!pip install torch
!pip install rich[jupyter]
!pip install datasets
!pip install pylcs
!pip install Rouge

## 查看gpu的状态信息

In [None]:
# 查看GPU的信息
!nvidia-smi

Tue Aug 22 13:49:32 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    24W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## 挂载gdrive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## 数据集预处理

### 处理belle数据

In [None]:
!cp /content/gdrive/MyDrive/big_models_learn/chatyuan_finetune/data/belle_1k.csv .

In [None]:
!head belle_1k.csv

input,target
"用户：针对健身房的新手，设计一套适合他们的健身器械使用指南，包括安全应用、正确姿势等方面。
小元：","健身器械使用指南
1. 开始前，请先进行热身运动。这会帮助你的身体适应运动，并减少受伤的风险。
2. 在使用健身器械前，确保你已经了解了其使用方法。请阅读说明书或咨询教练以获得正确的使用技巧。
3. 谨防过度使用或过度挑战你的身体。 如果你觉得有些动作太难或太重，请添加锻炼计划，以逐步提高动作难度。
4. 使用合适的装备。 确保你拥有合适的运动鞋和舒适的运动服。 不要在裸露的脚或短裤上进行重量训练。
5. 在健身器械上使用安全装置。 这些通常用于保护你的身体免受不当操作造成的损伤。 例如，重量训练中，你需要使用杠铃和负重时，一定要使用卡子来防止重量滑落。
6. 注意正确的姿势。 如果你的姿势是错误的，那么你的身体很容易被伤害到，你也可能无法获得最佳的锻炼效果。 至关重要的是，保持直立的身体，保持头部和颈部的稳定，并使用合适的重量。
7. 保持合理的呼吸方式。 无论何时进行训练，都必须保持正常呼吸。 当你需要用力时，呼气； 当你放松时，吸气。


## 导入相关的python包

In [None]:
import os,json
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import time
from transformers import T5Tokenizer, T5ForConditionalGeneration

from rich.table import Column, Table
from rich import box
from rich.console import Console

## 配置显示logger


In [None]:
# 做一些相关的配置(打印显示；GPU设置)
console = Console(record=True)

# to display dataframe in ASCII format
def display_df(df):
    """display dataframe in ASCII format"""

    console = Console()
    table = Table(
        Column("source_text", justify="center"),
        Column("target_text", justify="center"),
        title="Sample Data",
        pad_edge=False,
        box=box.ASCII,
    )

    for i, row in enumerate(df.values.tolist()):
        table.add_row(row[0], row[1])
    console.print(table)

# training logger to log training progress
training_logger = Table(
    Column("Epoch", justify="center"),
    Column("Steps", justify="center"),
    Column("Loss", justify="center"),
    title="Training Status",
    pad_edge=False,
    box=box.ASCII,
)

# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

## 数据读取

In [None]:
class YourDataSetClass(Dataset):
    """
    创建一个自定义的数据集，用于训练，必须包括两个字段：输入(如source_text)、输出（如target_text）

    """

    def __init__(
        self, dataframe, tokenizer, source_len, target_len, source_text, target_text
    ):
        """
        Initializes a Dataset class

        Args:
            dataframe (pandas.DataFrame): Input dataframe
            tokenizer (transformers.tokenizer): Transformers tokenizer
            source_len (int): Max length of source text
            target_len (int): Max length of target text
            source_text (str): column name of source text
            target_text (str): column name of target text
        """
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = target_len
        self.target_text = self.data[target_text]
        self.source_text = self.data[source_text]

    def __len__(self):
        """returns the length of dataframe"""

        return len(self.target_text)
    def __preprocess__(self, text):
        text = text.replace("\n", "\\n").replace("\t", "\\t").replace("  ", "%20")
        return text

    def __getitem__(self, index):
        """return the input ids, attention masks and target ids"""

        source_text = str(self.source_text[index])
        target_text = str(self.target_text[index])

        source_text = self.__preprocess__(source_text)
        target_text = self.__preprocess__(target_text)
        # cleaning data so as to ensure data is in string type
        #source_text = " ".join(source_text.split())
        #target_text = " ".join(target_text.split())

        source = self.tokenizer.batch_encode_plus(
            [source_text],
            max_length=self.source_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        target = self.tokenizer.batch_encode_plus(
            [target_text],
            max_length=self.summ_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        source_ids = source["input_ids"].squeeze()
        source_mask = source["attention_mask"].squeeze()
        target_ids = target["input_ids"].squeeze()
        target_mask = target["attention_mask"].squeeze()

        return {
            "source_ids": source_ids.to(dtype=torch.long),
            "source_mask": source_mask.to(dtype=torch.long),
            "target_ids": target_ids.to(dtype=torch.long),
            "target_ids_y": target_ids.to(dtype=torch.long),
        }

## 训练代码

In [None]:
def train(epoch, tokenizer, model, device, loader, optimizer):

    """
    用于训练的方法
    """

    model.train()
    time1=time.time()
    for _, data in enumerate(loader, 0):
        y = data["target_ids"].to(device, dtype=torch.long)

        shift_right_y = torch.cat([torch.zeros((y.size(0), 1), device=device, dtype=torch.long), y[:, :-1]], dim=-1)

        y_ids = shift_right_y.contiguous()
        lm_labels = y.clone().detach()
        lm_labels[y == tokenizer.pad_token_id] = -100

        ids = data["source_ids"].to(device, dtype=torch.long)
        mask = data["source_mask"].to(device, dtype=torch.long)

        optimizer.zero_grad()

        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            decoder_input_ids=y_ids,
            labels=lm_labels,
        )
        loss = outputs[0]

        # 每100步打印日志
        if _ % 100 == 0 and _!=0:
            time2=time.time()
            training_logger.add_row(str(epoch), str(_), str(loss))
            console.print(training_logger)


        loss.backward()
        optimizer.step()

## 验证代码

In [None]:
def validate(epoch, tokenizer, model, device, loader,max_length):

  """
  用于验证的方法：输入用于验证的数据，返回模型预测的结果和正确的标签
  Function to evaluate model for predictions

  """
  model.eval()
  predictions = []
  actuals = []
  with torch.no_grad():
      for _, data in enumerate(loader, 0):
          y = data['target_ids'].to(device, dtype = torch.long)
          ids = data['source_ids'].to(device, dtype = torch.long)
          mask = data['source_mask'].to(device, dtype = torch.long)

          generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask,
              max_length=max_length,
              num_beams=2,
              repetition_penalty=2.5,
              length_penalty=1.0,
              early_stopping=True
              )
          preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
          target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
          if _%1000==0:
              console.print(f'Completed {_}')

          predictions.extend(preds)
          actuals.extend(target)
  return predictions, actuals

In [None]:
# 安装包
import pylcs
def f1_sim(text_a, text_b):
    """F1相似度
    说明：算出两个文本的最长公共子序列长度，然后乘2并处以两者
    长度之和。推荐用pylcs算，速度较快。
    """
    if not text_a and not text_b:
        return 0.
    else:
        lcs = pylcs.lcs(text_a, text_b)
        return 2. * lcs / (len(text_a) + len(text_b))

## 训练类

In [None]:
# 训练类：整合数据集类、训练方法、验证方法，加载数据进行训练并验证训练过程的效果
def T5Trainer(
    dataframe, source_text, target_text, model_params, output_dir="./outputs/"
):
    """
    T5 trainer
    """
    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(model_params["SEED"])  # pytorch random seed
    np.random.seed(model_params["SEED"])  # numpy random seed
    torch.backends.cudnn.deterministic = True

    # logging
    console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

    # tokenzier for encoding the text
    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

    # Defining the model. We are using ChatYuan model and added a Language model layer on top for generation of prediction.
    # Further this model is sent to device (GPU/TPU) for using the hardware.
    model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"],torch_dtype=torch.bfloat16)
    model = model.to(device)

    # logging
    console.log(f"[Data]: Reading data...\n")

    # Importing the raw dataset
    dataframe = dataframe[[source_text, target_text]]
    # display_df(dataframe.head(2))

    # Creation of Dataset and Dataloader
    # Defining the train size So 94% of the data will be used for training and the rest for validation.
    train_size = 0.94
    train_dataset = dataframe.sample(frac=train_size, random_state=model_params["SEED"])
    val_dataset = dataframe.drop(train_dataset.index).reset_index(drop=True)
    train_dataset = train_dataset.reset_index(drop=True)

    # 打印数据集相关日志：数据量、训练步数
    console.print(f"FULL Dataset: {dataframe.shape}")
    console.print(f"TRAIN Dataset: {train_dataset.shape}")
    console.print(f"TEST Dataset: {val_dataset.shape}\n")
    total_train_steps=int((train_dataset.shape[0] * model_params["TRAIN_EPOCHS"])/model_params["TRAIN_BATCH_SIZE"])
    console.print(f"Total Train Steps: {total_train_steps}\n")

    # Creating the Training and Validation dataset for further creation of Dataloader
    training_set = YourDataSetClass(
        train_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )
    val_set = YourDataSetClass(
        val_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )

    # Defining the parameters for creation of dataloaders
    train_params = {
        "batch_size": model_params["TRAIN_BATCH_SIZE"],
        "shuffle": True,
        "num_workers": 0,
    }

    val_params = {
        "batch_size": model_params["VALID_BATCH_SIZE"],
        "shuffle": False,
        "num_workers": 0,
    }

    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)

    # Defining the optimizer that will be used to tune the weights of the network in the training session.
    optimizer = torch.optim.Adam(
        params=model.parameters(), lr=model_params["LEARNING_RATE"]
    )

    # Training loop
    console.log(f"[Initiating Fine Tuning]...\n")

    for epoch in range(model_params["TRAIN_EPOCHS"]):
        # 1) train for one epoch
        train(epoch, tokenizer, model, device, training_loader, optimizer)

        # 2) save model for each epoch
        console.log(f"[Saving Model]...\n")
        path = os.path.join(output_dir, "model_files")
        model.save_pretrained(path)
        tokenizer.save_pretrained(path)

        # 3) evaluating test dataset
        console.log(f"[Initiating Validation]...\n")
        with torch.no_grad(): # add 2022.10.4
          #for epoch in range(model_params["VAL_EPOCHS"]):
          predictions, actuals = validate(epoch, tokenizer, model, device, val_loader,model_params["MAX_TARGET_TEXT_LENGTH"])
          same_item_num = sum(f1_sim(a, b) for a, b in zip(predictions, actuals))
          console.log(f"验证集准确率：{same_item_num/len(predictions):.2f}")
          final_df = pd.DataFrame({"Generated Text": predictions, "Actual Text": actuals})
          final_df.to_csv(os.path.join(output_dir, "predictions.csv"))

    console.save_text(os.path.join(output_dir, "logs.txt"))

    console.log(f"[Validation Completed.]\n")
    console.print(
        f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n"""
    )
    console.print(
        f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'predictions.csv')}\n"""
    )
    console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir,'logs.txt')}\n""")

## 参数配置

In [None]:
# 定义模型的参数 let's define model parameters specific to T5
model_params = {
    "MODEL": "ClueAI/ChatYuan-large-v2",  # model_type
    "TRAIN_BATCH_SIZE": 2,  # training batch size, 8
    "VALID_BATCH_SIZE": 1,  # validation batch size,8
    "TRAIN_EPOCHS": 1,  # number of training epochs
    "VAL_EPOCHS": 1,  # number of validation epochs
    "LEARNING_RATE": 1e-4,  # learning rate
    "MAX_SOURCE_TEXT_LENGTH": 64,  # max length of source text, 512
    "MAX_TARGET_TEXT_LENGTH": 64,  # max length of target text,64
    "SEED": 42,  # set seed for reproducibility
}
print("end...")

end...


## 开始训练

In [None]:

# dataframe必须有2列:
#   - input: 文本输入
#   - target: 目标输出
df = pd.read_csv('/content/belle_1k.csv')  # 数据量：1k数据。
print("df.head:",df.head(n=2))
print("df.shape:",df.shape)
T5Trainer(
    dataframe=df,
    source_text="input",
    target_text="target",
    model_params=model_params,
    output_dir="outputs",
)

df.head:                                                input  \
0  用户：针对健身房的新手，设计一套适合他们的健身器械使用指南，包括安全应用、正确姿势等方面。\...   
1  用户：给定一段文本和关键词列表，删除文本中包含所有给定关键词的子字符串。\n文本："这是一个...   

                                              target  
0  健身器械使用指南\n1. 开始前，请先进行热身运动。这会帮助你的身体适应运动，并减少受伤的风...  
1  删除包含所有给定关键词的子字符串后，文本变为："这是一个句子，目的是看看是否可以正确地从这个...  
df.shape: (1001, 2)


Downloading spiece.model:   0%|          | 0.00/742k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/197 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/848 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Downloading pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# 查看训练后显存占用情况。如果显存被占用，可以kill掉相关的进程
#!nvidia-smi
#!fuser -v /dev/nvidia*

Fri Aug 11 05:39:15 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    41W / 300W |  16148MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
#!kill -9 1527

### 推理测试

In [None]:
# 加载训练后的模型
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("/content/outputs/model_files/")
model_trained = AutoModelForSeq2SeqLM.from_pretrained("/content/outputs/model_files/")

In [None]:
import torch
from transformers import AutoTokenizer
device = torch.device('cuda')
model_trained.to(device)
model_trained.half()
def preprocess(text):
  text = text.replace("\n", "\\n").replace("\t", "\\t")
  return text

def postprocess(text):
  return text.replace("\\n", "\n").replace("\\t", "\t").replace('%20','  ')

def answer(text, sample=True, top_p=1, temperature=0.7, context=""):
  '''sample：是否抽样。生成任务，可以设置为True;
  top_p：0-1之间，生成的内容越多样'''
  text = f"{context}\n用户：{text}\n小元："
  text = text.strip()
  text = preprocess(text)
  encoding = tokenizer(text=[text], truncation=True, padding=True, max_length=512, return_tensors="pt").to(device)
  if not sample:
    out = model_trained.generate(**encoding, return_dict_in_generate=True, output_scores=False, max_new_tokens=1024, num_beams=1, length_penalty=0.6)
  else:
    out = model_trained.generate(**encoding, return_dict_in_generate=True, output_scores=False, max_new_tokens=1024, do_sample=True, top_p=top_p, temperature=temperature, no_repeat_ngram_size=3)
  out_text = tokenizer.batch_decode(out["sequences"], skip_special_tokens=True)
  return postprocess(out_text[0])



In [None]:
input_text0 = "翻译这句话成英文：屈臣氏里的化妆品到底怎么样？"
input_text1 = "帮我写一个英文营销方案，针对iphone"
input_text2 = "写一个冒泡排序"
input_text3 = "写一个文章，题目是未来城市"
input_text4 = "写一个诗歌，关于冬天"
input_text5 = "从南京到上海的路线"
input_text6 = "学前教育专业岗位实习中，在学生方面会存在问题，请提出改进措施。800字"
input_text7 = "根据标题生成文章：标题：屈臣氏里的化妆品到底怎么样？正文：化妆品，要讲究科学运用，合理搭配。屈臣氏起码是正品连锁店。请继续后面的文字。"
input_text8 = "帮我对比几款GPU，列出详细参数对比，并且给出最终结论"
input_list = [input_text0, input_text1, input_text2, input_text3, input_text4, input_text5, input_text6, input_text7, input_text8]
for i, input_text in enumerate(input_list):
  print(f"示例{i}".center(50, "="))
  output_text = answer(input_text)
  print(f"{input_text}\n{output_text}")


翻译这句话成英文：屈臣氏里的化妆品到底怎么样？
What is the kind of cosmetic items that are available at Watsons?
帮我写一个英文营销方案，针对iphone
我们希望在苹果公司(Apple)和苹果公司全球范围内的市场份额中，能够展示我们的产品在市场上的地位。我们非常重视苹果的产品，尤其是iPhone。苹果是世界上最优秀的智能手机制造商之一，他们一直在不断发展和改变，以确保我们的产品始终在市场中处于领先地位。
在iPhone系列中，我们致力于提供更先进、
写一个冒泡排序
冒泡排序是一种简单的排序算法，它通过一次比较相邻的两个元素，将较大的元素放到中间，然后从右边向左边比较相邻的元素，如果它们的顺序错误，就交换它们的位置。
冒泡
写一个文章，题目是未来城市
未来城市，代表着人类未来的发展方向。随着科技和科技的不断进步，未来的城市将会变得更加美好和智能化。未来城市将会成为一个智能、绿色、高效、可持续的城市，人们将能够更好地体验到生活的乐趣和价值。
未来城市将是一个智能化、环保、绿色的城市。城市的交通、能源
写一个诗歌，关于冬天
冬天，白雪纷飞，
雪花纷飞，似银白色的世界。
白雪飞舞，世界宁静，
冬日，万物沉睡。
寂静的冬夜，寂静无声，
寂静无声的冬月，静静的沉睡。
从南京到上海的路线
从南京到上海的路线：
1. 从南京市乘坐高铁前往上海市区，全程约2小时左右，票价约为200元。
2. 从南通市乘坐高铁，抵达上海市区后，再乘坐地铁或公交前往上海市区内的各大景点和商场。
3. 从苏州市乘坐高铁到达上海市区
学前教育专业岗位实习中，在学生方面会存在问题，请提出改进措施。800字
学前教育专业岗位实习是学前教育专业教学的重要环节之一，在实习过程中学生需要掌握幼儿教育、学前教育、幼儿园管理等方面的知识，在实践中体会学前教育的内涵，提高学生综合素质。然而，在学前教育专业实习中，学生往往会存在一些问题，这些问题不利于学生的学习和成长。因此，我们需要改进学前教育专业
根据标题生成文章：标题：屈臣氏里的化妆品到底怎么样？正文：化妆品，要讲究科学运用，合理搭配。屈臣氏起码是正品连锁店。请继续后面的文字。
屈臣氏里有很多美容护肤品、化妆品、护肤品、彩妆等，有很多品种可供选购。这些产品种类繁多，有祛痘霜、美白霜、保湿霜等等。这些护肤