# 读取数据

In [6]:
import pandas as pd

train_jsonl_new_path = "../datasets/new_train.jsonl"
test_jsonl_new_path = "../datasets/new_test.jsonl"
train_df = pd.read_json(train_jsonl_new_path, lines=True)[:1000]  # 取前1000条做训练（可选）
test_df = pd.read_json(test_jsonl_new_path, lines=True)[:10]  # 取前10条做主观评测


In [None]:
from modelscope import snapshot_download, AutoTokenizer
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import torch
from awq import AutoAWQForCausalLM

# 在modelscope上下载Qwen模型到本地目录下
# model_dir = snapshot_download("./qwen/Qwen2-1.5B-Instruct", cache_dir="../bmodel/", revision="master")

# Transformers加载模型权重
tokenizer = AutoTokenizer.from_pretrained("../bmodel/qwen/Qwen2-1___5B-Instruct/", use_fast=False, trust_remote_code=True)
model = AutoAWQForCausalLM.from_pretrained("../bmodel/qwen/Qwen2-1___5B-Instruct/", device_map="auto", torch_dtype=torch.bfloat16)#修改为量化模式
# model.enable_input_require_grads()  # 开启梯度检查点时，要执行该方法



In [8]:
#数据预处理

def process_func(example):
    """
    将数据集进行预处理
    """
    MAX_LENGTH = 384
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(
        f"<|im_start|>system\n你是一个文本分类领域的专家，你会接收到一段文本和几个潜在的分类选项，请输出文本内容的正确类型<|im_end|>\n<|im_start|>user\n{example['input']}<|im_end|>\n<|im_start|>assistant\n",
        add_special_tokens=False,
    )
    response = tokenizer(f"{example['output']}", add_special_tokens=False)
    input_ids = (
        instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    )
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]
    labels = (
        [-100] * len(instruction["input_ids"])
        + response["input_ids"]
        + [tokenizer.pad_token_id]
    )
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

from datasets import Dataset

train_ds = Dataset.from_pandas(train_df)
train_dataset = train_ds.map(process_func, remove_columns=train_ds.column_names)

Map: 100%|██████████| 1000/1000 [04:38<00:00,  3.60 examples/s]


In [20]:

# 设置lora
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    inference_mode=False,  # 训练模式
    r=8,  # Lora 秩
    lora_alpha=32,  # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.1,  # Dropout 比例
)

model = get_peft_model(model.model, config)
model.print_trainable_parameters()

# 训练
args = TrainingArguments(
    output_dir="./output/Qwen2",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=2,
    save_steps=100,
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=False,#关闭梯度check
    report_to="none",
)

from swanlab.integration.huggingface import SwanLabCallback
import swanlab

swanlab_callback = SwanLabCallback(
    project="Qwen2-fintune",
    experiment_name="Qwen2-1.5B-Instruct",
    description="使用通义千问Qwen2-1.5B-Instruct模型在zh_cls_fudan-news数据集上微调。",
    config={
        "model": "../bmodel/qwen/Qwen2-1___5B-Instruct",
        "dataset": "huangjintao/zh_cls_fudan-news",
    },
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    callbacks=[swanlab_callback],
)

trainer.train()
trainer.save_model("output/quQwen2.1")#加入输出

# # ====== 训练结束后的预测 ===== #

# def predict(messages, model, tokenizer):
#     device = "cuda"
#     text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
#     model_inputs = tokenizer([text], return_tensors="pt").to(device)
#     generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512)
#     generated_ids = [
#         output_ids[len(input_ids) :]
#         for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
#     ]

#     response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
#     print(response)

#     return response
    

# test_text_list = []
# for index, row in test_df.iterrows():
#     instruction = row["instruction"]
#     input_value = row["input"]

#     messages = [
#         {"role": "system", "content": f"{instruction}"},
#         {"role": "user", "content": f"{input_value}"},
#     ]

#     response = predict(messages, model, tokenizer)
#     messages.append({"role": "assistant", "content": f"{response}"})
#     result_text = f"{messages[0]}\n\n{messages[1]}\n\n{messages[2]}"
#     test_text_list.append(swanlab.Text(result_text, caption=response))

# swanlab.log({"Prediction": test_text_list})
# swanlab.finish()

trainable params: 9,232,384 || all params: 1,552,946,688 || trainable%: 0.5945


Step,Training Loss
10,59.3654
20,0.3836
30,0.5452
40,0.6738
50,5.0954
60,0.0244
70,0.6001
80,0.0642
90,0.0081
100,0.1508


[1m[33mswanlab[0m[0m: Step 10 on key train/loss already exists, ignored.
[1m[33mswanlab[0m[0m: Step 10 on key train/grad_norm already exists, ignored.
[1m[33mswanlab[0m[0m: Step 10 on key train/learning_rate already exists, ignored.
[1m[33mswanlab[0m[0m: Step 10 on key train/epoch already exists, ignored.
[1m[33mswanlab[0m[0m: Step 20 on key train/loss already exists, ignored.
[1m[33mswanlab[0m[0m: Step 20 on key train/grad_norm already exists, ignored.
[1m[33mswanlab[0m[0m: Step 20 on key train/learning_rate already exists, ignored.
[1m[33mswanlab[0m[0m: Step 20 on key train/epoch already exists, ignored.
[1m[33mswanlab[0m[0m: Step 30 on key train/loss already exists, ignored.
[1m[33mswanlab[0m[0m: Step 30 on key train/grad_norm already exists, ignored.
[1m[33mswanlab[0m[0m: Step 30 on key train/learning_rate already exists, ignored.
[1m[33mswanlab[0m[0m: Step 30 on key train/epoch already exists, ignored.
[1m[33mswanlab[0m[0m: Step 

The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Electronics
Military
Politics
外国文学研究
History
Space
Transport
Literature
Economy
Art
[1m[34mswanlab[0m[0m: 🌟 Run `[1mswanlab watch -l /home/ubuntu/why/swanlog[0m` to view SwanLab Experiment Dashboard locally
[1m[34mswanlab[0m[0m: 🏠 View project at [34m[4mhttps://swanlab.cn/@whytehighmore/Qwen2-fintune[0m[0m
[1m[34mswanlab[0m[0m: 🚀 View run at [34m[4mhttps://swanlab.cn/@whytehighmore/Qwen2-fintune/runs/zjx7v5llc25ehlw64e7mj[0m[0m
                                                                                                    