# 基于bert中文情感分类实战(根据下游任务微调模型)
1. 我们只关注两个东西：输入和输出

### 1. 下载数据集并加载

In [1]:
from datasets import load_dataset
import os

# 设置数据集目录
dataset_dir = "/raid/gfc/llm/datasets/ChnSentiCorp"
os.makedirs(dataset_dir, exist_ok=True)

# 在线加载数据集
data = load_dataset("lansinuote/ChnSentiCorp")

# 保存到dataset_dir目录
data.save_to_disk(dataset_dir)
print(f"数据集已保存到: {dataset_dir}")

  from .autonotebook import tqdm as notebook_tqdm
Saving the dataset (1/1 shards): 100%|██████████| 9600/9600 [00:00<00:00, 454985.63 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1200/1200 [00:00<00:00, 157281.48 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1200/1200 [00:00<00:00, 153473.54 examples/s]

数据集已保存到: /raid/gfc/llm/datasets/ChnSentiCorp





### 2. 制作Dataset

In [2]:
from torch.utils.data import Dataset
from datasets import load_from_disk

class Mydataset(Dataset):
    def __init__(self, split, dataset_dir="/raid/gfc/llm/datasets/ChnSentiCorp"):
        self.dataset = load_from_disk(dataset_dir)
        if split == "train":
            self.dataset = self.dataset["train"]
        elif split == "validation":
            self.dataset = self.dataset["validation"]
        elif split == "test":
            self.dataset = self.dataset["test"]
        else:
            raise ValueError(f"Invalid split: {split}. Must be one of ['train', 'validation', 'test']")

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset[idx]["text"]
        label = self.dataset[idx]["label"]
        return {
            "text": text,
            "label": label
        }

In [3]:
# 测试
# 创建训练集
train_dataset = Mydataset(split="train")

# for data in train_dataset:
#     print(data)

### 3. 根据下游任务修改模型

In [4]:
from transformers import BertModel
import torch
import subprocess
import torch.nn as nn

In [5]:
# 1. 选择gpu
def pick_free_gpu(start=7, end=0, memory_threshold=100):
    """
    自动选择空闲的GPU
    :param start: 起始GPU编号
    :param end: 结束GPU编号
    :param memory_threshold: 显存占用阈值（MB），低于此值认为GPU空闲
    :return: torch.device对象
    """
    try:
        # 获取nvidia-smi输出，包含显存使用和GPU利用率
        result = subprocess.check_output(
            ['nvidia-smi', '--query-gpu=memory.used,utilization.gpu', '--format=csv,nounits,noheader'],
            encoding='utf-8'
        )
        
        # 解析输出
        gpu_info = []
        for line in result.strip().split('\n'):
            memory, util = map(int, line.split(', '))
            gpu_info.append((memory, util))
        
        # 从start到end检查GPU
        for i in range(start, end-1, -1):
            if i < len(gpu_info):
                memory_used, gpu_util = gpu_info[i]
                # 判断条件：显存占用低于阈值且GPU利用率接近0
                if memory_used < memory_threshold and gpu_util < 5:
                    print(f"选择空闲GPU: cuda:{i}")
                    print(f"显存占用: {memory_used}MB, GPU利用率: {gpu_util}%")
                    return torch.device(f"cuda:{i}")
        
        print("没有检测到空闲GPU，使用CPU。")
        return torch.device("cpu")
        
    except Exception as e:
        print(f"检测GPU时出错：{e}，使用CPU。")
        return torch.device("cpu")

# 使用示例
device = pick_free_gpu(7, 0, memory_threshold=100)  # 设置显存阈值100MB
print(f"当前使用的设备: {device}")

选择空闲GPU: cuda:6
显存占用: 3MB, GPU利用率: 0%
当前使用的设备: cuda:6


In [6]:
# 2. 加载预训练模型
model_dir = "/raid/gfc/llm/models/models--uer--gpt2-chinese-cluecorpussmall/snapshots/c2c0249d8a2731f269414cc3b22dff021f8e07a3"
pretrained = BertModel.from_pretrained(model_dir).to(device)

# print(pretrained)
# 打印输入
print(pretrained.embeddings.word_embeddings)
# 打印输出


You are using a model of type gpt2 to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertModel were not initialized from the model checkpoint at /raid/gfc/llm/models/models--uer--gpt2-chinese-cluecorpussmall/snapshots/c2c0249d8a2731f269414cc3b22dff021f8e07a3 and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.

Embedding(21128, 768, padding_idx=0)


In [7]:
# 3. 定义下游任务模型
# 4. 模型定义
class Model(nn.Module):
    def __init__(self, pretrained_model):
        super().__init__()
        self.pretrained = pretrained_model
        # 使用多层分类头
        self.classifier = nn.Sequential(
            nn.Linear(768, 384),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(384, 2)
        )

    def forward(self, input_ids, attention_mask, token_type_ids):
        # 上游任务不参与训练
        with torch.no_grad():
            bert_output = self.pretrained(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        # 获取[CLS]标记的输出 (形状: [batch_size, 768])
        cls_output = bert_output.last_hidden_state[:, 0]
        
        # 通过多层分类头
        logits = self.classifier(cls_output)
        
        return logits
    

In [8]:
# 4. 训练模型
from torch.utils.data import DataLoader
from transformers import BertTokenizer, AdamW

Epoch = 50

# 创建数据集
train_dataset = Mydataset(split="train")

# 加载编码器
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

# 自定义函数：对数据进行编码处理
def collate_fn(batch):
    texts = [item["text"] for item in batch]
    labels = [item["label"] for item in batch]

    # 批量编码文本
    encoded = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors='pt',
    )

    # 将labels转换为张量
    labels = torch.tensor(labels)

    return {
        "input_ids": encoded["input_ids"],
        "attention_mask": encoded["attention_mask"],
        "token_type_ids": encoded["token_type_ids"],
        "labels": labels,
    }

# 创建Dataloader
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=32,
    shuffle=True,
    drop_last=True,
    collate_fn=collate_fn, # 一般加载一批数据，编码一批数据
)


In [9]:
# 测试模型精度
# 加载测试集
test_dataset = Mydataset(split="test")
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=32,
    shuffle=True,
    drop_last=True,
    collate_fn=collate_fn, # 一般加载一批数据，编码一批数据
)
# 加载初始的bert模型在测试集上测试
model = Model(pretrained_model=pretrained).to(device=device)
model.eval()
with torch.no_grad():
    total = 0
    correct = 0
    for i, batch in enumerate(test_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels = batch["labels"].to(device)

        out = model(input_ids, attention_mask, token_type_ids)
        out = out.argmax(dim=1)
        correct += (out == labels).sum().item()
        total += len(labels)
    print(f"初始的bert模型在测试集上的精度为：{correct / total}")

# 加载训练好的模型在测试集上测试
model_path = "/raid/gfc/llm/params/bert_sentiment_classification/best_model.pt"
model = Model(pretrained_model=pretrained).to(device)
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()

with torch.no_grad():
    total = 0
    correct = 0
    for i, batch in enumerate(test_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels = batch["labels"].to(device)

        out = model(input_ids, attention_mask, token_type_ids)
        out = out.argmax(dim=1)
        correct += (out == labels).sum().item()
        total += len(labels)
    print(f"训练{Epoch}后的模型在测试集上的精度为：{correct / total}")


初始的bert模型在测试集上的精度为：0.49408783783783783
训练100后的模型在测试集上的精度为：0.7204391891891891


In [None]:
# 开始训练（在py文件中训练）
print(device)
model = Model().to(device=device)
optimizer = AdamW(model.parameters(), lr=5e-4)
loss_func = nn.CrossEntropyLoss()

model.train()
for epoch in range(Epoch):
    for i, batch in enumerate(train_loader):
        # 将数据放到device上
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels = batch["labels"].to(device)
        
        # 前向传播得到输出
        out = model(input_ids, attention_mask, token_type_ids)

        loss = loss_func(out, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i % 5 == 0:
            out = out.argmax(dim=1)
            acc = (out == labels).sum().item() / len(labels)
            print(epoch, i, loss.item(), acc)

    # 保存模型参数
    save_dir = "/raid/gfc/llm/params/bert_sentiment_classification"
    os.makedirs(save_dir, exist_ok=True)
    torch.save(model.state_dict(), os.path.join(save_dir, f"{epoch}_bert.pt"))
    print(f"{epoch} epoch 参数保存成功") # 防止模型还没保存完，程序提前执行完毕
