In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jd_comment_with_label/jd_comment_data.xlsx
/kaggle/input/jd-comment-data-csv/jd_comment_data.csv


### 利用huggingface中预训练模型，实现文本分类模型定制和微调

1. 加载预训练模型定制输出端任务
2. 原始数据进行清洗转换
   - 清理停用词或非法字符
3. 构建Dataset和DataLoader
   - DataLoader的collate_fn参数，在回调函数中使用tokenizer转换模型输入数据
5. 创建模型，损失函数、优化器
6. 训练模型
7. 观察损失调参迭代
8. 模型保存

In [10]:
import torch.optim as optim
import torch.nn as nn
from tqdm import tqdm
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [11]:
#数据预处理，只保留写了评价的数据
import csv

# 用户评论数据集
ds_comments = []

# 1. Read the CSV file
with open('/kaggle/input/jd-comment-data-csv/jd_comment_data.csv', 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        vote = int(row['评分（总分5分）(score)'])
        content = row['评价内容(content)']
        if content != '此用户未填写评价内容':
            ds_comments.append([content, vote])  # 1 for positive, 0 for negative

len(ds_comments)

train_comments = ds_comments[:round(0.8*len(ds_comments))]  # Display the first 80% of the dataset
test_comments = ds_comments[round(0.8*len(ds_comments)):]  # Display the last 20% of the dataset

In [12]:
# 加载词典创建分词器
from transformers import AutoTokenizer
tokenizer=  AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')

In [13]:
from torch.utils.data import DataLoader
def build_collate(tokenizer):
    def collate_fn(batch):
        # 文本分类语料：输入语句，类别标签
        sentents,labels = zip(*batch)
    
        # tokenizer转换
        model_inputs = tokenizer(sentents, return_tensors='pt', padding = True,  truncation = True)
        labels = torch.tensor(labels)

        return model_inputs, labels
    return collate_fn

# DataLoader
dl = DataLoader(train_comments, batch_size=20, shuffle=True, collate_fn=build_collate(tokenizer))



# 使用预训练bert模型时，学习率不能太大!!! 推荐1e-4或1e-5 

In [6]:
# 定制模型输出
from transformers import AutoModelForSequenceClassification, AutoModelForMaskedLM

# 完成文本分类任务(5个类别)
model = AutoModelForSequenceClassification.from_pretrained('google-bert/bert-base-chinese', num_labels=5)
model.to(device)

2025-05-27 06:45:00.748466: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748328300.929140      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748328300.983906      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
# 优化器、损失
optimizer = optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss(ignore_index=0)

# 训练

for epoch in range(1):
    model.train()
    tpbar = tqdm(dl)
    for model_inputs, labels in tpbar:
        model_inputs, labels = model_inputs.to(device), labels.to(device)
        # 前向传播
        logits = model(model_inputs.input_ids).logits
        #print(logits,labels)
        # 计算损失
        loss = criterion(logits.view(-1, 5), labels.view(-1)-1)

        # 反向传播
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tpbar.set_description(f'Epoch {epoch+1}, Loss: {loss.item():.4f}')

torch.save(model.state_dict(), '/kaggle/working/bert_classification_JD_Comments.bin')

  0%|          | 0/1777 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Epoch 1, Loss: 0.2697:  12%|█▏        | 218/1777 [01:00<06:49,  3.81it/s]

In [16]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [27]:
dl_test = DataLoader(test_comments, batch_size=20, shuffle=True, collate_fn=build_collate(tokenizer))
model.eval()
correct,total = 0,0
tpbar_test = tqdm(dl_test)
with torch.no_grad():
    for inputs,labels in tpbar_test:
        inputs, labels = inputs.to(device), labels.to(device)
        out = model(inputs.input_ids).logits
        #print(model_inputs.input_ids.shape,labels.shape,out.shape)
        _,predicted = torch.max(out,1) 
        #print(predicted,labels)
        correct += (predicted == (labels-1)).sum().item()
        total += labels.shape[0]
print(f'{correct},{total},准确率:{correct/total*100}%')


        

100%|██████████| 445/445 [00:29<00:00, 15.34it/s]

8403,8884,准确率:94.58577217469607%



