In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jd_comment_with_label/jd_comment_data.xlsx


In [2]:
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

2025-05-22 05:06:27.745402: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747890387.925915      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747890387.980998      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# 处理数据
def handleComments(path):
    df = pd.read_excel(path)
    # print(df.columns)
    content_name = '评价内容(content)'
    score_name = '评分（总分5分）(score)'
    df = df[df[content_name].notna() & (df[content_name] != '此用户未填写评价内容') & (df[content_name] != '您没有填写内容，默认好评')]
    comments_list = df[[content_name, score_name]].values.tolist()
    # print(len(data_list))
    return comments_list
def build_collate_fn(tokenizer):
    def collate_fn(batch):
        comments, scores = zip(*batch)
        token_encodings = tokenizer(comments, padding=True, truncation=True, return_tensors="pt")
        scores = torch.tensor(scores) - 1
        return token_encodings['input_ids'], token_encodings['attention_mask'], scores
    return collate_fn

In [4]:
comments_list = handleComments('/kaggle/input/jd_comment_with_label/jd_comment_data.xlsx')
train_list, test_list = train_test_split(comments_list, test_size=0.2, random_state=42)

In [5]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_NAME = 'google-bert/bert-base-chinese'
EPOCHS = 3
LR = 1e-5
BATCH_SIZE = 32 # 批次太大，可能会导致：OutOfMemoryError: CUDA out of memory
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
train_dataLoader = DataLoader(train_list, batch_size=BATCH_SIZE, shuffle=True, collate_fn=build_collate_fn(tokenizer))
test_dataLoader = DataLoader(test_list, batch_size=BATCH_SIZE, collate_fn=build_collate_fn(tokenizer))

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

In [6]:
# 加载模型进行训练
def train(trainable):
    log_dir = '/kaggle/working/runs/trainable_true' if trainable else '/kaggle/working/runs/trainable_false'
    writer = SummaryWriter(log_dir=log_dir)
    # 加载模型
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=5)
    # False:冻结bert, True:不冻结bert
    model.bert.trainable = trainable
    model.to(DEVICE)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    # 进行训练
    for epoch in range(EPOCHS):
        train_bar = tqdm(train_dataLoader)
        model.train()
        for i, (input_ids, attention_mask, scores) in enumerate(train_bar):
            input_ids, attention_mask, scores = input_ids.to(DEVICE), attention_mask.to(DEVICE), scores.to(DEVICE)
            logits = model(input_ids=input_ids, attention_mask=attention_mask)['logits']
            loss = loss_fn(logits, scores)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            writer.add_scalar("loss", loss.item(), epoch * len(train_dataLoader) + i)
            train_bar.set_description(f"Epoch={epoch + 1}, Loss={loss.item()}")
        model.eval()
        with torch.no_grad():
            total = 0
            correct = 0
            for input_ids, attention_mask, scores in test_dataLoader:
                input_ids, attention_mask, scores = input_ids.to(DEVICE), attention_mask.to(DEVICE), scores.to(DEVICE)
                logits = model(input_ids=input_ids, attention_mask=attention_mask)['logits']
                pred = torch.argmax(logits, dim=1)
                total += len(scores)
                correct += (pred == scores).sum().item()
            accuracy = 100 * correct / total
            print(f"Epoch {epoch + 1}, Accuracy: {accuracy}, Total: {total}, Correct: {correct}")
    writer.close()
    # 保存模型
    save_name = 'model_true.pth' if trainable else 'model_false.pth'
    torch.save(model.state_dict(), save_name)

In [8]:
def predict(comments, trainable):
    model_path = '/kaggle/working/model_true.pth' if trainable else '/kaggle/working/model_false.pth'
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=5)
    model.load_state_dict(torch.load(model_path))
    model.to(DEVICE)
    model.eval()
    token_encodings = tokenizer(comments, padding=True, truncation=True, return_tensors="pt")
    input_ids, attention_mask = token_encodings['input_ids'].to(DEVICE), token_encodings['attention_mask'].to(DEVICE)
    logits = model(input_ids=input_ids, attention_mask=attention_mask)['logits']
    pred = torch.argmax(logits, dim=1) + 1
    return pred

In [9]:
# 冻结bert
train(False)

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch=1, Loss=0.08166500926017761: 100%|██████████| 1067/1067 [08:44<00:00,  2.03it/s] 


Epoch 1, Accuracy: 94.13005272407733, Total: 8535, Correct: 8034


Epoch=2, Loss=0.14433038234710693: 100%|██████████| 1067/1067 [08:41<00:00,  2.04it/s] 


Epoch 2, Accuracy: 94.13005272407733, Total: 8535, Correct: 8034


Epoch=3, Loss=0.08118396997451782: 100%|██████████| 1067/1067 [08:41<00:00,  2.05it/s] 


Epoch 3, Accuracy: 93.90743995313416, Total: 8535, Correct: 8015


In [10]:
print(predict(['这款手机真的太让我惊喜了！外观设计简约又高级，拿在手里质感十足。',
              '对这个手机真的太失望了！刚用没多久就频繁出现卡顿现象，打开几个应用就卡得不行'], False))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([5, 1], device='cuda:0')


In [11]:
# 不冻结bert
train(True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch=1, Loss=0.2219211608171463: 100%|██████████| 1067/1067 [08:41<00:00,  2.05it/s]  


Epoch 1, Accuracy: 94.27065026362038, Total: 8535, Correct: 8046


Epoch=2, Loss=0.3838520348072052: 100%|██████████| 1067/1067 [08:45<00:00,  2.03it/s]  


Epoch 2, Accuracy: 94.11833626244874, Total: 8535, Correct: 8033


Epoch=3, Loss=0.12285474687814713: 100%|██████████| 1067/1067 [08:43<00:00,  2.04it/s] 


Epoch 3, Accuracy: 94.17691857059168, Total: 8535, Correct: 8038


In [12]:
print(predict(['这款手机真的太让我惊喜了！外观设计简约又高级，拿在手里质感十足。',
              '对这个手机真的太失望了！刚用没多久就频繁出现卡顿现象，打开几个应用就卡得不行'], True))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([5, 1], device='cuda:0')
