In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jd_comment_with_label/jd_comment_data.xlsx


In [1]:
import pandas as pd

# 读取 Excel 并保存为 CSV
df = pd.read_excel("/kaggle/input/jd_comment_with_label/jd_comment_data.xlsx")
df.to_csv("jd_comment_data.csv", index=False, encoding='utf-8')

In [None]:
import csv
with open('/kaggle/working/jd_comment_data.csv', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    rows = [row for row in reader if row['评价内容(content)'] not in ['此用户未填写评价内容', '您没有填写内容，默认好评'] and row['评价内容(content)'].strip() != '']
len(rows)


In [6]:
import os
import csv
import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, AutoModelForSequenceClassification
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from torch.cuda.amp import autocast, GradScaler

MODEL_PATH = '/kaggle/working/'
PRETRAINED_MODEL = 'bert-base-chinese'
LOG_DIR = './runs/bert_cls'
BATCH_SIZE = 256
EPOCHS = 5
MAX_LEN = 128
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 数据集定义
class CommentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=MAX_LEN)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# 加载并预处理数据
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        rows = [row for row in reader if row['评价内容(content)'] not in ['此用户未填写评价内容', '您没有填写内容，默认好评'] and row['评价内容(content)'].strip() != '']
    texts = [row['评价内容(content)'] for row in rows]
    labels = [int(row['评分（总分5分）(score)']) - 1 for row in rows]  # 标签从0开始
    split_idx = int(len(texts) * 0.8)
    train_texts, train_labels = texts[:split_idx], labels[:split_idx]
    test_texts, test_labels = texts[split_idx:], labels[split_idx:]
    return train_texts, train_labels, test_texts, test_labels

# 冻结 BERT 编码层
def freeze_bert(model):
    for param in model.bert.parameters():
        param.requires_grad = False

# 验证函数
def evaluate(model, data_loader):
    model.eval()
    correct = total = 0
    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            outputs = model(**batch)
            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == batch['labels']).sum().item()
            total += batch['labels'].size(0)
    model.train()
    return correct / total

# 主训练函数
def train(freeze=False):
    tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL)
    train_texts, train_labels, test_texts, test_labels = load_data('jd_comment_data.csv')

    train_dataset = CommentDataset(train_texts, train_labels, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

    test_dataset = CommentDataset(test_texts, test_labels, tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

    model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=5)

    if freeze:
        freeze_bert(model)

    model = torch.nn.DataParallel(model)
    model.to(DEVICE)

    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5)
    scaler = GradScaler()
    writer = SummaryWriter(log_dir=LOG_DIR + ('/freeze' if freeze else '/finetune'))

    global_step = 0
    model.train()

    for epoch in range(EPOCHS):
        total_loss = 0
        loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}", leave=False)
        for batch in loop:
            batch = {k: v.to(DEVICE) for k, v in batch.items()}

            with autocast():
                outputs = model(**batch)
                loss = outputs.loss
                if loss.dim() > 0:
                    loss = loss.mean()

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())
            writer.add_scalar("train/loss", loss.item(), global_step)
            global_step += 1

        torch.save(model.module.state_dict(), os.path.join(MODEL_PATH, f"{'freeze' if freeze else 'finetune'}_bert_epoch{epoch+1}.pt"))
        # 在每个 epoch 后进行验证
        val_acc = evaluate(model, test_loader)
        writer.add_scalar("eval/accuracy", val_acc, epoch)
        print(f"Epoch {'freeze' if freeze else 'finetune'} {epoch+1} completed. Avg loss: {total_loss / len(train_loader):.4f}, Val Accuracy: {val_acc:.4f}")

    writer.close()


if __name__ == '__main__':
    # 冻结 BERT 主体
    train(freeze=True)   # 冻结只训练分类头
    train(freeze=False)  # 不冻结，完整微调


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()
  with autocast():
                                                                        

Epoch freeze 1 completed. Avg loss: 0.8155, Val Accuracy: 0.9435


                                                                        

Epoch freeze 2 completed. Avg loss: 0.3721, Val Accuracy: 0.9441


                                                                        

Epoch freeze 3 completed. Avg loss: 0.3259, Val Accuracy: 0.9441


                                                                        

Epoch freeze 4 completed. Avg loss: 0.3168, Val Accuracy: 0.9441


                                                                        

Epoch freeze 5 completed. Avg loss: 0.3136, Val Accuracy: 0.9441


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                                        

Epoch finetune 1 completed. Avg loss: 0.2828, Val Accuracy: 0.9441


                                                                         

Epoch finetune 2 completed. Avg loss: 0.1986, Val Accuracy: 0.9475


                                                                        

Epoch finetune 3 completed. Avg loss: 0.1762, Val Accuracy: 0.9480


                                                                         

Epoch finetune 4 completed. Avg loss: 0.1570, Val Accuracy: 0.9463


                                                                         

Epoch finetune 5 completed. Avg loss: 0.1386, Val Accuracy: 0.9462


In [9]:
import torch
from transformers import BertTokenizer, AutoModelForSequenceClassification
import os

MODEL_PATH = '/kaggle/working/finetune_bert_epoch5.pt'
PRETRAINED_MODEL = 'bert-base-chinese'
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 加载 tokenizer 和 模型
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=5)
model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
model.to(DEVICE)
model.eval()

def predict(texts):
    encodings = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
    encodings = {k: v.to(DEVICE) for k, v in encodings.items()}
    with torch.no_grad():
        outputs = model(**encodings)
        preds = torch.argmax(outputs.logits, dim=1)
    return preds.cpu().numpy().tolist()

if __name__ == '__main__':
    test_texts = [
        "这款手机性价比很高，值得购买",
        "包装很差，收到的时候已经坏了",
        "物流挺快，客服也不错",
        "太棒了，非常满意的一次购物",
        "客服回复的太慢了，体验不太好"
    ]
    results = predict(test_texts)
    for text, label in zip(test_texts, results):
        print(f"评论: {text} -> 预测评分: {label+1} 星")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


评论: 这款手机性价比很高，值得购买 -> 预测评分: 5 星
评论: 包装很差，收到的时候已经坏了 -> 预测评分: 1 星
评论: 物流挺快，客服也不错 -> 预测评分: 5 星
评论: 太棒了，非常满意的一次购物 -> 预测评分: 5 星
评论: 客服回复的太慢了，体验不太好 -> 预测评分: 5 星
