In [1]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "jd_comment_data.xlsx"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "dosonleung/jd_comment_with_label",
  file_path
)

print("First 5 records:", df.head())

  df = kagglehub.load_dataset(


First 5 records:           爬取时间(__time)                                        爬取链接(__url)  \
0  2019-03-08 00:50:34  https://sclub.jd.com/comment/productPageCommen...   
1  2019-03-08 00:50:34  https://sclub.jd.com/comment/productPageCommen...   
2  2019-03-08 00:50:34  https://sclub.jd.com/comment/productPageCommen...   
3  2019-03-08 00:50:34  https://sclub.jd.com/comment/productPageCommen...   
4  2019-03-08 00:50:33  https://sclub.jd.com/comment/productPageCommen...   

   商品ID(product_id)  评价时间(publish_time)  评分（总分5分）(score) 评价内容(content)  \
0           4722324          1550631798                5    此用户未填写评价内容   
1           4722324          1550633151                5    此用户未填写评价内容   
2           4722324          1550633330                3    此用户未填写评价内容   
3           4722324          1550633401                5    此用户未填写评价内容   
4           4722324          1550633461                5    此用户未填写评价内容   

  评价者(author_name) 评价者会员等级(author_level) 商品sku(product_sku) 评价标签(tags)  
0 

In [2]:
df.columns.tolist()

['爬取时间(__time)',
 '爬取链接(__url)',
 '商品ID(product_id)',
 '评价时间(publish_time)',
 '评分（总分5分）(score)',
 '评价内容(content)',
 '评价者(author_name)',
 '评价者会员等级(author_level)',
 '商品sku(product_sku)',
 '评价标签(tags)']

In [4]:
df_filter = df.loc[df['评价内容(content)']!="此用户未填写评价内容",['评分（总分5分）(score)','评价内容(content)']].rename(columns = {'评分（总分5分）(score)':'rating','评价内容(content)':'comment'}).copy()
df_filter['label'] = df_filter['rating'] - 1
df_filter.head()

Unnamed: 0,rating,comment,label
15,1,一般般，一分钱一分货吧,0
18,4,商品质量很好，很满意，配送速度快啊，而且配送员态度也非常好。,3
19,5,。。。,4
22,5,刘慧敏提莫摸摸摸休息泽TCL退咯的一组婆婆破鼓规土局,4
25,5,还好还好还好还好红红火火好很好好,4


In [6]:
# 必须的核心库
import torch
import pandas as pd
from transformers import pipeline, BertForSequenceClassification, BertTokenizer
from torch.utils.tensorboard import SummaryWriter

# 设备配置 (必须)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#创建自定义Pipeline (必须)
def create_classifier(freeze_bert=False):
    """创建支持冻结BERT的分类器"""
    model = BertForSequenceClassification.from_pretrained(
        'google-bert/bert-base-chinese', 
        num_labels=5
    ).to(device)
    
    if freeze_bert:
        for param in model.bert.parameters():
            param.requires_grad = False
        print("冻结了BERT基础层")
    
    tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-chinese')
    
    return pipeline(
        "text-classification",
        model=model,
        tokenizer=tokenizer,
        device=device.index if device.type == 'cuda' else -1
    )

# 3. 训练函数 (必须)
def train_classifier(classifier, df, strategy_name, epochs=1, lr=5e-5):
    """训练分类器并记录到TensorBoard"""
    writer = SummaryWriter(f'runs/{strategy_name}')
    model = classifier.model
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    
    # 简单数据拆分
    train_df = df.sample(frac=0.8, random_state=42)
    val_df = df.drop(train_df.index)
    
    for epoch in range(epochs):
        total_loss = 0
        # 简化训练循环
        for i in range(0, min(1000, len(train_df)), 32):  # 小批量训练
            batch_texts = train_df['comment'].iloc[i:i+32].tolist()
            batch_labels = train_df['label'].iloc[i:i+32].tolist()
            
            # 使用pipeline的分词器
            inputs = classifier.tokenizer(
                batch_texts, 
                padding=True, 
                truncation=True, 
                return_tensors="pt",
                max_length=64
            ).to(device)
            
            # 前向传播
            outputs = model(**inputs, labels=torch.tensor(batch_labels).to(device))
            loss = outputs.loss
            
            # 反向传播
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            total_loss += loss.item()
            writer.add_scalar('Loss/train', loss.item(), epoch*len(train_df)//32 + i//32)
        
        # 验证评估
        model.eval()
        correct = 0
        for i in range(0, min(200, len(val_df)), 32):
            val_texts = val_df['comment'].iloc[i:i+32].tolist()
            val_labels = val_df['label'].iloc[i:i+32].tolist()
            
            predictions = classifier(val_texts)
            pred_labels = [int(p['label'].split('_')[-1]) for p in predictions]
            correct += sum([1 for p, t in zip(pred_labels, val_labels) if p == t])
        
        accuracy = correct / min(200, len(val_df))
        writer.add_scalar('Accuracy/val', accuracy, epoch)
        print(f"Epoch {epoch+1} | 策略 {strategy_name} | 准确率: {accuracy:.4f}")
    
    writer.close()
    return classifier

In [7]:
# 4. 主流程 (必须)
def main():
    # 策略对比
    strategies = [
        ("冻结BERT", True, 2e-4),
        ("不冻结BERT", False, 5e-5)
    ]
    
    classifiers = {}
    
    for name, freeze, lr in strategies:
        print(f"\n=== 开始 {name} 策略训练 ===")
        classifier = create_classifier(freeze_bert=freeze)
        trained_clf = train_classifier(classifier, df_filter, name, epochs=1, lr=lr)
        classifiers[name] = trained_clf
        
        # 保存模型 (必须)
        trained_clf.save_pretrained(f"{name}_model")
        print(f"{name} 模型已保存")
    
    # 预测示例 (必须)
    sample_comments = [
        "质量很好，非常满意",
        "包装破损，体验很差",
        "中规中矩，没什么特别"
    ]
    
    print("\n=== 预测示例 ===")
    for comment in sample_comments:
        print(f"\n评论: '{comment}'")
        for name, clf in classifiers.items():
            result = clf(comment)[0]
            rating = int(result['label'].split('_')[-1]) + 1
            print(f"{name}预测: {rating}星 (置信度: {result['score']:.4f})")
    
    print("\n训练完成! 使用以下命令查看TensorBoard:")
    print("tensorboard --logdir=runs")

# 运行主程序 (必须)
if __name__ == "__main__":
    main()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== 开始 冻结BERT 策略训练 ===


Device set to use cuda:0


冻结了BERT基础层
Epoch 1 | 策略 冻结BERT | 准确率: 1.0450


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


冻结BERT 模型已保存

=== 开始 不冻结BERT 策略训练 ===


Device set to use cuda:0


Epoch 1 | 策略 不冻结BERT | 准确率: 1.0450
不冻结BERT 模型已保存

=== 预测示例 ===

评论: '质量很好，非常满意'
冻结BERT预测: 5星 (置信度: 0.9719)
不冻结BERT预测: 5星 (置信度: 0.9738)

评论: '包装破损，体验很差'
冻结BERT预测: 5星 (置信度: 0.9760)
不冻结BERT预测: 5星 (置信度: 0.8944)

评论: '中规中矩，没什么特别'
冻结BERT预测: 5星 (置信度: 0.9690)
不冻结BERT预测: 5星 (置信度: 0.8909)

训练完成! 使用以下命令查看TensorBoard:
tensorboard --logdir=runs


In [9]:
!tensorboard --logdir=runs

2025-06-04 13:57:49.610305: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749045469.630703     108 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749045469.636799     108 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.18.0 at http://localhost:6006/ (Press CTRL+C to quit)
^C
