In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# 挂载 Kaggle 数据集（首次运行需授权）
try:
    from kaggle_datasets import KaggleDatasets
except ImportError:
    !pip install kaggle_datasets
    from kaggle_datasets import KaggleDatasets

In [None]:
# 获取数据集
df = pd.read_excel('/kaggle/input/jd_comment_with_label/jd_comment_data.xlsx')  

In [None]:
print("数据集列名：", df.columns.tolist())

# 查看前几行数据（确认列名和数据）
print("\n数据预览：")
print(df.head())

In [None]:
# 筛选需要的列
texts = df['评价内容(content)'].tolist()          # 文本列
labels = df['评分（总分5分）(score)'].values    # 标签列（1-5分）

In [None]:
# 检查标签分布
print("标签分布：", np.bincount(labels))

In [None]:
#分割数据集
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

In [None]:
#加载预训练模型
model_name = "bert-base-chinese"  # 中文预训练模型
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=5,  # 评分1-5分，共5个类别
    output_attentions=False,
    output_hidden_states=False
)

In [None]:
#创建数据集
# Tokenize 输入数据
train_encodings = tokenizer(
    train_texts,
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors='tf'
)
val_encodings = tokenizer(
    val_texts,
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors='tf'
)

# 创建 TensorFlow 数据集
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).shuffle(1000).batch(16).prefetch(tf.data.AUTOTUNE)
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
)).batch(16).prefetch(tf.data.AUTOTUNE)

In [None]:
#训练模型
# 配置优化器和损失函数
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# 训练模型
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=3  # 根据数据量调整
)

In [None]:
#模型评估
# 预测测试集（示例）
test_encodings = tokenizer(
    val_texts,
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors='tf'
)
predictions = model.predict(test_encodings)['logits']
pred_labels = np.argmax(predictions, axis=1)

# 输出评估结果
print("\nTest Accuracy:", accuracy_score(val_labels, pred_labels))
print("\nClassification Report:\n", classification_report(val_labels, pred_labels))