In [None]:
import pandas as pd
# 数据处理
excel_data = pd.read_excel('./data/jd_comment_data.xlsx')




In [None]:
import re

# 保留指定列并重命名（直接创建新DataFrame）
df = excel_data[['评分（总分5分）(score)', '评价内容(content)']].copy()  # 显式创建副本
df.columns = ['label', 'text']

# 处理缺失值（避免inplace操作）
df = df.dropna(subset=['text', 'label'])

# 过滤无效数据（链式操作，避免中间视图）
df = df[
    (df['text'] != '此用户未填写评价内容') & 
    (df['text'].apply(lambda x: len(re.findall(r'[\u4e00-\u9fa5]', x)) >= 1))
]

# 处理异常值
df = df[(df['label']>=1) & (df['label']<=5)]

# 转换为二分类
df ['label'] = df['label'].apply(lambda x : 1 if x >=4 else 0)
# 统计信息
print("类别分布：\n", df['label'].value_counts())

In [None]:
# 2. 创建Dataset对象
from datasets import Dataset, load_dataset
# 转换为huggingface dataset格式
dataset = Dataset.from_pandas(df[['text','label']])
# 拆分数据集
train_test = dataset.train_test_split(test_size=0.2,shuffle=True)
# 保存数据集
train_test['train'].save_to_disk('my_dataset_train')
train_test['test'].save_to_disk('my_dataset_test')


In [None]:
# 加载分词器
from transformers import AutoTokenizer,AutoModelForSequenceClassification
from datasets import load_from_disk

MODEL_NAME = 'bert-base-chinese'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# 加载训练集
loaded_train_dataset = load_from_disk('my_dataset_train')

# 加载测试集
loaded_test_dataset = load_from_disk('my_dataset_test')

# 数据预处理
def preprocess_data(tokenizer,max_length=128):
    def tokenize_function(examples):
        return tokenizer(examples["text"],padding="max_length",truncation=True,max_length=max_length,return_special_tokens_mask=True)
    return tokenize_function

tokenizer_train = loaded_train_dataset.map(
    preprocess_data(tokenizer),
    batched=True,
    batch_size=128
)
tokenizer_test = loaded_test_dataset.map(
    preprocess_data(tokenizer),
    batched=True,
    batch_size=128
)

In [None]:
print(tokenizer_train[0])

In [None]:
# 模型配置
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,num_labels = 2)

In [None]:
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np
from torch.utils.tensorboard import SummaryWriter  # 可选：显式导入SummaryWriter（非必需）

# 1. 定义训练参数（已包含 TensorBoard 配置）
training_args = TrainingArguments(
    output_dir='./result',
    overwrite_output_dir=True,
    eval_strategy='epoch',          # 每个 epoch 评估一次
    learning_rate=2e-5,
    per_device_train_batch_size=128,
    num_train_epochs=20,
    weight_decay=0.01,
    logging_dir='./logs',           # 日志目录（用于 TensorBoard）
    logging_steps=10,               # 每 10 步记录一次日志
    save_strategy='epoch',          # 每个 epoch 保存一次模型
    load_best_model_at_end=True,    # 训练结束后加载最佳模型
    report_to='tensorboard',        # 启用 TensorBoard 日志记录
    run_name='bert_training'        # 为 TensorBoard 添加运行名称（可选）
)

# 2. 定义评估指标
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# 3. 创建 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenizer_train,
    eval_dataset=tokenizer_test,
    compute_metrics=compute_metrics  # 会自动记录到 TensorBoard
)

# 4. 开始训练
trainer.train()

# 5. 保存模型
trainer.save_model("./best_model")
tokenizer.save_pretrained("./best_model")

In [None]:
from transformers import pipeline, AutoTokenizer

# 加载本地模型和分词器
model_path = "./best_model"

# 显式加载 tokenizer（这才是关键）
tokenizer = AutoTokenizer.from_pretrained(model_path)

# 创建文本分类管道
classifier = pipeline(
    "text-classification",
    model=model_path,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1  # 自动选择设备
)

# 示例预测
test_text = ["真的垃圾", "好"]
results = classifier(test_text)
print(results)