In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from datasets import Dataset, DatasetDict
from transformers import (
    T5TokenizerFast,
    T5ForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
import evaluate # Hugging Face's evaluate library
import os

In [2]:
# --- 配置 ---
data_path = r"D:\GitHubRepos\is6941-ml-social-media\taptap\data\integrated\lm_cleaned_taptap_reviews.csv"
model_name = "google-t5/t5-base"
text_column = "review_content"
label_column = "sentiment"
test_size = 0.2  # 20% 的数据作为测试集
random_state = 42 # 为了结果可复现
max_input_length = 256 # T5 输入序列最大长度
max_target_length = 8   # 目标序列最大长度 ("positive", "negative")
train_batch_size = 8    # 根据你的 GPU 显存调整
eval_batch_size = 16   # 根据你的 GPU 显存调整
num_train_epochs = 3    # 训练轮数
learning_rate = 5e-5
output_dir = "./t5_sentiment_results" # 训练结果输出目录
logging_dir = "./t5_sentiment_logs"   # 日志目录

In [3]:
# --- 1. 加载数据 ---
print("Loading data...")
try:
    df = pd.read_csv(data_path, usecols=[text_column, label_column])
    # 处理可能的缺失值
    df = df.dropna(subset=[text_column, label_column])
    # 确保标签是整数类型
    df[label_column] = df[label_column].astype(int)
    print(f"Data loaded successfully. Shape: {df.shape}")
    print("Sample data:\n", df.head())
    print("\nLabel distribution:\n", df[label_column].value_counts())
except FileNotFoundError:
    print(f"Error: Data file not found at {data_path}")
    exit()
except Exception as e:
    print(f"Error loading data: {e}")
    exit()

Loading data...
Data loaded successfully. Shape: (39985, 2)
Sample data:
                   review_content  sentiment
0            可以体验一下，剧情不错，但可能会有点迷          1
1                           剧情很好          1
2  刺激，感受到了友情，亲情，自我，爱慕，传承，等待 ，与纯真          1
3            毋庸置疑的神作 第一次回溯时间的是葵啊          1
4                   没有别的可以说，剧情神作          1

Label distribution:
 sentiment
1    26353
0    13632
Name: count, dtype: int64


In [4]:
# --- 2. 预处理数据 ---
print("Preprocessing data...")
# T5 需要文本标签
label_map = {0: "negative", 1: "positive"}
# 反向映射，用于评估时解码
id_to_label = {v: k for k, v in label_map.items()}

# 添加任务前缀并转换标签
df['input_text'] = "classify sentiment: " + df[text_column]
df['target_text'] = df[label_column].map(label_map)

# 检查是否有标签未能成功映射
if df['target_text'].isnull().any():
    print("Warning: Some labels could not be mapped. Check your label_column values and label_map.")
    print("Rows with null target_text:", df[df['target_text'].isnull()])
    df = df.dropna(subset=['target_text']) # 移除无法映射的行

print("Preprocessing complete. Sample processed data:\n", df[['input_text', 'target_text']].head())

Preprocessing data...
Preprocessing complete. Sample processed data:
                                           input_text target_text
0            classify sentiment: 可以体验一下，剧情不错，但可能会有点迷    positive
1                           classify sentiment: 剧情很好    positive
2  classify sentiment: 刺激，感受到了友情，亲情，自我，爱慕，传承，等待 ，与纯真    positive
3            classify sentiment: 毋庸置疑的神作 第一次回溯时间的是葵啊    positive
4                   classify sentiment: 没有别的可以说，剧情神作    positive


In [5]:
# --- 3. 划分数据集 ---
print("Splitting data...")
train_df, test_df = train_test_split(
    df,
    test_size=test_size,
    random_state=random_state,
    stratify=df[label_column] # 保持训练集和测试集标签分布一致
)
print(f"Train set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")

Splitting data...
Train set size: 31988
Test set size: 7997


In [6]:
# --- 4. 转换为 Hugging Face Datasets ---
print("Converting to Hugging Face Datasets...")
train_dataset = Dataset.from_pandas(train_df[['input_text', 'target_text']])
test_dataset = Dataset.from_pandas(test_df[['input_text', 'target_text']])
dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})
print(dataset_dict)

Converting to Hugging Face Datasets...
DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text', '__index_level_0__'],
        num_rows: 31988
    })
    test: Dataset({
        features: ['input_text', 'target_text', '__index_level_0__'],
        num_rows: 7997
    })
})


In [7]:
# --- 5. 加载 Tokenizer 和模型 ---
print(f"Loading tokenizer and model: {model_name}...")
tokenizer = T5TokenizerFast.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# 检查 GPU 是否可用
if torch.cuda.is_available():
    print("GPU is available. Using CUDA.")
    device = torch.device("cuda")
else:
    print("GPU not available. Using CPU.")
    device = torch.device("cpu")

# 将模型移到正确的设备 (Trainer 会自动处理，但显式检查有益)
# model.to(device) # Trainer 会自动处理设备放置

Loading tokenizer and model: google-t5/t5-base...
GPU is available. Using CUDA.


In [8]:
# --- 6. Tokenize 数据 ---
print("Tokenizing data...")
def preprocess_function(examples):
    inputs = examples['input_text']
    targets = examples['target_text']
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset_dict.map(preprocess_function, batched=True)
print("Tokenization complete.")
print("Sample tokenized input:", tokenized_datasets['train'][0]['input_ids'])
print("Sample tokenized label:", tokenized_datasets['train'][0]['labels'])

Tokenizing data...


Map:   0%|          | 0/31988 [00:00<?, ? examples/s]



Map:   0%|          | 0/7997 [00:00<?, ? examples/s]

Tokenization complete.
Sample tokenized input: [853, 4921, 6493, 10, 3, 2, 6, 2, 6, 2, 6, 2, 1808, 2, 6, 2, 6, 2, 6, 2, 3, 14817, 14817, 18, 14817, 14817, 3, 2, 599, 2, 2773, 2, 61, 3, 2, 6, 2, 3, 2, 6, 2, 6, 2, 6, 52, 157, 7, 2, 6, 2, 3, 2, 52, 157, 7, 2, 6, 2, 536, 15938, 2, 6, 2, 1808, 5, 927, 2, 6, 2, 6, 2, 52, 157, 7, 2, 3, 2, 6, 2, 6, 2, 6, 2, 2517, 5, 927, 2, 6, 2, 2606, 2, 3, 2, 6, 2, 6, 2, 6, 2, 6, 2, 3, 2, 6, 2, 6, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Sample tokenized label: [1465, 1, 0, 0, 0, 0, 0, 0]


In [9]:
# --- 7. 设置训练参数 ---
print("Setting up training arguments...")
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch",      # 每个 epoch 结束后进行评估
    save_strategy="epoch",            # 每个 epoch 结束后保存模型
    learning_rate=learning_rate,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=0.01,
    load_best_model_at_end=True,      # 训练结束后加载最佳模型
    metric_for_best_model="eval_loss",# 使用评估损失来判断最佳模型
    predict_with_generate=True,       # 在评估时使用 generate 方法生成文本
    logging_dir=logging_dir,
    logging_steps=100,                 # 每 100 步记录一次日志
    fp16=torch.cuda.is_available(),   # 如果有 GPU，使用混合精度训练加速
    # report_to="tensorboard" # 可以取消注释以使用 TensorBoard
)

Setting up training arguments...


In [10]:
# --- 8. 定义评估指标 ---
print("Defining compute metrics function...")
# 获取真实标签的数值形式，用于 sklearn metrics
# 注意：确保 test_df 在这个作用域内是可访问的，或者在调用时传递它
# 或者，更好的方法是从原始的 test_dataset 中提取标签
# 为了简单起见，我们假设 test_df 在这里仍然可用
true_labels_numeric = test_df[label_column].tolist()
# 获取文本标签列表，用于 classification_report
text_label_list = list(label_map.values()) # ["negative", "positive"]

# 创建反向映射
text_to_id_map = {v: k for k, v in label_map.items()} # {"negative": 0, "positive": 1}

def compute_metrics(eval_pred):
    predictions, labels = eval_pred # labels 在这里是 token IDs，我们不需要它们，因为我们有 true_labels_numeric

    # predictions 是 token IDs, 需要解码
    # T5 的 generate 输出通常包含 pad token ID，即使设置了 skip_special_tokens
    # 替换 pad token ID 为 -100 以便解码器忽略它们 (虽然 skip_special_tokens 应该处理)
    # predictions[predictions == tokenizer.pad_token_id] = -100 # 通常不需要，skip_special_tokens=True 足够

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # *** 这是修改的关键部分 ***
    # 将解码后的文本 ("negative", "positive") 转换回 0, 1
    # 使用 .strip() 处理可能的首尾空格
    pred_labels_numeric = [text_to_id_map.get(pred.strip(), -1) for pred in decoded_preds]
    # 如果解码结果不在 text_to_id_map 中 (例如模型生成了奇怪的东西)，则映射为 -1

    # 过滤掉无法识别的预测标签 (-1)
    # 同时确保我们只比较那些有有效预测的真实标签
    valid_indices = [i for i, label in enumerate(pred_labels_numeric) if label != -1]

    # 检查过滤后是否还有样本
    if not valid_indices:
        print("Warning: No valid predictions found after decoding/filtering. Check model outputs.")
        print("Sample Decoded Predictions (first 10):", decoded_preds[:10]) # 打印解码输出来调试
        return {"accuracy": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0}

    filtered_preds = [pred_labels_numeric[i] for i in valid_indices]
    # 确保使用相同索引过滤真实标签
    filtered_true = [true_labels_numeric[i] for i in valid_indices]

    print("\n--- Evaluation ---")
    print("Sample Decoded Predictions:", decoded_preds[:5])
    print("Sample Predicted Numeric Labels (filtered):", filtered_preds[:5])
    print("Sample True Numeric Labels (filtered):", filtered_true[:5])

    # 计算 classification report
    # 确保 target_names 的顺序与标签 0, 1 对应
    target_names_for_report = [label_map[0], label_map[1]] # ["negative", "positive"]
    try:
        report = classification_report(filtered_true, filtered_preds, target_names=target_names_for_report, output_dict=True, zero_division=0)
        print("\nClassification Report:")
        print(classification_report(filtered_true, filtered_preds, target_names=target_names_for_report, zero_division=0))

        # 计算 confusion matrix
        # labels 参数指定了矩阵的行/列顺序，确保与 target_names 一致
        cm = confusion_matrix(filtered_true, filtered_preds, labels=[0, 1])
        print("\nConfusion Matrix (Rows: True, Cols: Pred):")
        print(f"       {label_map[0]}  {label_map[1]}")
        print(f"{label_map[0]}: {cm[0]}")
        print(f"{label_map[1]}: {cm[1]}")
        print("------------------")

        # 返回主要的指标给 Trainer
        return {
            "accuracy": report["accuracy"],
            "precision": report["weighted avg"]["precision"],
            "recall": report["weighted avg"]["recall"],
            "f1": report["weighted avg"]["f1-score"],
        }
    except Exception as e:
        print(f"Error calculating metrics: {e}")
        print("Filtered True Labels:", filtered_true)
        print("Filtered Pred Labels:", filtered_preds)
        return {"accuracy": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0}

Defining compute metrics function...


In [11]:
# --- 9. 初始化 Trainer ---
print("Initializing Trainer...")
# 数据整理器，负责动态填充批次中的序列
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    padding="max_length", # 确保标签也被填充
    max_length=max_input_length,
    label_pad_token_id=tokenizer.pad_token_id # 明确指定标签填充 ID
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Initializing Trainer...


  trainer = Seq2SeqTrainer(


In [12]:
# --- 10. 训练 ---
print("Starting training...")
train_result = trainer.train()
print("Training finished.")

# 保存最终模型和 tokenizer
trainer.save_model()
tokenizer.save_pretrained(output_dir)

# 记录训练指标
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

Starting training...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0027,0.002687,0.483306,0.634048,0.483306,0.468555
2,0.0026,0.002551,0.670126,0.643272,0.670126,0.589953
3,0.0026,0.002551,0.670001,0.643019,0.670001,0.589581



--- Evaluation ---
Sample Decoded Predictions: ['negative', 'positive', 'negative', 'negative', 'negative']
Sample Predicted Numeric Labels (filtered): [0, 1, 0, 0, 0]
Sample True Numeric Labels (filtered): [1, 1, 0, 1, 1]

Classification Report:
              precision    recall  f1-score   support

    negative       0.38      0.82      0.52      2726
    positive       0.77      0.31      0.44      5271

    accuracy                           0.48      7997
   macro avg       0.57      0.56      0.48      7997
weighted avg       0.63      0.48      0.47      7997


Confusion Matrix (Rows: True, Cols: Pred):
       negative  positive
negative: [2223  503]
positive: [3629 1642]
------------------

--- Evaluation ---
Sample Decoded Predictions: ['positive', 'positive', 'positive', 'positive', 'positive']
Sample Predicted Numeric Labels (filtered): [1, 1, 1, 1, 1]
Sample True Numeric Labels (filtered): [1, 1, 0, 1, 1]

Classification Report:
              precision    recall  f1-score 

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Training finished.
***** train metrics *****
  epoch                    =        3.0
  total_flos               = 27212330GF
  train_loss               =     0.0195
  train_runtime            = 0:44:16.21
  train_samples_per_second =     36.128
  train_steps_per_second   =      4.517


In [13]:
# --- 11. 评估 ---
print("Starting evaluation on the test set...")
eval_metrics = trainer.evaluate()

print("\n--- Final Test Set Evaluation Results ---")
print(f"Evaluation Loss: {eval_metrics.get('eval_loss', 'N/A')}")
print(f"Accuracy: {eval_metrics.get('eval_accuracy', 'N/A')}")
print(f"Precision: {eval_metrics.get('eval_precision', 'N/A')}")
print(f"Recall: {eval_metrics.get('eval_recall', 'N/A')}")
print(f"F1-Score: {eval_metrics.get('eval_f1', 'N/A')}")

Starting evaluation on the test set...



--- Evaluation ---
Sample Decoded Predictions: ['positive', 'positive', 'positive', 'positive', 'positive']
Sample Predicted Numeric Labels (filtered): [1, 1, 1, 1, 1]
Sample True Numeric Labels (filtered): [1, 1, 0, 1, 1]

Classification Report:
              precision    recall  f1-score   support

    negative       0.58      0.12      0.20      2726
    positive       0.68      0.95      0.79      5271

    accuracy                           0.67      7997
   macro avg       0.63      0.54      0.50      7997
weighted avg       0.64      0.67      0.59      7997


Confusion Matrix (Rows: True, Cols: Pred):
       negative  positive
negative: [ 327 2399]
positive: [ 239 5032]
------------------

--- Final Test Set Evaluation Results ---
Evaluation Loss: 0.002551098819822073
Accuracy: 0.6701262973615105
Precision: 0.6432715526495508
Recall: 0.6701262973615105
F1-Score: 0.5899531119508825
