In [924]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix
import jieba
import re

In [925]:
# 在特征工程前添加增强预处理
def clean_review(text):
    # 处理NaN并转换为字符串
    text = str(text) if pd.notna(text) else ''
    # 去除首尾空格
    text = text.strip()
    # 处理纯空格情况
    return 'no_content' if len(text) == 0 else text

In [926]:
# 加载预处理数据
df = pd.read_csv(r"/Users/wangjingwen/Documents/GitHub/is6941-ml-social-media/taptap/data/integrated/cleaned_taptap_reviews.csv")
# 应用清洗
df['review_content'] = df['review_content'].apply(clean_review)

In [927]:
# 仅保留文本内容和目标变量
df = df[['review_content', 'sentiment']].copy()

In [928]:
# 划分特征和目标
X = df.drop('sentiment', axis=1)
y = df['sentiment']

In [929]:
# 数据集划分
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [930]:
# 创建CatBoost数据池（仅包含文本特征）
train_pool = Pool(
    X_train, y_train,
    text_features=['review_content']
)

test_pool = Pool(
    X_test, y_test,
    text_features=['review_content']
)

In [931]:
# 中文优化模型配置
model = CatBoostClassifier(
    task_type='CPU',
    iterations=10,          # 增加迭代次数配合更大的学习率
    learning_rate=0.05,      # 提升学习率加速收敛
    depth=8,                # 降低树深平衡复杂度
    eval_metric='F1',
    loss_function='Logloss',  # 自定义分类边界
    # simple_ctr='Counter:CtrBorderCount=15:Prior=0.5',  # 改进CTR计算
    class_weights=[1.2, 1.1],  # 调整权重比例
    l2_leaf_reg=12,         # 增强正则化
    random_strength=2.0,    # 提升随机性防止过拟合
    thread_count=8,         # 减少线程数防止内存竞争
    bootstrap_type='MVS',   # 更高效的采样方式
    subsample=0.5,         # 子采样比例
    max_ctr_complexity=2,   # 允许二阶特征组合
    leaf_estimation_method='Newton',  # 改进叶子值计算方法
    leaf_estimation_iterations=8,     # 增加迭代次数
    # early_stopping_rounds=30,        # 添加早停机制
    verbose=1
)

In [932]:
# 训练模型
model.fit(train_pool, eval_set=test_pool, verbose=50)

0:	learn: 0.8386147	test: 0.8389960	best: 0.8389960 (0)	total: 212ms	remaining: 1.91s
9:	learn: 0.8390412	test: 0.8390513	best: 0.8391612 (2)	total: 1.77s	remaining: 0us

bestTest = 0.8391611991
bestIteration = 2

Shrink model to first 3 iterations.


<catboost.core.CatBoostClassifier at 0x154221b90>

In [933]:
# 预测和评估
y_pred = model.predict(test_pool)
print("\n测试集分类报告:")
print(classification_report(y_test, y_pred))
print("混淆矩阵:\n", confusion_matrix(y_test, y_pred))


测试集分类报告:
              precision    recall  f1-score   support

           0       0.70      0.71      0.70      2726
           1       0.85      0.84      0.84      5271

    accuracy                           0.80      7997
   macro avg       0.77      0.78      0.77      7997
weighted avg       0.80      0.80      0.80      7997

混淆矩阵:
 [[1939  787]
 [ 840 4431]]
