In [52]:
import pandas as pd
import joblib
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

In [53]:
# ================== 配置参数 ==================
param_grid = {
    'tfidf__max_features': [8000, 12000],        # 平衡特征维度
    'tfidf__ngram_range': [(1,1), (1,2)],        # 词组合范围
    'tree__max_depth': [15, 20, None],           # 树的最大深度
    'tree__min_samples_split': [10, 20],         # 节点分裂最小样本
    'tree__class_weight': [None, 'balanced']     # 类别权重
}

In [54]:
# 在特征工程前添加增强预处理
def clean_review(text):
    # 处理NaN并转换为字符串
    text = str(text) if pd.notna(text) else ''
    # 去除首尾空格
    text = text.strip()
    # 处理纯空格情况
    return 'no_content' if len(text) == 0 else text

In [55]:
# 加载预处理数据
df = pd.read_csv(r"D:\GitHubRepos\is6941-ml-social-media\taptap\data\integrated\cleaned_taptap_reviews.csv")
# 应用清洗
df['review_content'] = df['review_content'].apply(clean_review)

In [56]:
# 数据划分（保持类别平衡）
X_train, X_test, y_train, y_test = train_test_split(
    df['review_content'], 
    df['sentiment'],
    test_size=0.2,
    stratify=df['sentiment'],
    random_state=42
)

In [57]:
# 构建处理管道
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        tokenizer=lambda x: x.split(),
        sublinear_tf=True
    )),
    ('tree', DecisionTreeClassifier(
        random_state=42,
        min_samples_leaf=10  # 预剪枝参数
    ))
])

In [58]:
# 网格搜索调参
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring='f1_weighted',
    cv=3,
    n_jobs=-1,
    verbose=2
)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits




In [59]:
# 输出最佳参数
print("\n最佳参数:", grid_search.best_params_)
print("最佳验证F1:", round(grid_search.best_score_, 4))


最佳参数: {'tfidf__max_features': 12000, 'tfidf__ngram_range': (1, 2), 'tree__class_weight': None, 'tree__max_depth': None, 'tree__min_samples_split': 10}
最佳验证F1: 0.7213


In [60]:
# 测试集评估
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("\n测试集分类报告:")
print(classification_report(y_test, y_pred))
print("混淆矩阵:\n", confusion_matrix(y_test, y_pred))


测试集分类报告:
              precision    recall  f1-score   support

           0       0.59      0.56      0.58      2726
           1       0.78      0.80      0.79      5271

    accuracy                           0.72      7997
   macro avg       0.68      0.68      0.68      7997
weighted avg       0.71      0.72      0.72      7997

混淆矩阵:
 [[1536 1190]
 [1068 4203]]
