In [30]:
import pandas as pd
import numpy as np
import joblib
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm

In [31]:
# 在特征工程前添加增强预处理
def clean_review(text):
    # 处理NaN并转换为字符串
    text = str(text) if pd.notna(text) else ''
    # 去除首尾空格
    text = text.strip()
    # 处理纯空格情况
    return 'no_content' if len(text) == 0 else text

In [32]:
# ================== 配置参数 ==================
param_grid = {
    'tfidf__max_features': [5000, 10000],         # TF-IDF特征维度
    'tfidf__ngram_range': [(1,1), (1,2)],         # 词组合范围
    'svd__n_components': [200, 300],              # 降维维度
    'knn__n_neighbors': [3,5,7],                  # K值选择
    'knn__weights': ['uniform', 'distance'],      # 投票权重
    'knn__metric': ['cosine', 'euclidean']        # 距离度量
}

In [33]:
# 加载预处理数据
df = pd.read_csv(r"D:\GitHubRepos\is6941-ml-social-media\taptap\data\integrated\cleaned_taptap_reviews.csv")
# 应用清洗
df['review_content'] = df['review_content'].apply(clean_review)

In [34]:
# 划分训练集和测试集（分层抽样）
X_train, X_test, y_train, y_test = train_test_split(
    df['review_content'], 
    df['sentiment'],
    test_size=0.2, 
    stratify=df['sentiment'],  # 保持类别分布
    random_state=42
)

In [35]:
# ================== 模型构建 ==================
# 创建处理管道
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        tokenizer=lambda x: x.split(),
        sublinear_tf=True
    )),
    ('svd', TruncatedSVD()),
    ('knn', KNeighborsClassifier())
])

In [36]:
# 初始化网格搜索
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='f1_weighted',    # 选择加权F1作为评估指标
    cv=3,                     # 3折交叉验证
    n_jobs=-1,                # 使用全部CPU核心
    verbose=3                 # 输出详细日志
)

In [37]:
# 执行网格搜索（带进度提示）
print("开始网格搜索... 共需训练 {} 个参数组合".format(
    np.prod([len(v) for v in param_grid.values()]))
)
grid_search.fit(X_train, y_train)

开始网格搜索... 共需训练 96 个参数组合
Fitting 3 folds for each of 96 candidates, totalling 288 fits




In [38]:
# 输出最佳参数
print("\n最佳参数组合:", grid_search.best_params_)
print("最佳验证F1分数: {:.4f}".format(grid_search.best_score_))


最佳参数组合: {'knn__metric': 'cosine', 'knn__n_neighbors': 7, 'knn__weights': 'uniform', 'svd__n_components': 300, 'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1)}
最佳验证F1分数: 0.7400


In [39]:
# 评估测试集
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("\n测试集分类报告:")
print(classification_report(y_test, y_pred))
print("混淆矩阵:\n", confusion_matrix(y_test, y_pred))


测试集分类报告:
              precision    recall  f1-score   support

           0       0.66      0.52      0.58      2726
           1       0.78      0.86      0.82      5271

    accuracy                           0.75      7997
   macro avg       0.72      0.69      0.70      7997
weighted avg       0.74      0.75      0.74      7997

混淆矩阵:
 [[1425 1301]
 [ 737 4534]]
