In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import xgboost as xgb
from tqdm.auto import tqdm
from sklearn.model_selection import ParameterGrid

ModuleNotFoundError: No module named 'joblib'

In [4]:
# ================== 配置参数 ==================
# TFIDF_PARAMS = {
#     "tokenizer": lambda x: x.split(),
#     "ngram_range": (1, 2),          # 包含单字和双词组合
#     "max_features": 15000,          # 平衡特征维度与计算成本
#     "sublinear_tf": True            # 亚线性缩放
# }

# XGB_PARAMS = {
#     "objective": "binary:logistic",
#     "eval_metric": "logloss",
#     "tree_method": "gpu_hist",      # GPU加速
#     "random_state": 42,
#     "verbosity": 0                  # 减少输出噪音
# }
TFIDF_PARAMS = {
    "tokenizer": lambda x: x.split(),
    "ngram_range": (1, 2),
    "max_features": 10000,          # 降低特征维度以节省内存
    "sublinear_tf": True
}

XGB_PARAMS = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "tree_method": "hist",          # 改为CPU优化的直方图算法
    "random_state": 42,
    "n_jobs": 4,                    # 根据Mac CPU核心数调整（M1/M2通常4性能核）
    "verbosity": 0
}

In [5]:
# 在特征工程前添加增强预处理
def clean_review(text):
    # 处理NaN并转换为字符串
    text = str(text) if pd.notna(text) else ''
    # 去除首尾空格
    text = text.strip()
    # 处理纯空格情况
    return 'no_content' if len(text) == 0 else text

In [6]:
# 加载预处理数据
df = pd.read_csv(r"/Users/wangjingwen/Documents/GitHub/is6941-ml-social-media/taptap/data/integrated/cleaned_taptap_reviews.csv")
# 应用清洗
df['review_content'] = df['review_content'].apply(clean_review)

In [7]:
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(
    df['review_content'],
    df['sentiment'],
    test_size=0.2,
    stratify=df['sentiment'],
    random_state=42
)

In [8]:
# ================== 构建处理管道 ==================
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(**TFIDF_PARAMS)),
    ('xgb', xgb.XGBClassifier(**XGB_PARAMS))
])

In [9]:
# ================== 网格搜索调参 ==================
param_grid = {
    'xgb__max_depth': [4],                   # 控制树复杂度
    'xgb__learning_rate': [0.05, 0.1],       # 学习率范围
    'xgb__subsample': [0.8, 1.0],            # 行采样比例
    'xgb__colsample_bytree': [0.8, 1.0],     # 列采样比例
    'xgb__gamma': [0, 0.1],                  # 节点分裂最小损失下降
    'xgb__scale_pos_weight': [               # 自动计算类别权重
        len(y_train[y_train==0])/len(y_train[y_train==1]),  # 负/正样本比
        1.5                                   # 经验系数
    ]
}

In [10]:
# 初始化搜索器
# grid_search = GridSearchCV(
#     pipeline,
#     param_grid,
#     scoring='f1_weighted',
#     cv=3,
#     n_jobs=-1,
#     verbose=1
# )
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring='f1_weighted',
    cv=2,                                  # 减少交叉验证折数
    n_jobs=2,                              # 降低并行度以避免内存问题
    verbose=1
)
print("开始训练...")
grid_search.fit(X_train, y_train)

开始训练...
Fitting 2 folds for each of 32 candidates, totalling 64 fits


Traceback (most recent call last):
  File "/opt/anaconda3/envs/IS6941/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/IS6941/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/IS6941/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/IS6941/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 90, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/IS6941/lib/python3.11/site-packages/sk

In [12]:
# ================== 模型评估 ==================
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

print("\n最佳参数:", grid_search.best_params_)
print("验证集最佳F1:", round(grid_search.best_score_, 4))

print("\n测试集分类报告:")
print(classification_report(y_test, y_pred))
print("混淆矩阵:\n", confusion_matrix(y_test, y_pred))

AttributeError: 'super' object has no attribute '__sklearn_tags__'