In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from tqdm import tqdm

In [None]:
# ================== 配置参数 ==================
param_grid = {
    'tfidf__max_features': [15000, 20000],  # 提高特征维度上限
    'tfidf__ngram_range': [(1,2), (1,3)],    # 尝试三元组
    'svm__C': np.logspace(-2, 2, 5),         # 扩展C值范围 [0.01, 0.1, 1, 10, 100]
    'svm__class_weight': ['balanced', None],
    'svm__loss': ['squared_hinge', 'hinge'],  # 增加损失函数选项
    'svm__max_iter': [2000],                  # 固定较大迭代次数
    'svm__dual': [True]
}

In [72]:
# 在特征工程前添加增强预处理
def clean_review(text):
    # 处理NaN并转换为字符串
    text = str(text) if pd.notna(text) else ''
    # 去除首尾空格
    text = text.strip()
    # 处理纯空格情况
    return 'no_content' if len(text) == 0 else text

In [73]:
# 加载预处理数据
df = pd.read_csv(r"D:\GitHubRepos\is6941-ml-social-media\taptap\data\integrated\cleaned_taptap_reviews.csv")
# 应用清洗
df['review_content'] = df['review_content'].apply(clean_review)

In [74]:
# 2. 计算类别权重
classes = np.unique(df['sentiment'])  # 自动获取实际存在的类别
weights = compute_class_weight('balanced', classes=classes, y=df['sentiment'])
class_weights = {cls: weight for cls, weight in zip(classes, weights)}

In [75]:
# 数据划分
X_train, X_test, y_train, y_test = train_test_split(
    df['review_content'],
    df['sentiment'],
    test_size=0.2,
    stratify=df['sentiment'],
    random_state=42
)

In [76]:
# 构建处理管道
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        tokenizer=lambda x: x.split(),
        sublinear_tf=True
    )),
    ('svm', SVC(
        probability=True,  # 启用概率预测
        cache_size=1000,   # 提高缓存大小
        random_state=42
    ))
])

In [77]:
pipeline.set_params(
    svm=LinearSVC(
        class_weight='balanced',
        dual=False,  # 当样本量>特征量时启用
        max_iter=1000
    )
)

In [78]:
# def balanced_accuracy_scorer(estimator, X, y):
#     """自定义平衡准确率评估指标"""
#     y_pred = estimator.predict(X)
#     cm = confusion_matrix(y, y_pred)
#     return np.mean([cm[i,i]/cm[i].sum() for i in range(cm.shape[0])])

# 修改GridSearch配置
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring={
        'f1': 'f1_weighted',
        # 'balanced_acc': balanced_accuracy_scorer  # 新增评估指标
    },
    refit='f1',      # 选择f1作为优化目标
    cv=3,
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 80 candidates, totalling 240 fits




In [79]:
# 6. 输出最佳参数
print("\n最佳参数:", grid_search.best_params_)
print("最佳验证F1:", round(grid_search.best_score_, 4))


最佳参数: {'svm__C': np.float64(0.1), 'svm__class_weight': 'balanced', 'svm__dual': True, 'svm__loss': 'squared_hinge', 'svm__max_iter': 2000, 'tfidf__max_features': 20000, 'tfidf__ngram_range': (1, 3)}
最佳验证F1: 0.8139


In [80]:
# 7. 测试集评估
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("\n测试集分类报告:")
print(classification_report(y_test, y_pred))
print("混淆矩阵:\n", confusion_matrix(y_test, y_pred))


测试集分类报告:
              precision    recall  f1-score   support

           0       0.68      0.81      0.74      2726
           1       0.89      0.80      0.84      5271

    accuracy                           0.81      7997
   macro avg       0.79      0.81      0.79      7997
weighted avg       0.82      0.81      0.81      7997

混淆矩阵:
 [[2203  523]
 [1036 4235]]
