In [12]:
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm

In [13]:
# 在特征工程前添加增强预处理
def clean_review(text):
    # 处理NaN并转换为字符串
    text = str(text) if pd.notna(text) else ''
    # 去除首尾空格
    text = text.strip()
    # 处理纯空格情况
    return 'no_content' if len(text) == 0 else text

In [14]:
# ================== 配置参数 ==================
# 修改TF-IDF配置
TFIDF_PARAMS = {
    "tokenizer": lambda x: x.split() if x != 'no_content' else ['no_content'],
    "ngram_range": (1, 2),
    "max_features": 5000,
    "sublinear_tf": True
}

MODEL_PARAMS = {
    "solver": "liblinear",    # 适合小数据集
    "max_iter": 1000,         # 确保收敛
    "random_state": 42
}

In [15]:
# 1. 加载预处理数据
df = pd.read_csv(r"D:\GitHubRepos\is6941-ml-social-media\taptap\data\integrated\cleaned_taptap_reviews.csv")
# 应用清洗
df['review_content'] = df['review_content'].apply(clean_review)

In [16]:
# 2. 特征工程
tfidf = TfidfVectorizer(**TFIDF_PARAMS)
X = tfidf.fit_transform(df['review_content'])
y = df['sentiment']



In [17]:
# 3. 划分数据集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
print(f"\n训练集: {X_train.shape[0]} | 测试集: {X_test.shape[0]}")


训练集: 31988 | 测试集: 7997


In [19]:
# 4. 模型训练
model = LogisticRegression(**MODEL_PARAMS)
model.fit(X_train, y_train)

In [None]:
# 5. 模型评估
y_pred = model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


分类报告:
              precision    recall  f1-score   support

           0       0.75      0.64      0.69      2726
           1       0.83      0.89      0.86      5271

    accuracy                           0.81      7997
   macro avg       0.79      0.77      0.77      7997
weighted avg       0.80      0.81      0.80      7997

混淆矩阵:
 [[1744  982]
 [ 574 4697]]
