In [23]:
import pandas as pd
from snownlp import SnowNLP
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm

In [24]:
def preprocess_text(text):
    """确保输入为字符串并处理缺失值"""
    if pd.isna(text):
        return ""
    return str(text).strip()

In [25]:
# ================== 自定义阈值优化 ==================
def find_optimal_threshold(scores, labels):
    """寻找最佳分类阈值"""
    from sklearn.metrics import f1_score
    best_thresh = 0.5
    best_f1 = 0
    for thresh in [x/100 for x in range(20, 80)]:
        preds = (scores > thresh).astype(int)
        f1 = f1_score(labels, preds)
        if f1 > best_f1:
            best_f1 = f1
            best_thresh = thresh
    return best_thresh

In [26]:
def safe_snownlp(text):
    """处理空文本的增强函数"""
    if not text.strip():  # 过滤空字符串和纯空格
        return 0.5  # 返回中性值
    
    try:
        return SnowNLP(text).sentiments
    except Exception as e:
        print(f"处理异常文本: {text}，错误: {str(e)}")
        return 0.5  # 异常文本返回中性值

In [27]:
# 1. 加载预处理数据
df = pd.read_csv(r"D:\GitHubRepos\is6941-ml-social-media\taptap\data\integrated\cleaned_taptap_reviews.csv")
df['review_content'] = df['review_content'].apply(preprocess_text)

In [28]:
# 2. 计算情感得分（启用并行加速）
# 修改后的情感计算部分
tqdm.pandas(desc="情感分析进度")
df['snownlp_score'] = df['review_content'].progress_apply(
    lambda x: safe_snownlp(x)  # 使用安全处理函数
)

情感分析进度: 100%|██████████| 39985/39985 [01:10<00:00, 567.62it/s] 


In [29]:
# 3. 寻找最佳分类阈值
optimal_thresh = find_optimal_threshold(df['snownlp_score'], df['sentiment'])
print(f"最优分类阈值: {optimal_thresh:.2f}")

最优分类阈值: 0.20


In [30]:
# 4. 生成预测结果
df['pred'] = df['snownlp_score'].apply(
    lambda x: 1 if x > optimal_thresh else 0
)

In [31]:
# 5. 评估模型表现
print("\n分类报告:")
print(classification_report(df['sentiment'], df['pred']))
print("\n混淆矩阵:")
print(confusion_matrix(df['sentiment'], df['pred']))


分类报告:
              precision    recall  f1-score   support

           0       0.56      0.47      0.51     13632
           1       0.75      0.81      0.78     26353

    accuracy                           0.69     39985
   macro avg       0.65      0.64      0.64     39985
weighted avg       0.68      0.69      0.69     39985


混淆矩阵:
[[ 6356  7276]
 [ 5005 21348]]
