In [1]:
import re
import jieba
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict

In [2]:
DATA_PATH = "/Users/wangjingwen/Documents/GitHub/is6941-ml-social-media/analysis/data/cleaned_BV1dZwLeKEzG_comments.csv"  # 数据路径
STOPWORD_PATH = "/Users/wangjingwen/Documents/GitHub/is6941-ml-social-media/analysis/data/hit_stopwords.txt"          # 哈工大停用词表路径
SENTIMENT_DICT = {
    'pos': 'hownet_positive.txt',           # 积极词典
    'neg': 'hownet_negative.txt',           # 消极词典
    'level': 'degree_level.txt',            # 程度副词
    'deny': 'deny_words.txt'                # 否定词
}

In [3]:
# ================= 预处理模块 =================
def preprocess(text):
    """文本预处理函数"""
    # 去除特殊符号和数字
    text = re.sub(r'[^\w\s\u4e00-\u9fa5]', '', text)
    # 分词处理
    words = jieba.lcut(text)
    # 加载停用词表
    with open(STOPWORD_PATH, 'r', encoding='utf-8') as f:
        stopwords = set([line.strip() for line in f])
    # 过滤停用词
    return [w for w in words if w not in stopwords]

In [16]:
# ================= 情感词典加载 =================
def load_sentiment_resources():
    """加载所有情感词典资源"""
    resources = {}
    # 加载情感词
    for key in ['pos', 'neg']:
        with open(SENTIMENT_DICT[key], 'r', encoding='utf-8') as f:
            resources[key] = set(line.strip().split('\t')[0] for line in f if line.strip())
    
    # 加载程度副词（格式：词语\t权重）
    resources['degree'] = {}
    with open(SENTIMENT_DICT['level'], 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                word, score = line.strip().split('\t')
                resources['degree'][word] = float(score)
    
    # 加载否定词
    with open(SENTIMENT_DICT['deny'], 'r', encoding='utf-8') as f:
        resources['deny'] = set(line.strip() for line in f if line.strip())
    
    return resources

In [17]:
# ================= 情感分析核心 =================
class SentimentAnalyzer:
    def __init__(self):
        self.resources = load_sentiment_resources()
        self.window_size = 3  # 否定词/程度词的影响窗口
    
    def analyze_sentence(self, words):
        """分析单条评论情感值"""
        sentiment_score = 0
        # 记录前序修饰词位置
        modifiers = defaultdict(list)
        
        for index, word in enumerate(words):
            # 识别否定词
            if word in self.resources['deny']:
                for i in range(index+1, min(index+self.window_size, len(words))):
                    modifiers[i].append('deny')
            
            # 识别程度副词
            if word in self.resources['degree']:
                for i in range(index+1, min(index+self.window_size, len(words))):
                    modifiers[i].append(('degree', self.resources['degree'][word]))
        
        # 计算情感值
        for index, word in enumerate(words):
            current_score = 0
            # 判断情感词
            if word in self.resources['pos']:
                current_score = 1
            elif word in self.resources['neg']:
                current_score = -1
            
            # 应用修饰词
            for mod in modifiers.get(index, []):
                if mod == 'deny':
                    current_score *= -1
                elif isinstance(mod, tuple) and mod[0] == 'degree':
                    current_score *= mod[1]
            
            sentiment_score += current_score
        
        return sentiment_score

In [18]:
# ================= 主流程 =================
if __name__ == "__main__":
    # 读取数据
    df = pd.read_csv(DATA_PATH)
    comments = df['评论内容'].tolist()
    
    # 初始化分析器
    analyzer = SentimentAnalyzer()
    
    # 执行分析
    results = []
    for comment in comments:
        words = preprocess(comment)
        score = analyzer.analyze_sentence(words)
        # 分类逻辑
        if score >= 0.3:
            sentiment = '积极'
        elif score <= -0.3:
            sentiment = '消极'
        else:
            sentiment = '中性'
        results.append(sentiment)
    
    # 统计结果
    count = pd.Series(results).value_counts()
    
    # 可视化
    plt.figure(figsize=(10,6))
    colors = ['#66b3ff','#99ff99','#ff9999']
    explode = (0.05, 0, 0)
    
    plt.pie(count.values,
            labels=count.index,
            autopct='%1.1f%%',
            startangle=90,
            colors=colors,
            explode=explode,
            shadow=True)
    
    plt.title('用户评论情感分布 (n={})'.format(len(comments)), fontsize=14)
    plt.axis('equal')
    plt.savefig('sentiment_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'hownet_positive.txt'