In [6]:
import os
import time
import threading
import pandas as pd
from functools import partial
from tqdm import tqdm
from multiprocessing import Pool
from nltk.tokenize import word_tokenize
from concurrent.futures import ThreadPoolExecutor 

In [2]:
# 构建情感词列表
def build_emtion_list(lexion_content):
    Positive, Negative, Anger, Anticipation, Disgust, Fear, Joy, Sadness, Surprise, Trust = [[] for _ in range(10)]

    # 遍历每一行，将情感值为1的词语添加到相应的情感列表中
    lines = lexion_content.split('\n')
    for line in lines:
        parts = line.split('\t')
        if len(parts) == 3:  # 确保行包含三个部分
            word, emotion, value = parts
            if value == '1':
                if emotion == 'positive':
                    Positive.append(word)
                elif emotion == 'negative':
                    Negative.append(word)
                elif emotion == 'anger':
                    Anger.append(word)
                elif emotion == 'anticipation':
                    Anticipation.append(word)
                elif emotion == 'disgust':
                    Disgust.append(word)
                elif emotion == 'fear':
                    Fear.append(word)
                elif emotion == 'joy':
                    Joy.append(word)
                elif emotion == 'sadness':
                    Sadness.append(word)
                elif emotion == 'surprise':
                    Surprise.append(word)
                elif emotion == 'trust':
                    Trust.append(word)
        else:
            print(line)
            
    return Positive, Negative, Anger, Anticipation, Disgust, Fear, Joy, Sadness, Surprise, Trust

In [3]:
# 计算一条文本的情感词频，返回series
def emotion_caculate(text,Positive, Negative, Anger, Anticipation, Disgust, Fear, Joy, Sadness, Surprise, Trust):
    positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, trust = [0 for i in range(10)]
    
    text = text.lower()
    wordlist = word_tokenize(text)
    wordset = set(wordlist)
    wordfreq = []
    for word in wordset:
        freq = wordlist.count(word)
        if word in Positive:
            positive+=freq
        if word in Negative:
            negative+=freq
        if word in Anger:
            anger+=freq  
        if word in Anticipation:
            anticipation+=freq
        if word in Disgust:
            disgust+=freq
        if word in Fear:
            fear+=freq
        if word in Joy:
            joy+=freq
        if word in Sadness:
            sadness+=freq
        if word in Surprise:
            surprise+=freq
        if word in Trust:
            trust+=freq
            
    emotion_info = {
        'positive': positive,
        'negative': negative,
        'anger': anger,
        'anticipation': anticipation,
        'disgust': disgust,
        'fear':fear,
        'joy':joy,
        'sadness':sadness,
        'surprise':surprise,
        'trust':trust,
        'length':len(wordlist)
    }
    indexs = ['length', 'positive', 'negative', 'anger', 'anticipation','disgust','fear','joy','sadness','surprise','trust']
    return pd.Series(emotion_info, index=indexs)

In [4]:
# 大规模计算情感值，返回数据中全部月中每一条新闻的情感值
def main():
    with open(r'D:\zhenfeng zhou\The Civil War\analyse_code\emtion_analysis\NRC-Emotion-Lexicon\NRC-Emotion-Lexicon-Wordlevel-v0.92.txt', 'r',encoding = 'utf-8') as file:
        lexion_content = file.read()
        Positive, Negative, Anger, Anticipation, Disgust, Fear, Joy, Sadness, Surprise, Trust = build_emtion_list(lexion_content)
        
    # 给数据框添加进度条
    tqdm.pandas()
    file_path = r"D:\zhenfeng zhou\The Civil War\data\only_black_news.csv"
    df = pd.read_csv(file_path)
    df['filtered_text'] = df['filtered_text'].fillna('')
    
    emotion_data = df['filtered_text'].progress_apply(lambda text: emotion_caculate(text, Positive, Negative, Anger, Anticipation, Disgust, Fear, Joy, Sadness, Surprise, Trust))
    
    # 将计算结果合并到原 DataFrame 中
    df = pd.concat([df, emotion_data], axis=1)

    # 写入本地
    df.to_csv(r'D:\zhenfeng zhou\The Civil War\data\only_black_news_emotion.csv',index = False)
    
    return 'done'

In [7]:
# 运行
if __name__ == "__main__":
    main()




100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 270550/270550 [1:04:20<00:00, 70.07it/s]
