In [1]:
# 导入库
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='transformers')

from   datetime import datetime
from tqdm.notebook import tqdm
import os
import glob
import pysentiment2 as ps
import nltk
from   nltk.sentiment.vader import SentimentIntensityAnalyzer
from   IPython.core.display import HTML

from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import torch

In [2]:
## 设置情感分析工具

# 下载Vader情感分析工具的词典
nltk.download('vader_lexicon')
# 初始化Loughran-McDonald情感分析模型
lm    = ps.LM()
# 初始化HIV-4情感分析模型
hiv4  = ps.HIV4()
# 初始化Vader情感分析模型
vader = SentimentIntensityAnalyzer()

# 如果有可用的第一块GPU则使用，否则使用CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("mps"if torch.backends.mps.is_available()else "cpu")

# 初始化FinBERT情感分析模型的tokenizer
finbert_tokenizer  = AutoTokenizer.from_pretrained("ProsusAI/finbert")
# 从FinBERT预训练模型加载模型
finbert_model      = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
# 配置FinBERT情感分析模型的运行环境（GPU or CPU）
finbert_classifier = pipeline("sentiment-analysis", model = finbert_model, tokenizer = finbert_tokenizer, device = device)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\WangH\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
A matching Triton is not available, some optimizations will not be enabled.
Error caught was: No module named 'triton'


In [3]:
def sentimentAnalysisLMD(text):
    tokens = lm.tokenize(text) #tokenize the text into relevant words in the LM dict
    score = lm.get_score(tokens) #score dict seperates into: NEG, POS, POLARITY, SUBJECTIVITY
    return score

def sentimentAnalysisHIV4(text):
    tokens = hiv4.tokenize(text)
    score = hiv4.get_score(tokens)
    return score

def sentimentAnalysisVader(df):
    df_vader = pd.DataFrame(df, columns=['Date', 'Text'])
    scores = [vader.polarity_scores(text) for text in tqdm(df_vader['Text'], desc="Processing with VADER")]
    scores_vader = pd.DataFrame(scores).add_prefix("vader_")
    df_vader = df_vader.join(scores_vader, rsuffix='_right')
    return df_vader['vader_compound']

def sentimentAnalysisFinBert(df):
    df_finbert = pd.DataFrame(df, columns=['Date', 'Text'])
    score = []
    for text in tqdm(df_finbert['Text'], desc="Processing with FinBERT"):
        classified = finbert_classifier(text, truncation=True)[0]
        if classified['label'] == "negative":
            score.append(classified['score']*(-1))
        elif classified['label'] == "positive":
            score.append(classified['score'])
        else:
            score.append(0)
    return score


In [4]:
lst_files = []

Path = "../Data/NER/count_3/MW_NER/*.csv"
count = 0

for fname in glob.glob(Path):
    lst_files.append(fname)

lst_files = sorted(lst_files)

for file in lst_files:
    file_name = os.path.basename(file)  # Get the file name from the full file path
    country_name = os.path.splitext(file_name)[0]  # Remove the file extension
    country_name = country_name.replace("_articles", "")  # Remove the "_articles" part of the file name

    # 对单个国家的数据进行情感分析
    # if country_name == 'Finland':
    # Check if the file already exists
    if os.path.isfile('../Data/Sentiment_Scores/count 1/MW_Sentiment_Scores/{0}.csv'.format(country_name)):
        print("File already exists: {0}.csv".format(country_name))
    else:
        print(file)
        df = pd.read_csv(file)

        if df.shape[0] != 0:
            df = df.drop_duplicates(['Date','Headline'], keep='last')
            df = df.drop(['Unnamed: 0', 'Headline'], axis = 1)
            df['Text'] = df['Text'].astype(str)
            df['Date'] = pd.to_datetime(df['Date'], utc=True).dt.date
            df.reset_index(drop=True, inplace=True)

            # LMD + HIV4
            df_LMD_HIV4 = pd.DataFrame(df, columns=['Date', 'Text'])
            # df_LMD_HIV4["scoreLMD"] = df_LMD_HIV4["Text"].apply(sentimentAnalysisLMD)
            # df_LMD_HIV4["scoreHIV4"] = df_LMD_HIV4["Text"].apply(sentimentAnalysisHIV4)
            # 使用tqdm在处理sentiment时显示进度条
            df_LMD_HIV4["scoreLMD"] = [sentimentAnalysisLMD(text) for text in tqdm(df_LMD_HIV4["Text"], desc="Processing with LMD")]
            df_LMD_HIV4["scoreHIV4"] = [sentimentAnalysisHIV4(text) for text in tqdm(df_LMD_HIV4["Text"], desc="Processing with HIV4")]
            df_LMD_HIV4 = pd.concat([df_LMD_HIV4.drop(["scoreLMD"], axis=1), df_LMD_HIV4["scoreLMD"].apply(pd.Series).add_prefix("LMD_")], axis=1)
            df_LMD_HIV4 = pd.concat([df_LMD_HIV4.drop(["scoreHIV4"], axis=1), df_LMD_HIV4["scoreHIV4"].apply(pd.Series).add_prefix("HIV4_")], axis=1)
            df[['LMD_Polarity', 'HIV4_Polarity']] = df_LMD_HIV4[['LMD_Polarity', 'HIV4_Polarity']]

            # VADER
            df['Vader_Polarity'] = sentimentAnalysisVader(df)

            # FINBERT
            df['FinBert_Polarity'] = sentimentAnalysisFinBert(df)

            # 储存一下df的原始数据
            df_original = df.copy()
            df['Country'] = country_name
            # Find the average Polarity for each Dictionary, per given date
            df = df.groupby('Date').mean(numeric_only=True).reset_index()
            df = df.sort_values(by = ['Date'], ascending = True)
            idx = pd.date_range('2012-01-01', '2023-06-30')
            df = df.set_index(['Date']).reindex(idx, fill_value=np.nan).rename_axis('Date').reset_index()

            df['Country'] = country_name
            df= df[['Date', 'Country', 'LMD_Polarity', 'HIV4_Polarity', 'Vader_Polarity', 'FinBert_Polarity']]

            count = count + 1
            print(count)
            df.to_csv('../Data/Sentiment_Scores/count_3/MW_Sentiment_Scores/{0}.csv'.format(country_name))
            # 保存df的原始数据
            df_original.to_csv('../Data/Sentiment_Scores/count_3/MW_Sentiment_Scores/{0}_original.csv'.format(country_name))


../Data/NER/count_3/MW_NER\Australia.csv


Processing with LMD:   0%|          | 0/2099 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/2099 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/2099 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/2099 [00:00<?, ?it/s]

1
../Data/NER/count_3/MW_NER\Canada.csv


Processing with LMD:   0%|          | 0/1107 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/1107 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/1107 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/1107 [00:00<?, ?it/s]

2
../Data/NER/count_3/MW_NER\China.csv


Processing with LMD:   0%|          | 0/5295 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/5295 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/5295 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/5295 [00:00<?, ?it/s]

3
../Data/NER/count_3/MW_NER\Denmark.csv


Processing with LMD:   0%|          | 0/12 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/12 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/12 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/12 [00:00<?, ?it/s]

4
../Data/NER/count_3/MW_NER\Finland.csv


Processing with LMD:   0%|          | 0/15 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/15 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/15 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/15 [00:00<?, ?it/s]

5
../Data/NER/count_3/MW_NER\France.csv


Processing with LMD:   0%|          | 0/784 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/784 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/784 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/784 [00:00<?, ?it/s]

6
../Data/NER/count_3/MW_NER\Germany.csv


Processing with LMD:   0%|          | 0/118 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/118 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/118 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/118 [00:00<?, ?it/s]

7
../Data/NER/count_3/MW_NER\Italy.csv


Processing with LMD:   0%|          | 0/101 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/101 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/101 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/101 [00:00<?, ?it/s]

8
../Data/NER/count_3/MW_NER\Japan.csv


Processing with LMD:   0%|          | 0/2730 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/2730 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/2730 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/2730 [00:00<?, ?it/s]

9
../Data/NER/count_3/MW_NER\Netherlands.csv


Processing with LMD:   0%|          | 0/52 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/52 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/52 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/52 [00:00<?, ?it/s]

10
../Data/NER/count_3/MW_NER\New Zealand.csv


Processing with LMD:   0%|          | 0/465 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/465 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/465 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/465 [00:00<?, ?it/s]

11
../Data/NER/count_3/MW_NER\Norway.csv


Processing with LMD:   0%|          | 0/50 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/50 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/50 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/50 [00:00<?, ?it/s]

12
../Data/NER/count_3/MW_NER\Portugal.csv


Processing with LMD:   0%|          | 0/60 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/60 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/60 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/60 [00:00<?, ?it/s]

13
../Data/NER/count_3/MW_NER\Singapore.csv


Processing with LMD:   0%|          | 0/398 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/398 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/398 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/398 [00:00<?, ?it/s]

14
../Data/NER/count_3/MW_NER\South Korea.csv


Processing with LMD:   0%|          | 0/14 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/14 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/14 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/14 [00:00<?, ?it/s]

15
../Data/NER/count_3/MW_NER\Spain.csv


Processing with LMD:   0%|          | 0/107 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/107 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/107 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/107 [00:00<?, ?it/s]

16
../Data/NER/count_3/MW_NER\Sweden.csv


Processing with LMD:   0%|          | 0/68 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/68 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/68 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/68 [00:00<?, ?it/s]

17
../Data/NER/count_3/MW_NER\Switzerland.csv


Processing with LMD:   0%|          | 0/199 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/199 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/199 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/199 [00:00<?, ?it/s]

18
../Data/NER/count_3/MW_NER\United Kingdom.csv


Processing with LMD:   0%|          | 0/122 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/122 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/122 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/122 [00:00<?, ?it/s]

19
../Data/NER/count_3/MW_NER\United States.csv


Processing with LMD:   0%|          | 0/1505 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/1505 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/1505 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/1505 [00:00<?, ?it/s]

20
