In [10]:
# 导入库
import numpy as np
import pandas as pd
from   datetime import datetime
from tqdm.notebook import tqdm
import os
import glob
import pysentiment2 as ps
import nltk
from   nltk.sentiment.vader import SentimentIntensityAnalyzer
from   IPython.core.display import HTML

from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import torch

In [2]:
## 设置情感分析工具

# 下载Vader情感分析工具的词典
nltk.download('vader_lexicon')
# 初始化Loughran-McDonald情感分析模型
lm    = ps.LM()
# 初始化HIV-4情感分析模型
hiv4  = ps.HIV4()
# 初始化Vader情感分析模型
vader = SentimentIntensityAnalyzer()

# 如果有可用的第一块GPU则使用，否则使用CPU
device = torch.device("mps"if torch.backends.mps.is_available()else "cpu")

# 初始化FinBERT情感分析模型的tokenizer
finbert_tokenizer  = AutoTokenizer.from_pretrained("ProsusAI/finbert")
# 从FinBERT预训练模型加载模型
finbert_model      = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
# 配置FinBERT情感分析模型的运行环境（GPU or CPU）
finbert_classifier = pipeline("sentiment-analysis", model = finbert_model, tokenizer = finbert_tokenizer, device = device)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/wanghs/nltk_data...


Downloading (…)okenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [16]:
def sentimentAnalysisLMD(text):
    tokens = lm.tokenize(text) #tokenize the text into relevant words in the LM dict
    score = lm.get_score(tokens) #score dict seperates into: NEG, POS, POLARITY, SUBJECTIVITY
    return score

def sentimentAnalysisHIV4(text):
    tokens = hiv4.tokenize(text)
    score = hiv4.get_score(tokens)
    return score

def sentimentAnalysisVader(df):
    df_vader = pd.DataFrame(df, columns=['Date', 'Text'])
    scores = [vader.polarity_scores(text) for text in tqdm(df_vader['Text'], desc="Processing with VADER")]
    scores_vader = pd.DataFrame(scores).add_prefix("vader_")
    df_vader = df_vader.join(scores_vader, rsuffix='_right')
    return df_vader['vader_compound']

def sentimentAnalysisFinBert(df):
    df_finbert = pd.DataFrame(df, columns=['Date', 'Text'])
    score = []
    for text in tqdm(df_finbert['Text'], desc="Processing with FinBERT"):
        classified = finbert_classifier(text, truncation=True)[0]
        if classified['label'] == "negative":
            score.append(classified['score']*(-1))
        elif classified['label'] == "positive":
            score.append(classified['score'])
        else:
            score.append(0)
    return score

In [None]:
lst_files = []

Path = "../Data/Reuters_NER/*.csv"
count = 0

for fname in glob.glob(Path):
    lst_files.append(fname)

lst_files = sorted(lst_files)

for file in lst_files:
    file_name = os.path.basename(file)  # Get the file name from the full file path
    country_name = os.path.splitext(file_name)[0]  # Remove the file extension

    # 对单个国家的数据进行情感分析
    # if country_name == 'Finland':
    # Check if the file already exists
    if os.path.isfile('../Data/SentimentScores/{0}.csv'.format(country_name)):
        print("File already exists: {0}.csv".format(country_name))
    else:
        print(file)
        df = pd.read_csv(file)

        if df.shape[0] != 0:
            df = df.drop_duplicates(['Date','Headline'], keep='last')
            df = df.drop(['Unnamed: 0', 'Headline'], axis = 1)
            df['Text'] = df['Text'].astype(str)
            df['Date'] = pd.to_datetime(df['Date'], utc=True).dt.date
            df.reset_index(drop=True, inplace=True)

            # LMD + HIV4
            df_LMD_HIV4 = pd.DataFrame(df, columns=['Date', 'Text'])
            # df_LMD_HIV4["scoreLMD"] = df_LMD_HIV4["Text"].apply(sentimentAnalysisLMD)
            # df_LMD_HIV4["scoreHIV4"] = df_LMD_HIV4["Text"].apply(sentimentAnalysisHIV4)
            # 使用tqdm在处理sentiment时显示进度条
            df_LMD_HIV4["scoreLMD"] = [sentimentAnalysisLMD(text) for text in tqdm(df_LMD_HIV4["Text"], desc="Processing with LMD")]
            df_LMD_HIV4["scoreHIV4"] = [sentimentAnalysisHIV4(text) for text in tqdm(df_LMD_HIV4["Text"], desc="Processing with HIV4")]
            df_LMD_HIV4 = pd.concat([df_LMD_HIV4.drop(["scoreLMD"], axis=1), df_LMD_HIV4["scoreLMD"].apply(pd.Series).add_prefix("LMD_")], axis=1)
            df_LMD_HIV4 = pd.concat([df_LMD_HIV4.drop(["scoreHIV4"], axis=1), df_LMD_HIV4["scoreHIV4"].apply(pd.Series).add_prefix("HIV4_")], axis=1)
            df[['LMD_Polarity', 'HIV4_Polarity']] = df_LMD_HIV4[['LMD_Polarity', 'HIV4_Polarity']]

            # VADER
            df['Vader_Polarity'] = sentimentAnalysisVader(df)

            # FINBERT
            df['FinBert_Polarity'] = sentimentAnalysisFinBert(df)

            # 储存一下df的原始数据
            df_original = df.copy()
            df['Country'] = country_name
            # Find the average Polarity for each Dictionary, per given date
            df = df.groupby('Date').mean(numeric_only=True).reset_index()
            df = df.sort_values(by = ['Date'], ascending = True)
            idx = pd.date_range('2012-01-01', '2022-12-31')
            df = df.set_index(['Date']).reindex(idx, fill_value=np.nan).rename_axis('Date').reset_index()

            df['Country'] = country_name
            df= df[['Date', 'Country', 'LMD_Polarity', 'HIV4_Polarity', 'Vader_Polarity', 'FinBert_Polarity']]

            count = count + 1
            print(count)
            df.to_csv('../Data/SentimentScores/{0}.csv'.format(country_name))
            # 保存df的原始数据
            df_original.to_csv('../Data/SentimentScores/{0}_original.csv'.format(country_name))


../Data/Reuters_NER/Australia.csv


Processing with LMD:   0%|          | 0/9234 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/9234 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/9234 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/9234 [00:00<?, ?it/s]

1
../Data/Reuters_NER/Canada.csv


Processing with LMD:   0%|          | 0/6950 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/6950 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/6950 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/6950 [00:00<?, ?it/s]

2
../Data/Reuters_NER/China.csv


Processing with LMD:   0%|          | 0/20741 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/20741 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/20741 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/20741 [00:00<?, ?it/s]

3
../Data/Reuters_NER/Denmark.csv


Processing with LMD:   0%|          | 0/1675 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/1675 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/1675 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/1675 [00:00<?, ?it/s]

4
../Data/Reuters_NER/Finland.csv


Processing with LMD:   0%|          | 0/1329 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/1329 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/1329 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/1329 [00:00<?, ?it/s]

5
../Data/Reuters_NER/France.csv


Processing with LMD:   0%|          | 0/9304 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/9304 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/9304 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/9304 [00:00<?, ?it/s]

6
../Data/Reuters_NER/Germany.csv


Processing with LMD:   0%|          | 0/11026 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/11026 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/11026 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/11026 [00:00<?, ?it/s]

7
../Data/Reuters_NER/Italy.csv


Processing with LMD:   0%|          | 0/6109 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/6109 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/6109 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/6109 [00:00<?, ?it/s]

8
../Data/Reuters_NER/Japan.csv


Processing with LMD:   0%|          | 0/9954 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/9954 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/9954 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/9954 [00:00<?, ?it/s]

9
../Data/Reuters_NER/Netherlands.csv


Processing with LMD:   0%|          | 0/2661 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/2661 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/2661 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/2661 [00:00<?, ?it/s]

10
../Data/Reuters_NER/New Zealand.csv


Processing with LMD:   0%|          | 0/3171 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/3171 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/3171 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/3171 [00:00<?, ?it/s]

11
../Data/Reuters_NER/Norway.csv


Processing with LMD:   0%|          | 0/1998 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/1998 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/1998 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/1998 [00:00<?, ?it/s]

12
../Data/Reuters_NER/Portugal.csv


Processing with LMD:   0%|          | 0/1771 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/1771 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/1771 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/1771 [00:00<?, ?it/s]

13
../Data/Reuters_NER/Singapore.csv


Processing with LMD:   0%|          | 0/2999 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/2999 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/2999 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/2999 [00:00<?, ?it/s]

14
../Data/Reuters_NER/South Korea.csv


Processing with LMD:   0%|          | 0/3760 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/3760 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/3760 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/3760 [00:00<?, ?it/s]

15
../Data/Reuters_NER/Spain.csv


Processing with LMD:   0%|          | 0/4669 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/4669 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/4669 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/4669 [00:00<?, ?it/s]

16
../Data/Reuters_NER/Sweden.csv


Processing with LMD:   0%|          | 0/2202 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/2202 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/2202 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/2202 [00:00<?, ?it/s]

In [5]:
# 展示每个国家的情感分析数据
lst_files = []
path = "../Data/SentimentScores/*.csv"

for fname in glob.glob(path):
    # 检查文件名是否以 "_original.csv" 结尾
    if not fname.endswith("_original.csv"):
        lst_files.append(fname)

lst_files = sorted(lst_files)

# 读取每个国家数据, 并输出每个数据的前5行
for file in lst_files:
    file_name = os.path.basename(file)  # Get the file name from the full file path
    country_name = os.path.splitext(file_name)[0]  # Remove the file extension
    df = pd.read_csv(file)
    print(country_name)
    print(df.head())
    print('\n')


Australia
   Unnamed: 0        Date    Country  LMD_Polarity  HIV4_Polarity  \
0           0  2012-01-01  Australia           NaN            NaN   
1           1  2012-01-02  Australia           NaN            NaN   
2           2  2012-01-03  Australia           NaN            NaN   
3           3  2012-01-04  Australia           NaN            NaN   
4           4  2012-01-05  Australia           NaN            NaN   

   Vader_Polarity  FinBert_Polarity  
0             NaN               NaN  
1             NaN               NaN  
2             NaN               NaN  
3             NaN               NaN  
4             NaN               NaN  


Canada
   Unnamed: 0        Date Country  LMD_Polarity  HIV4_Polarity  \
0           0  2012-01-01  Canada           NaN            NaN   
1           1  2012-01-02  Canada           NaN            NaN   
2           2  2012-01-03  Canada           NaN            NaN   
3           3  2012-01-04  Canada           NaN            NaN   
4       

In [6]:
# 展示每个国家的原始情感数据
lst_files = []
path = "../Data/SentimentScores/*.csv"

for fname in glob.glob(path):
    # 检查文件名是否以 "_original.csv" 结尾
    if fname.endswith("_original.csv"):
        lst_files.append(fname)

lst_files = sorted(lst_files)

# 读取每个国家数据, 并输出每个数据的前5行
for file in lst_files:
    file_name = os.path.basename(file)  # Get the file name from the full file path
    country_name = os.path.splitext(file_name)[0]  # Remove the file extension
    df = pd.read_csv(file)
    print(country_name)
    print(df.head())
    print('\n')

Australia_original
   Unnamed: 0        Date                                               Text  \
0           0  2021-01-20  China’s imports of Australian copper ore crash...   
1           1  2021-01-28  First flashes of Brexit trade trouble appear i...   
2           2  2021-02-09  Stocks, oil rise in Asia after U.S. records, d...   
3           3  2021-02-22  Future proofing: Australia’s gas networks look...   
4           4  2021-02-26  Take Five: Policymakers under pressure. 1/TANT...   

   LMD_Polarity  HIV4_Polarity  Vader_Polarity  FinBert_Polarity    Country  
0     -1.000000       0.214286         -0.9485         -0.972263  Australia  
1     -0.411765       0.050847         -0.6597         -0.969801  Australia  
2     -0.066667       0.333333          0.9799          0.595929  Australia  
3      0.400000       0.449275          0.9964          0.000000  Australia  
4     -0.666667      -0.012658          0.9554          0.561960  Australia  


Canada_original
   Unnamed: 0 

In [14]:
# 合并所有国家的情感分析数据
lst_Reuters_files = []
lst_MarketWatch_files = []
path_Reuters = "../Data/SentimentScores/*.csv"
path_MarketWatch = "../Data/MW Sentiment/*.csv"

for fname in glob.glob(path_Reuters):
    # 检查文件名是否以 "_original.csv" 结尾
    if not fname.endswith("_original.csv"):
        lst_Reuters_files.append(fname)

for fname in glob.glob(path_MarketWatch):
    lst_MarketWatch_files.append(fname)

lst_Reuters_files = sorted(lst_Reuters_files)
lst_MarketWatch_files = sorted(lst_MarketWatch_files)

df_Reuters_sentiment = pd.concat(map(pd.read_csv, lst_Reuters_files), ignore_index=True)
df_MW_sentiment = pd.concat(map(pd.read_csv, lst_MarketWatch_files), ignore_index=True)

# Sort by Date, Country; drop redundant columns
df_Reuters_sentiment = df_Reuters_sentiment.sort_values(by = ['Date', 'Country'], ascending = True)
df_Reuters_sentiment = df_Reuters_sentiment.drop(['Unnamed: 0'], axis = 1)
df_MW_sentiment = df_MW_sentiment.sort_values(by = ['Date', 'Country'], ascending = True)
df_MW_sentiment = df_MW_sentiment.drop(['Unnamed: 0'], axis = 1)

# Align Sentiment with the following day's Return
df_Reuters_sentiment['Date'] = pd.to_datetime(df_Reuters_sentiment['Date'])
df_Reuters_sentiment = df_Reuters_sentiment[~(df_Reuters_sentiment['Date'] < '2012-01-01')]
df_MW_sentiment['Date'] = pd.to_datetime(df_MW_sentiment['Date'])
df_MW_sentiment = df_MW_sentiment[~(df_MW_sentiment['Date'] < '2012-01-01')]
# df['Date'] = df["Date"] + BusinessDay()

# 展示数据
# df
#
df_Reuters_sentiment.to_csv(r'../Data/{0}.csv'.format('Merged Sentiment Reuters'))
df_MW_sentiment.to_csv(r'../Data/{0}.csv'.format('Merged Sentiment MW'))
