In [37]:
# 导入库
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='transformers')

from   datetime import datetime
from tqdm.notebook import tqdm
import os
import glob
import pysentiment2 as ps
import nltk
from   nltk.sentiment.vader import SentimentIntensityAnalyzer
from   IPython.core.display import HTML

from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import torch

In [38]:
## 设置情感分析工具

# 下载Vader情感分析工具的词典
nltk.download('vader_lexicon')
# 初始化Loughran-McDonald情感分析模型
lm    = ps.LM()
# 初始化HIV-4情感分析模型
hiv4  = ps.HIV4()
# 初始化Vader情感分析模型
vader = SentimentIntensityAnalyzer()

# 如果有可用的第一块GPU则使用，否则使用CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("mps"if torch.backends.mps.is_available()else "cpu")

# 初始化FinBERT情感分析模型的tokenizer
finbert_tokenizer  = AutoTokenizer.from_pretrained("ProsusAI/finbert")
# 从FinBERT预训练模型加载模型
finbert_model      = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
# 配置FinBERT情感分析模型的运行环境（GPU or CPU）
finbert_classifier = pipeline("sentiment-analysis", model = finbert_model, tokenizer = finbert_tokenizer, device = device)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\WangH\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [39]:
def sentimentAnalysisLMD(text):
    tokens = lm.tokenize(text) #tokenize the text into relevant words in the LM dict
    score = lm.get_score(tokens) #score dict seperates into: NEG, POS, POLARITY, SUBJECTIVITY
    return score

def sentimentAnalysisHIV4(text):
    tokens = hiv4.tokenize(text)
    score = hiv4.get_score(tokens)
    return score

def sentimentAnalysisVader(df):
    df_vader = pd.DataFrame(df, columns=['Date', 'Text'])
    scores = [vader.polarity_scores(text) for text in tqdm(df_vader['Text'], desc="Processing with VADER")]
    scores_vader = pd.DataFrame(scores).add_prefix("vader_")
    df_vader = df_vader.join(scores_vader, rsuffix='_right')
    return df_vader['vader_compound']

def sentimentAnalysisFinBert(df):
    df_finbert = pd.DataFrame(df, columns=['Date', 'Text'])
    score = []
    for text in tqdm(df_finbert['Text'], desc="Processing with FinBERT"):
        classified = finbert_classifier(text, truncation=True)[0]
        if classified['label'] == "negative":
            score.append(classified['score']*(-1))
        elif classified['label'] == "positive":
            score.append(classified['score'])
        else:
            score.append(0)
    return score


In [40]:
lst_files = []

Path = "../Data/CNN_NER/*.csv"
count = 0

for fname in glob.glob(Path):
    lst_files.append(fname)

lst_files = sorted(lst_files)

for file in lst_files:
    file_name = os.path.basename(file)  # Get the file name from the full file path
    country_name = os.path.splitext(file_name)[0]  # Remove the file extension

    # 对单个国家的数据进行情感分析
    if country_name == 'Finland':
    # Check if the file already exists
    # if os.path.isfile('../Data/CNN_Sentiment_Scores/{0}.csv'.format(country_name)):
    #     print("File already exists: {0}.csv".format(country_name))
    # else:
        print(file)
        df = pd.read_csv(file)

        if df.shape[0] != 0:
            df = df.drop_duplicates(['Date','Headline'], keep='last')
            df = df.drop(['Unnamed: 0', 'Headline'], axis = 1)
            df['Text'] = df['Text'].astype(str)
            df['Date'] = pd.to_datetime(df['Date'], utc=True).dt.date
            df.reset_index(drop=True, inplace=True)

            # LMD + HIV4
            df_LMD_HIV4 = pd.DataFrame(df, columns=['Date', 'Text'])
            # df_LMD_HIV4["scoreLMD"] = df_LMD_HIV4["Text"].apply(sentimentAnalysisLMD)
            # df_LMD_HIV4["scoreHIV4"] = df_LMD_HIV4["Text"].apply(sentimentAnalysisHIV4)
            # 使用tqdm在处理sentiment时显示进度条
            df_LMD_HIV4["scoreLMD"] = [sentimentAnalysisLMD(text) for text in tqdm(df_LMD_HIV4["Text"], desc="Processing with LMD")]
            df_LMD_HIV4["scoreHIV4"] = [sentimentAnalysisHIV4(text) for text in tqdm(df_LMD_HIV4["Text"], desc="Processing with HIV4")]
            df_LMD_HIV4 = pd.concat([df_LMD_HIV4.drop(["scoreLMD"], axis=1), df_LMD_HIV4["scoreLMD"].apply(pd.Series).add_prefix("LMD_")], axis=1)
            df_LMD_HIV4 = pd.concat([df_LMD_HIV4.drop(["scoreHIV4"], axis=1), df_LMD_HIV4["scoreHIV4"].apply(pd.Series).add_prefix("HIV4_")], axis=1)
            df[['LMD_Polarity', 'HIV4_Polarity']] = df_LMD_HIV4[['LMD_Polarity', 'HIV4_Polarity']]

            # VADER
            df['Vader_Polarity'] = sentimentAnalysisVader(df)

            # FINBERT
            df['FinBert_Polarity'] = sentimentAnalysisFinBert(df)

            # 储存一下df的原始数据
            df_original = df.copy()
            df['Country'] = country_name
            # Find the average Polarity for each Dictionary, per given date
            df = df.groupby('Date').mean(numeric_only=True).reset_index()
            df = df.sort_values(by = ['Date'], ascending = True)
            idx = pd.date_range('2012-01-01', '2023-06-30')
            df = df.set_index(['Date']).reindex(idx, fill_value=np.nan).rename_axis('Date').reset_index()

            df['Country'] = country_name
            df= df[['Date', 'Country', 'LMD_Polarity', 'HIV4_Polarity', 'Vader_Polarity', 'FinBert_Polarity']]

            count = count + 1
            print(count)
            df.to_csv('../Data/CNN_Sentiment_Scores/{0}.csv'.format(country_name))
            # 保存df的原始数据
            df_original.to_csv('../Data/CNN_Sentiment_Scores/{0}_original.csv'.format(country_name))


../Data/CNN_NER\Finland.csv


Processing with LMD:   0%|          | 0/127 [00:00<?, ?it/s]

Processing with HIV4:   0%|          | 0/127 [00:00<?, ?it/s]

Processing with VADER:   0%|          | 0/127 [00:00<?, ?it/s]

Processing with FinBERT:   0%|          | 0/127 [00:00<?, ?it/s]

1


In [6]:
# 展示每个国家的情感分析数据
lst_files = []
path = "../Data/CNN_Sentiment_Scores/*.csv"

for fname in glob.glob(path):
    # 检查文件名是否以 "_original.csv" 结尾
    if not fname.endswith("_original.csv"):
        lst_files.append(fname)

lst_files = sorted(lst_files)

# 读取每个国家数据, 并输出每个数据的前5行
for file in lst_files:
    file_name = os.path.basename(file)  # Get the file name from the full file path
    country_name = os.path.splitext(file_name)[0]  # Remove the file extension
    df = pd.read_csv(file)
    
    # print(country_name)
    # print(df.head())
    # print('\n')


Australia
   Unnamed: 0        Date    Country  LMD_Polarity  HIV4_Polarity  \
0           0  2012-01-01  Australia           NaN            NaN   
1           1  2012-01-02  Australia           NaN            NaN   
2           2  2012-01-03  Australia           NaN            NaN   
3           3  2012-01-04  Australia           NaN            NaN   
4           4  2012-01-05  Australia           NaN            NaN   

   Vader_Polarity  FinBert_Polarity  
0             NaN               NaN  
1             NaN               NaN  
2             NaN               NaN  
3             NaN               NaN  
4             NaN               NaN  


Canada
   Unnamed: 0        Date Country  LMD_Polarity  HIV4_Polarity  \
0           0  2012-01-01  Canada           NaN            NaN   
1           1  2012-01-02  Canada           NaN            NaN   
2           2  2012-01-03  Canada           NaN            NaN   
3           3  2012-01-04  Canada           NaN            NaN   
4       

In [8]:
# 展示每个国家的原始情感数据
lst_files = []
path = "../Data/CNN_Sentiment_Scores/*.csv"

for fname in glob.glob(path):
    # 检查文件名是否以 "_original.csv" 结尾
    if fname.endswith("_original.csv"):
        lst_files.append(fname)

lst_files = sorted(lst_files)

# 读取每个国家数据, 并输出每个数据的前5行
for file in lst_files:
    file_name = os.path.basename(file)  # Get the file name from the full file path
    country_name = os.path.splitext(file_name)[0]  # Remove the file extension
    country_name = country_name.replace("_original", "")
    df = pd.read_csv(file)
    df = df.drop(['Unnamed: 0'], axis = 1)
    print(country_name)
    print(df.head())
    print('\n')
    # Find the average Polarity for each Dictionary, per given date
    df['Date'] = pd.to_datetime(df['Date'], utc=True).dt.date   
    df.reset_index(drop=True, inplace=True)
    df = df.groupby('Date').mean(numeric_only=True).reset_index()
    df = df.sort_values(by = ['Date'], ascending = True)
    # idx = pd.date_range('2012-01-01', '2023-06-30')
    # df = df.set_index(['Date']).reindex(idx, fill_value=np.nan).rename_axis('Date').reset_index()
    # 
    # df['Country'] = country_name
    # df= df[['Date', 'Country', 'LMD_Polarity', 'HIV4_Polarity', 'Vader_Polarity', 'FinBert_Polarity']]
    # 
    # # count = count + 1
    # # print(count)
    # df.to_csv('../Data/CNN_Sentiment_Scores/{0}.csv'.format(country_name))

Australia
         Date                                               Text  \
0  2016-05-05  Australia's most wanted ISIS recruiter 'killed...   
1  2016-05-08  Australian PM Turnbull announces general elect...   
2  2016-05-11  'World's oldest ax' found in Australia. Archae...   
3  2016-05-11  Five terror suspects caught trying to flee Aus...   
4  2016-05-13  The great Richard Quest social media challenge...   

   LMD_Polarity  HIV4_Polarity  Vader_Polarity  FinBert_Polarity  
0      0.000000      -0.222222         -0.9947          0.000000  
1     -0.130435       0.172414          0.9705         -0.757349  
2     -0.250000       0.135135          0.8271          0.000000  
3     -0.837838      -0.159091         -0.9980         -0.578246  
4     -0.384615       0.377778          0.7569          0.000000  


Canada
         Date                                               Text  \
0  2016-11-18  America's NAFTA nemesis: Canada, not Mexico. A...   
1  2016-11-28  O Canada: Trudeau's

In [None]:
# # 合并所有国家的情感分析数据
# lst_Reuters_files = []
# lst_MarketWatch_files = []
# path_Reuters = "../Data/CNN_Sentiment_Scores/*.csv"
# path_MarketWatch = "../Data/MW Sentiment/*.csv"
# 
# for fname in glob.glob(path_Reuters):
#     # 检查文件名是否以 "_original.csv" 结尾
#     if not fname.endswith("_original.csv"):
#         lst_Reuters_files.append(fname)
# 
# for fname in glob.glob(path_MarketWatch):
#     lst_MarketWatch_files.append(fname)
# 
# lst_Reuters_files = sorted(lst_Reuters_files)
# lst_MarketWatch_files = sorted(lst_MarketWatch_files)
# 
# df_Reuters_sentiment = pd.concat(map(pd.read_csv, lst_Reuters_files), ignore_index=True)
# df_MW_sentiment = pd.concat(map(pd.read_csv, lst_MarketWatch_files), ignore_index=True)
# 
# # Sort by Date, Country; drop redundant columns
# df_Reuters_sentiment = df_Reuters_sentiment.sort_values(by = ['Date', 'Country'], ascending = True)
# df_Reuters_sentiment = df_Reuters_sentiment.drop(['Unnamed: 0'], axis = 1)
# df_MW_sentiment = df_MW_sentiment.sort_values(by = ['Date', 'Country'], ascending = True)
# df_MW_sentiment = df_MW_sentiment.drop(['Unnamed: 0'], axis = 1)
# 
# # Align Sentiment with the following day's Return
# df_Reuters_sentiment['Date'] = pd.to_datetime(df_Reuters_sentiment['Date'])
# df_Reuters_sentiment = df_Reuters_sentiment[~(df_Reuters_sentiment['Date'] < '2012-01-01')]
# df_MW_sentiment['Date'] = pd.to_datetime(df_MW_sentiment['Date'])
# df_MW_sentiment = df_MW_sentiment[~(df_MW_sentiment['Date'] < '2012-01-01')]
# # df['Date'] = df["Date"] + BusinessDay()
# 
# # 展示数据
# # df
# #
# df_Reuters_sentiment.to_csv(r'../Data/{0}.csv'.format('Merged Sentiment Reuters'))
# df_MW_sentiment.to_csv(r'../Data/{0}.csv'.format('Merged Sentiment MW'))


In [9]:
# 导入收益率数据
df_return = pd.read_csv('../Data/daily_return.csv')

# 展示数据
df_return.head()

Unnamed: 0,Date,New Zealand,United Kingdom,Finland,Norway,United States,Sweden,Korea,Japan,Canada,...,Australia,Germany,France,China,Switzerland,Spain,Denmark,Italy,Singapore,Portugal
0,2011-12-01,,,,,,,,,,...,,,,,,,,,,
1,2011-12-02,-0.010866,-0.207038,0.302897,0.235483,0.489692,0.154354,0.094609,-0.04227,0.178307,...,0.053087,0.393824,-1.345329,0.328191,0.241278,0.5364,0.065521,-0.189497,-0.251753,0.148492
2,2011-12-05,0.077141,-0.460678,-0.474224,-0.158909,-0.084015,-0.179613,0.056292,0.165816,0.227525,...,0.302855,-0.615728,1.099039,0.264403,-0.12572,3.778843,-1.723893,5.259712,-0.012405,3.308727
3,2011-12-06,0.201784,0.810078,0.100602,-0.190226,-0.406179,0.198203,-0.004941,0.066255,-0.330724,...,0.354603,0.15015,-0.863361,-0.157734,0.335027,-0.630513,0.27233,0.715348,0.047384,1.062182
4,2011-12-07,0.187048,0.129148,0.496636,0.290407,0.543003,0.567308,-0.04316,0.002849,0.596567,...,-0.273399,0.762903,0.199724,0.001528,0.198283,-1.476326,-0.028845,-0.948723,-0.251362,1.737582


In [24]:
correlation_data = []  # 用于储存每个国家的相关性数据

lst_files = []
path = "../Data/CNN_Sentiment_Scores/*.csv"

for fname in glob.glob(path):
    # 检查文件名是否以 "_original.csv" 结尾
    if not fname.endswith("_original.csv"):
        lst_files.append(fname)

lst_files = sorted(lst_files)
        
correlation_data = []

# 对于每个国家的情感分数数据，找到非NaN值的日期，取相应的收益率数据，然后计算相关性
for file in lst_files:
    file_name = os.path.basename(file)  # 获取完整文件路径的文件名
    country_name = os.path.splitext(file_name)[0]  # 去掉文件扩展名

    df_sentiment = pd.read_csv(file)
    df_sentiment = df_sentiment.drop(['Unnamed: 0'], axis = 1)
    available_dates = df_sentiment[df_sentiment['LMD_Polarity'].notna()]['Date'].tolist()  # 获取非NaN情感分数的日期

    # 根据可用日期筛选收益率数据
    df_country_return = df_return[df_return['Date'].isin(available_dates)][['Date', country_name]]
    df_country_return.columns = ['Date', 'Return']

    # Merge the sentiment data with the return data
    merged_data = pd.merge(df_sentiment, df_country_return, on='Date')

    # Ensure that there's no missing return data in merged_data
    merged_data = merged_data.dropna(subset=['Return'])

    # Compute the correlations
    correlations = merged_data.corr(numeric_only=True).unstack().sort_values(ascending=False)  # Build correlation matrix
    correlations = pd.DataFrame(correlations).reset_index()  # Convert to dataframe
    correlations.columns = ['Return', 'Method', 'Correlation with Returns']  # Label it

    # Filter for correlations with 'Return' and exclude the correlation of 'Return' with itself
    result = correlations.query("Return == 'Return' & Method != 'Return'")

    # Store the results
    for _, row in result.iterrows():
        correlation_data.append({
            'Country': country_name,
            'Method': row['Method'],
            'Correlation with Returns': row['Correlation with Returns']
        })

df_correlation = pd.DataFrame(correlation_data)
df_correlation.to_csv('../Data/correlation_results.csv', index=False)

    

In [25]:
df_correlation

Unnamed: 0,Country,Method,Correlation with Returns
0,Australia,Vader_Polarity,0.023871
1,Australia,LMD_Polarity,0.019424
2,Australia,FinBert_Polarity,0.005801
3,Australia,HIV4_Polarity,-0.005539
4,Canada,HIV4_Polarity,-0.004902
...,...,...,...
75,United Kingdom,Vader_Polarity,0.028748
76,United States,HIV4_Polarity,0.003358
77,United States,Vader_Polarity,0.000419
78,United States,FinBert_Polarity,-0.018033
