# Ke et al.(2020) ‘Predicting Returns with Text Data’ paper replication
## Zhongchen WANG (20745072)

In [1]:
# import packages
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('mode.chained_assignment', None)

# Data Pre-process

## Read dataset

In [None]:
dsf = pd.read_parquet('./data/dsf.parquet.gzip')
# Column specret: for the market reaction
dsf = dsf[['SecuCode', 'date', 'ret', 'specret']]

In [None]:
anatxt = pd.read_parquet('./data/anatxt.parquet.gzip')
anatxt = anatxt.drop(columns=['FYEAR'])
anatxt = anatxt.drop_duplicates()
data = anatxt[['ID','SecuCode','content','create_date']]
data.drop_duplicates(inplace = True)
data.reset_index(drop = True, inplace = True)

In [None]:
iret = pd.read_csv('./data/iret.csv')
iret = iret[['date', 'csi300t']]
iret.date = pd.to_datetime(iret.date, format='%Y-%m-%d', errors='ignore')
#iret.head()

## Pre-process 1 -- get two return labels: [t:t+1], [t+2:t+6]

### merge dataframe

In [None]:
DF_rt = pd.merge(dsf, iret, how = 'left', on = ['date'])

In [None]:
# subtracting the csi300t from the return of the same period
DF_rt['adj_return'] = DF_rt.ret - DF_rt.csi300t
DF_rt = DF_rt.drop(['ret', 'csi300t'], 1)

In [None]:
# columns = SecuCode , date, specret
DF_rt_spe = DF_rt.drop(['adj_return'],1)
DF_rt_spe.dropna(inplace = True)
DF_rt_spe.reset_index(inplace = True, drop = True)

### dealing with the trading date

In [None]:
dt_list = DF_rt_spe[['date']].sort_values('date').astype(str)
dt_list = dt_list.drop_duplicates()
dt_list = dt_list.date.tolist()

In [None]:
def timeindex(lt,delta = 1):
    lt_new = lt[delta:]
    for i in range(delta):
        lt_new.append('NaN')
    df = pd.DataFrame(lt)
    df.columns = ['date']
    df.date = pd.to_datetime(df.date, format='%Y-%m-%d', errors='ignore')
    df['t+'+str(delta)] = pd.to_datetime(lt_new, format='%Y-%m-%d', errors='ignore')
    
    return df

In [None]:
# the following 6 trading day
dt_df_t1 = timeindex(dt_list,1)
dt_df_t2 = timeindex(dt_list,2)
dt_df_t3 = timeindex(dt_list,3)
dt_df_t4 = timeindex(dt_list,4)
dt_df_t5 = timeindex(dt_list,5)
dt_df_t6 = timeindex(dt_list,6)

In [None]:
DF_final = pd.merge(DF_rt_spe,dt_df_t1,how = 'left',on = 'date')
DF_final = pd.merge(DF_final,dt_df_t2,how = 'left',on = 'date')
DF_final = pd.merge(DF_final,dt_df_t3,how = 'left',on = 'date')
DF_final = pd.merge(DF_final,dt_df_t4,how = 'left',on = 'date')
DF_final = pd.merge(DF_final,dt_df_t5,how = 'left',on = 'date')
DF_final = pd.merge(DF_final,dt_df_t6,how = 'left',on = 'date')

In [None]:
DF_final = DF_final.rename(columns = {'date':'create_date'})
DF_final = pd.merge(data, DF_final, how = 'left', on = ['SecuCode','create_date'])

In [None]:
DF_final = DF_final.dropna().sort_values('create_date')
DF_final.reset_index(drop = True, inplace = True )
DF_final = DF_final.rename(columns = {'specret':'specret_t'})

In [None]:
def dfmerge(DF_final,DF_rt_spe, delta):
    df_new = pd.merge(DF_final,DF_rt_spe,how = 'left',left_on = ['SecuCode','t+'+str(delta)], right_on = ['SecuCode','date'])
    df_new.drop(['date'],1,inplace = True)
    df_new = df_new.rename(columns = {'specret':'specret_t+'+str(delta)})
    return df_new

In [None]:
DF_all_spe = dfmerge(DF_final,DF_rt_spe, 1)
DF_all_spe = dfmerge(DF_all_spe,DF_rt_spe, 2)
DF_all_spe = dfmerge(DF_all_spe,DF_rt_spe, 3)
DF_all_spe = dfmerge(DF_all_spe,DF_rt_spe, 4)
DF_all_spe = dfmerge(DF_all_spe,DF_rt_spe, 5)
DF_all_spe = dfmerge(DF_all_spe,DF_rt_spe, 6)

In [None]:
DF_all_spe['t:t+1'] = (DF_all_spe['specret_t'] + DF_all_spe['specret_t+1']).tolist()
DF_all_spe['t+2:t+6'] = (DF_all_spe['specret_t+2'] + DF_all_spe['specret_t+3']
                        + DF_all_spe['specret_t+4'] + + DF_all_spe['specret_t+5']
                        + DF_all_spe['specret_t+6']
                        ).tolist()

### get dataframe with two labels

In [None]:
DF_data = DF_all_spe[['ID', 'SecuCode', 'content', 'create_date', 't:t+1', 't+2:t+6']]

In [None]:
#DF_data.to_csv('./data_preprocess/DF_data_spe.csv',index=False,header=True)

## Pre-process 2 -- jieba cut for the texts

In [None]:
Data = pd.read_csv('./data_preprocess/DF_data_spe.csv')

### Divide content into sentences

In [None]:
content_sent = []
for i in tqdm(range(len(Data))):
    st = re.sub('[\d]', '', Data.content[i]) # remove the number
    st = re.sub('[\s]', '', st)
    st = re.sub('[a-zA-Z]', '', st)
    st = st.replace('\n','')
    st = st.replace(' ','')
    st = st.replace('％','')
    st = st.replace('%','')
    st = st.replace('.','')
    st = st.replace('（）','')
    st = st.replace('()','')
    st = re.split('。|！|\！|？|\?',st)
    del st[-1]
    content_sent.append(st)

In [None]:
Data['content_sent'] = content_sent

### Divide train_data and test_data
- train_data: 2010-01-01 -- 2014-12-31
- test_data: after 2015-01-01

In [None]:
train_data = Data[(Data['create_date']<='2014-12-31')&(Data['create_date']>='2010-01-01')].reset_index(drop = True)
test_data = Data[Data['create_date']>='2015-01-01'].reset_index(drop = True)

### Remove stopwords

In [None]:
# stop words and punctuations
stopwords = [line.strip() for line in open('./data/cn_stopwords_only.txt', 'r', encoding = 'utf-8').readlines()]

In [None]:
def stop_removal(test_data):
    no_stop = []
    for i in tqdm(range(len(test_data['content_sent']))):
        m = test_data['content_sent'][i]
        for j in range(len(stopwords)):
            a = stopwords[j]
            if a in m:
                m = m.replace(a,'')
        no_stop.append(m)
    return no_stop

In [None]:
no_stop = stop_removal(test_data)
no_stop_tr = stop_removal(train_data)

In [None]:
test_data['no_stop_cont'] = no_stop
train_data['no_stop_cont'] = no_stop_tr

### Content_split into word with POS

In [None]:
import jieba.posseg as pseg
def wd_split(test_data):
    lt_test = []
    dou = ','
    for j in tqdm(range(len(test_data['no_stop_cont']))):
        cont = test_data['no_stop_cont'][j]
        a = pseg.cut(cont,use_paddle=True) #paddle mode
        b = [dou.join([q,d]) for (q,d) in a]
        lt_test.append(b)
    return lt_test

In [None]:
content = wd_split(test_data)
content_tr = wd_split(train_data)
test_data['content_split'] = content
train_data['content_split'] = content_tr

In [None]:
#train_data.to_parquet('./data_preprocess/train_data.parquet.gzip')
#test_data.to_parquet('./data_preprocess/test_data.parquet.gzip')

## Pre-process 3 -- content refinement

In [None]:
#train_data = pd.read_parquet('./data_preprocess/train_data.parquet.gzip')
#test_data = pd.read_parquet('./data_preprocess/test_data.parquet.gzip')

### sgn lables for train data

In [None]:
train_data['sgn_labels,t:t+1'] = train_data['t:t+1'].map(lambda x: 1 if x>0 
                                                         else (-1 if x<0 else (0 if x == 0 else 'NaN')))

### data cleansing -- delete punctuation (eg: ,) (POS tag == 'x')

In [None]:
def select(a):
    lt = []
    for i in range(len(a)):
        if 'x' not in a[i]:
            if len(a[i]) != 1:
                lt.append(a[i])
    return lt

In [None]:
def word_split(train_data):
    new_split = []
    for i in tqdm(range(len(train_data))):
        a = train_data.content_split[i].tolist()
        m = select(a)
        new_split.append(m)
    return new_split

In [None]:
word_tr = word_split(train_data)
word_test = word_split(test_data)
train_data['words'] = word_tr
test_data['words'] = word_test

### A new column: get words only, without POS

In [None]:
def concatwd(data):
    sen = ' '.join(data)
    sen = sen.replace(',','')
    sen = re.sub('[a-zA-Z]','',sen)  # remove POS
    return sen

In [None]:
def words_noPOS(train_data):
    sen_all = []
    for i in tqdm(range(len(train_data))):
        a = train_data.words[i].tolist()
        sen = concatwd(a)
        sen_all.append(sen)
    return sen_all

In [None]:
noPOS_tr = words_noPOS(train_data)
noPOS_test = words_noPOS(test_data)
train_data['str_words'] = noPOS_tr
train_data['str_words'] = noPOS_test

In [None]:
#train_data.to_parquet('./data_preprocess/train_data.parquet.gzip')
#test_data.to_parquet('./data_preprocess/test_data.parquet.gzip')

## Pre-process 4 -- bag of words

### train_data_out2: for train data with return label outside of [-2%,2%]

In [None]:
train_data_out2 = train_data[(train_data['t:t+1']>=0.02)|(train_data['t:t+1']<=-0.02)].reset_index(drop = True)

### construct bag of words from all training sample
+ following procedures are the same for `train_data_out2` -- omit here

In [None]:
def bag_words_func(train_data):
    bag = []
    for i in tqdm(range(len(train_data))):
        one = train_data.words[i].tolist()
        bag.extend(one)
    counts = pd.value_counts(bag)
    counts = pd.DataFrame(counts)
    counts.columns = ['counts']
    bag_words = list(set(bag))
    bag_words = pd.DataFrame(bag_words)
    bag_words.columns = ['original']
    bag_words['word'] = bag_words['original'].map(lambda x:x.split(',')[0])
    bag_words['POS'] = bag_words['original'].map(lambda x:x.split(',')[1])
    bag_df = bag_words.join(counts, on = 'original').sort_values('counts',ascending = False)
    ct = bag_df.groupby('word').agg('sum').sort_values('counts',ascending = False)
    ct.columns = ['#words']
    bag_df_final = bag_df.drop_duplicates('word').reset_index(drop = True)
    bag_df_final = bag_df_final.join(ct,on = 'word')
    return bag_df_final

In [None]:
bag_df_final = bag_words_func(train_data)

In [None]:
#bag_df_final.to_csv('./data/word count/bag_df_final.csv',index=False,header=True, encoding = 'utf_8_sig')
bag_df_final = pd.read_csv('./data/word count/bag_df_final.csv')

### divide bag of words by different length

In [None]:
def word_cut(data, length = 2):
    if length == 4:
        df = data[data.word.str.len()>=length]
    else:
        df = data[data.word.str.len()==length]
    return df

In [None]:
def wordcut(bag_df_final,whole = ''):
    cf_1 = word_cut(bag_df_final,1)
    cf_2 = word_cut(bag_df_final,2)
    cf_3 = word_cut(bag_df_final,3)
    cf_4 = word_cut(bag_df_final,4)
    cf_1.sort_values("#words",inplace=True,ascending = False)
    cf_2.sort_values("#words",inplace=True,ascending = False)
    cf_3.sort_values("#words",inplace=True,ascending = False)
    cf_4.sort_values("#words",inplace=True,ascending = False)
    cf_1.to_csv('./data/word count/DF_word_len1'+whole+'.csv',index=False,header=True, encoding = 'utf_8_sig') 
    cf_2.to_csv('./data/word count/DF_word_len2'+whole+'.csv',index=False,header=True, encoding = 'utf_8_sig') 
    cf_3.to_csv('./data/word count/DF_word_len3'+whole+'.csv',index=False,header=True, encoding = 'utf_8_sig') 
    cf_4.to_csv('./data/word count/DF_word_len4'+whole+'.csv',index=False,header=True, encoding = 'utf_8_sig')

In [None]:
wordcut(bag_df_final)
# wordcut(bag_df_final_out2,whole = '_out2') for train_data_out2

### detailed POS exclusion rule of different word length
**two methods are adopted**
- method one: only select adj. and v. words
- method two: delete different POS of words for different word length

In [None]:
wdcount_df = pd.read_csv('./data/word count/bag_df_final.csv')
cf_1 = pd.read_csv('./data/word count/DF_word_len1.csv')
cf_2 = pd.read_csv('./data/word count/DF_word_len2.csv')
cf_3 = pd.read_csv('./data/word count/DF_word_len3.csv')
cf_4 = pd.read_csv('./data/word count/DF_word_len4.csv')

In [None]:
# only a,v
wdcount_df_av = wdcount_df[wdcount_df['POS'].isin(['a','v'])].reset_index(drop = True)

In [None]:
# Drop type of POS that do not contain sentiment information -- detailed exclusion
cf_1_ex = cf_1[~cf_1['POS'].isin(['w','r','ns','f','p','c','q','n','nr','m','u'])].reset_index(drop = True)
cf_2_ex = cf_2[~cf_2['POS'].isin(['nw','u','m','t','TIME','nr','PER','nz','n','q','ORG','f','LOC','r','s'])].reset_index(drop = True)
cf_3_ex = cf_3[~cf_3['POS'].isin(['s','r','LOC','f','ORG','nt','n','nz','PER','nr','TIME','t','m','nw'])].reset_index(drop = True)
cf_4_ex = cf_4[~cf_4['POS'].isin(['s','LOC','f','ORG','n','nz','PER','nr','TIME','t','m','nw'])].reset_index(drop = True)
wdcount_df_ex = pd.concat([cf_1_ex,cf_2_ex,cf_3_ex,cf_4_ex],axis = 0).reset_index(drop = True)
wdcount_df_ex.reset_index(drop = True,inplace = True)

In [None]:
wdcount_df_av.to_csv('./data/word count/wdcount_df_av.csv',index=False,header=True, encoding = 'utf_8_sig') 

In [None]:
wdcount_df_ex.to_csv('./data/word count/wdcount_df_ex.csv',index=False,header=True, encoding = 'utf_8_sig') 

# Benchmark construction -- two github links
- benchmark 1: https://github.com/dictionaries2020/SentimentDictionaries
- benchmark 2: https://github.com/MengLingchao/Chinese_financial_sentiment_dictionary

## Read dataset -- benchmark 1

In [None]:
# annual report only
git1 = pd.read_excel('./data/金融领域中文情绪词典.xlsx', sheet_name='年报负面',header = None)
git2 = pd.read_excel('./data/金融领域中文情绪词典.xlsx', sheet_name='年报正面',header = None)

In [None]:
test_data = pd.read_parquet('./data_preprocess/test_data.parquet.gzip')

In [None]:
git1.columns = ['word']
git1['sentiment'] = [-1]*len(git1)
git2.columns = ['word']
git2['sentiment'] = [1]*len(git2)
git_report = pd.concat([git1,git2],axis = 0)
git_report.reset_index(inplace = True, drop = True)

## 12 scores definition:
- Score 1 = # positive words/# of words excluding stop words and symbols etc.
- Score 2 = (-1) * # negative words/# of words excluding stop words and symbols etc.
- Score 3 = Score 1+score 2
- Score 4-6 = change the denominator to (#positive+#negative words) in score 1-3.
- Score 7-9 = use # of sentences instead of # words in Score 1-3
- Score 10-12 = use # of sentences instead of # words in Score 4-6

## count of word

In [None]:
def sentiment_count(train_data, dic):
    lt_p = []
    lt_n = []
    
    word_list = dic.word.tolist()
    s_list = dic.sentiment.tolist()
    content_list = train_data.content_sent.tolist()
    
    for j in tqdm(range(len(content_list)), position=0, leave=True): 
        p = 0
        n = 0
        for i in range(len(word_list)):
            ct = content_list[j].count(word_list[i])
            if s_list[i]>0:
                p+=ct
            else:
                n+=ct
        lt_p.append(p)
        lt_n.append(n)
    return lt_p, lt_n

In [None]:
lt_test_p, lt_test_n = sentiment_count(test_data, git_report)

In [None]:
def words_count(data):
    lt_all = []
    str_list = data.str_words.tolist()
    for j in tqdm(range(len(str_list)), position=0, leave=True):
        lt_all.append(len(str_list[j].split(' ')))
    return lt_all

In [None]:
lt_all = words_count(test_data)

## score1 - socre6 dataframe

In [None]:
def score_df(lt_test_p, lt_test_n, lt_all):
    lt_test_p = pd.DataFrame(lt_test_p)
    lt_test_n = pd.DataFrame(lt_test_n)
    lt_all = pd.DataFrame(lt_all)
    lt_test_df = pd.concat([lt_test_p,lt_test_n,lt_all],axis = 1)
    lt_test_df.columns = ['#p','#n','#all']
    lt_test_df['score1'] = lt_test_df['#p']/lt_test_df['#all']
    lt_test_df['score2'] = (-1) * lt_test_df['#n']/lt_test_df['#all']
    lt_test_df.fillna(0)
    lt_test_df['score3'] = lt_test_df['score1'] + lt_test_df['score2']
    lt_test_df['score4'] = lt_test_df['#p']/(lt_test_df['#p']+lt_test_df['#n'])
    lt_test_df['score5'] = (-1) * lt_test_df['#n']/(lt_test_df['#p']+lt_test_df['#n'])
    lt_test_df.fillna(0)
    lt_test_df['score6'] = lt_test_df['score4'] + lt_test_df['score5']
    return lt_test_df

In [None]:
lt_test_df = score_df(lt_test_p, lt_test_n, lt_all)

## sentence level count

In [None]:
def sent_level(a, dic):
    flag_lt = []
    word_list = dic.word.tolist()
    s_list = dic.sentiment.tolist()
    ct = len(a)
    for j in range(ct): 
        p = 0
        n = 0
        for i in range(len(word_list)):
            ct_wd = a[j].count(word_list[i])
            if s_list[i]>0:
                p+=ct_wd
            else:
                n+=ct_wd
        if p>n:
            flag = 1
        elif p<n:
            flag = -1
        else:
            flag = 0
        
        flag_lt.append(flag)
    return flag_lt,ct

In [None]:
f_t = []
sen_all = []
for i in tqdm(range(len(test_data))):
    a = test_data.no_stop_cont[i].split(',')
    f,sen = sent_level(a,git_report)
    f_t.append(f)
    sen_all.append(sen)

In [None]:
p_all = []
n_all = []
for i in tqdm(range(len(f_t))):
    p = f_t[i].count(1)
    n = f_t[i].count(-1)
    p_all.append(p)
    n_all.append(n)

## score7 - socre12 dataframe

In [None]:
def score_sen_df(lt_test_p, lt_test_n, lt_all):
    lt_test_p = pd.DataFrame(lt_test_p)
    lt_test_n = pd.DataFrame(lt_test_n)
    lt_all = pd.DataFrame(lt_all)
    lt_test_df = pd.concat([lt_test_p,lt_test_n],axis = 1)
    lt_test_df = pd.concat([lt_test_df,lt_all],axis = 1)
    lt_test_df.columns = ['#sen_p','#sen_n','#sen_all']
    lt_test_df['score7'] = lt_test_df['#sen_p']/lt_test_df['#sen_all']
    lt_test_df['score8'] = (-1) * lt_test_df['#sen_n']/lt_test_df['#sen_all']
    lt_test_df.fillna(0)
    lt_test_df['score9'] = lt_test_df['score7'] + lt_test_df['score8']
    lt_test_df['score10'] = lt_test_df['#sen_p']/(lt_test_df['#sen_p']+lt_test_df['#sen_n'])
    lt_test_df['score11'] = (-1) * lt_test_df['#sen_n']/(lt_test_df['#sen_p']+lt_test_df['#sen_n'])
    lt_test_df.fillna(0)
    lt_test_df['score12'] = lt_test_df['score10'] + lt_test_df['score11']
    return lt_test_df

In [None]:
sen_df = score_sen_df(p_all, n_all, sen_all)

In [None]:
score_df = pd.concat([lt_test_df,sen_df],axis = 1)
master_df = pd.concat([test_data,score_df],1)

In [None]:
score_df.to_csv('./data_preprocess/test_score1-12_df.csv',index=False,header=True)
master_df.to_parquet('./data_preprocess/master_df.parquet.gzip')

## correlation calculation

In [None]:
update_df[['t:t+1','score1','score2','score3','score4','score5','score6','score7','score8','score9',
 'score10','score11','score12']].corr().iloc[0:1,:]

In [None]:
update_df[['t:t+1','score1','score2','score3','score4','score5','score6','score7','score8','score9',
 'score10','score11','score12']].corr(method = 'spearman').iloc[0:1,:]

In [None]:
update_df[['t+2:t+6','score1','score2','score3','score4','score5','score6','score7','score8','score9',
 'score10','score11','score12']].corr().iloc[0:1,:]

In [None]:
update_df[['t+2:t+6','score1','score2','score3','score4','score5','score6','score7','score8','score9',
 'score10','score11','score12']].corr(method = 'spearman').iloc[0:1,:]

## Read dataset -- benchmark 2

In [None]:
git1_meng = pd.read_excel('./data/中文金融情感词典_姜富伟等(2020).xlsx', sheet_name='negative')
git2_meng = pd.read_excel('./data/中文金融情感词典_姜富伟等(2020).xlsx', sheet_name='positive')

### following procedure is the same code as the previous for benchmark 1
- omit here

## benchmark 1 correlation plot

In [None]:
score_df = pd.read_csv('/Users/evaking/desktop/NLP Project/data_preprocess/test_score1-12_df.csv')
# test data
test_data = pd.read_parquet('/Users/evaking/desktop/NLP Project/data_preprocess/test_data_simple.parquet.gzip')

In [None]:
test_data['score6'] = score_df.score6.tolist()

In [None]:
# this function is written at 3.10 -- Correlation calculation in the time window
corr_6 = corre_df(test_data,'score6')
corr_6.to_csv('/Users/evaking/desktop/NLP Project/data/correlation/corr_benchmarkscore6.csv',
            index=False,header=True, encoding = 'utf_8_sig')

# SESTM method
**Three different variations:**
- `av method`: only choose word marked with a and v in POS tag: adj and verb
- `ex method`: detailed POS tag word exclusion method for different word length
- `av_2p method`: chooes word length >= 2 based on words in av method

`The following codes are the same for different variations, so only show in one variation`

## Dataset simplification

In [None]:
train_data = pd.read_parquet('/Users/evaking/Desktop/NLP Project/data_preprocess/train_data.parquet.gzip')
train_data = train_data.drop(['content','content_sent','no_stop_cont','content_split','words'],axis = 1)
train_data.to_parquet('./data_preprocess/train_data_simple.parquet.gzip')

## Read data

In [None]:
# train_data
train_data = pd.read_parquet('/Users/evaking/Desktop/NLP Project/data_preprocess/train_data_simple.parquet.gzip')

In [None]:
# bag of words from all training sample
wdcount_df_av = pd.read_csv('/Users/evaking/Desktop/NLP Project/data/word count/wdcount_df_av.csv')
wdcount_df_ex = pd.read_csv('/Users/evaking/Desktop/NLP Project/data/word count/wdcount_df_ex.csv')

## pi_hat

In [None]:
def pi_hat_func(train_data, p = True):
    pi_hat = train_data[train_data['sgn_labels,t:t+1']>0]['sgn_labels,t:t+1'].sum() / train_data.shape[0]
    if p:
        print('pi_hat is:', pi_hat)
    return pi_hat

In [None]:
pi_hat = pi_hat_func(train_data)

## Step 1 -- Screening for Sentiment-Charged words

### 1.3 Choose one kappa -- threshold

In [None]:
# count of words distribution
def kappaFunc(data,quant = 0.94):
    kappa = data['#words'].quantile(quant)
    print('The value of kappa is: ', kappa)
    return kappa

In [None]:
kappa_av = kappaFunc(wdcount_df_av)
# for wdcount

In [None]:
# revise the dataset according to kappa
wdcount_df_av = wdcount_df_av[wdcount_df_av['#words'] > kappa_av].reset_index(drop = True)

### 1.1 calculates the frequency with which word j co-occurs with a positive return

In [None]:
def wd_freq_variant(df,train_data):
    word_list = df.word.tolist()
    content_list = train_data.str_words.tolist()
    labels_list = train_data['sgn_labels,t:t+1'].tolist()
    
    ct = [0] * len(word_list)
    ct_sgn1 = [0] * len(word_list)
    
    for i in tqdm(range(len(word_list))):
        for j in range(len(content_list)): 
            if (word_list[i] in content_list[j]):
                ct[i] += 1
                if labels_list[j] > 0:
                    ct_sgn1[i] += 1
    return word_list, ct, ct_sgn1

In [None]:
word_list_av,ct_av,ct_sgn1_av = wd_freq_variant(wdcount_df_av,train_data)

In [None]:
# variant method showed in the paper
def Freqfunc(word_list_ex,ct_sgn1_ex,ct_ex):
    Freq_ex = pd.DataFrame([word_list_ex, ct_sgn1_ex, ct_ex]).T
    Freq_ex.columns = ['word', 'k_j & y=1', 'k_j']
    Freq_ex = Freq_ex[~Freq_ex['k_j'].isin([0])]
    Freq_ex.reset_index(drop = True,inplace = True)
    Freq_ex['f_j'] = (Freq_ex['k_j & y=1'] / Freq_ex.k_j)#.replace(np.inf,0)
    return Freq_ex

In [None]:
Freq_av = Freqfunc(word_list_av,ct_sgn1_av,ct_av)
Freq_av.to_csv('/Users/evaking/Desktop/NLP Project/data/Step1/Freq_av.csv',index=False,header=True, encoding = 'utf_8_sig')

### 1.3 Choose one kappa -- threshold

In [None]:
#Freq_av = pd.read_csv('./data/Step1/Freq_av.csv')
Freq_av = Freq_av[Freq_av.k_j>kappa_av].reset_index(drop = True)
#Freq_ex = Freq_ex[Freq_ex.k_j>kappa_ex].reset_index(drop = True)

### 1.2 Compare f_j with proper threshold: alpha+ and alpha-
`Here are also two variations: 500 and 100 words respectively for both positive and negative words`

`The following codes are the same for different variations, so only show in one variation`

In [None]:
def S_hat(Freq,pi_hat,wds=100):
    S_hat_p = Freq[Freq.f_j >= (pi_hat)].sort_values('k_j',ascending = False)[:wds].reset_index(drop = True)
    S_hat_n = Freq[Freq.f_j <= (pi_hat)].sort_values('k_j',ascending = False)[:wds].reset_index(drop = True)
    S_hat = pd.concat([S_hat_p, S_hat_n], axis = 0)
    S_hat = S_hat.reset_index(drop = True)
    S_hat['sentiment'] = ['positive']*wds + ['negative']*wds
    return S_hat

In [None]:
S_hat_av_500 = S_hat(Freq_av, pi_hat, wds = 500)
S_hat_av_500.to_csv('/Users/evaking/Desktop/NLP Project/data/Step1/S_hat_av_500.csv',index=False,header=True)

## Step 2 -- Learning Sentiment Topics

### 2.1 estimate H_hat

In [None]:
def sentwdbag(sentlist,train_data):
    SentCharged = sentlist.word.tolist()
    SentCharged_bag = []
    content = train_data.str_words.tolist()
    for i in tqdm(range(len(SentCharged))):
        a = SentCharged[i]
        ct_wd = []
        for j in range(len(content)):
            ct = content[j].count(a)
            ct_wd.append(ct)
        SentCharged_bag.append(ct_wd)
    return SentCharged_bag

In [None]:
SentCharged_bag_av_500 = sentwdbag(S_hat_av_500,train_data)

In [None]:
def hhat_func(sentwdbag, index, method = 'zero'):
    d_S_hat = pd.DataFrame(sentwdbag)
    d_S_hat.index = index
    
    if method == 'zero': # setting hi to 0 if si_hat==0
        H_hat = d_S_hat.apply(lambda x: x/x.sum(), axis = 0)
        H_hat = H_hat.fillna(0)
    elif method == 'drop': # remove articles with s_i_hat = 0
        H_hat = d_S_hat.apply(lambda x: x/x.sum(), axis = 0)
        H_hat = H_hat.dropna(axis = 1)
    else: # replace s_i_hat with count of all words (instead of sentiment charged words) for all articles (including those with s_i_hat > 0)
        d_S_hat.loc['all'] = ct_a.values.tolist()[0]
        H_hat = d_S_hat.apply(lambda x: x/x['all'], axis = 0)
        d_S_hat.drop(['all'],axis = 0,inplace = True)
        H_hat.drop(['all'],axis = 0,inplace = True)
        H_hat = H_hat.fillna(0)
    
    return H_hat

In [None]:
index_av_500 = S_hat_av_500.word.tolist()
H_hat_av_500 = hhat_func(SentCharged_bag_av_500, index = index_av_500)
np.save('/Users/evaking/Desktop/NLP Project/Data/Step2/array_H_hat_av_500.npy',H_hat_av_500)

### 2.2 calculate W_hat

In [None]:
def W_hat_func(train_data, rank = 'big',label = 't:t+1'):
    ln = len(train_data)
    if rank == 'small': # the higher the return, the smaller the number of the rank (i.e. highest return ranked as 1)
        rk = train_data[label].rank(ascending = False)
    else: # this is the correct one
        rk = train_data[label].rank(ascending = True)
    p_hat = list(rk/ln)
    p_hat_1 = list(1-rk/ln)
    lt = [p_hat,p_hat_1]
    return np.array(lt)

In [None]:
W_hat_big = W_hat_func(train_data,rank = 'big')
np.save('/Users/evaking/desktop/NLP Project/Data/Step2/W_hat_big.npy',W_hat_big)

### 2.3 calculate O_hat

In [None]:
def ohatfunc(H_hat, W_hat):
    O_hat = np.dot(H_hat, W_hat.T)
    O_hat = np.dot(O_hat, np.linalg.inv(np.dot(W_hat,W_hat.T)))
    O_hat = pd.DataFrame(O_hat.tolist())
    O_hat[O_hat < 0] = 0
    O_hat = O_hat.apply(lambda x: x/x.sum(), axis = 0)
    return O_hat

In [None]:
O_hat_av_500 = ohatfunc(H_hat_av_500, W_hat_big).T # transpose matters

np.save('/Users/evaking/desktop/NLP Project/Data/Step2/list_O_hat_av_500.npy',O_hat_av_500)

## Step 3 -- Scoring New Articles

In [None]:
test_data = pd.read_parquet('/Users/evaking/desktop/NLP Project/data_preprocess/test_data_simple.parquet.gzip')
#S_hat_av_500 = pd.read_csv('./data/Step1/S_hat_av_500.csv')

## get d first

In [None]:
def d_func(sentlist,data):
    SentCharged = sentlist.word.tolist()
    SentCharged_bag = []
    content = data.str_words.tolist()
    for j in tqdm(range(len(content))):
        ct_wd = []
        for i in range(len(SentCharged)):
            ct = content[j].count(SentCharged[i])
            ct_wd.append(ct)
        SentCharged_bag.append(ct_wd)
    return SentCharged_bag

In [None]:
d_av_500 = d_func(S_hat_av_500,test_data) # list
np.save('/Users/evaking/desktop/NLP Project/data/Step3/array_d_av_500',d_av_500) # 保存为.npy格式

## likelihood function

In [None]:
def llh(p):
    summ = sum(np.log(p * O[0] + (1-p) * O[1])*d[flag])
    if sum(d[flag]):
        likelihood = (summ / sum(d[flag]) + lamb * np.log(p*(1-p)))*(-1) # times (-1) for later minimize func
    else:
        likelihood = (lamb * np.log(p*(1-p)))*(-1) # times (-1) for later minimize func
    return likelihood

## prediction

In [None]:
def pre_score(bnds=(0,1)):
    import scipy.optimize as opt # use optimization function
    pre = []
    for _ in tqdm(range(len(d))):
        # get the independent variable 
        p_optimal = opt.minimize_scalar(fun=llh, bounds=bnds,method = 'bounded').x 
        global flag
        flag +=1 # go through all the articles in the testing sample
        pre.append(p_optimal)
    return pre

In [None]:
d = np.load('/Users/evaking/Desktop/NLP Project/data/Step3/array_d_av_500.npy')
O = np.load('/Users/evaking/Desktop/NLP Project/data/Step2/list_O_hat_av_500.npy')
flag = 0
lamb = 5
pre = pre_score()
np.save('/Users/evaking/desktop/NLP Project/data/Step3/p_pre/array_p_pre_av500',pre) # 保存为.npy格式

## Correlation calculation in the time window
`two kinds of correlation`
- pearson correlation
- spearman correlation

`two time window`
- the correlation between predicted score at time t and return label time [t:t+1]
- the correlation between predicted score at time t and return label time [t+2:t+6]

In [None]:
p_pre_av_500 = np.load('/Users/evaking/desktop/NLP Project/data/Step3/p_pre/array_p_pre_av500.npy').tolist()
test_data['p_pre_av_500'] = p_pre_av_500

In [None]:
def corre_df(data,name):
    lt_p = []
    lt_s = []
    date_list = []
    for i in tqdm(range(len(data))):
        if (i>1 and not i%100) or i==len(data)-1:
            cor_p = data[[name,'t:t+1','t+2:t+6']][:i].corr().iloc[0][1:].tolist()
            cor_s = data[[name,'t:t+1','t+2:t+6']][:i].corr(method = 'spearman').iloc[0][1:].tolist()
            lt_p.append(cor_p)
            lt_s.append(cor_s)
            date_list.append(data.create_date[i-1])
    dt = pd.DataFrame(date_list)
    lt_p = pd.DataFrame(lt_p)
    lt_s = pd.DataFrame(lt_s)
    df = pd.concat([dt,lt_p,lt_s],axis = 1)
    df.columns = ['date','p:[t:t+1]','p:[t+2:t+6]','s:[t:t+1]','s:[t+2:t+6]']
    return df

In [None]:
corr = corre_df(test_data,'p_pre_av_500')
corr.to_csv('/Users/evaking/desktop/NLP Project/data/correlation/corr_av500.csv',
            index=False,header=True, encoding = 'utf_8_sig')

# Extention -- pre-trained word embedding corpus

In [None]:
pd.set_option('mode.chained_assignment', None)

## Data preparation

### Data preprocess for test_data

In [None]:
# test data
test_data = pd.read_parquet('/Users/evaking/desktop/NLP Project/data_preprocess/test_data_simple.parquet.gzip')

In [None]:
def wordsfortest(test_data):
    words = []
    lt = test_data.str_words.tolist()
    for i in tqdm(range(len(lt))):
        wd = []
        ltt = lt[i].split(' ')
        for j in range(len(ltt)):
            if ltt[j]:
                wd.append(eval(ltt[j]))
        words.append(wd)
    return words

In [None]:
words_test = wordsfortest(test_data)

In [None]:
sen_all = []
for i in tqdm(range(len(test_data))):
    sen_all.append(' '.join(test_data.new[i]))

In [None]:
# cleaned words in testing data (the format is the same as train data)
test_data['str_words'] = sen_all

In [None]:
test_data.to_parquet('/Users/evaking/desktop/NLP Project/data_preprocess/test_data_simple.parquet.gzip')

### Read Data

In [None]:
train_data = pd.read_parquet('/Users/evaking/Desktop/NLP Project/data_preprocess/train_data_simple.parquet.gzip')
test_data = pd.read_parquet('/Users/evaking/desktop/NLP Project/data_preprocess/test_data_simple.parquet.gzip')

### get the pretrained data
1. Ngram2vec pre-trained word vectors
    - https://github.com/Embedding/Chinese-Word-Vectors/blob/master/README_zh.md
    - Word2vec / Skip-Gram with Negative Sampling (SGNS) : Financial News 金融新闻
2. DSG pre-trained word vectors
    - https://ai.tencent.com/ailab/nlp/en/embedding.html
    
`The following procedure is the same for above two methods so only show one here`

In [None]:
embeddings_dict = {}
with open('/Users/evaking/Desktop/NLP Project/data/sgns.financial.word.txt', 'r',encoding="utf-8") as file_to_read:
     for line in file_to_read:
            line = line.strip('\n')
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector

In [None]:
dim = embeddings_dict.pop('467370')
print('The number of words in the file:', len(embeddings_dict), 'and the dimension size:', dim[0])

### get the whole words in the training sample

In [None]:
word_bag=[]
tt = train_data.str_words.tolist()
for i in range(len(tt)):
    word_bag.extend(tt[i].split(' '))

In [None]:
bag = set(word_bag)
#len(bag) = 248043

### find the overlapped words for training sample

In [None]:
embed_words = list(embeddings_dict)
embed_words = set(embed_words)
overlap = embed_words&bag
#len(overlap) = 71475

In [None]:
oolist = list(overlap)
np.save('/Users/evaking/Desktop/NLP Project/data/Step1/overlap_w2v.npy',oolist)

### new word vectors only for the overlapped words

In [None]:
new_dict = {key: value for key, value in embeddings_dict.items() if key in overlap}

## construct article level vector

In [None]:
def article_vec(overlap,words):
    words = words.split(' ')
    vec = np.zeros(int(dim[0]))
    for i in range(len(words)):
        
        if words[i] in overlap:
            vec += new_dict[words[i]]
    return vec.tolist()

In [None]:
words = train_data.str_words.tolist()
article_vec_list = all_article_vec(overlap,words)
#np.save('/Users/evaking/desktop/NLP Project/Data/model extension/array_train.npy',article_vec_list)

In [None]:
H_hat_w2v = np.array(article_vec_list).T
#np.save('/Users/evaking/Desktop/NLP Project/Data/Step2/array_H_hat_w2v.npy',H_hat_w2v)

## calculate O_hat

In [None]:
W_hat_big = np.load('/Users/evaking/desktop/NLP Project/Data/Step2/W_hat_big.npy')
H_hat_w2v = np.load('/Users/evaking/Desktop/NLP Project/Data/Step2/array_H_hat_w2v.npy')

In [None]:
oolist = np.load('/Users/evaking/Desktop/NLP Project/data/Step1/overlap_w2v.npy')

In [None]:
# ohatfunc is the same as Step 2 -- 3.5.3 2.3 calculate O_hat
O_hat_w2v = ohatfunc(H_hat_w2v, W_hat_big).T # transpose matters
#np.save('/Users/evaking/desktop/NLP Project/Data/Step2/list_O_hat_w2v.npy',O_hat_w2v)

In [None]:
O_hat_w2v = np.load('/Users/evaking/desktop/NLP Project/Data/Step2/list_O_hat_w2v.npy')

## get article level vector for testing sample

### get the whole words in the testing sample

In [None]:
word_bag_test =[]
tt_test = test_data.str_words.tolist()
for i in range(len(tt)):
    word_bag_test.extend(tt_test[i].split(' '))

In [None]:
bag_test = set(word_bag_test)
#len(bag_test) = 266567
lt = list(bag_test)
del(lt[0])
bag_test_new = lt
# len(bag_test_new) = 266566

### find the overlapped words with training sample

In [None]:
embed_words = list(embeddings_dict)
embed_words = set(embed_words)
overlap_test = set(bag_test_new)&overlap
#len(overlap_test) = 58799

In [None]:
new_dict_test = {key: value for key, value in embeddings_dict.items() if key in overlap_test}

In [None]:
words_t = test_data.str_words.tolist()
article_vec_list_test = all_article_vec(overlap_test,words_t)
#np.save('/Users/evaking/desktop/NLP Project/Data/model extension/array_test.npy',article_vec_list_test)

In [None]:
array_test = np.load('/Users/evaking/desktop/NLP Project/Data/model extension/array_test.npy')

## SESTM method

In [None]:
# here no sum(d) -- seems make sense but the result is strange
def llh_extension(p):
    summ = sum(np.log(p * O[0] + (1-p) * O[1])*d[flag])
   
    likelihood = (summ  + lamb * np.log(p*(1-p)))*(-1) # times (-1) for later minimize func
    
    return likelihood

In [None]:
def pre_score_extension(bnds=(0,1)):
    import scipy.optimize as opt # use optimization function
    pre = []
    for _ in tqdm(range(len(d))):
        # get the independent variable 
        p_optimal = opt.minimize_scalar(fun=llh_extension, bounds=bnds,method = 'bounded').x 
        global flag
        flag +=1 # go through all the articles in the testing sample
        pre.append(p_optimal)
    return pre

In [None]:
d = np.load('/Users/evaking/desktop/NLP Project/Data/model extension/array_test.npy')
O = np.load('/Users/evaking/desktop/NLP Project/Data/Step2/list_O_hat_w2v.npy')
flag = 0
lamb = 5
pre = pre_score_extension()
#np.save('/Users/evaking/desktop/NLP Project/data/Step3/p_pre/array_p_pre_av500',pre) # 保存为.npy格式

In [None]:
test_data['pre_sestm_w2v'] = pre # the result is strange -- many are the same

## Directly method
- calculate the distance between new article vector and learned O+ and O-

In [None]:
dis_pos = np.linalg.norm(array_test-O_hat_w2v[0],axis = 1)
dis_neg = np.linalg.norm(array_test-O_hat_w2v[1],axis = 1)

In [None]:
test_data['pre_w2v_diff'] = (dis_pos-dis_neg).tolist()
test_data['pre_w2v_diff_np'] = (dis_neg-dis_pos).tolist()
test_data['pre_w2v_pos'] = (dis_pos).tolist()
test_data['pre_w2v_neg'] = (dis_neg).tolist()

In [None]:
test_data.corr().loc[['t:t+1','t+2:t+6']]
# correlation 
#pre_w2v_diff  t:t+1 -0.058777 t+2:t+6 -0.000974
#pre_w2v_diff_np  t:t+1 0.058777 t+2:t+6 0.000974  this one is selected for the final comparison
#pre_w2v_pos t:t+1 0.005595 t+2:t+6 0.001548
#pre_w2v_neg t:t+1 0.005701 t+2:t+6 0.001551

In [None]:
corr_np = corre_df(test_data,'pre_w2v_diff_np')
corr_np.to_csv('/Users/evaking/desktop/NLP Project/data/correlation/corr_w2v_dis.csv',
            index=False,header=True, encoding = 'utf_8_sig')

## Directly jump to prediction
### Model selection summary
`Regression`

    - Linear Regression
    - SVR
    - Random Forest -- too slow -- ignore
    
`Classification`

    - Logistic Regression
    - KNN
    - SVM -- twoo slow -- ignore

In [None]:
import time

### Simple Linear Regression

In [None]:
from sklearn import linear_model
# Create linear regression object
regr = linear_model.LinearRegression()

In [None]:
# Train the model using the training sets
regr.fit(article_vec_list, return_y)

In [None]:
# Make predictions using the testing set
y_pred = regr.predict(article_vec_list_test)
test_data['pre_w2v_LR'] = y_pred

In [None]:
# correlation: t:t+1 0.076263  t+2:t+6 0.018424
corr_LR = corre_df(test_data,'pre_w2v_LR')
corr_LR.to_csv('/Users/evaking/desktop/NLP Project/data/correlation/corr_n2v_LR.csv',
            index=False,header=True, encoding = 'utf_8_sig')

### SVM regression

In [None]:
from sklearn import svm
model_SVR = svm.SVR()

In [None]:
# Train the model using the training sets
model_SVR.fit(article_vec_list, return_y)

In [None]:
# Make predictions using the testing set
y_pred_SVR = model_SVR.predict(article_vec_list_test)
test_data['pre_w2v_SVR'] = y_pred_SVR

In [None]:
# correlation: t:t+1 0.082435  t+2:t+6 0.027413
corr_SVR = corre_df(test_data,'pre_w2v_SVR')
corr_SVR.to_csv('/Users/evaking/desktop/NLP Project/data/correlation/corr_n2v_SVR.csv',
            index=False,header=True, encoding = 'utf_8_sig')

### Logistic Regression
`threshold change`
- threshold 0
- three categories
- threshold top 30% and bottom 30%

In [None]:
logistic = linear_model.LogisticRegression(max_iter=1000)
logistic_30 = linear_model.LogisticRegression(max_iter=1000)
logistic_out = linear_model.LogisticRegression(max_iter=1000)

In [None]:
# threshold 0
return_y_classification_0 = [1  if x >=0 else 0 for x in return_y]

In [None]:
logistic.fit(article_vec_list, return_y_classification_0)

In [None]:
logit_pred = logistic.predict_proba(article_vec_list_test)
test_data['pre_w2v_logit_1'] = logit_pred.T[1].tolist()

In [None]:
corr_logit_1 = corre_df(test_data,'pre_w2v_logit_1')
corr_logit_1.to_csv('/Users/evaking/desktop/NLP Project/data/correlation/corr_n2v_logit.csv',
            index=False,header=True, encoding = 'utf_8_sig')

In [None]:
# top 30%
top30 = np.quantile(np.array(return_y), .70)
# bottom 30%
bottom30 = np.quantile(np.array(return_y), .30)

In [None]:
# three categories
return_y_classification_30 = [1 if x >=top30 else -1 if x<=bottom30 else 0 for x in return_y]

In [None]:
logistic_30.fit(article_vec_list, return_y_classification_30)

In [None]:
logit_pred_30 = logistic_30.predict_proba(article_vec_list_test)
test_data['pre_w2v_logit_1_30'] = logit_pred_30.T[2].tolist()

In [None]:
# top and bottom
only30 = [i for i in range(len(return_y_classification_30)) if return_y_classification_30[i] == 0]

In [None]:
array_train = np.load('/Users/evaking/desktop/NLP Project/Data/model extension/array_train.npy')
array_test = np.load('/Users/evaking/desktop/NLP Project/Data/model extension/array_test.npy')

In [None]:
array_train = [array_train[i] for i in tqdm(range(0, len(array_train), 1)) if i not in only30]
return_y_out = [return_y_classification_30[i] for i in tqdm(range(0, len(return_y_classification_30), 1)) if i not in only30]

In [None]:
logistic_out.fit(array_train, return_y_out)

In [None]:
logit_pred_out = logistic_out.predict_proba(array_test)
test_data['pre_w2v_logit_out'] = logit_pred_out.T[1].tolist()
np.save('/Users/evaking/desktop/NLP Project/data/Step3/p_pre/array_p_pre_logitc_out',logit_pred_out.T[1].tolist())

In [None]:
corr_logit_out = corre_df(test_data,'pre_w2v_logit_out')
corr_logit_out.to_csv('/Users/evaking/desktop/NLP Project/data/correlation/corr_n2v_logit_out.csv',
            index=False,header=True, encoding = 'utf_8_sig')

In [None]:
# correlation: 
# threshold 0
    # t:t+1 0.092801  t+2:t+6 0.019043
# three categories
    # t:t+1 0.094214  t+2:t+6 0.021277
# top and bottom 30%
    # t:t+1 0.095062  t+2:t+6 0.019215

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
kneighbor = KNeighborsClassifier(500)
# threshold 0
time_start=time.time()
kneighbor.fit(article_vec_list, return_y_classification_0)
time_end=time.time()
print('time cost',time_end-time_start,'s')

In [None]:
time_start=time.time()
kneighbor_pred = kneighbor.predict_proba(article_vec_list_test)
test_data['pre_w2v_kneighbor_0'] = kneighbor_pred.T[1].tolist()
time_end=time.time()
print('time cost',time_end-time_start,'s')

In [None]:
corr_kneighbor_0 = corre_df(test_data,'pre_w2v_kneighbor_0')
corr_kneighbor_0.to_csv('/Users/evaking/desktop/NLP Project/data/correlation/corr_n2v_kneighbor.csv',
            index=False,header=True, encoding = 'utf_8_sig')

In [None]:
kneighbor = KNeighborsClassifier(500)
# threshold top30, bottom 30
time_start=time.time()
kneighbor.fit(array_train, return_y_out)
time_end=time.time()
print('time cost',time_end-time_start,'s')

In [None]:
time_start=time.time()
kneighbor_pred_out = kneighbor.predict_proba(array_test)
test_data['pre_w2v_kneighbor_out'] = kneighbor_pred_out.T[1].tolist()
time_end=time.time()
print('time cost',time_end-time_start,'s')

In [None]:
corr_kneighbor_out = corre_df(test_data,'pre_w2v_kneighbor_out')
corr_kneighbor_out.to_csv('/Users/evaking/desktop/NLP Project/data/correlation/corr_n2v_kneighbor_out.csv',
            index=False,header=True, encoding = 'utf_8_sig')

In [None]:
# correlation: 
# threshold 0
    # t:t+1 0.065391  t+2:t+6 0.006842
# top and bottom 30%
    # t:t+1 0.071280  t+2:t+6 0.006468

# Performance -- Correlation plot

## Read data

In [None]:
name = ['av100','av500','av100_2p','av500_2p','ex100','ex500',
        'ex100_2p','ex500_2p','git','benchmarkscore6',
        'n2v_dis','n2v_LR','n2v_SVR','n2v_logit','n2v_logit_out','n2v_kneighbor','n2v_kneighbor_out',
       'DSG_dis','DSG_LR','DSG_SVR','DSG_logit','DSG_logit_out','DSG_kneighbor','DSG_kneighbor_out']

In [None]:
sestm = ['av100','av500','av100_2p','av500_2p','ex100','ex500',
        'ex100_2p','ex500_2p','git','benchmarkscore6']
extension_w2v = ['av500_2p','git',
             'n2v_dis','n2v_LR','n2v_SVR','n2v_logit','n2v_logit_out','n2v_kneighbor','n2v_kneighbor_out']
extension_glove = ['av500_2p','git',
             'DSG_dis','DSG_LR','DSG_SVR','DSG_logit','DSG_logit_out','DSG_kneighbor','DSG_kneighbor_out']

In [None]:
for n in name:
    # original and git
    exec('corr_{} = pd.read_csv("/Users/evaking/desktop/NLP Project/data/correlation/corr_{}.csv",\
         parse_dates = ["date"],infer_datetime_format = True)'.format(n,n))
    # for training data: range outside of [-2%,2%]
    if '2p' in n:
        exec('corr_{}_out2 = pd.read_csv("/Users/evaking/desktop/NLP Project/data/correlation/corr_{}_out2.csv",\
         parse_dates = ["date"],infer_datetime_format = True)'.format(n,n))

## Plot function

In [None]:
def plot_parms(plt,interval = 0.02):
    plt.xlabel('Year',fontdict={'family' : 'Times New Roman', 'size'   : 20})
    plt.ylabel('Correlation',fontdict={'family' : 'Times New Roman', 'size'   : 20})
    plt.xticks(size = 15)
    plt.yticks(size = 15)
    y_major_locator = MultipleLocator(interval)
    ax = plt.gca()
    ax.yaxis.set_major_locator(y_major_locator)
    import matplotlib.dates as mdates
    ax.xaxis.set_major_locator(locator=mdates.YearLocator())
    plt.grid(linestyle='--')

In [None]:
def corr_plot(name, git = True, method = 'p'):
    colors = ['b','g','r','c','m','y','darkorange','lightpink','k','darkgrey']
    for i in range(len(name)):
        n = name[i]
        if git == True:
            nn = 'corr_{}'.format(n)
        else:
            if '2p' in n:
                nn_out2 = 'corr_{}_out2'.format(n)
                nn = 'corr_{}'.format(n)
                plot_data_out2 = eval(nn_out2)
                ax.plot(plot_data_out2['date'],plot_data_out2[method+':[t:t+1]'],colors[i],linestyle=':',label = nn+',[t:t+1],out2')
                ax.plot(plot_data_out2['date'],plot_data_out2[method+':[t+2:t+6]'],colors[i],linestyle='-.',label = nn+',[t+2:t+6],out2')
            else:
                continue
        plot_data = eval(nn)
        ax.plot(plot_data['date'],plot_data[method+':[t:t+1]'],colors[i],label = nn+',[t:t+1]')
        ax.plot(plot_data['date'],plot_data[method+':[t+2:t+6]'],colors[i],linestyle = 'dashed',label = nn+',[t+2:t+6]')

## SESTM method variation

In [None]:
#  (Pearson Correlation)
fig = plt.figure(figsize=(16, 10), dpi=80)
ax = fig.add_subplot(1,2,1)
plt.title('Pearson Correlation change', fontdict={'family' : 'Times New Roman', 'size'   : 20})
plot_parms(plt)
plt.ylim(0,0.32)
# plot sestm method
corr_plot(sestm)
plt.legend()

ax = fig.add_subplot(1,2,2)
plt.title('close look at Pearson Correlation change', fontdict={'family' : 'Times New Roman', 'size'   : 20})
plot_parms(plt,interval = 0.01)
plt.ylim(0,0.10)
corr_plot(sestm)
#plt.legend(loc = 7, prop = {'size':7})
plt.legend(bbox_to_anchor=(0.6,0.4,0.3,0),ncol=1,loc=7,mode='expand',borderaxespad=2, prop = {'size':8})

plt.suptitle('SESTM method variation (Pearson Correlation)',fontsize=22)
plt.savefig('/Users/evaking/desktop/NLP Project/pics/SESTM method variation_pearson.png')

########################
# (Spearman Correlation)
fig = plt.figure(figsize=(16, 10), dpi=80)

ax = fig.add_subplot(1,2,1)
plt.title('Spearman Correlation change', fontdict={'family' : 'Times New Roman', 'size'   : 20})
plot_parms(plt)
plt.ylim(-0.03,0.285)
corr_plot(sestm,method = 's')
plt.legend(prop = {'size':9})


ax = fig.add_subplot(1,2,2)
plt.title('close look at Spearman Correlation change', fontdict={'family' : 'Times New Roman', 'size'   : 20})
plot_parms(plt,interval = 0.01)
plt.ylim(-0.03,0.12)
corr_plot(sestm,method = 's')
plt.legend(bbox_to_anchor=(0.57,0.5,0.3,0),ncol=1,loc=7,mode='expand',borderaxespad=2, prop = {'size':8})

plt.suptitle('SESTM method variation (Spearman Correlation)',fontsize=22)
plt.savefig('/Users/evaking/desktop/NLP Project/pics/SESTM method variation_spearman.png')

## Model Extention -- n2v and DSG implementation
- only show n2v implementation here

In [None]:
# (Pearson Correlation)
fig = plt.figure(figsize=(16, 10), dpi=80)

ax = fig.add_subplot(1,2,1)
plt.title('Pearson Correlation change', fontdict={'family' : 'Times New Roman', 'size'   : 20})
plot_parms(plt)
plt.ylim(-0.018,0.32)
# plot extension method
corr_plot(extension_w2v)
plt.legend(loc=1,prop = {'size':10})

ax = fig.add_subplot(1,2,2)
plt.title('close look at Pearson Correlation change', fontdict={'family' : 'Times New Roman', 'size'   : 20})
plot_parms(plt,interval = 0.01)
plt.ylim(-0.015,0.10)
corr_plot(extension_w2v)
#plt.legend(loc = 7, prop = {'size':7})
plt.legend(bbox_to_anchor=(0.6,0.48,0.3,0),ncol=1,loc=7,mode='expand',borderaxespad=2, prop = {'size':7})

plt.suptitle('Algorithm Extention -- ngram2vec implementation (Pearson Correlation)',fontsize=22)
plt.savefig('/Users/evaking/desktop/NLP Project/pics/Model Extention_pearson.png')

########################
# (Spearman Correlation)
fig = plt.figure(figsize=(16, 10), dpi=80)

ax = fig.add_subplot(1,2,1)
plt.title('Spearman Correlation change', fontdict={'family' : 'Times New Roman', 'size'   : 20})
plot_parms(plt)
plt.ylim(-0.034,0.285)
corr_plot(extension_w2v,method = 's')
plt.legend(prop = {'size':9})


ax = fig.add_subplot(1,2,2)
plt.title('close look at Spearman Correlation change', fontdict={'family' : 'Times New Roman', 'size'   : 20})
plot_parms(plt,interval = 0.01)
plt.ylim(-0.034,0.14)
corr_plot(extension_w2v,method = 's')
plt.legend(bbox_to_anchor=(0.1,0.5,0.3,0),ncol=1,loc=7,mode='expand',borderaxespad=2, prop = {'size':8})

plt.suptitle('Algorithm Extention -- ngram2vec implementation (Spearman Correlation)',fontsize=22)
plt.savefig('/Users/evaking/desktop/NLP Project/pics/Model Extention_spearman.png')

## SESTM method variation -- range outside of [-2%,2%]

In [None]:
# (Pearson Correlation)
fig = plt.figure(figsize=(16, 8), dpi=80)

ax = fig.add_subplot(1,2,1)
plt.title('Pearson Correlation change', fontdict={'family' : 'Times New Roman', 'size'   : 20})
plot_parms(plt)
plt.ylim(0,0.32)
corr_plot(name, git = False)
plt.legend()


ax = fig.add_subplot(1,2,2)
plt.title('close look at Pearson Correlation change', fontdict={'family' : 'Times New Roman', 'size'   : 20})
plot_parms(plt,interval = 0.01)
plt.ylim(0,0.10)
corr_plot(name, git = False)
plt.legend(loc = 7, prop = {'size':9})

plt.suptitle('SESTM method variation -- range outside of [-2%,2%] (Pearson Correlation)',fontsize=22)
plt.savefig('/Users/evaking/desktop/NLP Project/pics/out2_pearson.png')

########################
# (Spearman Correlation)
fig = plt.figure(figsize=(16, 8), dpi=80)

ax = fig.add_subplot(1,2,1)
plt.title('Spearman Correlation change', fontdict={'family' : 'Times New Roman', 'size'   : 20})
plot_parms(plt)
plt.ylim(-0.03,0.3)
corr_plot(name, git = False,method = 's')
plt.legend()


ax = fig.add_subplot(1,2,2)
plt.title('close look at Spearman Correlation change', fontdict={'family' : 'Times New Roman', 'size'   : 20})
plot_parms(plt,interval = 0.01)
plt.ylim(-0.03,0.12)
corr_plot(name, git = False,method = 's')
plt.legend(bbox_to_anchor=(0.7,0.45,0.3,0),loc = 7, prop = {'size':9})

plt.suptitle('SESTM method variation -- range outside of [-2%,2%] (Spearman Correlation)',fontsize=22)
plt.savefig('/Users/evaking/desktop/NLP Project/pics/out2_spearman.png')

# Portfolio

In [None]:
import math

## Data pre-processing
- get the average predicted score for each stock at each day

In [None]:
# test_data
test_data = pd.read_parquet('/Users/evaking/Desktop/NLP Project/data_preprocess/test_data_simple.parquet.gzip')
# predicted score
pre_score = np.load('/Users/evaking/desktop/NLP Project/data/Step3/p_pre/array_p_pre_logitc_out.npy')

In [None]:
test_data['extension_w2v_logit_out'] = pre_score # also add other predicted score from different methods, omit here
test_data = test_data.drop(['ID','t:t+1','t+2:t+6','str_words'],axis = 1)

In [None]:
test_data = test_data.groupby(['SecuCode','create_date']).mean().reset_index()
test_data.create_date = pd.to_datetime(test_data.create_date, format='%Y-%m-%d', errors='ignore')

In [None]:
# if there is previous result
test = pd.read_parquet('/Users/evaking/desktop/NLP Project/data_preprocess/test_data_score.parquet.gzip')

In [None]:
test_data = pd.merge(test,test_data,how = 'left',on=['SecuCode','create_date'])
order = ['SecuCode', 'create_date', 'extension_w2v_logit_out','av_500_2p', 'av_100_2p', 'ex_500_2p',
       'ex_100_2p', 'year', 'month']
test_data = test_data[order]

In [None]:
#test_data.to_parquet('/Users/evaking/desktop/NLP Project/data_preprocess/test_data_score.parquet.gzip')

## Read data

In [None]:
msf = pd.read_csv('/Users/evaking/desktop/NLP Project/data/msf.csv')
msf_new = msf[['SecuCode','SecuAbbr','date','ret']]
msf_new.date = pd.to_datetime(msf_new.date, format='%Y-%m-%d', errors='ignore')
msf_new['year'] = msf_new.date.dt.year
msf_new['month'] = msf_new.date.dt.month

In [None]:
# choose score 6 as the sentiment score for dictionay based method
git1 = pd.read_parquet('/Users/evaking/desktop/NLP Project/data_preprocess/git1_score.parquet.gzip')
git2 = pd.read_parquet('/Users/evaking/desktop/NLP Project/data_preprocess/git2_score.parquet.gzip')
git1 = git1[['SecuCode','create_date','year','month','score6']]
git2 = git2[['SecuCode','create_date','year','month','score6']]

In [None]:
# choose av_500_2p and extension_w2v_logit_out score as the sentiment score for SESTM method and its extension
sestm = pd.read_parquet('/Users/evaking/desktop/NLP Project/data_preprocess/test_data_score.parquet.gzip')
av500_2p = sestm[['SecuCode','create_date','year','month','av_500_2p']]
extension_w2v_logit_out = sestm[['SecuCode','create_date','year','month','extension_w2v_logit_out']]
av500_2p.rename(columns={"av_500_2p": "score6"},inplace = True)
extension_w2v_logit_out.rename(columns={"extension_w2v_logit_out": "score6"},inplace = True)

## Data processing

In [None]:
def preprocess(data):
    data = data.sort_values(['SecuCode','create_date']).reset_index(drop = True)
    data.create_date = pd.to_datetime(data.create_date, format='%Y-%m-%d', errors='ignore')
    data.loc[:,'SecuCode'] = data.loc[:,'SecuCode'].astype('object')
    # average score for one day
    data = data.groupby(['SecuCode','create_date']).mean()
    data.reset_index(inplace = True)
    data['year'] = data['create_date'].dt.year
    data['month'] = data['create_date'].dt.month
    df = data[['SecuCode','create_date','year','month']]
    df = df.drop_duplicates(['SecuCode','year','month'],keep = 'last') # to track the last date of one month
    # average score for one month
    new = data.groupby(['SecuCode','year','month']).mean()
    new = pd.merge(new,df,on = ['SecuCode','year','month'])
    return new

In [None]:
git1 = preprocess(git1)
git2 = preprocess(git2)
av500_2p = preprocess(av500_2p)
extension_w2v_logit_out = preprocess(extension_w2v_logit_out)

In [None]:
def previous_calendar(data,day = 30):
    month = day//30
    data['pre_'+str(day)+'d_month'] = data.month-month
    data['pre_'+str(day)+'d_year'] = data.year
    data.loc[data['pre_'+str(day)+'d_month']<=0,'pre_'+str(day)+'d_year'] = data.loc[data['pre_'+str(day)+'d_month']<=0]['year']-1
    data.loc[data['pre_'+str(day)+'d_month']==0,'pre_'+str(day)+'d_month'] = 12
    data.loc[data['pre_'+str(day)+'d_month']==-1,'pre_'+str(day)+'d_month'] = 11
    data.loc[data['pre_'+str(day)+'d_month']==-2,'pre_'+str(day)+'d_month'] = 10
    return data[['pre_'+str(day)+'d_month','pre_'+str(day)+'d_year']]

In [None]:
def df(data,score_list = ['score6']):
    previous_calendar(data,day = 30)
    previous_calendar(data,day = 60)
    lt = ['SecuCode','year','month']
    lt.extend(score_list)
    lt_o = [a+'_o' for a in lt]
    git1_original = data[lt]
    git1_original.columns = lt_o
    return git1_original

In [None]:
git1_original = df(git1)
git2_original = df(git2)
av500_2p_original = df(av500_2p)
extension_w2v_logit_out_original = df(extension_w2v_logit_out)

In [None]:
def match(git1,git1_original):
    git1_df = pd.merge(git1,git1_original,how = 'left',
             left_on = ['SecuCode','pre_30d_month','pre_30d_year'],
             right_on = ['SecuCode_o','month_o','year_o']).dropna()
    git1_df.rename(columns={"score6_o": "score6_pre1m"},inplace = True)
    git1_df = git1_df[['SecuCode','year','month','score6_pre1m']]
    
    git1_df_60 = pd.merge(git1,git1_original,how = 'left',
             left_on = ['SecuCode','pre_60d_month','pre_60d_year'],
             right_on = ['SecuCode_o','month_o','year_o']).dropna()
    git1_df_60.rename(columns={"score6_o": "score6_pre2m"},inplace = True)
    git1_df_60 = git1_df_60[['SecuCode','year','month','score6_pre2m']]
    
    df = pd.merge(git1,git1_df,how = 'left',on = ['SecuCode','year','month'])
    df = pd.merge(df,git1_df_60,how = 'left',on = ['SecuCode','year','month'])
    return df

In [None]:
def final_df(git1,git1_original):
    git1_df = match(git1,git1_original)
    git1_final = git1_df[['SecuCode','create_date','year','month','score6','score6_pre1m','score6_pre2m']]
    git1_final['score6_pre1m_ave'] = git1_final[['score6','score6_pre1m']].mean(axis = 1)
    git1_final['score6_pre2m_ave'] = git1_final[['score6','score6_pre1m','score6_pre2m']].mean(axis = 1)
    git1_final['1m+month'] = git1_final.month+1
    git1_final['1m+year'] = git1_final.year
    git1_final.loc[git1_final['1m+month']==13,
               '1m+year'] = git1_final.loc[git1_final['1m+month']==13]['1m+year']+1
    git1_final.loc[git1_final['1m+month']==13,
               '1m+month'] = 1
    git1_final = pd.merge(git1_final,msf_new,how='left',
                          left_on = ['SecuCode','1m+year','1m+month'],right_on = ['SecuCode','year','month'])
    git1_final = git1_final.drop(['year_y','month_y','year_x','month_x','1m+month','1m+year'],axis = 1)
    return git1_final

In [None]:
git1_final = final_df(git1,git1_original)
git2_final = final_df(git2,git2_original)
av500_2p_final = final_df(av500_2p,av500_2p_original)
extension_w2v_logit_out_final = final_df(extension_w2v_logit_out,extension_w2v_logit_out_original)

In [None]:
def simple_df(data):
    data['year'] = data['create_date'].dt.year
    data['month'] = data['create_date'].dt.month
    data = data.dropna(subset=['ret']).reset_index(drop=True)
    data = data.drop(['score6_pre1m','score6_pre2m','SecuAbbr','create_date','date'],axis = 1)
    return data

In [None]:
git1_final_ = simple_df(git1_final)
git2_final_ = simple_df(git2_final)
av500_2p_final_ = simple_df(av500_2p_final)
extension_w2v_logit_out_final_ = simple_df(extension_w2v_logit_out_final)

## Cross-sectional Correlation calculatioin

In [None]:
def cross_df(data,method = 'pearson'):
    # previous 30 days
    df = data.groupby(['year','month'])[['score6','ret']].corr(method = method).reset_index()
    corr30 = df[ ~ df['level_2'].str.contains('ret') ].drop(['score6','level_2','year','month'],axis = 1).mean()[0]
    df = data.groupby(['year','month'])[['score6_pre1m_ave','ret']].corr(method = method).reset_index()
    corr60 = df[ ~ df['level_2'].str.contains('ret') ].drop(['score6_pre1m_ave','level_2','year','month'],axis = 1).mean()[0]
    df = data.groupby(['year','month'])[['score6_pre2m_ave','ret']].corr(method = method).reset_index()
    corr90 = df[ ~ df['level_2'].str.contains('ret') ].drop(['score6_pre2m_ave','level_2','year','month'],axis = 1).mean()[0]
    pearson = pd.DataFrame([corr30,corr60,corr90]).T
    pearson.columns = ['pre_30days','pre_60days','pre_90days']
    return pearson

In [None]:
def corr_df(data):
    df_p = cross_df(data)
    df_s = cross_df(data,'spearman')
    df = pd.concat([df_p,df_s])
    df.index = pd.Series(['pearson', 'spearman'])
    return df

In [None]:
# SESTM method: extension_w2v_logit_out
corr_extension_w2v_logit_out = corr_df(extension_w2v_logit_out_final_)
corr_extension_w2v_logit_out

In [None]:
# SESTM method: av_500_2p
corr_av500_2p = corr_df(av500_2p_final_)
corr_av500_2p

In [None]:
# github link 1
corr_git1 = corr_df(git1_final)
corr_git1

In [None]:
# github link 2
corr_git2 = corr_df(git2_final)
corr_git2

## 11 portfolios

### 11 portfolios construction

In [None]:
def cum_return(av500_2p_30_hedge,name): # to get the compound return for the hedge portfolio
    import calendar
    lt = []
    for i in range(len(av500_2p_30_hedge)):
        lt.append(calendar.monthrange(av500_2p_30_hedge.year[i],av500_2p_30_hedge.month[i])[1])
        
    av500_2p_30_hedge['day'] = lt
    av500_2p_30_hedge['date'] = av500_2p_30_hedge['year'].map(str)+"/"\
                                +av500_2p_30_hedge['month'].map(str)+"/"\
                                +av500_2p_30_hedge['day'].map(str)
    av500_2p_30_hedge = av500_2p_30_hedge.set_index('date')
    av500_2p_30_hedge = av500_2p_30_hedge.drop(['year','month','day'],1)
    av500_2p_30_hedge = av500_2p_30_hedge+1 # for the use of compound calculation
    av500_2p_30_hedge = av500_2p_30_hedge.cumprod(axis = 0)
    av500_2p_30_hedge = av500_2p_30_hedge.reset_index()
    av500_2p_30_hedge.columns = ['date',name]
    return av500_2p_30_hedge

In [None]:
def portfolio(test_final,name,score = 'score'):
    lt = [score[:6]+'_pre1m',score[:6]+'_pre2m']
    test_ls = test_final.drop(lt,axis = 1)
    test_ls['year'] = test_ls.create_date.dt.year
    test_ls['month'] = test_ls.create_date.dt.month
    
    test_ls = test_ls.dropna(subset = ['ret',score]).reset_index(drop = True)
    
    test_ls_1m = test_ls[['SecuCode',score,'ret','year','month']]
    
    #higher the score,higher the pct
    test_ls_1m_pct = test_ls_1m.groupby(['year','month']).rank(ascending=True,method='dense',pct = True)[[score]]
    test_ls_1m_pct.columns = [score+'_pct']
    # allocate group
    test_ls_1m_pct['group'] = test_ls_1m_pct.apply(lambda x: x*10)
    
    test_ls_1m_pct['group'] = [math.ceil(a) for a in test_ls_1m_pct['group']]
    test_ls_1m_pct['group'] = test_ls_1m_pct['group'].astype('int')
    
    # group for each stock at each month
    df = pd.merge(test_ls_1m,test_ls_1m_pct,left_index = True,right_index = True)
    #df = df.dropna(subset = ['ret']).reset_index(drop = True)
    
    return_df = df.groupby(['year','month','group']).mean()[['ret']].reset_index()
    
    # hedged portfolio
    hedge = (return_df.groupby(['year','month']).last()-return_df.groupby(['year','month']).first())[['ret']]
    ret = hedge.ret.mean()*12
    v = hedge.ret.std()*math.sqrt(12) #volatility
    sr = ret/v  #sharpe ratio
    
    # cumulative compound return
    hedge = hedge.reset_index()
    if score == 'score6':
        day = 30
    elif score == 'score6_pre1m_ave':
        day = 60
    else:
        day = 90
    hedge = cum_return(hedge,name[1:]+'_'+str(day)+'_hedge')
    
    # annualized return, risk, sharpe ratio
    annual = return_df.groupby('group').mean()[['ret']]*12
    annual.rename(columns = {'ret':'annual_return'+name},inplace = True)
    vol = return_df.groupby('group').std()[['ret']]*math.sqrt(12)
    vol.rename(columns = {'ret':'annual_risk'+name},inplace = True)
    result = pd.merge(annual,vol,left_index = True, right_index = True)
    result['Sharpe Ratio'+name] = result['annual_return'+name]/result['annual_risk'+name]
    
    result.loc[11] = [ret,v,sr]
    

    return result,hedge

In [None]:
def get_result(av500_2p_final,name):
    av500_2p_30 = portfolio(av500_2p_final,name = name,score = 'score6')
    av500_2p_60 = portfolio(av500_2p_final,name = name,score = 'score6_pre1m_ave')
    av500_2p_90 = portfolio(av500_2p_final,name = name,score = 'score6_pre2m_ave')
    return av500_2p_30,av500_2p_60,av500_2p_90

In [None]:
# SESTM method
av500_2p_30,av500_2p_60,av500_2p_90, av500_2p_30_hedge,av500_2p_60_hedge,av500_2p_90_hedge= get_result(av500_2p_final,name = '_av500_2p')
extension_w2v_logit_out_30,extension_w2v_logit_out_60,extension_w2v_logit_out_90,extension_w2v_logit_out_30_hedge,extension_w2v_logit_out_60_hedge,extension_w2v_logit_out_90_hedge = get_result(extension_w2v_logit_out_final,name = '_extension_n2v_logit_out')

In [None]:
# github link 1
git1_30,git1_60,git1_90,git1_30_hedge,git1_60_hedge,git1_90_hedge = get_result(git1_final,name = '_git1')

In [None]:
# github link 2
git2_30,git2_60,git2_90,git2_30_hedge,git2_60_hedge,git2_90_hedge = get_result(git2_final,name = '_git2')

### Result plot

#### cumulative return for hedged portfolio

In [None]:
def hedge_plot(av500_2p_30_hedge,extension_w2v_logit_out_30_hedge,git1_30_hedge,git2_30_hedge):
    data = pd.merge(av500_2p_30_hedge,extension_w2v_logit_out_30_hedge,how='left', on=['date'])
    data = pd.merge(data,git1_30_hedge,how='left', on=['date'])
    data = pd.merge(data,git2_30_hedge,how='left', on=['date'])
    data.date = pd.to_datetime(data.date, format='%Y-%m-%d', errors='ignore')
    return data#.set_index(['date'])

In [None]:
data_30 =hedge_plot(av500_2p_30_hedge,extension_w2v_logit_out_30_hedge,git1_30_hedge,git2_30_hedge)
data_60 =hedge_plot(av500_2p_60_hedge,extension_w2v_logit_out_60_hedge,git1_30_hedge,git2_60_hedge)
data_90 =hedge_plot(av500_2p_90_hedge,extension_w2v_logit_out_90_hedge,git1_90_hedge,git2_90_hedge)

In [None]:
def plot_parms():
    plt.xlabel('Year',fontdict={'family' : 'Times New Roman', 'size'   : 30})
    plt.ylabel('Portfolio Value',fontdict={'family' : 'Times New Roman', 'size'   : 30})
    plt.xticks(size = 15)
    plt.yticks(size = 15)
    ax = plt.gca()
    plt.grid(linestyle='--')

In [None]:
def plot_func(title,data,lt,size = 18):
    plt.title(title, fontdict={'family' : 'Times New Roman', 'size'   : 30})
    plot_parms()
    for i in range(len(lt)):
        ax.plot(data['date'],data[lt[i]],linestyle='-',label = lt[i])
    plt.legend(prop = {'size':size})

In [None]:
extension_compare = pd.merge(data_30[['date','extension_n2v_logit_out_30_hedge']],
        data_60[['date','extension_n2v_logit_out_60_hedge']],
        how='left', on=['date'])
extension_compare = pd.merge(extension_compare,
        data_90[['date','extension_n2v_logit_out_90_hedge']],
        how='left', on=['date'])

In [None]:
fig = plt.figure(figsize=(25, 18), dpi=80)
plt.suptitle('Cumulative compound return for hedged portfolios',fontsize=30)

ax = fig.add_subplot(2,2,1)
plot_func('previous 30 days',data_30,
          ['av500_2p_30_hedge','extension_n2v_logit_out_30_hedge','git1_30_hedge','git2_30_hedge'])

ax = fig.add_subplot(2,2,2)
plot_func('previous 60 days',data_60,
          ['av500_2p_60_hedge','extension_n2v_logit_out_60_hedge','git1_30_hedge','git2_60_hedge'])

ax = fig.add_subplot(2,2,3)
plot_func('previous 90 days',data_90,
          ['av500_2p_90_hedge','extension_n2v_logit_out_90_hedge','git1_90_hedge','git2_90_hedge'])

ax = fig.add_subplot(2,2,4)
plot_func('extension method previous days comparison',extension_compare,
          ['extension_n2v_logit_out_30_hedge','extension_n2v_logit_out_60_hedge','extension_n2v_logit_out_90_hedge'])
plt.savefig('/Users/evaking/desktop/NLP Project/pics/cumulative.png')

In [None]:
def plot_df(df1,df2,df3,df4,category = 'Sharpe Ratio'):
    df_1 = pd.merge(df1[[category+'_extension_n2v_logit_out']],df2[[category+'_av500_2p']],
         left_index = True, right_index = True)
    df_2 = pd.merge(df3[[category+'_git1']],df4[[category+'_git2']],
         left_index = True, right_index = True)
    df = pd.merge(df_1,df_2,left_index = True, right_index = True)
    return df

In [None]:
def plot(df,category,day = 30):
    df.plot(figsize=(12, 4),xticks=df.index.tolist())
    tick_spacing = 1
    plt.title(category+' of different portfolios ('+str(day)+'days)')
    plt.xlabel('Group',fontdict={'family' : 'Times New Roman', 'size'   : 20})
    plt.ylabel(category,fontdict={'family' : 'Times New Roman', 'size'   : 20})
    plt.xticks(size = 15)
    plt.yticks(size = 15)
    plt.grid(linestyle='--')
    plt.legend()
    plt.savefig('/Users/evaking/desktop/NLP Project/pics/'+category+str(day)+'.png')

#### Sharpe Ratio

In [None]:
df_sp = plot_df(extension_w2v_logit_out_30,av500_2p_30,git1_30,git2_30)
plot(df_sp,'Sharpe Raio')

In [None]:
df_sp_60 = plot_df(extension_w2v_logit_out_60,av500_2p_60,git1_60,git2_60)
plot(df_sp_60,'Sharpe Raio',day = 60)

In [None]:
df_sp_90 = plot_df(extension_w2v_logit_out_90,av500_2p_90,git1_60,git2_90)
plot(df_sp_90,'Sharpe Raio',day = 90)

#### Annual Return

In [None]:
df_r = plot_df(extension_w2v_logit_out_30,av500_2p_30,git1_30,git2_30,category = 'annual_return')
plot(df_r,'annual_return')
# 60, 90 days are similar and omit here

#### Annual Risk

In [None]:
df_risk = plot_df(extension_w2v_logit_out_30,av500_2p_30,git1_30,git2_30,category = 'annual_risk')
plot(df_risk,'annual_risk')
# 60, 90 days are similar and omit here

# Conclusion

## Result Description

**`method variation comparison`**
1. *500* words for positive and negative word each is better than *100* words each
2. adj + v is better than detailed exclusion
3. word length >=*2* is better than not selecting word length (for SESTM method only)
4. _500_ words each with a. and v. whose word length larger or equal to two performs best
5. github fixed dictionary + SESTM method performs worst

**`correlation result analysis`**

The correlation varies as time goes by.
- for the correlation between predicted score at time t and return label time *[t:t+1]*
    - At the begining of the testing sample, the pearson correlation can be up to around *0.31*.
    - While after around half a year, the pearson correlation sharply drop to around *0.08* in average for all methods and become stable afterwards.
    - The spearman correlation follows the same trend, but with a slightly lower peak at the beginning (around *0.28*) and a slightly higher correlation when the trend is stable (around *0.09*).

- for the correlation between predicted score at time t and return label time *[t+2:t+6]*
    - Follow the same trend as *[t:t+1]*, but the correlation drops ealier.
    - At the begining of the testing sample, the pearson correlation can also be up to around *0.31*.
    - While in the following months, the pearson correlation gradually drop to around *0.01* in average for all methods and become stable afterwards.
    - The spearman correlation follows the same trend, with a similar peak at the beginning (around *0.28*) but a relative lower correlation when the trend is stable (around *-0.01*).
    
**`algorithm extension`**    
1. Comparison between using the whole training dataset and using only those with return outside range of *[-2%,2%]*
- for pearson correlation
    - Not much difference as in some periods the whole training dataset one is slightly higher while in other periods, the outside range one is slightly higher for all methods and all time window.

- for spearman correlation
    - In general, the correlation score of the outside range one is slightly lower than the whole one.

2. ngram2vec method implementation / DSG method implementation
- Both regression and classification methods are applied and logistic regression performs best
    
**`portfolio result analysis`**
- Adopt four methods in total to test the correlation between previous *1-3* month average predicted sentiment score and the following one month return for each stock.
- Four methods are as followed:
    - n2v logistic regression threshold _30%_
    - av_500_2p
    - github link _1_ with score _6_ (dictionary based method)
    - github link _2_ with score _6_ (dictionary based method)
- Construct _10_ portfolios and one hedged portfolio (notated as portfolio _11_) based on the previous *1-3* month average predicted sentiment score rank
    - The portfolio is rebalanced at each month.
    - The higher the number of portfolio, the more positive for the sentiment score.
    - Therefore, the hedged portfolio _11_ is constructed by (portfolio _10_ - portfolio _1_) at each rebalance date.

1. The correlation results show that the word2vec method performs best among different methods and different previous time window.
2. The sharpe ratio doesn't vary too much among the first _10_ portfolios for different methods and different previous time window with an average score at around _0.6_.
3. However, the sharpe ratio all drops at the hedged portfolio for all different methods and different previous time window with varied values for different combination of method and time window.
4. The trend of annual return is the same as sharpe ratio and the average annual return among the first _10_ portfolios is around _0.16_.
5. Interestingly, the trend of annual risk is very different from the previous two. Among the first _10_ portfolios, the bigger the number of the portfolio, the higher the annual risk, which means the more positive sentiment of the stocks in blasket the higher the risk of the portfolio. While for hedged portfolio, since it adopts hedged strategy, the risk is supposed to be the lowest, and the result proves the expectation. The annual risk of the hedged portfolio is the lowest among all the _11_ portfolios.

    
## Result Conlusion
1. The algorithm extension applying ngram2vec method performs best.
2. The predicted score is most powerful in the very short period of time *[t:t+1]*, while not significant in the relative longer time window *[t+2:t+6]*
3. The portfolio based on the sentiment score doesn't perform well which may make sense, since we are back-testing the portfolios on a monthly scale which far beyond the predict ability of the sentiment score.