In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import pandas as pd
from finbert_embedding.embedding import FinbertEmbedding
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np
from tqdm import tqdm
import glob
import os.path
from pandas import *
from collections import Counter
import time

In [2]:
current_data_directory = os.path.abspath("./data/sp500")
path_text = current_data_directory + "/text/"
path_to_id_csv = current_data_directory + "/df_100stocks_sp500.csv"


In [3]:
print(current_data_directory)

/Users/juliencyrusenyegue/Documents/GitHub/financial_econometrics_project/FinBert/data/sp500


In [4]:
data = read_csv(path_to_id_csv)
ID_sp500 = data['id'].tolist()

In [5]:
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

In [6]:
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

In [7]:
# Extract texts from the directory

def extract_texts(start, N):

    texts = []
    first_sentence = []
    articles = []
    list_ID = []
   
    list_tickers = os.listdir(path_text)
    list_tickers.remove('.DS_Store') # for mac
 
    for ticker in list_tickers:
        
        list_articles = glob.glob(path_text+ticker+"/*")
        
        ID = []
        for s in list_articles:
            
            test_id = int(s.replace(path_text+str(ticker)+"/", ""))
          
            if test_id in ID_sp500:
                
                with open(s) as f:
                    
                    x = re.sub(path_text+str(ticker)+"/","",s)
                    articles.append(x)
                    t = f.read()
                    texts.append(t)
        
                ID.append(test_id)
                  
        list_ID += ID
        
     
    return texts, list_ID


In [8]:
def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]

    #sentences = [x for x in sentences if len(x.split(' '))>5]
    return sentences


In [9]:
# Run finbert on each text and count the nb of neutral, positive, negative sentences

def finbert_texts(start,n_texts, texts):
    labels = {0:'neutral', 1:'positive',2:'negative'}
    data = np.zeros((n_texts,3))

    for j in tqdm(range(start,start + n_texts)):
        
        print(j - start + 1)
        sentences = split_into_sentences(texts[j])
        sentences = texts[j].split('\n \n')
        sentences = [x for x in sentences if len(x.split(' '))>4]
        inputs = tokenizer(sentences, return_tensors="pt", padding=True,truncation=True,max_length=512)


        outputs = finbert(**inputs)[0]
        #print(outputs)
        i = np.argmax(outputs.detach().numpy(), axis = 1)
        #print(i)
        sentiment_counter = list(Counter(i).values())
        if(np.shape(sentiment_counter) == (2, )):
            sentiment_counter.append(0)
        print(sentiment_counter)
        data[j-start,:] = sentiment_counter
        time.sleep(45)
        
        
    return data

In [10]:
# Nombre d'articles
N = 50
start = 200
texts, list_ID = extract_texts(start, N)


li = [1,2]
#print(np.shape(li))

if(np.shape(li) == (2, )):
    li.append(4)
    print(li)
    

In [11]:
 if __name__ == "__main__":
    
    data = finbert_texts(start, N, texts)
    
   

  0%|                                                    | 0/50 [00:00<?, ?it/s]

1
[90, 26, 9]


  2%|▊                                        | 1/50 [01:41<1:22:45, 101.35s/it]

2
[109, 24, 9]


  4%|█▋                                        | 2/50 [03:15<1:17:47, 97.24s/it]

3
[53, 28, 6]


  6%|██▌                                       | 3/50 [04:58<1:18:16, 99.92s/it]

4
[60, 25, 9]


  8%|███▎                                      | 4/50 [06:13<1:08:54, 89.88s/it]

5
[111, 28, 12]


 10%|████▏                                     | 5/50 [07:46<1:08:19, 91.10s/it]

6
[51, 16, 3]


 12%|████▉                                    | 6/50 [09:59<1:17:18, 105.41s/it]

7
[53, 33, 3]


 14%|█████▉                                    | 7/50 [11:13<1:08:04, 94.99s/it]

8
[90, 45, 21]


 16%|██████▋                                   | 8/50 [12:54<1:07:53, 96.98s/it]

9
[25, 28, 2]


 18%|███████▌                                  | 9/50 [14:21<1:04:06, 93.81s/it]

10
[27, 41, 1]


 20%|████████▌                                  | 10/50 [15:30<57:28, 86.22s/it]

11
[26, 17, 2]


 22%|█████████▍                                 | 11/50 [16:31<51:05, 78.60s/it]

12
[27, 25, 2]


 24%|██████████▎                                | 12/50 [17:43<48:21, 76.36s/it]

13
[15, 24, 0]


 26%|███████████▏                               | 13/50 [18:45<44:30, 72.17s/it]

14
[27, 42, 2]


 28%|████████████                               | 14/50 [19:47<41:22, 68.95s/it]

15
[40, 29, 5]


 30%|████████████▉                              | 15/50 [20:53<39:43, 68.10s/it]

16
[37, 27, 1]


 32%|█████████████▊                             | 16/50 [22:05<39:17, 69.34s/it]

17
[31, 24, 2]


 34%|██████████████▌                            | 17/50 [23:18<38:48, 70.55s/it]

18
[35, 18, 0]


 36%|███████████████▍                           | 18/50 [24:26<37:10, 69.72s/it]

19
[31, 11, 2]


 38%|████████████████▎                          | 19/50 [25:58<39:26, 76.33s/it]

20
[24, 31, 2]


 40%|█████████████████▏                         | 20/50 [27:13<38:00, 76.03s/it]

21
[14, 9, 0]


 42%|██████████████████                         | 21/50 [28:29<36:42, 75.94s/it]

22
[41, 45, 0]


 44%|██████████████████▉                        | 22/50 [30:04<38:09, 81.78s/it]

23
[50, 43, 4]


 46%|███████████████████▊                       | 23/50 [31:27<36:55, 82.06s/it]

24
[62, 36, 12]


 48%|████████████████████▋                      | 24/50 [32:52<35:55, 82.91s/it]

25
[55, 32, 1]


 50%|█████████████████████▌                     | 25/50 [34:14<34:24, 82.58s/it]

26
[61, 36, 2]


 52%|█████████████████████▊                    | 26/50 [36:44<41:09, 102.88s/it]

27
[66, 36, 4]


 54%|██████████████████████▋                   | 27/50 [38:59<43:04, 112.39s/it]

28
[30, 35, 6]


 56%|███████████████████████▌                  | 28/50 [40:15<37:17, 101.72s/it]

29
[24, 22, 2]


 58%|████████████████████████▉                  | 29/50 [41:47<34:30, 98.60s/it]

30
[46, 4, 18]


 60%|█████████████████████████▏                | 30/50 [43:46<34:55, 104.76s/it]

31
[54, 28, 9]


 62%|██████████████████████████▋                | 31/50 [45:12<31:25, 99.22s/it]

32
[61, 28, 6]


 64%|███████████████████████████▌               | 32/50 [46:21<27:01, 90.09s/it]

33
[36, 28, 2]


 66%|████████████████████████████▍              | 33/50 [48:14<27:29, 97.00s/it]

34
[52, 32, 4]


 68%|█████████████████████████████▏             | 34/50 [49:39<24:54, 93.41s/it]

35
[76, 13, 2]


 70%|██████████████████████████████             | 35/50 [50:55<22:00, 88.03s/it]

36
[61, 21, 2]


 72%|██████████████████████████████▉            | 36/50 [52:17<20:06, 86.20s/it]

37
[69, 17, 1]


 74%|███████████████████████████████▊           | 37/50 [54:21<21:08, 97.57s/it]

38
[35, 23, 4]


 76%|████████████████████████████████▋          | 38/50 [55:28<17:43, 88.59s/it]

39
[47, 21, 2]


 78%|█████████████████████████████████▌         | 39/50 [57:06<16:44, 91.34s/it]

40
[58, 3, 13]


 80%|██████████████████████████████████▍        | 40/50 [58:32<14:55, 89.58s/it]

41
[29, 5, 16]


 82%|███████████████████████████████████▎       | 41/50 [59:35<12:15, 81.71s/it]

42
[54, 16, 6]


 84%|██████████████████████████████████▍      | 42/50 [1:00:47<10:29, 78.74s/it]

43
[33, 10, 3]


 86%|███████████████████████████████████▎     | 43/50 [1:01:59<08:57, 76.76s/it]

44
[66, 17, 3]


 88%|████████████████████████████████████     | 44/50 [1:03:04<07:19, 73.26s/it]

45
[71, 18, 2]


 90%|████████████████████████████████████▉    | 45/50 [1:04:19<06:08, 73.73s/it]

46
[37, 15, 4]


 92%|█████████████████████████████████████▋   | 46/50 [1:05:24<04:45, 71.29s/it]

47
[32, 13, 1]


 94%|██████████████████████████████████████▌  | 47/50 [1:06:32<03:30, 70.09s/it]

48
[35, 18, 5]


 96%|███████████████████████████████████████▎ | 48/50 [1:07:43<02:20, 70.35s/it]

49
[81, 37, 1]


 98%|████████████████████████████████████████▏| 49/50 [1:09:14<01:16, 76.72s/it]

50
[55, 22, 3]


100%|█████████████████████████████████████████| 50/50 [1:10:42<00:00, 84.86s/it]


In [12]:
# Ratios computations

df_data = pd.DataFrame(data, columns= ["Neutral","Positive", "Negative"])


df_data["Total"] = df_data["Neutral"] + df_data["Positive"] + df_data["Negative"]
df_data["Difference"] = df_data["Positive"] - df_data["Negative"]
df_data["P_N ratio"] = df_data["Positive"] / df_data["Negative"]
df_data["Difference ratio"] = df_data["Difference"] / df_data["Total"]

df_data["Decile"] = 1 + df_data["Difference ratio"].transform(lambda y: pd.qcut(y, 10, labels=False))

df_data = np.round(df_data, 5)


df_data["ID"] = list_ID[start: start + N]
first_column = df_data.pop("ID")
df_data.insert(0, "ID", first_column)

#long = list(df_data.loc[df_data['Decile'].isin([10,9])]["ID"])
#short = list(df_data.loc[df_data['Decile'].isin([1,2])]["ID"])

display(df_data)


Unnamed: 0,ID,Neutral,Positive,Negative,Total,Difference,P_N ratio,Difference ratio,Decile
0,4419494,90.0,26.0,9.0,125.0,17.0,2.88889,0.136,2
1,4358685,109.0,24.0,9.0,142.0,15.0,2.66667,0.10563,1
2,4460641,53.0,28.0,6.0,87.0,22.0,4.66667,0.25287,5
3,4255002,60.0,25.0,9.0,94.0,16.0,2.77778,0.17021,3
4,4480173,111.0,28.0,12.0,151.0,16.0,2.33333,0.10596,1
5,4317441,51.0,16.0,3.0,70.0,13.0,5.33333,0.18571,3
6,4439385,53.0,33.0,3.0,89.0,30.0,11.0,0.33708,7
7,4379483,90.0,45.0,21.0,156.0,24.0,2.14286,0.15385,2
8,4378855,25.0,28.0,2.0,55.0,26.0,14.0,0.47273,9
9,4439214,27.0,41.0,1.0,69.0,40.0,41.0,0.57971,10


In [13]:
df_ratios = read_csv(os.path.abspath("./FinBert_ratios.csv"))
display(df_ratios)
df_ratios = pd.concat([df_ratios, df_data])
df_ratios["Decile"] = 1 + df_ratios["Difference ratio"].transform(lambda y: pd.qcut(y, 10, labels=False))
display(df_ratios)

Unnamed: 0,ID,Neutral,Positive,Negative,Total,Difference,P_N ratio,Difference ratio,Decile
0,4463311,32.0,34.0,4.0,70.0,30.0,8.50000,0.42857,10
1,4320409,54.0,35.0,5.0,94.0,30.0,7.00000,0.31915,8
2,4256968,44.0,5.0,37.0,86.0,-32.0,0.13514,-0.37209,1
3,4442839,25.0,6.0,29.0,60.0,-23.0,0.20690,-0.38333,1
4,4298941,43.0,30.0,6.0,79.0,24.0,5.00000,0.30380,8
...,...,...,...,...,...,...,...,...,...
195,4501965,62.0,13.0,1.0,76.0,12.0,13.00000,0.15789,5
196,4296861,64.0,3.0,21.0,88.0,-18.0,0.14286,-0.20455,1
197,4275572,62.0,11.0,2.0,75.0,9.0,5.50000,0.12000,4
198,4338042,57.0,9.0,1.0,67.0,8.0,9.00000,0.11940,4


Unnamed: 0,ID,Neutral,Positive,Negative,Total,Difference,P_N ratio,Difference ratio,Decile
0,4463311,32.0,34.0,4.0,70.0,30.0,8.50000,0.42857,10
1,4320409,54.0,35.0,5.0,94.0,30.0,7.00000,0.31915,8
2,4256968,44.0,5.0,37.0,86.0,-32.0,0.13514,-0.37209,1
3,4442839,25.0,6.0,29.0,60.0,-23.0,0.20690,-0.38333,1
4,4298941,43.0,30.0,6.0,79.0,24.0,5.00000,0.30380,8
...,...,...,...,...,...,...,...,...,...
45,4327174,37.0,15.0,4.0,56.0,11.0,3.75000,0.19643,5
46,4346324,32.0,13.0,1.0,46.0,12.0,13.00000,0.26087,7
47,4360233,35.0,18.0,5.0,58.0,13.0,3.60000,0.22414,6
48,4504519,81.0,37.0,1.0,119.0,36.0,37.00000,0.30252,8


In [14]:
df_ratios.to_csv('FinBert_ratios.csv', index = False)