In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import pandas as pd
from finbert_embedding.embedding import FinbertEmbedding
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np
from tqdm import tqdm
import glob
import os.path
from pandas import *
from collections import Counter
import time

In [2]:
current_data_directory = os.path.abspath("./data/sp500")
path_text = current_data_directory + "/text/"
path_to_id_csv = current_data_directory + "/df_100stocks_sp500.csv"


In [3]:
print(current_data_directory)

/Users/juliencyrusenyegue/Documents/GitHub/financial_econometrics_project/FinBert/data/sp500


In [4]:
data = read_csv(path_to_id_csv)
ID_sp500 = data['id'].tolist()

In [5]:
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

In [6]:
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

In [7]:
# Extract texts from the directory

def extract_texts(start, N):

    texts = []
    first_sentence = []
    articles = []
    list_ID = []
   
    list_tickers = os.listdir(path_text)
    list_tickers.remove('.DS_Store') # for mac
 
    for ticker in list_tickers:
        
        list_articles = glob.glob(path_text+ticker+"/*")
        
        ID = []
        for s in list_articles:
            
            test_id = int(s.replace(path_text+str(ticker)+"/", ""))
          
            if test_id in ID_sp500:
                
                with open(s) as f:
                    
                    x = re.sub(path_text+str(ticker)+"/","",s)
                    articles.append(x)
                    t = f.read()
                    texts.append(t)
        
                ID.append(test_id)
                  
        list_ID += ID
        
     
    return texts, list_ID


In [8]:
def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]

    #sentences = [x for x in sentences if len(x.split(' '))>5]
    return sentences


In [9]:
# Run finbert on each text and count the nb of neutral, positive, negative sentences

def finbert_texts(start,n_texts, texts):
    labels = {0:'neutral', 1:'positive',2:'negative'}
    data = np.zeros((n_texts,3))
    
    for j in tqdm(range(start,start + n_texts)):
        
        print(j - start + 1)
        sentences = split_into_sentences(texts[j])
        sentences = texts[j].split('\n \n')
        sentences = [x for x in sentences if len(x.split(' '))>4]
        inputs = tokenizer(sentences, return_tensors="pt", padding=True,truncation=True,max_length=512)


        outputs = finbert(**inputs)[0]
        #print(outputs)
        i = np.argmax(outputs.detach().numpy(), axis = 1)
        #print(i)
        sentiment_counter = list(Counter(i).values())
        if(np.shape(sentiment_counter) == (2, )):
            sentiment_counter.append(0)
        print(sentiment_counter)
        data[j-start,:] = sentiment_counter
        
        time.sleep(45)
        
        
    return data

In [10]:
# Nombre d'articles
N = 50
start = 250

li = [1,2]
#print(np.shape(li))

if(np.shape(li) == (2, )):
    li.append(4)
    print(li)
    

In [None]:
 if __name__ == "__main__":
    
    texts, list_ID = extract_texts(start, N)
    data = finbert_texts(start, N, texts)
    
   

  0%|                                                    | 0/50 [00:00<?, ?it/s]

1
[79, 37, 9]


  2%|▊                                         | 1/50 [01:23<1:07:58, 83.24s/it]

2
[80, 28, 3]


  4%|█▋                                       | 2/50 [03:16<1:20:30, 100.64s/it]

3
[85, 17, 2]


  6%|██▌                                       | 3/50 [04:41<1:13:23, 93.69s/it]

4
[61, 36, 2]


  8%|███▎                                      | 4/50 [06:11<1:10:48, 92.35s/it]

5
[62, 8, 6]


 10%|████                                     | 5/50 [08:35<1:23:14, 110.99s/it]

6
[70, 31, 8]


 12%|████▉                                    | 6/50 [10:05<1:16:06, 103.78s/it]

7
[55, 5, 28]


 14%|█████▋                                   | 7/50 [11:42<1:12:46, 101.55s/it]

8
[67, 4, 23]


 16%|██████▌                                  | 8/50 [13:38<1:14:21, 106.23s/it]

9
[54, 7, 26]


 18%|███████▍                                 | 9/50 [15:15<1:10:32, 103.23s/it]

10
[75, 2, 25]


 20%|████████                                | 10/50 [16:54<1:07:52, 101.82s/it]

11
[24, 18, 4]


 22%|█████████                                | 11/50 [18:11<1:01:15, 94.24s/it]

12
[28, 5, 24]


 24%|██████████▎                                | 12/50 [19:30<56:52, 89.81s/it]

13
[38, 17, 1]


 26%|██████████▍                             | 13/50 [21:50<1:04:37, 104.81s/it]

14
[32, 18, 7]


 28%|████████████                               | 14/50 [23:06<57:42, 96.17s/it]

15
[48, 57, 10]


 30%|████████████▉                              | 15/50 [24:54<58:16, 99.89s/it]

16
[40, 15, 6]


 32%|█████████████▊                             | 16/50 [26:04<51:24, 90.73s/it]

17
[30, 29, 5]


 34%|██████████████▌                            | 17/50 [27:19<47:23, 86.17s/it]

18
[42, 28, 7]


 36%|███████████████▍                           | 18/50 [29:16<50:52, 95.39s/it]

19
[31, 30, 2]


 38%|████████████████▎                          | 19/50 [30:41<47:35, 92.12s/it]

20
[29, 17, 3]


 40%|█████████████████▏                         | 20/50 [32:35<49:19, 98.66s/it]

21
[31, 34, 3]


 42%|██████████████████                         | 21/50 [33:47<43:56, 90.90s/it]

22
[37, 34, 2]


 44%|██████████████████▉                        | 22/50 [35:01<39:55, 85.54s/it]

23
[36, 42, 19]


 46%|███████████████████▊                       | 23/50 [36:23<38:04, 84.61s/it]

24
[84, 10, 19]


 48%|████████████████████▋                      | 24/50 [37:55<37:36, 86.80s/it]

25
[67, 10, 22]


 50%|█████████████████████▌                     | 25/50 [39:10<34:45, 83.41s/it]

26
[77, 15, 5]


 52%|██████████████████████▎                    | 26/50 [41:17<38:35, 96.49s/it]

27
[98, 24, 12]


 54%|███████████████████████▏                   | 27/50 [42:50<36:32, 95.31s/it]

28
[50, 15, 2]


 56%|████████████████████████                   | 28/50 [44:15<33:52, 92.37s/it]

29
[97, 24, 8]


 58%|████████████████████████▉                  | 29/50 [45:48<32:21, 92.47s/it]

30
[64, 8, 20]


 60%|█████████████████████████▏                | 30/50 [48:49<39:38, 118.93s/it]

31
[82, 4, 16]


 62%|██████████████████████████                | 31/50 [50:26<35:33, 112.29s/it]

32
[50, 9, 11]


 64%|██████████████████████████▉               | 32/50 [52:11<33:01, 110.10s/it]

33
[87, 9, 20]


 66%|███████████████████████████▋              | 33/50 [53:51<30:23, 107.29s/it]

34
[63, 21, 8]


 68%|████████████████████████████▌             | 34/50 [55:27<27:41, 103.84s/it]

35
[103, 6, 30]


 70%|█████████████████████████████▍            | 35/50 [57:04<25:28, 101.89s/it]

36
[70, 19, 8]


 72%|██████████████████████████████▏           | 36/50 [59:09<25:20, 108.64s/it]

37
[39, 50, 3]


 74%|██████████████████████████████▎          | 37/50 [1:00:21<21:11, 97.80s/it]

38
[24, 31, 1]


 76%|███████████████████████████████▏         | 38/50 [1:01:51<19:05, 95.47s/it]

39
[41, 60, 2]


 78%|███████████████████████████████▉         | 39/50 [1:03:40<18:13, 99.39s/it]

40
[41, 42, 8]


 80%|████████████████████████████████▊        | 40/50 [1:05:01<15:39, 93.92s/it]

41
[36, 51, 0]


 82%|█████████████████████████████████▌       | 41/50 [1:06:19<13:23, 89.26s/it]

42
[46, 35, 7]


 84%|██████████████████████████████████▍      | 42/50 [1:07:57<12:13, 91.74s/it]

43
[39, 35, 8]


 86%|███████████████████████████████████▎     | 43/50 [1:09:26<10:36, 90.96s/it]

44
[40, 48, 1]


In [None]:
# Ratios computations

df_data = pd.DataFrame(data, columns= ["Neutral","Positive", "Negative"])


df_data["Total"] = df_data["Neutral"] + df_data["Positive"] + df_data["Negative"]
df_data["Difference"] = df_data["Positive"] - df_data["Negative"]
df_data["P_N ratio"] = df_data["Positive"] / df_data["Negative"]
df_data["Difference ratio"] = df_data["Difference"] / df_data["Total"]

df_data["Decile"] = 1 + df_data["Difference ratio"].transform(lambda y: pd.qcut(y, 10, labels=False))

df_data = np.round(df_data, 5)


df_data["ID"] = list_ID[start: start + N]
first_column = df_data.pop("ID")
df_data.insert(0, "ID", first_column)

#long = list(df_data.loc[df_data['Decile'].isin([10,9])]["ID"])
#short = list(df_data.loc[df_data['Decile'].isin([1,2])]["ID"])

display(df_data)


In [None]:
# Pas run à la première itération

df_ratios = read_csv(os.path.abspath("./FinBert_ratios.csv"))
display(df_ratios)
df_ratios = pd.concat([df_ratios, df_data])
df_ratios["Decile"] = 1 + df_ratios["Difference ratio"].transform(lambda y: pd.qcut(y, 10, labels=False))
display(df_ratios)

In [None]:
df_ratios.to_csv('FinBert_ratios.csv', index = False)