In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import pandas as pd
from finbert_embedding.embedding import FinbertEmbedding
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np
from tqdm import tqdm
import glob
import os.path
from pandas import *
from collections import Counter
import time

In [2]:
current_data_directory = os.path.abspath("./data/sp500")
path_text = current_data_directory + "/text/"
path_to_id_csv = current_data_directory + "/df_100stocks_sp500.csv"


In [3]:
print(current_data_directory)

/Users/juliencyrusenyegue/Documents/GitHub/financial_econometrics_project/FinBert/data/sp500


In [4]:
data = read_csv(path_to_id_csv)
ID_sp500 = data['id'].tolist()

In [5]:
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

In [6]:
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

In [7]:
# Extract texts from the directory

def extract_texts(start, N):

    texts = []
    first_sentence = []
    articles = []
    list_ID = []
   
    list_tickers = os.listdir(path_text)
    list_tickers.remove('.DS_Store') # for mac
 
    for ticker in list_tickers:
        
        list_articles = glob.glob(path_text+ticker+"/*")
        
        ID = []
        for s in list_articles:
            
            test_id = int(s.replace(path_text+str(ticker)+"/", ""))
          
            if test_id in ID_sp500:
                
                with open(s) as f:
                    
                    x = re.sub(path_text+str(ticker)+"/","",s)
                    articles.append(x)
                    t = f.read()
                    texts.append(t)
        
                ID.append(test_id)
                  
        list_ID += ID
        
     
    return texts, list_ID


In [8]:
def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]

    #sentences = [x for x in sentences if len(x.split(' '))>5]
    return sentences


In [9]:
# Run finbert on each text and count the nb of neutral, positive, negative sentences

def finbert_texts(start,n_texts, texts):
    labels = {0:'neutral', 1:'positive',2:'negative'}
    data = np.zeros((n_texts,3))

    for j in tqdm(range(start,start + n_texts)):
        
        print(j - start + 1)
        sentences = split_into_sentences(texts[j])
        sentences = texts[j].split('\n \n')
        sentences = [x for x in sentences if len(x.split(' '))>4]
        inputs = tokenizer(sentences, return_tensors="pt", padding=True,truncation=True,max_length=512)


        outputs = finbert(**inputs)[0]
        #print(outputs)
        i = np.argmax(outputs.detach().numpy(), axis = 1)
        #print(i)
        sentiment_counter = list(Counter(i).values())
        if(np.shape(sentiment_counter) == (2, )):
            sentiment_counter.append(0)
        print(sentiment_counter)
        data[j-start,:] = sentiment_counter
        time.sleep(45)
        
        
    return data

## soucis au 72

In [10]:
# Nombre d'articles
N = 25
start = 75
texts, list_ID = extract_texts(start, N)


li = [1,2]
#print(np.shape(li))

if(np.shape(li) == (2, )):
    li.append(4)
    print(li)
    

In [11]:
 if __name__ == "__main__":
    
    data = finbert_texts(start, N, texts)
    
   

  0%|                                                    | 0/25 [00:00<?, ?it/s]

1
[37, 10, 4]


  4%|█▊                                          | 1/25 [01:13<29:35, 73.96s/it]

2
[47, 8, 25]


  8%|███▌                                        | 2/25 [02:55<34:36, 90.29s/it]

3
[28, 4, 12]


 12%|█████▎                                      | 3/25 [04:01<29:02, 79.20s/it]

4
[40, 22, 8]


 16%|███████                                     | 4/25 [05:26<28:33, 81.59s/it]

5
[52, 25, 4]


 20%|████████▊                                   | 5/25 [07:05<29:15, 87.76s/it]

6
[62, 17, 5]


 24%|██████████▎                                | 6/25 [09:38<34:47, 109.88s/it]

7
[80, 17, 23]


 28%|████████████                               | 7/25 [11:26<32:47, 109.33s/it]

8
[43, 10, 21]


 32%|██████████████                              | 8/25 [12:41<27:50, 98.24s/it]

9
[59, 31, 8]


 36%|███████████████▊                            | 9/25 [14:05<25:03, 93.99s/it]

10
[79, 27, 14]


 40%|█████████████████▏                         | 10/25 [15:41<23:39, 94.60s/it]

11
[81, 14, 36]


 44%|██████████████████▉                        | 11/25 [17:19<22:18, 95.59s/it]

12
[65, 27, 22]


 48%|████████████████████▏                     | 12/25 [19:20<22:24, 103.44s/it]

13
[53, 19, 8]


 52%|██████████████████████▎                    | 13/25 [20:52<19:57, 99.82s/it]

14
[33, 25, 2]


 56%|████████████████████████                   | 14/25 [22:13<17:16, 94.25s/it]

15
[43, 28, 6]


 60%|█████████████████████████▏                | 15/25 [24:17<17:10, 103.08s/it]

16
[57, 25, 10]


 64%|██████████████████████████▉               | 16/25 [25:53<15:08, 100.92s/it]

17
[61, 39, 10]


 68%|████████████████████████████▌             | 17/25 [27:45<13:55, 104.41s/it]

18
[88, 22, 4]


 72%|██████████████████████████████▏           | 18/25 [29:27<12:04, 103.46s/it]

19
[112, 60, 6]


 76%|███████████████████████████████▉          | 19/25 [31:49<11:30, 115.12s/it]

20
[94, 23, 5]


 80%|█████████████████████████████████▌        | 20/25 [33:43<09:34, 114.89s/it]

21
[101, 9, 44]


 84%|███████████████████████████████████▎      | 21/25 [35:30<07:29, 112.49s/it]

22
[101, 31, 6]


 88%|████████████████████████████████████▉     | 22/25 [37:23<05:37, 112.50s/it]

23
[71, 28, 3]


 92%|██████████████████████████████████████▋   | 23/25 [38:56<03:33, 106.83s/it]

24
[78, 30, 16]


 96%|████████████████████████████████████████▎ | 24/25 [40:52<01:49, 109.51s/it]

25
[134, 48, 19]


100%|██████████████████████████████████████████| 25/25 [42:34<00:00, 102.18s/it]


In [12]:
# Ratios computations

df_data = pd.DataFrame(data, columns= ["Neutral","Positive", "Negative"])


df_data["Total"] = df_data["Neutral"] + df_data["Positive"] + df_data["Negative"]
df_data["Difference"] = df_data["Positive"] - df_data["Negative"]
df_data["P_N ratio"] = df_data["Positive"] / df_data["Negative"]
df_data["Difference ratio"] = df_data["Difference"] / df_data["Total"]

df_data["Decile"] = 1 + df_data["Difference ratio"].transform(lambda y: pd.qcut(y, 10, labels=False))

df_data = np.round(df_data, 5)


df_data["ID"] = list_ID[start: start + N]
first_column = df_data.pop("ID")
df_data.insert(0, "ID", first_column)

#long = list(df_data.loc[df_data['Decile'].isin([10,9])]["ID"])
#short = list(df_data.loc[df_data['Decile'].isin([1,2])]["ID"])

display(df_data)


Unnamed: 0,ID,Neutral,Positive,Negative,Total,Difference,P_N ratio,Difference ratio,Decile
0,4507323,37.0,10.0,4.0,51.0,6.0,2.5,0.11765,4
1,4346259,47.0,8.0,25.0,80.0,-17.0,0.32,-0.2125,1
2,4484024,28.0,4.0,12.0,44.0,-8.0,0.33333,-0.18182,1
3,4445561,40.0,22.0,8.0,70.0,14.0,2.75,0.2,8
4,4320314,52.0,25.0,4.0,81.0,21.0,6.25,0.25926,9
5,4382659,62.0,17.0,5.0,84.0,12.0,3.4,0.14286,5
6,4300447,80.0,17.0,23.0,120.0,-6.0,0.73913,-0.05,3
7,4465453,43.0,10.0,21.0,74.0,-11.0,0.47619,-0.14865,2
8,4484073,59.0,31.0,8.0,98.0,23.0,3.875,0.23469,8
9,4279647,79.0,27.0,14.0,120.0,13.0,1.92857,0.10833,3


In [13]:
df_ratios = read_csv(os.path.abspath("./FinBert_ratios.csv"))
display(df_ratios)
df_ratios = pd.concat([df_ratios, df_data])
display(df_ratios)

Unnamed: 0,ID,Neutral,Positive,Negative,Total,Difference,P_N ratio,Difference ratio,Decile
0,4463311,32.0,34.0,4.0,70.0,30.0,8.50000,0.42857,10
1,4320409,54.0,35.0,5.0,94.0,30.0,7.00000,0.31915,1
2,4256968,44.0,5.0,37.0,86.0,-32.0,0.13514,-0.37209,5
3,4442839,25.0,6.0,29.0,60.0,-23.0,0.20690,-0.38333,1
4,4298941,43.0,30.0,6.0,79.0,24.0,5.00000,0.30380,10
...,...,...,...,...,...,...,...,...,...
70,4303830,69.0,20.0,8.0,97.0,12.0,2.50000,0.12371,3
71,4467443,33.0,9.0,0.0,42.0,9.0,inf,0.21429,8
72,4322069,57.0,8.0,25.0,90.0,-17.0,0.32000,-0.18889,1
73,4283362,45.0,16.0,3.0,64.0,13.0,5.33333,0.20312,5


Unnamed: 0,ID,Neutral,Positive,Negative,Total,Difference,P_N ratio,Difference ratio,Decile
0,4463311,32.0,34.0,4.0,70.0,30.0,8.50000,0.42857,10
1,4320409,54.0,35.0,5.0,94.0,30.0,7.00000,0.31915,1
2,4256968,44.0,5.0,37.0,86.0,-32.0,0.13514,-0.37209,5
3,4442839,25.0,6.0,29.0,60.0,-23.0,0.20690,-0.38333,1
4,4298941,43.0,30.0,6.0,79.0,24.0,5.00000,0.30380,10
...,...,...,...,...,...,...,...,...,...
20,4257331,101.0,9.0,44.0,154.0,-35.0,0.20455,-0.22727,1
21,4461196,101.0,31.0,6.0,138.0,25.0,5.16667,0.18116,7
22,4380763,71.0,28.0,3.0,102.0,25.0,9.33333,0.24510,8
23,4318656,78.0,30.0,16.0,124.0,14.0,1.87500,0.11290,4


In [14]:
df_ratios.to_csv('FinBert_ratios.csv', index = False)