In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import pandas as pd
from finbert_embedding.embedding import FinbertEmbedding
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np
from tqdm import tqdm
import glob
from collections import Counter

In [2]:
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

In [3]:
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

In [4]:
def extract_texts(start, N):
    list_articles = glob.glob("data/earning_call/*")[start:start + N]
    texts = []
    first_sentence = []
    articles = []
    ID = []
    for s in list_articles:
        with open(s) as f:
            
            x = int(re.sub('data/earning_call/','',s))
            articles.append(x)
            t = f.read()
            texts.append(t)
    ID = [s.replace("data/earning_call/", "") for s in list_articles]
    
    return texts, ID


In [5]:
def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]

    #sentences = [x for x in sentences if len(x.split(' '))>5]
    return sentences


In [6]:
def finbert_texts(n_texts, texts):
    labels = {0:'neutral', 1:'positive',2:'negative'}
    data = np.zeros((n_texts,3))

    for j in tqdm(range(n_texts)):
        print(j)
        #x = split_into_sentences(texts[j])
        sentences = texts[j].split('\n \n')
        sentences = [x for x in sentences if len(x.split(' '))>4]
        #print(len(sentences))
        inputs = tokenizer(sentences, return_tensors="pt", padding=True)


        outputs = finbert(**inputs)[0]
        i = np.argmax(outputs.detach().numpy(), axis = 1)
        sentiment_counter = list(Counter(i).values())
        data[j,:] = sentiment_counter
        
        
    return data

In [7]:
# Nombre d'articles
N = 10
start = 0
texts, ID = extract_texts(start, N)



  #  df_sentiment.at[index,"ID"] = list_articles[index]
   # df_sentiment.at[index,"ID"] = df_sentiment["ID"].str.replace("data/earning_call/", "")
#ID = 

In [8]:
if __name__ == "__main__":
    #print(repr(texts[0]))
    data = finbert_texts(N, texts)
    print(data)
   

  0%|                                                  | 0/10 [00:00<?, ?it/s]

0


 10%|████▏                                     | 1/10 [00:48<07:16, 48.54s/it]

1


 20%|████████▍                                 | 2/10 [02:24<10:10, 76.31s/it]

2


 30%|████████████▌                             | 3/10 [03:23<07:59, 68.55s/it]

3


 40%|████████████████▊                         | 4/10 [05:37<09:26, 94.49s/it]

4


 50%|█████████████████████                     | 5/10 [06:18<06:15, 75.08s/it]

5


 60%|█████████████████████████▏                | 6/10 [06:48<03:59, 59.90s/it]

6


 70%|█████████████████████████████▍            | 7/10 [07:13<02:24, 48.20s/it]

7


 80%|█████████████████████████████████▌        | 8/10 [07:50<01:29, 44.75s/it]

8


 90%|█████████████████████████████████████▊    | 9/10 [08:20<00:40, 40.21s/it]

9


100%|█████████████████████████████████████████| 10/10 [09:53<00:00, 59.35s/it]


[[63. 35. 12.]
 [56. 27.  3.]
 [69. 30.  1.]
 [48. 54. 13.]
 [41. 23.  4.]
 [39. 24.  8.]
 [31. 28.  4.]
 [54. 38. 18.]
 [19. 20.  5.]
 [65. 56.  7.]]


In [18]:
df_data = pd.DataFrame(data, columns= ["Neutral","Positive", "Negative"])


df_data["Total"] = df_data["Neutral"] + df_data["Positive"] + df_data["Negative"]
df_data["Difference"] = df_data["Positive"] - df_data["Negative"]
df_data["P_N ratio"] = df_data["Positive"] / df_data["Negative"]
df_data["Difference ratio"] = df_data["Difference"] / df_data["Total"]

df_data["Decile"] = 1 + df_data["Difference ratio"].transform(lambda y: pd.qcut(y, 10, labels=False))

df_data = np.round(df_data, 5)

#total = nb_neutral + nb_positive + nb_negative
#difference = nb_positive - nb_negative
#P_N = nb_positive/nb_negative
#difference_ratio = difference / total
#df_data["ID"] = texts
#df_data["ID"] = 

df_data["ID"] = ID
first_column = df_data.pop("ID")
df_data.insert(0, "ID", first_column)

long = list(df_data.loc[df_data['Decile'].isin([10,9])]["ID"])
short = list(df_data.loc[df_data['Decile'].isin([1,2])]["ID"])

display(df_data)


#display(short)
#short

Unnamed: 0,ID,Neutral,Positive,Negative,Total,Difference,P_N ratio,Difference ratio,Decile
0,4509509,63.0,35.0,12.0,110.0,23.0,2.91667,0.20909,2
1,4508877,56.0,27.0,3.0,86.0,24.0,9.0,0.27907,4
2,4509199,69.0,30.0,1.0,100.0,29.0,30.0,0.29,6
3,4507655,48.0,54.0,13.0,115.0,41.0,4.15385,0.35652,8
4,4509360,41.0,23.0,4.0,68.0,19.0,5.75,0.27941,5
5,4508080,39.0,24.0,8.0,71.0,16.0,3.0,0.22535,3
6,4509536,31.0,28.0,4.0,63.0,24.0,7.0,0.38095,9
7,4508410,54.0,38.0,18.0,110.0,20.0,2.11111,0.18182,1
8,4508622,19.0,20.0,5.0,44.0,15.0,4.0,0.34091,7
9,4509394,65.0,56.0,7.0,128.0,49.0,8.0,0.38281,10


In [19]:
display(short)
display(long)

['4509509', '4508410']

['4509536', '4509394']