# Topic Modeling 
This notebook aims to take as input the texts who have been processed and use it to find the most relevants topics and the words that are relevant for the sentimental analysis.

**Implementation**
- TF-IDF
- FinBERT
- LSA 

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import glob
import re
import pandas as pd
import import_ipynb
from pre_processing import processing
from finbert_embedding.embedding import FinbertEmbedding

importing Jupyter notebook from pre_processing.ipynb
['4509509', '4508877', '4509199', '4507655', '4509360', '4508080', '4509536', '4508410', '4508622', '4509394', '4508848', '4508074', '4508428', '4508870', '4509358', '4508884', '4509393', '4508625', '4509367', '4508613', '4508421', '4508879', '4508045', '4509163', '4509507', '4509369', '4507697', '4508284', '4509356', '4509164', '4508042', '4509190', '4508486', '4509302', '4508016', '4508472', '4507637', '4508815', '4508475', '4508647', '4508223', '4509305', '4509137', '4509553', '4508481', '4508812', '4508678', '4507630', '4509108', '4508443', '4508685', '4509333', '4509101', '4508018', '4508824', '4508488', '4509334', '4508682', '4508676', '4508444', '4508020', '4509139', '4508823', '4508840', '4509368', '4508615', '4508285', '4509357', '4508847', '4508249', '4509350', '4508612', '4508878', '4508885', '4509359', '4509530', '4509366', '4509154', '4508086', '4509392', '4508072', '4508416', '4508624', '4509198', '4508876', '4507654', 

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/juliencyrusenyegue/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/juliencyrusenyegue/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/juliencyrusenyegue/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/juliencyrusenyegue/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


  0%|          | 0/841 [00:00<?, ?it/s]

### Import the text and process it 

In [2]:
# Nombre d'articles

N = 20

In [3]:
list_articles = glob.glob("data/earning_call/*")[:N]
texts = []
first_sentence = []
articles = []
for s in list_articles:
    with open(s) as f:
        x = int(re.sub('data/earning_call/','',s))
        articles.append(x)
        t = f.read()
        texts.append(t)
        
       
        
print('Number of articles', len(texts))


Number of articles 20


In [4]:
texts = [processing(x) for x in texts]

### TF-IDF
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [5]:
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(texts)
X_tfidf.shape

(20, 6533)

In [6]:
dict_w_index = vectorizer.vocabulary_
dict_index_w = {v: k for k, v in dict_w_index.items()}

Take the top n words depending on the score with the TF-IDF

In [7]:
n =10
top_n = []
for i in range(X_tfidf.shape[0]):
    index = X_tfidf[i,].nonzero()[1]
    words_of_index = [dict_index_w[x] for x in index]
    score_of_index = [X_tfidf[i,x] for x in index]
    x = list(zip(words_of_index,score_of_index))
    x.sort(key=lambda x: -x[1])
    a = [w[0] for w in x[:n]]
    top_n.append(a)

In [8]:
df = pd.DataFrame({'article':articles,'file_path':list_articles,'top_n_words':top_n})
df.to_pickle("data/top_n_words_tfidf.pkl")  

In [9]:
df

Unnamed: 0,article,file_path,top_n_words
0,4509509,data/earning_call/4509509,"[savings, hardware, app, 3pl, cogs, us, issues..."
1,4508877,data/earning_call/4508877,"[oneweb, chris, us, government, henry, caleb, ..."
2,4509199,data/earning_call/4509199,"[patients, us, oxybates, nda, avadel, ag, sodi..."
3,4507655,data/earning_call/4507655,"[customers, erik, aws, eps, capabilities, chad..."
4,4509360,data/earning_call/4509360,"[brooks, notes, clinicians, customers, us, all..."
5,4508080,data/earning_call/4508080,"[ounces, aris, amounted, trevor, haytham, rose..."
6,4509536,data/earning_call/4509536,"[merchants, originations, cupito, consumers, v..."
7,4508410,data/earning_call/4508410,"[us, patients, valves, centers, physicians, tr..."
8,4508622,data/earning_call/4508622,"[sales, increases, heppenstall, constant, impl..."
9,4509394,data/earning_call/4509394,"[events, customers, planners, hotels, operator..."


### FinBERT Julien Cyrus
https://pypi.org/project/finbert-embedding/

In [10]:
import torch
import torch.nn.functional as F
from transformers import BertTokenizer
from transformers import pipeline
from transformers import BertModel
from transformers import BertForSequenceClassification
import os
import re

In [11]:
labels = {0:'neutral', 1:'positive',2:'negative'}
num_labels= len(labels)
vocab = "finance-uncased"
vocab_path = 'analyst_tone/vocab'
pretrained_weights_path = "analyst_tone/pretrained_weights" # this is pre-trained FinBERT weights
fine_tuned_weight_path = "analyst_tone/fine_tuned.pth"      # this is fine-tuned FinBERT weights
max_seq_length=512
device='cuda:1'


In [12]:
headlines_list = list_articles
print(headlines_list)

['data/earning_call/4509509', 'data/earning_call/4508877', 'data/earning_call/4509199', 'data/earning_call/4507655', 'data/earning_call/4509360', 'data/earning_call/4508080', 'data/earning_call/4509536', 'data/earning_call/4508410', 'data/earning_call/4508622', 'data/earning_call/4509394', 'data/earning_call/4508848', 'data/earning_call/4508074', 'data/earning_call/4508428', 'data/earning_call/4508870', 'data/earning_call/4509358', 'data/earning_call/4508884', 'data/earning_call/4509393', 'data/earning_call/4508625', 'data/earning_call/4509367', 'data/earning_call/4508613']


In [13]:
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

In [14]:
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

sentences = ["there is a shortage of capital, and we need extra financing", 
             "growth is strong and we have plenty of liquidity", 
             "there are doubts about our finances", 
             "profits are flat","everything is fine","everything is worse"]

inputs = tokenizer(sentences, return_tensors="pt", padding=True)
outputs = finbert(**inputs)[0]

labels = {0:'neutral', 1:'positive',2:'negative'}
for idx, sent in enumerate(sentences):
    print(sent, '----', labels[np.argmax(outputs.detach().numpy()[idx])])

there is a shortage of capital, and we need extra financing ---- negative
growth is strong and we have plenty of liquidity ---- positive
there are doubts about our finances ---- negative
profits are flat ---- neutral
everything is fine ---- positive
everything is worse ---- negative


In [None]:
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

a = 0

labels = {0:'neutral', 1:'positive',2:'negative'}

# Create a dataframe with nb_positive sentence, neutral and negative sentences
df_sentiment = pd.DataFrame(columns = ["Neutral","Positive", "Negative","Difference","P/N ratio", "Difference ratio",
                                      "Total"])


for s in list_articles:
    with open(s) as f:
        t = f.read()
        x = split_into_sentences(t)
        a = a+1
        print(a)
        inputs = tokenizer(x, return_tensors="pt", padding=True)
        outputs = finbert(**inputs)[0] # Prends bcp de temps. pour run
        
           
        # [0] neutral
        #[1] positive
        # [2] negative
        
        nb_neutral = nb_positive = nb_negative = 0
        
    
        for idx,elem in enumerate(x):
            #print(outputs.detach().numpy()[idx])
            #print(elem, '----', labels[np.argmax(outputs.detach().numpy()[idx])])
            #print(labels[np.argmax(outputs.detach().numpy()[idx])])
            
            sentiment = labels[np.argmax(outputs.detach().numpy()[idx])]
            
            if(sentiment == "neutral"):
                nb_neutral += 1
            elif(sentiment == "positive"):
                nb_positive += 1
            elif(sentiment == "negative"):
                nb_negative += 1
            else :
                pass
        
        total = nb_neutral + nb_positive + nb_negative
        difference = nb_positive - nb_negative
        P_N = nb_positive/nb_negative
        difference_ratio = difference / total
        
        df_sentiment.loc[len(df_sentiment.index)] = [nb_neutral,nb_positive,nb_negative, difference, 
                                                    P_N,difference_ratio, total]
        #display(df_sentiment) 

df_sentiment = df_sentiment.astype({"Neutral": int, "Positive": int, "Negative" : int, "Difference" : int,
                                   "Total": int})

display(df_sentiment) 

1
2
3
4


In [None]:
df_sentiment["Final Sentiment"] = None


for index, row in df_sentiment.iterrows():
    #print(row['c1'], row['c2'])
    
    if row["Difference ratio"] > 0.2 :
        df_sentiment["Final Sentiment"] = "Positive"
    
    elif row["Difference ratio"] < 0.1 :
        df_sentiment["Final Sentiment"] = "Negative"
        
    else :
         df_sentiment["Final Sentiment"] = "Neutral"

            
display(df_sentiment)

In [None]:
df_sentiment["Decile"] = 1 + df_sentiment["Difference ratio"].transform(lambda y: pd.qcut(y, 10, labels=False))
df_sentiment= np.round(df_sentiment,5)
display(df_sentiment)

In [None]:
df_sentiment.to_csv('data_sentiment.csv')
