# Topic Modeling 
This notebook aims to take as input the texts who have been processed and use it to find the most relevants topics and the words that are relevant for the sentimental analysis.

**Implementation**
- TF-IDF
- FinBERT
- LSA 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import glob
import re
import pandas as pd
import import_ipynb
from pre_processing import processing
from finbert_embedding.embedding import FinbertEmbedding

### Import the text and process it 

In [None]:
list_articles = glob.glob("data/earning_call/*")
texts = []
first_sentence = []
articles = []
for s in list_articles:
    with open(s) as f:
        x = int(re.sub('data/earning_call/','',s))
        articles.append(x)
        t = f.read()
        texts.append(t)
        
print('Number of articles', len(texts))

In [None]:
texts = [processing(x) for x in texts]

### TF-IDF
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [None]:
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(texts)
X_tfidf.shape

In [None]:
dict_w_index = vectorizer.vocabulary_
dict_index_w = {v: k for k, v in dict_w_index.items()}

Take the top n words depending on the score with the TF-IDF

In [None]:
n =10
top_n = []
for i in range(X_tfidf.shape[0]):
    index = X_tfidf[i,].nonzero()[1]
    words_of_index = [dict_index_w[x] for x in index]
    score_of_index = [X_tfidf[i,x] for x in index]
    x = list(zip(words_of_index,score_of_index))
    x.sort(key=lambda x: -x[1])
    a = [w[0] for w in x[:n]]
    top_n.append(a)

In [None]:
df = pd.DataFrame({'article':articles,'file_path':list_articles,'top_n_words':top_n})
df.to_pickle("data/top_n_words_tfidf.pkl")  

In [None]:
df

### FinBERT 
https://pypi.org/project/finbert-embedding/

In [None]:
finbert = FinbertEmbedding()

In [None]:
X_FinB = np.zeros((len(texts),768))
k=0
for text in texts:
    X_FinB[k,] = finbert.sentence_vector(text)
    k+=1

In [None]:
X_FinB.shape