# Topic Modeling 
This notebook aims to take as input the texts who have been processed and use it to find the most relevants topics and the words that are used for the sentimental analysis.

**Implementation**
- TF-IDF
- FinBERT
- LSA 

In [None]:
import os
import re
import pandas as pd
import numpy as np
import import_ipynb
import pre_processing_final as p
from sklearn.feature_extraction.text import TfidfVectorizer
from finbert_embedding.embedding import FinbertEmbedding

import hdbscan
import gensim
import finbert_embedding
import umap.umap_ as umap
import matplotlib.pyplot as plt

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import make_multilabel_classification
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim import corpora
from gensim.corpora.dictionary import Dictionary


### Import and process the texts

In [None]:
texts, articles = p.import_texts()

In [None]:
texts_processed = [p.process_figas(t,'aa') for t in texts]

### TF-IDF
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

Take the top n words depending on the score with the TF-IDF

In [None]:
def top_n_TFIDF_words(n, texts_processed):
    vectorizer = TfidfVectorizer()
    X_tfidf = vectorizer.fit_transform(texts_processed)
    print(X_tfidf.shape)

    # Create dictionnary with all the words contained in the TF-IDF matrix
    dict_w_index = vectorizer.vocabulary_
    dict_index_w = {v: k for k, v in dict_w_index.items()}
    
    n =10
    top_n = []
    for i in range(X_tfidf.shape[0]):
        index = X_tfidf[i,].nonzero()[1]
        words_of_index = [dict_index_w[x] for x in index]
        score_of_index = [X_tfidf[i,x] for x in index]
        x = list(zip(words_of_index,score_of_index))
        x.sort(key=lambda x: -x[1])
        a = [w[0] for w in x[:n]]
        top_n.append(a)
    
    return top_n

In [None]:
tfidf_top_n_words = top_n_TFIDF_words(10, texts_processed)

Save the top words found with TFIDF in a pickle with the article IDm

In [None]:
df = pd.DataFrame({'article':articles,'top_n_words':tfidf_top_n_words})
df.to_pickle("data/top_n_words_tfidf_proc_figass_with_juliette_geneve.pkl")  

In [None]:
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(texts_processed)

### FinBERT 
https://pypi.org/project/finbert-embedding/

In [None]:
finbert = FinbertEmbedding()

In [None]:
X_FinB = np.zeros((len(texts_processed),768))
k=0
for text in texts:
    X_FinB[k,] = finbert.sentence_vector(text)
    k+=1

In [None]:
# Create dictionnary with all the words contained in the TF-IDF matrix
    dict_w_index = vectorizer.vocabulary_
    dict_index_w = {v: k for k, v in dict_w_index.items()}
    
    n =10
    top_n = []
    for i in range(X_tfidf.shape[0]):
        index = X_tfidf[i,].nonzero()[1]
        words_of_index = [dict_index_w[x] for x in index]
        score_of_index = [X_tfidf[i,x] for x in index]
        x = list(zip(words_of_index,score_of_index))
        x.sort(key=lambda x: -x[1])
        a = [w[0] for w in x[:n]]
        top_n.append(a)
    
    return top_n

In [None]:
X_FinB.shape

### Dimensionality reduction and clustering before topic extraction

In [None]:
embeddings = X_tfidf
reducer = umap.UMAP()
umap_embeddings = umap.UMAP(n_components= 15, n_neighbors=15, metric='cosine').fit_transform(embeddings)
cluster = hdbscan.HDBSCAN(min_cluster_size=5,
                          metric='euclidean',                      
                          cluster_selection_method='eom').fit(umap_embeddings)

print('Number of clusters/topics ', len(set(cluster.labels_)))

In [None]:
# Prepare data
umap_data = umap.UMAP(n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
result = pd.DataFrame(umap_data, columns=['x', 'y'])
result['labels'] = cluster.labels_

# Visualize clusters
fig, ax = plt.subplots(figsize=(20, 10))
outliers = result.loc[result.labels == -1, :]
clustered = result.loc[result.labels != -1, :]
plt.scatter(outliers.x, outliers.y, color='#BDBDBD', s=0.5)
plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=5, cmap='hsv_r')
plt.savefig('clusters.png')
plt.colorbar()

In [None]:
docs_df = pd.DataFrame(texts, columns=["Doc"])
docs_df['Topic'] = cluster.labels_
docs_df['Doc_ID'] = range(len(docs_df))
docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})

In [None]:
def c_tf_idf(documents, m):
    count = CountVectorizer(stop_words="english").fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count
  
tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(texts_processed))

In [None]:
def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

In [None]:
def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                     .Doc
                     .count()
                     .reset_index()
                     .rename({"Topic": "Topic", "Doc": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

In [None]:
top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
topic_sizes = extract_topic_sizes(docs_df)
topic_sizes = topic_sizes[topic_sizes.Size >=10]

In [None]:
k=0
for t in topic_sizes.Topic.values:
    if(t!=-1):
        top_n_words[t].sort(key=lambda x:- x[1])
        k +=1
        print('')
        print('Topic - ',k)
        for i in range(10):
            print(top_n_words[t][i][0])