In [None]:
import glob
import pandas as pd
import numpy as np
import re


import collections, itertools
import matplotlib.pyplot as plt
import nltk
import hdbscan
import gensim

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from nltk.probability import FreqDist



from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import make_multilabel_classification
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim import corpora
from gensim.corpora.dictionary import Dictionary

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

### Import the text and process it 

In [None]:
list_articles = glob.glob("data/earning_call/*")
texts = []
first_sentence = []
articles = []
for s in list_articles:
    with open(s) as f:
        x = int(re.sub('data/earning_call/','',s))
        articles.append(x)
        t = f.read()
        texts.append(t)
print('Number of articles', len(texts))


## Processing

In [None]:
# Function that transforms a list of strings into 1 big concatenated string and vice-versa
def list_to_text(list_input, stops = []):
    text_output = ' '.join([word for word in list_input if word not in stops]) 
    return text_output

def text_to_list(text_input):
    list_output = word_tokenize(text_input)
    return list_output

In [None]:
text = ''
# Ponctuation
text = re.sub('[^A-Za-z0-9]+', ' ', text)

# Lower all words
text = text.lower()

# Remove stopwords
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(text)
filtered_text = [w for w in word_tokens if not w.lower() in stop_words]

print('Number of words before removing the stop words',len(word_tokens))
print('Number of words after removing the stop words',len(filtered_text))

In [None]:
# N-Gram
def ngrams_list(n):
    """
        Compute ngrams.
        
        Args:
            n (int): the number of words to words to assemble in the ngram.
        
        Returns :
            A list composed of the ngrams.
    """
    m = []
    nx_grams = ngrams(sequence = nltk.word_tokenize(text), n = n)
    for gram in nx_grams:
        m.append(gram)
    return m

In [None]:
wordnet_lemmatizer = WordNetLemmatizer()

In [None]:
# Remove most frequent and least frequent words
def remove(filtered_text, a, l, h):
    """
        Remove most and least frequent words.
        Args:
            a (list) : list on which operations should be made.
            l (float): the proportion of top l% least frequent words to remove from the numer of different words.
            h (float): the proportion of top h% most frequent words to remove from the numer of different words.
    
        Returns:
            A copy of the input text without frequent and infrequent words.
    """
    f = FreqDist(a)

    df_fdist = pd.DataFrame({'Word': f.keys(), 'Number of apparitions': f.values()})
    L= l*len(df_fdist)
    L=int(L)

    H=h*len(df_fdist)
    H=int(H)
    
    df_fdesc = df_fdist.sort_values(by='Number of apparitions', ascending=False)
    df_fasc = df_fdist.sort_values(by='Number of apparitions', ascending=True)

    most_freq_words_list = list(df_fdesc['Word'][:H])
    least_freq_word_list = list(df_fasc['Word'][:L])
    stopwords = most_freq_words_list + least_freq_word_list
    textlist_wo_extremes = list_to_text(filtered_text, stopwords)
    #text_wo_extremes = ' '.join([word for word in filtered_text if word not in stopwords]) 

    return textlist_wo_extremes

In [None]:
def processing(text):
    """Function that combien all the processing steps"""
    text = re.sub('[^A-Za-z0-9]+', ' ', text)
    text = text.lower()
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [w for w in word_tokens if not w.lower() in stop_words]
    x = [wordnet_lemmatizer.lemmatize(word, pos='n') for word in filtered_text]
    return remove(filtered_text, x,0.06,0.06)

In [None]:
#number of articles to train 
texts = [processing(x) for x in texts]

## LSA

# Define the number of topics or components
num_components=5

# Create SVD object
lsa = TruncatedSVD(n_components=num_components, n_iter=100, random_state=42)

# Fit SVD model on data
lsa.fit_transform(X_tfidf)

# Get Singular values and Components 
Sigma = lsa.singular_values_ 
V_transpose = lsa.components_.T

# Print the topics with their terms
terms = vectorizer.get_feature_names()

for doc, component in enumerate(lsa.components_):
    print(doc)
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:15]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Document "+str(doc)+": ",top_terms_list)

## Randomized LSV

from sklearn.utils.extmath import randomized_svd

U, Sigma, VT = randomized_svd(X_tfidf, 
                              n_components=15,
                              n_iter=5,
                              random_state=None)

U[0:2,]

### Dimensionality reduction and clustering before topic extraction

embeddings = X_FinB

import umap.umap_ as umap

reducer = umap.UMAP()

umap_embeddings = umap.UMAP(n_components= 15, n_neighbors=15, metric='cosine').fit_transform(embeddings)
cluster = hdbscan.HDBSCAN(min_cluster_size=5,
                          metric='euclidean',                      
                          cluster_selection_method='eom').fit(umap_embeddings)

'Number of clusters/topics ', len(set(cluster.labels_))

# Prepare data
umap_data = umap.UMAP(n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
result = pd.DataFrame(umap_data, columns=['x', 'y'])
result['labels'] = cluster.labels_

# Visualize clusters
fig, ax = plt.subplots(figsize=(20, 10))
outliers = result.loc[result.labels == -1, :]
clustered = result.loc[result.labels != -1, :]
plt.scatter(outliers.x, outliers.y, color='#BDBDBD', s=0.5)
plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=3, cmap='hsv_r')
plt.colorbar()

In [None]:
len(texts)

docs_df = pd.DataFrame(texts, columns=["Doc"])
docs_df['Topic'] = cluster.labels_
docs_df['Doc_ID'] = range(len(docs_df))
docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})

def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count
  
tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(texts[:n]))

tf_idf.shape

def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                     .Doc
                     .count()
                     .reset_index()
                     .rename({"Topic": "Topic", "Doc": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
topic_sizes = extract_topic_sizes(docs_df); topic_sizes.head(10)

k=0
for t in topic_sizes.Topic.values:
    if(t!=-1):
        top_n_words[t].sort(key=lambda x:- x[1])
        k +=1
        print('')
        print('Topic - ',k)
        for i in range(10):
            print(top_n_words[t][i][0])

## Linear Discriminant Analysis (LDA)
--> supervised learning !

## Latent Dirichlet Allocation (LDA)
#to do how data cleanned / words removeds

!pip install gensim

final_doc = [document.split() for document in corpus]

dictionary = corpora.Dictionary(final_doc)
DT_matrix = [dictionary.doc2bow(doc) for doc in final_doc]
Lda_object = gensim.models.ldamodel.LdaModel

lda_model_1 = Lda_object(DT_matrix, num_topics=2, id2word = dictionary)
print(lda_model_1.show_topics(num_words=10))

## BERT - Test 1

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

embeddings = model.encode(texts[:10], show_progress_bar=True)

embeddings,shape