In [140]:
import math
import re
import time
import string
import pandas as pd
import numpy as np
import hdbscan
from umap import UMAP
import networkx as nx
import jellyfish
from segtok.segmenter import split_multi
from segtok.tokenizer import web_tokenizer, split_contractions
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

from fasttext import load_model
fasttext_model = 'wiki/wiki.fr.bin'
fmodel = load_model(fasttext_model)



In [141]:
def txtPreprocessing(txt):
    x = str(txt).lower()
    x = re.sub(r'http\S+', 'URL', x)
    x = re.sub(r'@[^\s]+', 'USER', x)
    x = re.sub(r'#[^\s]+', 'HASHTAG',x)
    x = re.sub('<.*?>', ' ', x)
    x = re.sub(' +', ' ', x)
    x = re.sub("\[.*?\]"," ",x)
    x = re.sub("[()!?',:;!.\"\\n]"," ", x)
    x = re.sub("['\"\\n]"," ", x)
    x = re.sub("www.\S+"," ", x)
    x = re.sub('[{!"#$%&\'()*’+,-./:;<=>?@[\\]^_`{»|«}~}]', ' ',x)
    x = re.sub('\s+', ' ', x)
    x = re.sub("(.)\\1{2,}", "\\1", x)
    x = re.sub(' +', ' ',x)
    x = x.strip()
#     x = " ".join([word for word in x.split(" ") if word not in stopWords])
    return x

In [142]:
#data
data = pd.read_parquet('/data/big-data-collection/OTHERS/data_esg.parquet')

In [143]:
data = data[['TITRE','CONTENU']]
data["txt"] = data['TITRE'] +" "+data['CONTENU']
data["txtClean"] = data.txt.apply(lambda x:txtPreprocessing(x))

In [3]:
data = pd.read_csv('/data/notebooks/others/french_tweets.csv')

In [4]:
data["txtClean"] = data.text.apply(lambda x:txtPreprocessing(x))

In [None]:
# txtVec = []
# for txt in _data.txtClean.to_list():
#     txtVec.append(fmodel.get_sentence_vector(txt))
# _data["txtVec"] = txtVec

In [49]:
stopWords = open('./stopwordFr', 'r').readlines()
for i in range(len(stopWords)):
    stopWords[i] = stopWords[i].replace('\n', '').replace(' ', '')

In [50]:
stopWords = list(set(stopWords))

In [None]:
start_time = time.time()
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# umap_model.fit(embeddings)

In [144]:
def getClusers(embeddings):
    fifteen = len(embeddings)*.22
    min_cluster_size, min_samples = 20, 2
    clusters = np.full(embeddings.shape[0], -1)
    _clusters = np.arange(len(clusters))
    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=False, verbose=False)
    if len(embeddings)//50000 < 2:
        _umap_embeddings = umap_model.fit_transform(embeddings)
        _embeddings = np.nan_to_num(_umap_embeddings) 
    else:
        umap_model.fit(embeddings[:int(fifteen)])
        chunks = np.array_split(embeddings, len(embeddings)//50000)
        _umap_embeddings = []
        for chunk in chunks:
            _umap_embeddings.extend(umap_model.transform(embeddings))
        _embeddings = np.nan_to_num(_umap_embeddings)    
    
    while(len(_clusters)>fifteen):
        nbrCl = len(set(clusters))-1
        hdbscan_model =hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, metric='euclidean',
                                       cluster_selection_method='eom', prediction_data=True)
        hdbscan_model.fit(_embeddings)
        indexes = np.argwhere(hdbscan_model.labels_!=-1)
        for i, v in zip(indexes, hdbscan_model.labels_[indexes]):
            clusters[_clusters[i]] = nbrCl + v
        _clusters = np.argwhere(hdbscan_model.labels_==-1)
        _embeddings = _embeddings[_clusters]
        _embeddings = _embeddings.reshape(len(_clusters),-1)
        min_cluster_size, min_samples = 10, 1
    return clusters

In [145]:
50002//50000

1

In [138]:
def getTopcKeywords(doc, nbrKeywords=15):
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1), stop_words=stopWords)
    tfidf = tfidf_vectorizer.fit_transform([doc])
    score = np.argsort(np.asarray(tfidf.sum(axis=0)).ravel())[::-1]
    candidates = np.array(tfidf_vectorizer.get_feature_names_out())[score[:277]]
    return ' '.join(candidates)
#     doc_embedding = fmodel.get_sentence_vector(doc)
#     candidate_embeddings = []
#     for w in candidates:
#         candidate_embeddings.append(fmodel.get_sentence_vector(w))
#     distances = cosine_similarity([doc_embedding], candidate_embeddings)[0]
#     return ' '.join([ candidates[index] for index in distances.argsort()[-nbrKeywords:]][::-1])

In [44]:
STOPWORD_WEIGHT = 'bi'
class DataCore(object):
    
    def __init__(self, text, stopword_set, windowsSize, n, tagsToDiscard = set(['u', 'd']), exclude = set(string.punctuation)):
        self.number_of_sentences = 0
        self.number_of_words = 0
        self.terms = {}
        self.candidates = {}
        self.sentences_obj = []
        self.sentences_str = []
        self.G = nx.DiGraph()
        self.exclude = exclude
        self.tagsToDiscard = tagsToDiscard
        self.freq_ns = {}
        for i in range(n):
            self.freq_ns[i+1] = 0.
        self.stopword_set = stopword_set
        self._build(text, windowsSize, n)

    def build_candidate(self, candidate_string):
        sentences_str = [w for w in split_contractions(web_tokenizer(candidate_string.lower())) if not (w.startswith("'") and len(w) > 1) and len(w) > 0]
        candidate_terms = []
        for (i, word) in enumerate(sentences_str):
            tag = self.getTag(word, i)
            term_obj = self.getTerm(word, save_non_seen=False)
            if term_obj.tf == 0:
                term_obj = None
            candidate_terms.append( (tag, word, term_obj) )
        if len([cand for cand in candidate_terms if cand[2] != None]) == 0:
            invalid_virtual_cand = composed_word(None)
            return invalid_virtual_cand
        virtual_cand = composed_word(candidate_terms)
        return virtual_cand

    # Build the datacore features
    def _build(self, text, windowsSize, n):
        text = self.pre_filter(text)
        self.sentences_str = [ [w for w in split_contractions(web_tokenizer(s)) if not (w.startswith("'") and len(w) > 1) and len(w) > 0] for s in list(split_multi(text)) if len(s.strip()) > 0]
        self.number_of_sentences = len(self.sentences_str)
        pos_text = 0
        block_of_word_obj = []
        sentence_obj_aux = []
        for (sentence_id, sentence) in enumerate(self.sentences_str):
            sentence_obj_aux = []
            block_of_word_obj = []
            for (pos_sent, word) in enumerate(sentence):
                if len([c for c in word if c in self.exclude]) == len(word): # If the word is based on exclude chars
                    if len(block_of_word_obj) > 0:
                        sentence_obj_aux.append( block_of_word_obj )
                        block_of_word_obj = []
                else:
                    tag = self.getTag(word, pos_sent)
                    term_obj = self.getTerm(word)
                    term_obj.addOccur(tag, sentence_id, pos_sent, pos_text)
                    pos_text += 1

                    #Create co-occurrence matrix
                    if tag not in self.tagsToDiscard:
                        word_windows = list(range( max(0, len(block_of_word_obj)-windowsSize), len(block_of_word_obj) ))
                        for w in word_windows:
                            if block_of_word_obj[w][0] not in self.tagsToDiscard: 
                                self.addCooccur(block_of_word_obj[w][2], term_obj)
                    #Generate candidate keyphrase list
                    candidate = [ (tag, word, term_obj) ]
                    cand = composed_word(candidate)
                    self.addOrUpdateComposedWord(cand)
                    word_windows = list(range( max(0, len(block_of_word_obj)-(n-1)), len(block_of_word_obj) ))[::-1]
                    for w in word_windows:
                        candidate.append(block_of_word_obj[w])
                        self.freq_ns[len(candidate)] += 1.
                        cand = composed_word(candidate[::-1])
                        self.addOrUpdateComposedWord(cand)

                    # Add term to the block of words' buffer
                    block_of_word_obj.append( (tag, word, term_obj) )

            if len(block_of_word_obj) > 0:
                sentence_obj_aux.append( block_of_word_obj )

            if len(sentence_obj_aux) > 0:
                self.sentences_obj.append(sentence_obj_aux)

        if len(block_of_word_obj) > 0:
            sentence_obj_aux.append( block_of_word_obj )

        if len(sentence_obj_aux) > 0:
            self.sentences_obj.append(sentence_obj_aux)

        self.number_of_words = pos_text

    def build_single_terms_features(self, features=None):
        validTerms = [ term for term in self.terms.values() if not term.stopword ]
        validTFs = (np.array([ x.tf for x in validTerms ]))

        if len(validTFs) == 0:
            return

        avgTF = validTFs.mean()
        stdTF = validTFs.std()
        maxTF = max([ x.tf for x in self.terms.values()])
        list(map(lambda x: x.updateH(maxTF=maxTF, avgTF=avgTF, stdTF=stdTF, number_of_sentences=self.number_of_sentences, features=features), self.terms.values()))

    def build_mult_terms_features(self, features=None):
        list(map(lambda x: x.updateH(features=features), [cand for cand in self.candidates.values() if cand.isValid()]))

    def pre_filter(self, text):
        prog = re.compile("^(\\s*([A-Z]))")
        parts = text.split('\n')
        buffer = ''
        for part in parts:
            sep = ' '
            if prog.match(part):
                sep = '\n\n'
            buffer += sep + part.replace('\t',' ')
        return buffer

    def getTag(self, word, i):
        try:
            w2 = word.replace(",","")
            float(w2)
            return "d"
        except:
            cdigit = len([c for c in word if c.isdigit()])
            calpha = len([c for c in word if c.isalpha()])
            if ( cdigit > 0 and calpha > 0 ) or (cdigit == 0 and calpha == 0) or len([c for c in word if c in self.exclude]) > 1:
                return "u"
            if len(word) == len([c for c in word if c.isupper()]):
                return "a"
            if len([c for c in word if c.isupper()]) == 1 and len(word) > 1 and word[0].isupper() and i > 0:
                return "n"
        return "p"

    def getTerm(self, str_word, save_non_seen=True):
        unique_term = str_word.lower()
        simples_sto = unique_term in self.stopword_set
        if unique_term.endswith('s') and len(unique_term) > 3:
            unique_term = unique_term[:-1]

        if unique_term in self.terms:
            return self.terms[unique_term]
                
        # Include this part
        simples_unique_term = unique_term
        for pontuation in self.exclude:
            simples_unique_term = simples_unique_term.replace(pontuation, '')
        # until here
        isstopword = simples_sto or unique_term in self.stopword_set or len(simples_unique_term) < 3
        
        term_id = len(self.terms)
        term_obj = single_word(unique_term, term_id, self.G)
        term_obj.stopword = isstopword

        if save_non_seen:
            self.G.add_node(term_id)
            self.terms[unique_term] = term_obj

        return term_obj

    def addCooccur(self, left_term, right_term):
        if right_term.id not in self.G[left_term.id]:
            self.G.add_edge(left_term.id, right_term.id, TF=0.)
        self.G[left_term.id][right_term.id]["TF"]+=1.
        
    def addOrUpdateComposedWord(self, cand):
        if cand.unique_kw not in self.candidates:
            self.candidates[cand.unique_kw] = cand
        else:
            self.candidates[cand.unique_kw].uptadeCand(cand)
        self.candidates[cand.unique_kw].tf += 1.


class composed_word(object):
    def __init__(self, terms): # [ (tag, word, term_obj) ]
        if terms == None:
             self.start_or_end_stopwords = True
             self.tags = set()
             return
        self.tags = set([''.join([ w[0] for w in terms ])])
        self.kw = ' '.join( [ w[1] for w in terms ] )
        self.unique_kw = self.kw.lower()
        self.size = len(terms)
        self.terms = [ w[2] for w in terms if w[2] != None ]
        self.tf = 0.
        self.integrity = 1.
        self.H = 1.
        self.start_or_end_stopwords = self.terms[0].stopword or self.terms[-1].stopword

    def uptadeCand(self, cand):
        for tag in cand.tags:
            self.tags.add( tag )

    def isValid(self):
        isValid = False
        for tag in self.tags:
            isValid = isValid or ( "u" not in tag and "d" not in tag )
        return isValid and not self.start_or_end_stopwords

    def get_composed_feature(self, feature_name, discart_stopword=True):
        list_of_features = [ getattr(term, feature_name) for term in self.terms if ( discart_stopword and not term.stopword ) or not discart_stopword ]
        sum_f  = sum(list_of_features)
        prod_f = np.prod(list_of_features)
        return ( sum_f, prod_f, prod_f /(sum_f + 1) )

    def build_features(self, doc_id=None, keys=None, rel=True, rel_approx=True, isVirtual=False, features=['WFreq', 'WRel', 'tf', 'WCase', 'WPos', 'WSpread'], _stopword=[True, False]):
        columns = []
        seen = set()
        features_cand = []

        if doc_id != None:
            columns.append('doc_id')
            features_cand.append(doc_id)

        if keys != None:
            if rel:
                columns.append('rel')
                if self.unique_kw in keys or isVirtual:
                    features_cand.append(1)
                    seen.add(self.unique_kw)
                else:
                    features_cand.append(0)

            if rel_approx:
                columns.append('rel_approx')
                max_gold_ = ('', 0.)
                for gold_key in keys:
                    dist = 1.-jellyfish.levenshtein_distance(gold_key, self.unique_kw ) / max(len(gold_key), len(self.unique_kw)) # _tL
                    if max_gold_[1] < dist:
                        max_gold_ = ( gold_key, dist )
                features_cand.append(max_gold_[1])

        columns.append('kw')
        features_cand.append(self.unique_kw)
        columns.append('h')
        features_cand.append(self.H)
        columns.append('tf')
        features_cand.append(self.tf)
        columns.append('size')
        features_cand.append(self.size)
        columns.append('isVirtual')
        features_cand.append(int(isVirtual))

        for feature_name in features:

            for discart_stopword in _stopword:
                (f_sum, f_prod, f_sum_prod) = self.get_composed_feature(feature_name, discart_stopword=discart_stopword)
                columns.append('%ss_sum_K%s' % ('n' if discart_stopword else '', feature_name) )
                features_cand.append(f_sum)

                columns.append('%ss_prod_K%s' % ('n' if discart_stopword else '', feature_name) )
                features_cand.append(f_prod)

                columns.append('%ss_sum_prod_K%s' % ('n' if discart_stopword else '', feature_name) )
                features_cand.append(f_sum_prod)

        return (features_cand, columns, seen)

    def updateH(self, features=None, isVirtual=False):
        sum_H  = 0.
        prod_H = 1.

        for (t, term_base) in enumerate(self.terms):
            if not term_base.stopword:
                sum_H += term_base.H
                prod_H *= term_base.H

            else:
                if STOPWORD_WEIGHT == 'bi':
                    prob_t1 = 0.
                    if term_base.G.has_edge(self.terms[t-1].id, self.terms[ t ].id):
                        prob_t1 = term_base.G[self.terms[t-1].id][self.terms[ t ].id]["TF"] / self.terms[t-1].tf

                    prob_t2 = 0.
                    if term_base.G.has_edge(self.terms[ t ].id, self.terms[t+1].id):
                        prob_t2 = term_base.G[self.terms[ t ].id][self.terms[t+1].id]["TF"] / self.terms[t+1].tf

                    prob = prob_t1 * prob_t2
                    prod_H *= (1 + (1 - prob ) )
                    sum_H -= (1 - prob)
                elif STOPWORD_WEIGHT == 'h':
                    sum_H += term_base.H
                    prod_H *= term_base.H
                elif STOPWORD_WEIGHT == 'none':
                    pass

        tf_used = 1.
        if features == None or "KPF" in features:
            tf_used = self.tf

        if isVirtual:
            tf_used = np.mean( [term_obj.tf for term_obj in self.terms] )

        self.H = prod_H / ( ( sum_H + 1 ) * tf_used )

    def updateH_old(self, features=None, isVirtual=False):
        sum_H  = 0.
        prod_H = 1.

        for (t, term_base) in enumerate(self.terms):
            if isVirtual and term_base.tf==0:
                continue

            if term_base.stopword:
                prob_t1 = 0.
                if term_base.G.has_edge(self.terms[t-1].id, self.terms[ t ].id):
                    prob_t1 = term_base.G[self.terms[t-1].id][self.terms[ t ].id]["TF"] / self.terms[t-1].tf

                prob_t2 = 0.
                if term_base.G.has_edge(self.terms[ t ].id, self.terms[t+1].id):
                    prob_t2 = term_base.G[self.terms[ t ].id][self.terms[t+1].id]["TF"] / self.terms[t+1].tf

                prob = prob_t1 * prob_t2
                prod_H *= (1 + (1 - prob ) )
                sum_H -= (1 - prob)
            else:
                sum_H += term_base.H
                prod_H *= term_base.H
        tf_used = 1.
        if features == None or "KPF" in features:
            tf_used = self.tf
        if isVirtual:
            tf_used = np.mean( [term_obj.tf for term_obj in self.terms] )
        self.H = prod_H / ( ( sum_H + 1 ) * tf_used )


class single_word(object):

    def __init__(self, unique, idx, graph):
        self.unique_term = unique
        self.id = idx
        self.tf = 0.
        self.WFreq = 0.0
        self.WCase = 0.0
        self.tf_a = 0.
        self.tf_n = 0.
        self.WRel = 1.0
        self.PL = 0.
        self.PR = 0.
        self.occurs = {}
        self.WPos = 1.0
        self.WSpread = 0.0
        self.H = 0.0
        self.stopword = False
        self.G = graph

        self.pagerank = 1.

    def updateH(self, maxTF, avgTF, stdTF, number_of_sentences, features=None):
        """if features == None or "WRel" in features:
            self.PL = self.WDL / maxTF
            self.PR = self.WDR / maxTF
            self.WRel = ( (0.5 + (self.PWL * (self.tf / maxTF) + self.PL)) + (0.5 + (self.PWR * (self.tf / maxTF) + self.PR)) )"""

        if features == None or "WRel" in features:
            self.PL = self.WDL / maxTF
            self.PR = self.WDR / maxTF
            self.WRel = ( (0.5 + (self.PWL * (self.tf / maxTF))) + (0.5 + (self.PWR * (self.tf / maxTF))) )

        if features == None or "WFreq" in features:
            self.WFreq = self.tf / (avgTF + stdTF)
        
        if features == None or "WSpread" in features:
            self.WSpread = len(self.occurs) / number_of_sentences
        
        if features == None or "WCase" in features:
            self.WCase = max(self.tf_a, self.tf_n) / (1. + math.log(self.tf))
        
        if features == None or "WPos" in features:
            self.WPos = math.log( math.log( 3. + np.median(list(self.occurs.keys())) ) )

        self.H = (self.WPos * self.WRel) / (self.WCase + (self.WFreq / self.WRel) + (self.WSpread / self.WRel))
        
    @property
    def WDR(self):
        return len( self.G.out_edges(self.id) )

    @property
    def WIR(self):
        return sum( [ d['TF'] for (u,v,d) in self.G.out_edges(self.id, data=True) ] )

    @property
    def PWR(self):
        wir = self.WIR
        if wir == 0:
            return 0
        return self.WDR / wir 
    
    @property
    def WDL(self):
        return len( self.G.in_edges(self.id) )

    @property
    def WIL(self):
        return sum( [ d['TF'] for (u,v,d) in self.G.in_edges(self.id, data=True) ] )

    @property
    def PWL(self):
        wil = self.WIL
        if wil == 0:
            return 0
        return self.WDL / wil 

    def addOccur(self, tag, sent_id, pos_sent, pos_text):
        if sent_id not in self.occurs:
            self.occurs[sent_id] = []

        self.occurs[sent_id].append( (pos_sent, pos_text) )
        self.tf += 1.

        if tag == "a":
            self.tf_a += 1.
        if tag == "n":
            self.tf_n += 1.

In [45]:
class Levenshtein(object):

    @staticmethod
    def __ratio(distance, str_length):
        return 1 - float(distance) / float(str_length)

    @staticmethod
    def ratio(seq1, seq2):
        str_distance = Levenshtein.distance(seq1,seq2)
        str_length = max(len(seq1),len(seq2))
        return Levenshtein.__ratio(str_distance,str_length)

    @staticmethod
    def distance(seq1, seq2):  
        size_x = len(seq1) + 1
        size_y = len(seq2) + 1
        matrix = np.zeros ((size_x, size_y))
        for x in range(size_x):
            matrix [x, 0] = x
        for y in range(size_y):
            matrix [0, y] = y

        for x in range(1, size_x):
            for y in range(1, size_y):
                if seq1[x-1] == seq2[y-1]:
                    matrix [x,y] = min(
                        matrix[x-1, y] + 1,
                        matrix[x-1, y-1],
                        matrix[x, y-1] + 1
                    )
                else:
                    matrix [x,y] = min(
                        matrix[x-1,y] + 1,
                        matrix[x-1,y-1] + 1,
                        matrix[x,y-1] + 1
                    )
        return (matrix[size_x - 1, size_y - 1])

In [46]:
class KeywordExtractor(object):

    def __init__(self, n=3, dedupLim=0.9, dedupFunc='seqm', windowsSize=1, top=20, features=None, stopwords=None):

        self.stopword_set = set(stopwords)
        self.n = n
        self.top = top
        self.dedupLim = dedupLim
        self.features = features
        self.windowsSize = windowsSize
        if dedupFunc == 'jaro_winkler' or dedupFunc == 'jaro':
            self.dedu_function = self.jaro
        elif dedupFunc.lower() == 'sequencematcher' or dedupFunc.lower() == 'seqm':
            self.dedu_function = self.seqm
        else:
            self.dedu_function = self.levs

    def jaro(self, cand1, cand2):
        return jellyfish.jaro_winkler(cand1, cand2 )

    def levs(self, cand1, cand2):
        return 1.-jellyfish.levenshtein_distance(cand1, cand2 ) / max(len(cand1),len(cand2))

    def seqm(self, cand1, cand2):
        return Levenshtein.ratio(cand1, cand2)

    def extract_keywords(self, text):
        try:
            if not(len(text) > 0):
                return []

            text = text.replace('\n\t',' ')
            dc = DataCore(text=text, stopword_set=self.stopword_set, windowsSize=self.windowsSize, n=self.n)
            dc.build_single_terms_features(features=self.features)
            dc.build_mult_terms_features(features=self.features)
            resultSet = []
            todedup = sorted([cc for cc in dc.candidates.values() if cc.isValid()], key=lambda c: c.H)

            if self.dedupLim >= 1.:
                return ([ (cand.H, cand.unique_kw) for cand in todedup])[:self.top]

            for cand in todedup:
                toadd = True
                for (h, candResult) in resultSet:
                    dist = self.dedu_function(cand.unique_kw, candResult.unique_kw)
                    if dist > self.dedupLim:
                        toadd = False
                        break
                if toadd:
                    resultSet.append( (cand.H, cand) )
                if len(resultSet) == self.top:
                    break

            return [ cand.kw for (h,cand) in resultSet]
        except Exception as e:
            print(f"Warning! Exception: {e} generated by the following text: '{text}' ")
            return []

In [146]:
def getTopic(df, txt):
    start_time = time.time()
    _data = df.copy()
    txtVec = []
    for txt in _data[txt].to_list():
        txtVec.append(fmodel.get_sentence_vector(txt))
    _data["txtVec"] = txtVec
    
    print("---txt2Vec: %s seconds ---" % (time.time() - start_time))
    start_time = time.time()
    
    
    embeddings = np.array(_data.txtVec.to_list())
    clusters = getClusers(embeddings)
    _data['Topic'] = clusters
    nbrTopic = len(set(clusters))
    _data = _data[_data.Topic != -1]
    
    documents_per_topic = _data.groupby(['Topic'], as_index=False).agg({'txtClean': ' '.join})
    _documents = documents_per_topic.txtClean.values
    
    print("---get initTopic: %s seconds ---" % (time.time() - start_time))
    start_time = time.time()
    
    topicKey50 = []
    for doc in _documents: topicKey50.append(getTopcKeywords(doc,50))
    documents_per_topic['TopicKey50'] = topicKey50
    
    topVec = []
    for txt in documents_per_topic.TopicKey50.to_list():
        topVec.append(fmodel.get_sentence_vector(txt))

    print("---get initTopics Comun word : %s seconds ---" % (time.time() - start_time))
    start_time = time.time()    
    
    hdbscan_model =hdbscan.HDBSCAN(metric='euclidean', cluster_selection_method='eom', prediction_data=True)
    hdbscan_model.fit(topVec)

    _topics = documents_per_topic.Topic.to_list()
    for i, t in enumerate(hdbscan_model.labels_):
        if t !=-1: _topics[i] = nbrTopic + t 

    print("---get Topics : %s seconds ---" % (time.time() - start_time))
    start_time = time.time()  
            
    documents_per_topic['nTopic'] = _topics
    print(documents_per_topic.shape)
    documents_per_topic['nTopic'] = documents_per_topic.nTopic.apply(lambda x: np.argwhere(np.array(list(set(_topics))) == x).item())
    documents_per_topic = documents_per_topic.groupby(['nTopic'], as_index=False).agg({'txtClean': ' '.join})
    _documents = documents_per_topic.txtClean.values
    print(documents_per_topic.shape)    
    topicKey = []
    for doc in _documents:
        k_extractor = KeywordExtractor(stopwords=stopWords, n=1, dedupLim=0.9, dedupFunc='jaro', windowsSize=10, top=20, features=None)
        topicKey.append(k_extractor.extract_keywords(doc))
    documents_per_topic['TopicKey'] = topicKey
    
    
    print("---get Topics Keywords: %s seconds ---" % (time.time() - start_time))
    start_time = time.time()      
    
    nTopic  = {}
    for i,j in zip(documents_per_topic.Topic.to_list(), documents_per_topic.nTopic.to_list()): nTopic[i] = j
    topic = _data.Topic.apply(lambda x: nTopic[x]).to_list()
    
    topicDic = {}
    cc = Counter(topic)
    for i,j in zip(documents_per_topic.nTopic.to_list(),documents_per_topic.TopicKey.to_list()): topicDic[i] = {"topic":j,"f":cc[i]}
    return topic, topicDic

In [147]:
start_time_ = time.time()
td = getTopic(data,'txtClean')
print("---ALL %s seconds ---" % (time.time() - start_time_))

---txt2Vec: 98.83606386184692 seconds ---
---get initTopic: 12.768213272094727 seconds ---
---get initTopics Comun word : 12.644285678863525 seconds ---
---get Topics : 0.5437262058258057 seconds ---
(794, 4)
(460, 2)
---get Topics Keywords: 872.4780995845795 seconds ---


AttributeError: 'DataFrame' object has no attribute 'Topic'

In [88]:
txt = td.txtClean.to_list()

In [None]:
# import tensorflow_hub as hub
# embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
embeddings = embed(txt)
print(embeddings)

2022-07-11 19:41:24.602794: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 5874295872 exceeds 10% of free system memory.
2022-07-11 19:41:25.868126: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 5873572080 exceeds 10% of free system memory.
2022-07-11 19:41:26.748813: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 5874295872 exceeds 10% of free system memory.
2022-07-11 19:41:27.770356: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 5874295872 exceeds 10% of free system memory.
2022-07-11 19:41:35.475174: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 11748591744 exceeds 10% of free system memory.


In [162]:
pd.DataFrame(td)#[1:2].topic.to_list()

Unnamed: 0,nTopic,txtClean
0,0,abdoulaye sow vice président fsf et directeur ...
1,1,a hambourg les sociaux démocrates prêts à cont...
2,2,niayes moussa baldé souligne les bonds impress...
3,3,la demarche du pdidas expliquee par sa coordin...
4,4,des experts se penchent sur la gestion durable...
...,...,...
173,173,agro industrie un projet de plus d un milliard...
174,174,les activites du projet agri jeunes tekki ndaw...
175,175,75 nouveaux cas et 3 décès dus au coronavirus ...
176,176,communiqué du conseil des ministres du 19 févr...


In [None]:
!free -gh

In [5]:
# start_time = time.time()
_data = data.copy()
txtVec = []
for txt in _data.txtClean.to_list():
    txtVec.append(fmodel.get_sentence_vector(txt))
_data["txtVec"] = txtVec

In [6]:
embeddings = np.array(_data.txtVec.to_list())

In [7]:
len(embeddings)*.3

458017.2

In [9]:
fifteen = 100000
min_cluster_size, min_samples = 20, 2
clusters = np.full(embeddings.shape[0], -1)
_clusters = np.arange(len(clusters))
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=False, verbose=False)
umap_model.fit(embeddings[:int(fifteen)])
# _umap_embeddings = umap_model.transform(embeddings)
# _embeddings = np.nan_to_num(_umap_embeddings)

In [21]:
start_time = time.time()
___umap_embeddings = umap_model.transform(embeddings[100000:200000])
print("---txt2Vec: %s seconds ---" % (time.time() - start_time))

---txt2Vec: 211.80362462997437 seconds ---


In [28]:
len(___umap_embeddings)

100000

In [22]:
import faiss

In [31]:
mat = faiss.PCAMatrix (300, 10)
mat.train(embeddings[:int(fifteen)])

In [131]:
start_time = time.time()
k_extractor = KeywordExtractor(stopwords=stopWords, n=1, dedupLim=0.9, dedupFunc='jaro', windowsSize=10, top=20, features=None)
jj1 = k_extractor.extract_keywords(txt[200])
print("---txt2Vec: %s seconds ---" % (time.time() - start_time))
jj1

---txt2Vec: 0.0201570987701416 seconds ---


['journées',
 'contribution',
 'document',
 'agroécologique',
 'sénégal',
 'transition',
 'politique',
 'nationale',
 'édition',
 'consacrée',
 'dynamique',
 'recommandations',
 'plénière',
 'tdr',
 'janvier',
 'grand',
 'groupes',
 'dakar',
 'hautes',
 'autorités']

In [163]:
txt = data.txtClean.to_list()

In [129]:
stopWords.append('nbsp')

In [33]:
from sklearn.decomposition import PCA


In [34]:
start_time = time.time()
tr = pca.transform(embeddings)
print("---txt2Vec: %s seconds ---" % (time.time() - start_time))

---txt2Vec: 0.6962049007415771 seconds ---


In [35]:
faiss.kmeans_clustering(tr)

TypeError: kmeans_clustering() missing 4 required positional arguments: 'n', 'k', 'x', and 'centroids'

In [3]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bigscience/bloom')

Downloading:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

KeyboardInterrupt: 