## This notebook:
* Calculate topic coherence of gsdmm

In [3]:
import pandas as pd
import neattext.functions as nfx

In [4]:
import numpy as np
import json
import glob

from collections import Counter, defaultdict

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
import nltk
# nltk.download()
from nltk.corpus import stopwords

#vis
import pyLDAvis
import pyLDAvis.gensim_models

import warnings
warnings.filterwarnings("ignore")

pyLDAvis.enable_notebook()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


In [5]:
#pip install git+https://github.com/rwalk/gsdmm.git ## install gsdmm

In [6]:
from gsdmm import MovieGroupProcess

In [7]:
online = pd.read_csv('assets/original/2021-10-19-MichiganOnline-courses.csv')
f_21 = pd.read_csv('assets/f_21_merge.csv')
w_22 = pd.read_csv('assets/w_22_merge.csv')

In [8]:
def prepare_text(df):

    data = df['description'].dropna()


    def lemmatization(texts, allowed_postags=["NOUN"]): #trying only noun instead ["NOUN", "ADJ", "VERB", "ADV"]
        nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
        texts_out = []
        for text in texts:
            doc = nlp(text)
            new_text = []
            for token in doc:
                if token.pos_ in allowed_postags:
                    new_text.append(token.lemma_)
            #final = " ".join(new_text)
            texts_out.append(new_text)
        return (texts_out)

    lemmatized_texts = lemmatization(data)

    def stop_word_removal(texts):

        flat_texts = [t for text in texts for t in text]
        common_words = [k for k,v in Counter(flat_texts).most_common(10)]
        texts_out = []
        stop = set(stopwords.words('english')+common_words)
        for text in texts:
            new_text = [t for t in text if t.lower() not in stop]
            final = " ".join(new_text)
            texts_out.append(final)
        return (texts_out)

    stop_word_removed_texts = stop_word_removal(lemmatized_texts)

    def gen_words(texts):
        final = []
        for text in texts:
            new = gensim.utils.simple_preprocess(text, deacc=True)
            final.append(new)
        return (final)

    data_words = gen_words(stop_word_removed_texts)


    # Create bigrams and trigrams
    bigram_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=100)
    trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=100)

    bigram = gensim.models.phrases.Phraser(bigram_phrases)
    trigram = gensim.models.phrases.Phraser(trigram_phrases)

    def make_bigrams(texts):
        return([bigram[doc] for doc in texts])

    def make_trigrams(texts):
        return ([trigram[bigram[doc]] for doc in texts])

    data_bigrams = make_bigrams(data_words)
    data_bigrams_trigrams = make_trigrams(data_bigrams)


    return data_bigrams_trigrams

texts = prepare_text(f_21)

In [12]:
def gsdmm(texts, num_topics):
    #mgp topic modeling
    mgp = MovieGroupProcess(K=num_topics, alpha=0.01, beta=0.01, n_iters=30)

    vocab = set(x for t in texts for x in t)
    n_terms = len(vocab)
    model = mgp.fit(texts, n_terms)

    return mgp

mgp = gsdmm(texts, 20)

In stage 0: transferred 16131 clusters with 20 clusters populated
In stage 1: transferred 2101 clusters with 20 clusters populated
In stage 2: transferred 557 clusters with 20 clusters populated
In stage 3: transferred 388 clusters with 20 clusters populated
In stage 4: transferred 347 clusters with 20 clusters populated
In stage 5: transferred 330 clusters with 20 clusters populated
In stage 6: transferred 310 clusters with 20 clusters populated
In stage 7: transferred 314 clusters with 20 clusters populated
In stage 8: transferred 308 clusters with 20 clusters populated
In stage 9: transferred 320 clusters with 20 clusters populated
In stage 10: transferred 304 clusters with 20 clusters populated
In stage 11: transferred 305 clusters with 20 clusters populated
In stage 12: transferred 279 clusters with 20 clusters populated
In stage 13: transferred 290 clusters with 20 clusters populated
In stage 14: transferred 299 clusters with 20 clusters populated
In stage 15: transferred 291 clu

In [17]:
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts =sorted(mgp.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print("\nCluster %s : %s"%(cluster,sort_dicts))


doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic :', doc_count)
print('*'*20)
# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-20:][::-1]
print('Most important clusters (by number of docs inside):', top_index)
print('*'*20)
# Show the top 5 words in term frequency for each cluster 

top_words(mgp.cluster_word_distribution, top_index, 20)

Number of documents per topic : [ 299  514  510  726  802  590 1504  568 2987 1105  548  860  738 1186
 1352 1188  736  695  616  491]
********************
Most important clusters (by number of docs inside): [ 8  6 14 15 13  9 11  4 12 16  3 17 18  5  7 10  1  2 19  0]
********************

Cluster 8 : [('faculty', 1338), ('engineering', 1276), ('interest', 1204), ('laboratory', 982), ('problem', 883), ('theatre', 852), ('computer', 801), ('research', 735), ('lecture', 685), ('seminar', 685), ('report', 657), ('credit', 637), ('eec', 577), ('hour', 567), ('science', 565), ('supervision', 556), ('area', 548), ('craft', 538), ('work', 534), ('production', 527)]

Cluster 6 : [('practice', 2548), ('situation', 2484), ('function', 1466), ('tool', 1370), ('computer', 1354), ('structure', 1308), ('literature', 1242), ('world', 1242), ('time', 1242), ('understanding', 1242), ('history', 1242), ('issue', 1242), ('lecture', 1242), ('politic', 1242), ('content', 1242), ('textbook', 1242), ('featu

In [18]:
dictionary = gensim.corpora.Dictionary(texts)

# filter extreme cases out of dictionary
dictionary.filter_extremes(no_below=15, no_above=0.5)

# create variable containing length of dictionary/vocab
vocab_length = len(dictionary)

# create BOW dictionary
bow_corpus = [dictionary.doc2bow(t) for t in texts]

In [21]:
# import library from gensim  
from gensim.models import CoherenceModel

# define function to get words in topics
def get_topics_lists(model, top_clusters, n_words):
    '''
    Gets lists of words in topics as a list of lists.
    
    model: gsdmm instance
    top_clusters:  numpy array containing indices of top_clusters
    n_words: top n number of words to include
    
    '''
    # create empty list to contain topics
    topics = []
    
    # iterate over top n clusters
    for cluster in top_clusters:
        #create sorted dictionary of word distributions
        sorted_dict = sorted(model.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:n_words]
         
        #create empty list to contain words
        topic = []
        
        #iterate over top n words in topic
        for k,v in sorted_dict:
            #append words to topic list
            topic.append(k)
            
        #append topics to topics list    
        topics.append(topic)
    
    return topics

# get topics to feed to coherence model
topics = get_topics_lists(mgp, top_index, 20) 

# evaluate model using Topic Coherence score
cm_gsdmm = CoherenceModel(topics=topics, 
                          dictionary=dictionary, 
                          corpus=bow_corpus, 
                          texts=texts, 
                          coherence='c_v')

# get coherence value
coherence_gsdmm = cm_gsdmm.get_coherence()  

print(coherence_gsdmm)

0.5409572911065412
