In [1]:
import pandas as pd
import neattext.functions as nfx

In [2]:
import numpy as np
import json
import glob

from collections import Counter, defaultdict

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
import nltk
# nltk.download()
from nltk.corpus import stopwords

#vis
import pyLDAvis
import pyLDAvis.gensim_models

import warnings
warnings.filterwarnings("ignore")

pyLDAvis.enable_notebook()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


In [3]:
#pip install git+https://github.com/rwalk/gsdmm.git ## install gsdmm

In [4]:
from gsdmm import MovieGroupProcess

In [5]:
online = pd.read_csv('assets/original/2021-10-19-MichiganOnline-courses.csv')
f_21 = pd.read_csv('assets/f_21_merge.csv')
w_22 = pd.read_csv('assets/w_22_merge.csv')

In [6]:
def gsdmm_model_for_each_df(df, num_topics):

    data = df['description'].dropna()


    def lemmatization(texts, allowed_postags=["NOUN"]): #trying only noun instead ["NOUN", "ADJ", "VERB", "ADV"]
        nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
        texts_out = []
        for text in texts:
            doc = nlp(text)
            new_text = []
            for token in doc:
                if token.pos_ in allowed_postags:
                    new_text.append(token.lemma_)
            #final = " ".join(new_text)
            texts_out.append(new_text)
        return (texts_out)

    lemmatized_texts = lemmatization(data)

    def stop_word_removal(texts):

        flat_texts = [t for text in texts for t in text]
        common_words = [k for k,v in Counter(flat_texts).most_common(10)]
        texts_out = []
        stop = set(stopwords.words('english')+common_words)
        for text in texts:
            new_text = [t for t in text if t.lower() not in stop]
            final = " ".join(new_text)
            texts_out.append(final)
        return (texts_out)

    stop_word_removed_texts = stop_word_removal(lemmatized_texts)


    def gen_words(texts):
        final = []
        for text in texts:
            new = gensim.utils.simple_preprocess(text, deacc=True)
            final.append(new)
        return (final)

    data_words = gen_words(stop_word_removed_texts)


    # Create bigrams and trigrams
    bigram_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=100)
    trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=100)

    bigram = gensim.models.phrases.Phraser(bigram_phrases)
    trigram = gensim.models.phrases.Phraser(trigram_phrases)

    def make_bigrams(texts):
        return([bigram[doc] for doc in texts])

    def make_trigrams(texts):
        return ([trigram[bigram[doc]] for doc in texts])

    data_bigrams = make_bigrams(data_words)
    data_bigrams_trigrams = make_trigrams(data_bigrams)


    

    from gensim.models import TfidfModel

    id2word = corpora.Dictionary(data_bigrams_trigrams)

    texts = data_bigrams_trigrams
    
    
    # TF-IDF removal - not used here
    corpus = [id2word.doc2bow(text) for text in texts]

    tfidf = TfidfModel(corpus, id2word=id2word)

    low_value = 0.03
    words  = []
    words_missing_in_tfidf = []
    for i in range(0, len(corpus)):
        bow = corpus[i]
        low_value_words = []
        tfidf_ids = [id for id, value in tfidf[bow]]
        bow_ids = [id for id, value in bow]
        low_value_words = [id for id, value in tfidf[bow] if value < low_value]
        drops = low_value_words+words_missing_in_tfidf
        for item in drops:
            words.append(id2word[item])
        words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids]

        new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
        corpus[i] = new_bow
    #


    #mgp topic modeling
    mgp = MovieGroupProcess(K=num_topics, alpha=0.01, beta=0.01, n_iters=30)

    vocab = set(x for t in texts for x in t)
    n_terms = len(vocab)
    model = mgp.fit(texts, n_terms)




    def top_words(cluster_word_distribution, top_cluster, values):
        for cluster in top_cluster:
            sort_dicts =sorted(mgp.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
            print("\nCluster %s : %s"%(cluster,sort_dicts))


    doc_count = np.array(mgp.cluster_doc_count)
    print('Number of documents per topic :', doc_count)
    print('*'*20)
    # Topics sorted by the number of document they are allocated to
    top_index = doc_count.argsort()[-10:][::-1]
    print('Most important clusters (by number of docs inside):', top_index)
    print('*'*20)
    # Show the top 5 words in term frequency for each cluster 
        
    top_words(mgp.cluster_word_distribution, top_index, 20)

In [7]:
gsdmm_model_for_each_df(f_21, num_topics=20)

In stage 0: transferred 16164 clusters with 20 clusters populated
In stage 1: transferred 1904 clusters with 20 clusters populated
In stage 2: transferred 710 clusters with 20 clusters populated
In stage 3: transferred 493 clusters with 20 clusters populated
In stage 4: transferred 374 clusters with 20 clusters populated
In stage 5: transferred 357 clusters with 20 clusters populated
In stage 6: transferred 381 clusters with 20 clusters populated
In stage 7: transferred 334 clusters with 20 clusters populated
In stage 8: transferred 330 clusters with 20 clusters populated
In stage 9: transferred 335 clusters with 20 clusters populated
In stage 10: transferred 290 clusters with 20 clusters populated
In stage 11: transferred 324 clusters with 20 clusters populated
In stage 12: transferred 327 clusters with 20 clusters populated
In stage 13: transferred 308 clusters with 20 clusters populated
In stage 14: transferred 314 clusters with 20 clusters populated
In stage 15: transferred 308 clu