In [3]:
import pandas as pd
import neattext.functions as nfx

In [4]:
import numpy as np
import json
import glob

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
import nltk
# nltk.download()
from nltk.corpus import stopwords

#vis
import pyLDAvis
import pyLDAvis.gensim_models

import warnings
warnings.filterwarnings("ignore")

pyLDAvis.enable_notebook()

In [32]:
online = pd.read_csv('assets/original/2021-10-19-MichiganOnline-courses.csv')
f_21 = pd.read_csv('assets/f_21_merge.csv')
w_22 = pd.read_csv('assets/w_22_merge.csv')

In [37]:
online['description']

0      Through this course, you will start by address...
1      The third and the last course of the Addressin...
2      Are you concerned about climate change? Would ...
3      This course, Additive Manufacturing, is the th...
4      This course builds upon the fundamental concep...
                             ...                        
516    This module examines the impacts of incarcerat...
517    This third course in the “Good with Words: Wri...
518    This fourth and final course in the “Good with...
519    This second course in the Good with Words: Wri...
520    This course will teach you how to use your wri...
Name: description, Length: 521, dtype: object

In [36]:
w_22['description']

0        This course seeks to introduce students to eve...
1        This seminar introduces first-year students to...
2        This class explores the fascinating shift in B...
3        This seminar introduces first-year students to...
4        This seminar introduces first-year students to...
                               ...                        
66693                                                  NaN
66694                                                  NaN
66695                                                  NaN
66696                                                  NaN
66697                                                  NaN
Name: description, Length: 66698, dtype: object

In [38]:
def lda_model_for_each_df(df, num_topics=10):
    
    data = df['description'].dropna()
    
    def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
        nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
        texts_out = []
        for text in texts:
            doc = nlp(text)
            new_text = []
            for token in doc:
                if token.pos_ in allowed_postags:
                    new_text.append(token.lemma_)
            final = " ".join(new_text)
            texts_out.append(final)
        return (texts_out)

    lemmatized_texts = lemmatization(data)

    def gen_words(texts):
        final = []
        for text in texts:
            new = gensim.utils.simple_preprocess(text, deacc=True)
            final.append(new)
        return (final)

    data_words = gen_words(lemmatized_texts)


    # Create bigrams and trigrams
    bigram_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=100)
    trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=100)

    bigram = gensim.models.phrases.Phraser(bigram_phrases)
    trigram = gensim.models.phrases.Phraser(trigram_phrases)

    def make_bigrams(texts):
        return([bigram[doc] for doc in texts])

    def make_trigrams(texts):
        return ([trigram[bigram[doc]] for doc in texts])

    data_bigrams = make_bigrams(data_words)
    data_bigrams_trigrams = make_trigrams(data_bigrams)


    # TF-IDF removal
    
    from gensim.models import TfidfModel

    id2word = corpora.Dictionary(data_bigrams_trigrams)

    texts = data_bigrams_trigrams

    corpus = [id2word.doc2bow(text) for text in texts]

    tfidf = TfidfModel(corpus, id2word=id2word)

    low_value = 0.03
    words  = []
    words_missing_in_tfidf = []
    for i in range(0, len(corpus)):
        bow = corpus[i]
        low_value_words = []
        tfidf_ids = [id for id, value in tfidf[bow]]
        bow_ids = [id for id, value in bow]
        low_value_words = [id for id, value in tfidf[bow] if value < low_value]
        drops = low_value_words+words_missing_in_tfidf
        for item in drops:
            words.append(id2word[item])
        words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids]

        new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
        corpus[i] = new_bow


    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus[:-1],
                                               id2word=id2word,
                                               num_topics=10,
                                               random_state=100,
                                               update_every=1,
                                               chunksize=100,
                                               passes=10,
                                               alpha="auto")

    vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=10)

    return vis

In [None]:
lda_model_for_each_df(f_21, num_topics=10)