In [None]:
import re
import numpy as np
import pandas as pd
import tqdm
from collections import defaultdict
from pprint import pprint
#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import pyLDAvis
import pyLDAvis.gensim_models
import pyLDAvis.sklearn
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets import make_multilabel_classification
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

# Load 20newsgroups Dataset via Scikit-learn

In [None]:
df = fetch_20newsgroups(remove =('headers', 'footers', 'quotes'))
docs_raw = df.data
#The ratio of training, validation and testing dataset is 80:10:10
#Split the dataset as training dataset and testing dataset with the ratio 80%:20%
training_data, testing_data = train_test_split(docs_raw, test_size=0.2, random_state=25)
#Assign 10% data as validation dataset from training dataset
validation_data, testing_data = train_test_split(testing_data, test_size=0.5, random_state=25)
print(len(docs_raw))
print(len(training_data))
print(len(validation_data))
print(len(testing_data))
data_pre_1 = [re.sub('\S*@\S*\s?', '', sent) for sent in training_data]
data_pre_2 = [re.sub('\s+', ' ', sent) for sent in data_pre_1]
data_pre_3 = [re.sub("\'", "", sent) for sent in data_pre_2]
pprint(data_pre_3[:2])

# Tokenize and Further Text Clean-up (Convert to Lowercases; Remove Punctuations)

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
words_data = list(sent_to_words(data_pre_3))
print(words_data[:2])

# Remove Stopwords

In [None]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
# Remove stopwords
words_data_nostops = remove_stopwords(words_data)

# Create Dictionary and Corpus

In [None]:
# Dictionary
id2word = corpora.Dictionary(words_data_nostops)
# Corpus
texts = words_data_nostops
# TDF-Term Document Frquency
corpus = [id2word.doc2bow(text) for text in texts]
print(corpus[:2])

# Training and Build LDA Topic Model (Gensim)

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=2,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           eta='auto',
                                           per_word_topics=True)

In [None]:
lda_model_1 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=2,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           eta='symmetric',
                                           per_word_topics=True)

In [None]:
lda_model_2 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=2,
                                           chunksize=100,
                                           passes=10,
                                           alpha='symmetric',
                                           eta='auto',
                                           per_word_topics=True)

In [None]:
lda_model_3 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=2,
                                           chunksize=100,
                                           passes=10,
                                           alpha='asymmetric',
                                           eta='auto',
                                           per_word_topics=True)

In [None]:
lda_model_4 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=2,
                                           chunksize=100,
                                           passes=10,
                                           alpha='symmetric',
                                           eta='symmetric',
                                           per_word_topics=True)

In [None]:
lda_model_5 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=2,
                                           chunksize=100,
                                           passes=10,
                                           alpha=0.01,
                                           eta=0.1,
                                           per_word_topics=True)

In [None]:
lda_model_6 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=2,
                                           chunksize=100,
                                           passes=10,
                                           alpha=0.09,
                                           eta=0.1,
                                           per_word_topics=True)

In [None]:
lda_model_7 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=2,
                                           chunksize=100,
                                           passes=10,
                                           alpha=0.01,
                                           eta=0.9,
                                           per_word_topics=True)

# Output

In [None]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
pprint(lda_model_1.print_topics())
doc_lda_1 = lda_model_1[corpus]

In [None]:
pprint(lda_model_2.print_topics())
doc_lda_2 = lda_model_2[corpus]

In [None]:
pprint(lda_model_3.print_topics())
doc_lda_3 = lda_model_3[corpus]

In [None]:
pprint(lda_model_4.print_topics())
doc_lda_4 = lda_model_4[corpus]

In [None]:
pprint(lda_model_5.print_topics())
doc_lda_5 = lda_model_5[corpus]

In [None]:
pprint(lda_model_6.print_topics())
doc_lda_6 = lda_model_6[corpus]

In [None]:
pprint(lda_model_7.print_topics())
doc_lda_7 = lda_model_7[corpus]

# Preprocessing the Testing Dataset

In [None]:
test_data_pre_1 = [re.sub('\S*@\S*\s?', '', sent) for sent in testing_data]
test_data_pre_2 = [re.sub('\s+', ' ', sent) for sent in test_data_pre_1]
test_data_pre_3 = [re.sub("\'", "", sent) for sent in test_data_pre_2]
words_test_data = list(sent_to_words(test_data_pre_3))
words_test_data_nostops = remove_stopwords(words_test_data)
id2word_test = corpora.Dictionary(words_test_data_nostops)
test_texts = words_test_data_nostops
test_corpus = [id2word_test.doc2bow(text) for text in test_texts]

# Validation and Testing

In [None]:
print('\nLog Perplexity: ', lda_model.log_perplexity(test_corpus)) 
#print('\nPerplexity: ', 2**abs((lda_model.log_perplexity(test_corpus)))) 
print('\nLog Perplexity 1: ', lda_model_1.log_perplexity(test_corpus)) 
print('\nLog Perplexity 2: ', lda_model_2.log_perplexity(test_corpus))  
print('\nLog Perplexity 3: ', lda_model_3.log_perplexity(test_corpus)) 
print('\nLog Perplexity 4: ', lda_model_4.log_perplexity(test_corpus)) 
print('\nLog Perplexity 5: ', lda_model_5.log_perplexity(test_corpus)) 
print('\nLog Perplexity 6: ', lda_model_6.log_perplexity(test_corpus))  
print('\nLog Perplexity 7: ', lda_model_7.log_perplexity(test_corpus))

# Calculate Perplexity and Coherence Score for Trained Models

In [None]:
# The variational bound score calculated for each word @Gensim
print('\nLog Perplexity: ', lda_model.log_perplexity(corpus))  
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=words_data_nostops, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

print('\nLog Perplexity 1: ', lda_model_1.log_perplexity(corpus)) 
coherence_model_lda_1 = CoherenceModel(model=lda_model_1, texts=words_data_nostops, dictionary=id2word, coherence='c_v')
coherence_lda_1 = coherence_model_lda_1.get_coherence()
print('\nCoherence Score 1: ', coherence_lda_1)

print('\nLog Perplexity 2: ', lda_model_2.log_perplexity(corpus)) 
coherence_model_lda_2 = CoherenceModel(model=lda_model_2, texts=words_data_nostops, dictionary=id2word, coherence='c_v')
coherence_lda_2 = coherence_model_lda_2.get_coherence()
print('\nCoherence Score 2: ', coherence_lda_2)

print('\nLog Perplexity 3: ', lda_model_3.log_perplexity(corpus)) 
coherence_model_lda_3 = CoherenceModel(model=lda_model_3, texts=words_data_nostops, dictionary=id2word, coherence='c_v')
coherence_lda_3 = coherence_model_lda_3.get_coherence()
print('\nCoherence Score 3: ', coherence_lda_3)

print('\nLog Perplexity 4: ', lda_model_4.log_perplexity(corpus)) 
coherence_model_lda_4 = CoherenceModel(model=lda_model_4, texts=words_data_nostops, dictionary=id2word, coherence='c_v')
coherence_lda_4 = coherence_model_lda_4.get_coherence()
print('\nCoherence Score 4: ', coherence_lda_4)

print('\nLog Perplexity 5: ', lda_model_5.log_perplexity(corpus)) 
coherence_model_lda_5 = CoherenceModel(model=lda_model_5, texts=words_data_nostops, dictionary=id2word, coherence='c_v')
coherence_lda_5 = coherence_model_lda_5.get_coherence()
print('\nCoherence Score 5: ', coherence_lda_5)

print('\nLog Perplexity 6: ', lda_model_6.log_perplexity(corpus)) 
coherence_model_lda_6 = CoherenceModel(model=lda_model_6, texts=words_data_nostops, dictionary=id2word, coherence='c_v')
coherence_lda_6 = coherence_model_lda_6.get_coherence()
print('\nCoherence Score 6: ', coherence_lda_6)

print('\nLog Perplexity 7: ', lda_model_7.log_perplexity(corpus)) 
coherence_model_lda_7 = CoherenceModel(model=lda_model_7, texts=words_data_nostops, dictionary=id2word, coherence='c_v')
coherence_lda_7 = coherence_model_lda_7.get_coherence()
print('\nCoherence Score 7: ', coherence_lda_7)


# Find the Optimal Number of Topics

# Determine the Coherence Score for Vary Topics

In [None]:
# This could be time consuming
def compute_coherence_values(dictionary, corpus, texts, limit, start=5, step=5):
    coherence_values = []
    model_list = []
    log_perplexity = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           update_every=2,
                                           chunksize=100,
                                           passes=10,
                                           alpha=0.01,
                                           eta=0.9,
                                           per_word_topics=True)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=words_data_nostops, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        log_perplexity.append(model.log_perplexity(corpus))

    return model_list, coherence_values, log_perplexity

model_list, coherence_values, log_perplexity = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=words_data_nostops, limit=45, 
                                                        start=5, step=5)

limit=45; start=5; step=5;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score")
plt.legend(("coherence_values"), loc='best')
plt.show()

for m, cv, p in zip(x, coherence_values, log_perplexity ):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4), "and Likelihood Bound of", round(p, 4))

# Pick the Final Model

In [None]:
lda_model_final = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=2,
                                           chunksize=100,
                                           passes=10,
                                           alpha=0.01,
                                           eta=0.9,
                                           per_word_topics=True)
pprint(lda_model_final.print_topics())
doc_lda_final = lda_model_final[corpus]
print('\nLog Perplexity: ', lda_model_final.log_perplexity(corpus)) 
coherence_model_lda_final = CoherenceModel(model=lda_model_final, texts=words_data_nostops, dictionary=id2word, coherence='c_v')
coherence_lda_final = coherence_model_lda_final.get_coherence()
print('\nCoherence Score Final: ', coherence_lda_final)


# Viusalize the LDA Model Using LDAvis

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model_final, corpus, id2word)
vis

# Testing

In [None]:
print('\nLog Perplexity Final on Test: ', lda_model_final.log_perplexity(test_corpus)) 