In [1]:
import numpy as np
import pandas as pd
import re, nltk, gensim 

In [2]:
# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

In [3]:
# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn

<h2>Data loading</h2>

In [4]:
# open title data
data_path = "dataset/"
df_corona = pd.read_json(data_path + 'corona_v3.json')
df_corona

Unnamed: 0,Pmid,Year,Title,Abstract,Keywords,norm_tiabs
0,32857764,2020,"[Translesion synthesis by AMV, HIV, and MMLVre...",[Inosine is ubiquitous and essential in many b...,[],"Translesion synthesis by AMV, HIV, and MMLVrev..."
1,32849762,2020,[LROD: An Overlap Detection Algorithm for Long...,[Third-generation sequencing technologies can ...,"[[alignment], [k-mer distribution], [long read...",LROD: An Overlap Detection Algorithm for Long ...
2,32849463,2020,[Analytical Performance Validation of Next-Gen...,[Next-generation sequencing (NGS) enables clin...,"[[antimicrobial resistance], [human pathogens]...",Analytical Performance Validation of Next-Gene...
3,32843552,2020,[A Genome-Based Model to Predict the Virulence...,[Variation in the genome of Pseudomonas aerugi...,"[[], [genome analysis], [machine learning], [m...",A Genome-Based Model to Predict the Virulence ...
4,32838395,2020,[The world should establish an early warning s...,[With the emergence of several new epidemics o...,[],The world should establish an early warning sy...
...,...,...,...,...,...,...
67963,28309716,2020,[Shallow water meiobenthos of the bermuda plat...,[The distribution and abundance of subtidal me...,[],Shallow water meiobenthos of the bermuda platf...
67964,5504579,1971,[Fats in fresh water crustaceans. I. Fatty aci...,[],[],Fats in fresh water crustaceans. I. Fatty acid...
67965,28304606,2019,[[Patterns of differentiation of medusae buds ...,[1. The normal development of medusa ofPodocor...,[],[Patterns of differentiation of medusae buds a...
67966,14325028,1996,[THE ADULT AND LARVAL MORPHOLOGY AND LIFE HIST...,[],"[[ANATOMY], [BRYOZOA], [EMBRYOLOGY], [EXPERIME...",THE ADULT AND LARVAL MORPHOLOGY AND LIFE HISTO...


In [5]:
# norm_tiabs = normalized title + abstract
df_corona["norm_tiabs"][0]

'Translesion synthesis by AMV, HIV, and MMLVreverse transcriptases using RNA templates containing inosine, guanosine, and their 8-oxo-7,8-dihydropurine derivatives. Inosine is ubiquitous and essential in many biological processes, including RNA-editing. In addition, oxidative stress on RNA has been a topic of increasing interest due, in part, to its potential role in the development/progression of disease. In this work we probed the ability of three reverse transcriptases (RTs) to catalyze the synthesis of cDNA in the presence of RNA templates containing inosine (I), 8-oxo-7,8-dihydroinosine (8oxo-I), guanosine (G), or 8-oxo-7,8-dihydroguanosine (8-oxoG), and explored the impact that these purine derivatives have as a function of position. To this end, we used 29-mers of RNA (as template) containing the modifications at position-18 and reverse transcribed DNA using 17-mers, 18-mers, or 19-mers (as primers). Generally reactivity of the viral RTs, AMV / HIV / MMLV, towards cDNA synthesis

<h2>Cleaning, Stemming</h2>

In [6]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/users0/changwn/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [37]:
# These two function include lemmatizing, stemming and short word (3 characters) removing
# import SnowballStemmer in English for stemming
stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result
lemmatize_stemming("isolation")

'isol'

In [11]:
# Test on a sample document
doc_sample = df_corona["norm_tiabs"][4]

print("original document: ")
words = []
for word in doc_sample.split(" "):
    words.append(word)
print(words)

# tokenized and lemmatized document
print("\n\ntokenized and lemmatized document: ")
print(preprocess(doc_sample))


original document: 


tokenized and lemmatized document: 
['world', 'establish', 'earli', 'warn', 'viral', 'infecti', 'diseas', 'space', 'weather', 'monitor', 'emerg', 'epidem', 'viral', 'infect', 'sar', 'mer', 'ebola', 'zika', 'influenza', 'pandem', 'covid', 'past', 'decad', 'suggest', 'world', 'wide', 'programm', 'stratospher', 'surveil', 'space', 'weather', 'monitor', 'urgent', 'place', 'delay']


<h2>Mapping for creating document-word matrix</h2>

In [12]:
# pandas.Series.map() --> pandas.core.series.Series
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.map.html
processed_docs = df_corona["norm_tiabs"].map(preprocess)

In [13]:
print("Type of processed_docs: ", type(processed_docs))
processed_docs[:10]

Type of processed_docs:  <class 'pandas.core.series.Series'>


0    [transles, synthesi, mmlvrevers, transcriptas,...
1    [lrod, overlap, detect, algorithm, long, read,...
2    [analyt, perform, valid, generat, sequenc, bas...
3    [genom, base, model, predict, virul, pseudomon...
4    [world, establish, earli, warn, viral, infecti...
5    [rare, viral, infect, lung, direct, contact, e...
6    [structur, comparison, monoval, cation, exchan...
7    [ardep, rapid, degener, primer, design, pipeli...
8    [care, context, awar, sequenc, read, error, co...
9    [irrevers, multilay, adsorpt, semirigid, mer, ...
Name: norm_tiabs, dtype: object

In [14]:
pd_list = []

for sent in processed_docs:
    #print(' '.join(sent))
    pd_list.append(' '.join(sent))

#pd_list

<h2>Create the Document-Word matrix</h2>

In [15]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform(pd_list)

<h2>Check the Data Sparsity</h2>

In [16]:
data_dense = data_vectorized.todense()
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

Sparsicity:  0.4718120846884145 %


<h2>Build LDA Model</h2>

In [17]:
# Build LDA Model in 15 topics
lda_model = LatentDirichletAllocation(n_components=15,           # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=15, n_jobs=-1,
                          perp_tol=0.1, random_state=100, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)


In [21]:
# save model
import pickle
with open("model/lda_tp15.pk", "wb") as fp:   #Pickling
    pickle.dump(lda_model, fp)

<h2>Diagnose model performance with perplexity and log-likelihood</h2>

In [22]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))

# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -35275501.11263594
Perplexity:  1085.599197530923
{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 15,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': 100,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


<h2>GridSearch the best LDA model</h2>
<ul>
    <li> Grid search only used in subset</li>
    <li> Takes too long to run</li>
</ul>

<h2>See the best topic model and its parameters</h2>

<h2>Topic distribution in each document, assign dominant topic</h2>

In [23]:
# Document - Topics distribution 
data = list(df_corona["norm_tiabs"])
best_lda_model = lda_model

# Create Document - Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)

# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(data))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)


# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

df_document_topic_title = df_document_topic
df_document_topic_title['title'] = data ##
df_document_topic_title = df_document_topic.head(15)
df_document_topic_title



Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,dominant_topic,title
Doc0,0.0,0.0,0.03,0.19,0.0,0.03,0.05,0.22,0.0,0.0,0.4,0.0,0.03,0.0,0.05,10,"Translesion synthesis by AMV, HIV, and MMLVrev..."
Doc1,0.7,0.0,0.0,0.0,0.0,0.13,0.0,0.12,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0,LROD: An Overlap Detection Algorithm for Long ...
Doc2,0.44,0.0,0.0,0.0,0.0,0.0,0.01,0.24,0.0,0.04,0.0,0.0,0.03,0.0,0.24,0,Analytical Performance Validation of Next-Gene...
Doc3,0.3,0.0,0.05,0.0,0.0,0.0,0.0,0.57,0.0,0.03,0.0,0.0,0.01,0.0,0.04,7,A Genome-Based Model to Predict the Virulence ...
Doc4,0.0,0.29,0.13,0.0,0.08,0.0,0.23,0.0,0.0,0.21,0.0,0.04,0.0,0.0,0.0,1,The world should establish an early warning sy...
Doc5,0.0,0.12,0.0,0.0,0.23,0.02,0.14,0.0,0.0,0.27,0.0,0.0,0.07,0.14,0.0,9,[Rare viral infections of the lungs]. Due to t...
Doc6,0.06,0.0,0.0,0.2,0.03,0.08,0.0,0.02,0.27,0.0,0.06,0.28,0.0,0.0,0.0,11,Structural Characterization and Comparison of ...
Doc7,0.37,0.04,0.0,0.0,0.0,0.02,0.08,0.33,0.0,0.0,0.05,0.11,0.0,0.0,0.0,0,"ARDEP, a Rapid Degenerate Primer Design Pipeli..."
Doc8,0.68,0.0,0.0,0.0,0.0,0.0,0.11,0.16,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0,CARE: Context-Aware Sequencing Read Error Corr...
Doc9,0.05,0.0,0.0,0.02,0.0,0.45,0.01,0.0,0.04,0.0,0.0,0.43,0.0,0.0,0.0,5,Irreversible multilayer adsorption of semirigi...


<h2>Assign topic to each document, export subsets</h2>

In [24]:
df_corona["dominant_topic"] = dominant_topic
df_corona

Unnamed: 0,Pmid,Year,Title,Abstract,Keywords,norm_tiabs,dominant_topic
0,32857764,2020,"[Translesion synthesis by AMV, HIV, and MMLVre...",[Inosine is ubiquitous and essential in many b...,[],"Translesion synthesis by AMV, HIV, and MMLVrev...",10
1,32849762,2020,[LROD: An Overlap Detection Algorithm for Long...,[Third-generation sequencing technologies can ...,"[[alignment], [k-mer distribution], [long read...",LROD: An Overlap Detection Algorithm for Long ...,0
2,32849463,2020,[Analytical Performance Validation of Next-Gen...,[Next-generation sequencing (NGS) enables clin...,"[[antimicrobial resistance], [human pathogens]...",Analytical Performance Validation of Next-Gene...,0
3,32843552,2020,[A Genome-Based Model to Predict the Virulence...,[Variation in the genome of Pseudomonas aerugi...,"[[], [genome analysis], [machine learning], [m...",A Genome-Based Model to Predict the Virulence ...,7
4,32838395,2020,[The world should establish an early warning s...,[With the emergence of several new epidemics o...,[],The world should establish an early warning sy...,1
...,...,...,...,...,...,...,...
67963,28309716,2020,[Shallow water meiobenthos of the bermuda plat...,[The distribution and abundance of subtidal me...,[],Shallow water meiobenthos of the bermuda platf...,11
67964,5504579,1971,[Fats in fresh water crustaceans. I. Fatty aci...,[],[],Fats in fresh water crustaceans. I. Fatty acid...,7
67965,28304606,2019,[[Patterns of differentiation of medusae buds ...,[1. The normal development of medusa ofPodocor...,[],[Patterns of differentiation of medusae buds a...,7
67966,14325028,1996,[THE ADULT AND LARVAL MORPHOLOGY AND LIFE HIST...,[],"[[ANATOMY], [BRYOZOA], [EMBRYOLOGY], [EXPERIME...",THE ADULT AND LARVAL MORPHOLOGY AND LIFE HISTO...,4


In [25]:
# Extract subsets
topic_0 = df_corona.loc[df_corona["dominant_topic"] == 0]
topic_1 = df_corona.loc[df_corona["dominant_topic"] == 1]
topic_2 = df_corona.loc[df_corona["dominant_topic"] == 2]
topic_3 = df_corona.loc[df_corona["dominant_topic"] == 3]
topic_4 = df_corona.loc[df_corona["dominant_topic"] == 4]
topic_5 = df_corona.loc[df_corona["dominant_topic"] == 5]
topic_6 = df_corona.loc[df_corona["dominant_topic"] == 6]
topic_7 = df_corona.loc[df_corona["dominant_topic"] == 7]
topic_8 = df_corona.loc[df_corona["dominant_topic"] == 8]
topic_9 = df_corona.loc[df_corona["dominant_topic"] == 9]
topic_10 = df_corona.loc[df_corona["dominant_topic"] == 10]
topic_11 = df_corona.loc[df_corona["dominant_topic"] == 11]
topic_12 = df_corona.loc[df_corona["dominant_topic"] == 12]
topic_13 = df_corona.loc[df_corona["dominant_topic"] == 13]
topic_14 = df_corona.loc[df_corona["dominant_topic"] == 14]

topic_0

Unnamed: 0,Pmid,Year,Title,Abstract,Keywords,norm_tiabs,dominant_topic
1,32849762,2020,[LROD: An Overlap Detection Algorithm for Long...,[Third-generation sequencing technologies can ...,"[[alignment], [k-mer distribution], [long read...",LROD: An Overlap Detection Algorithm for Long ...,0
2,32849463,2020,[Analytical Performance Validation of Next-Gen...,[Next-generation sequencing (NGS) enables clin...,"[[antimicrobial resistance], [human pathogens]...",Analytical Performance Validation of Next-Gene...,0
7,32824566,2020,"[ARDEP, a Rapid Degenerate Primer Design Pipel...",[The survey of microbial diversity in various ...,"[[analysis platform], [bioinformatic program],...","ARDEP, a Rapid Degenerate Primer Design Pipeli...",0
8,32818262,2020,[CARE: Context-Aware Sequencing Read Error Cor...,"[<AbstractText Label=""MOTIVATION"" NlmCategory=...",[],CARE: Context-Aware Sequencing Read Error Corr...,0
15,32703211,2020,[A performant bridge between fixed-size and va...,"[<AbstractText Label=""BACKGROUND"" NlmCategory=...","[[FMD-index], [High-throughput sequence alignm...",A performant bridge between fixed-size and var...,0
...,...,...,...,...,...,...,...
67845,2745101,1989,[RF currents induced in an anatomically-based ...,[The three-dimensional finite-difference time-...,[],RF currents induced in an anatomically-based m...,0
67850,2712850,1989,[Simple nonperturbing temperature probe for mi...,[We present a simple readout device that fills...,[],Simple nonperturbing temperature probe for mic...,0
67897,3516674,1986,[Structure-activity relationships (SARs) among...,[This review is an introduction to methods for...,[],Structure-activity relationships (SARs) among ...,0
67905,3854056,1986,[Application of a finite-difference technique ...,[A powerful finite-difference numerical techni...,[],Application of a finite-difference technique t...,0


In [26]:
# output json file
topic_0.to_json(r'dataset/subset/corona_topic0.json')
topic_1.to_json(r'dataset/subset/corona_topic1.json')
topic_2.to_json(r'dataset/subset/corona_topic2.json')
topic_3.to_json(r'dataset/subset/corona_topic3.json')
topic_4.to_json(r'dataset/subset/corona_topic4.json')
topic_5.to_json(r'dataset/subset/corona_topic5.json')
topic_6.to_json(r'dataset/subset/corona_topic6.json')
topic_7.to_json(r'dataset/subset/corona_topic7.json')
topic_8.to_json(r'dataset/subset/corona_topic8.json')
topic_9.to_json(r'dataset/subset/corona_topic9.json')
topic_10.to_json(r'dataset/subset/corona_topic10.json')
topic_11.to_json(r'dataset/subset/corona_topic11.json')
topic_12.to_json(r'dataset/subset/corona_topic12.json')
topic_13.to_json(r'dataset/subset/corona_topic13.json')
topic_14.to_json(r'dataset/subset/corona_topic14.json')

<h2>Visualize the LDA model with pyLDAvis</h2>

In [49]:
pyLDAvis.enable_notebook()
# This is apply on sklearn best lda model (select from gesim result), topic = 7
panel = pyLDAvis.sklearn.prepare(best_lda_model, data_vectorized, vectorizer, mds='tsne')
panel

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [50]:
pyLDAvis.save_html(panel, 'lda_corona_topic15_coherence.html')

<h2>Get the top 15 keywords each topic</h2>

In [27]:
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=15)        

# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,model,base,data,method,predict,analysi,test,studi,estim,approach,perform,develop,screen,evalu,result
Topic 1,case,outbreak,epidem,china,diseas,transmiss,spread,infecti,emerg,control,epidemiolog,report,itali,isol,provinc
Topic 2,covid,patient,diseas,sever,hospit,case,clinic,infect,studi,risk,coronavirus,associ,symptom,report,treatment
Topic 3,protein,activ,structur,sar,bind,compound,inhibitor,domain,target,receptor,interact,peptid,studi,deriv,spike
Topic 4,sar,coronavirus,respiratori,syndrom,infect,acut,sever,virus,mer,human,caus,viral,transmiss,middl,east
Topic 5,mental,studi,psycholog,particip,anxieti,stress,physic,self,relat,dimer,level,behavior,depress,disord,factor
Topic 6,covid,pandem,health,care,public,diseas,provid,coronavirus,manag,impact,emerg,respons,social,need,medic
Topic 7,sequenc,gene,genom,strain,speci,analysi,protein,region,virus,differ,isol,genet,mutat,recombin,studi
Topic 8,quot,cancer,covid,treatment,trial,woman,safeti,exposur,pregnant,home,deliveri,group,effect,oncolog,food
Topic 9,virus,infect,vaccin,immun,viral,drug,diseas,respons,antibodi,effect,treatment,develop,coronavirus,sar,mous


<h2>Try Gensim model and Coherence Score</h2>

In [38]:
data_lemmatized = []
for d in df_corona["norm_tiabs"]:
    lemma_doc = preprocess(d)
    data_lemmatized.append(lemma_doc)

In [39]:
# store data_lemmatized list
import pickle

with open("dataset/data_lemmatized.txt", "wb") as fp:   #Pickling
    pickle.dump(data_lemmatized, fp)


In [40]:
# open pickle file
with open("dataset/data_lemmatized.txt", "rb") as fp:   # Unpickling
    ls = pickle.load(fp)
len(data_lemmatized)

67968

In [41]:
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
texts = data_lemmatized
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 5), (15, 1), (16, 1), (17, 6), (18, 1), (19, 1), (20, 1), (21, 2), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 2), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 2), (39, 1), (40, 1), (41, 1), (42, 2), (43, 3), (44, 1), (45, 2), (46, 2), (47, 1), (48, 1), (49, 5), (50, 1), (51, 1), (52, 2), (53, 2), (54, 5), (55, 2), (56, 1), (57, 3), (58, 1), (59, 1), (60, 1), (61, 2), (62, 2), (63, 3), (64, 3), (65, 2), (66, 1), (67, 3), (68, 1), (69, 5), (70, 1), (71, 1), (72, 1), (73, 1), (74, 5), (75, 1), (76, 1), (77, 2), (78, 1), (79, 1), (80, 2), (81, 1), (82, 1), (83, 1), (84, 2), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 6), (91, 6), (92, 1), (93, 1), (94, 1), (95, 2), (96, 1), (97, 1), (98, 1), (99, 1), (100, 2), (101, 1), (102, 1)]]


In [42]:
# store data_lemmatized list
with open("dataset/dict_id2word", "wb") as fp:   #Pickling
    pickle.dump(id2word, fp)


In [43]:
# Build Gensim LDA model
lda_model_gensim = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=15, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [44]:
# save gensim model
with open("model/lda_model_gensim.pk", "wb") as fp:   #Pickling
    pickle.dump(lda_model_gensim, fp)

In [45]:
import gensim

# Compute Perplexity
print('\nPerplexity: ', lda_model_gensim.log_perplexity(corpus))  # a measure of how good the model is. lower the better.


Perplexity:  -7.82402787739186


In [46]:
from gensim.models import CoherenceModel

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model_gensim, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.5646561484722131
