In [1]:
import numpy as np
import pandas as pd
import gensim
import gensim.corpora as corpora
import tm_preprocessor
from tm_preprocessor import Preprocessor
from gensim.models import CoherenceModel
import nltk
import pickle

In [2]:
def preprocessor(file , min_freq = 0 , max_freq = 1 , min_len = 2 , num_topic = 10):
    #preprocessor
    df = pd.read_csv(file)
    df = df[:200]
    data = df['abstract'].astype(str).values.tolist()
    preprocessor = Preprocessor(data)
    preprocessor.remove_digits_punctuactions()
    preprocessor.remove_stopwords()
    min_freq = min_freq
    max_freq = max_freq
    min_len = min_len
    lemmatizer = nltk.WordNetLemmatizer()
    preprocessor.normalize(lemmatizer, min_freq, max_freq, min_len)
    data_preprocessed = preprocessor.corpus
    id2word = corpora.Dictionary(data_preprocessed)
    corpus = [id2word.doc2bow(text) for text in data_preprocessed]
    #lda model
    lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus,
                                                                   id2word = id2word,
                                                                   num_topics= num_topic,
                                                                   random_state=100,
                                                                   update_every=1,
                                                                   chunksize=100,
                                                                   passes=10,
                                                                   alpha='auto',
                                                                   per_word_topics=True)

    topic_save = lda_model.print_topics(num_topic,10)
    
    
    #print('Perplexity:', lda_model.log_perplexity(corpus))    #a measure of how good the model is, lower the better
    perplexity_score = ['Perplexity:', lda_model.log_perplexity(corpus)]
    
    #Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_preprocessed, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    coherence_score = ['Coherence Score:', coherence_lda]
    
    topic_name_with_score = [topic_save, perplexity_score,coherence_score]
    
    with open("topic_name_with_score.txt", "wb") as modeltopic:#Pickling
        pickle.dump(topic_name_with_score, modeltopic)
        
    return(topic_name_with_score)


In [3]:
preprocessor('pub_abstract.csv',num_topic= 50)

[[(0,
   '0.029*"method" + 0.016*"scheme" + 0.015*"record" + 0.015*"innovation" + 0.013*"interview" + 0.013*"provide" + 0.013*"study" + 0.013*"describe" + 0.013*"patient" + 0.012*"process"'),
  (1,
   '0.057*"data" + 0.040*"interaction" + 0.029*"rfid" + 0.024*"base" + 0.017*"learn" + 0.017*"predictive" + 0.017*"nfc" + 0.015*"large" + 0.015*"analytics" + 0.015*"field"'),
  (2,
   '0.039*"support" + 0.033*"communication" + 0.028*"market" + 0.027*"model" + 0.018*"difficulty" + 0.018*"positive" + 0.017*"signal" + 0.017*"usefulness" + 0.017*"product" + 0.016*"view"'),
  (3,
   '0.037*"hp" + 0.032*"normal" + 0.031*"fgf" + 0.031*"opn" + 0.025*"expression" + 0.022*"nsclc" + 0.022*"yf" + 0.019*"gastrin" + 0.017*"correlation" + 0.016*"pylori"'),
  (4,
   '0.085*"problem" + 0.038*"open" + 0.027*"batch" + 0.027*"machine" + 0.022*"version" + 0.017*"polynomially" + 0.017*"solvable" + 0.017*"scheduling" + 0.017*"serial" + 0.016*"elsevier"'),
  (5,
   '0.033*"service" + 0.024*"framework" + 0.019*"cust

In [16]:
with open("topic_name_with_score.txt", "rb") as modeltopic:   # Unpickling
    model_topics = pickle.load(modeltopic)

In [17]:
model_topics

[[(0,
   '0.079*"medium" + 0.022*"delivery" + 0.019*"approach" + 0.019*"administrative" + 0.016*"substitute" + 0.015*"obtain" + 0.015*"past" + 0.014*"choice" + 0.011*"supplement" + 0.011*"tanzania"'),
  (1,
   '0.023*"consumer" + 0.021*"product" + 0.017*"security" + 0.017*"individual" + 0.016*"tool" + 0.015*"data" + 0.014*"approach" + 0.013*"policy" + 0.013*"risk" + 0.013*"organization"'),
  (2,
   '0.032*"communication" + 0.027*"market" + 0.023*"product" + 0.018*"theory" + 0.018*"support" + 0.015*"model" + 0.014*"difficulty" + 0.014*"view" + 0.014*"impact" + 0.013*"finding"'),
  (3,
   '0.026*"measurement" + 0.022*"algorithm" + 0.021*"simulation" + 0.020*"clock" + 0.020*"network" + 0.019*"normal" + 0.019*"opn" + 0.019*"fgf" + 0.018*"strength" + 0.015*"expression"'),
  (4,
   '0.030*"transfer" + 0.028*"process" + 0.024*"source" + 0.020*"problem" + 0.019*"cognitive" + 0.016*"attention" + 0.014*"xa" + 0.014*"benefit" + 0.014*"study" + 0.013*"project"'),
  (5,
   '0.043*"system" + 0.026*"