In [1]:
import nltk; nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yihuiwang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#Spacy for lemmatization
import spacy

In [3]:
#NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [4]:
df = pd.read_csv('pub_abstract.csv')
df = df[:200]

In [5]:
data = df['abstract'].astype(str).values.tolist()

In [6]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence),deacc=True))   #deacc = True...remove punctuations
        
data_words = list(sent_to_words(data))

In [7]:
print(data_words[:1])

[['within', 'the', 'field', 'of', 'information', 'systems', 'is', 'user', 'involvement', 'generally', 'refers', 'to', 'participation', 'in', 'the', 'systems', 'development', 'process', 'by', 'potential', 'users', 'or', 'their', 'representatives', 'and', 'is', 'measured', 'as', 'set', 'of', 'behaviors', 'or', 'activities', 'that', 'such', 'individuals', 'perform', 'this', 'article', 'argues', 'for', 'separation', 'of', 'the', 'constructs', 'of', 'user', 'participation', 'set', 'of', 'behaviors', 'or', 'activities', 'performed', 'by', 'users', 'in', 'the', 'system', 'development', 'process', 'and', 'user', 'involvement', 'subjective', 'psychological', 'state', 'reflecting', 'the', 'importance', 'and', 'personal', 'relevance', 'of', 'system', 'to', 'the', 'user', 'such', 'distinction', 'is', 'not', 'only', 'more', 'consistent', 'with', 'of', 'involvement', 'found', 'in', 'other', 'disciplines', 'but', 'it', 'also', 'leads', 'to', 'number', 'of', 'new', 'and', 'interesting', 'hypotheses', 

In [8]:
#Creat Bigram
bigram = gensim.models.Phrases(data_words, min_count= 5, threshold= 100)   #higher threshold fewer phrases

bigram_mod = gensim.models.phrases.Phraser(bigram)

print(bigram_mod[data_words[0]])

['within', 'the', 'field', 'of', 'information', 'systems', 'is', 'user_involvement', 'generally', 'refers', 'to', 'participation', 'in', 'the', 'systems', 'development', 'process', 'by', 'potential', 'users', 'or', 'their', 'representatives', 'and', 'is', 'measured', 'as', 'set', 'of', 'behaviors', 'or', 'activities', 'that', 'such', 'individuals', 'perform', 'this', 'article', 'argues', 'for', 'separation', 'of', 'the', 'constructs', 'of', 'user', 'participation', 'set', 'of', 'behaviors', 'or', 'activities', 'performed', 'by', 'users', 'in', 'the', 'system', 'development', 'process', 'and', 'user_involvement', 'subjective', 'psychological', 'state', 'reflecting', 'the', 'importance', 'and', 'personal', 'relevance', 'of', 'system', 'to', 'the', 'user', 'such', 'distinction', 'is', 'not', 'only', 'more', 'consistent', 'with', 'of', 'involvement', 'found', 'in', 'other', 'disciplines', 'but', 'it', 'also', 'leads', 'to', 'number', 'of', 'new', 'and', 'interesting', 'hypotheses', 'these'

In [9]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [10]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def lemmatization(texts, allowed_postags = ['NOUN','ADJ','VERB','ADV']):
    """https://spacy.io/api./annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [11]:
data_words_nostops = remove_stopwords(data_words)

#For Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

#initialize spacy 'en' model
#python3 -m spacy download en

nlp = spacy.load('en', disable =['parser','ner'] )

data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN','ADJ','VERB','ADV'])

print(data_lemmatized[:1])

[['field', 'information', 'system', 'user_involvement', 'generally', 'refer', 'participation', 'system', 'development', 'process', 'potential', 'user', 'representative', 'measure', 'set', 'behavior', 'activity', 'individual', 'perform', 'article', 'argue', 'separation', 'construct', 'user', 'participation', 'set', 'behavior', 'activity', 'perform', 'user', 'system', 'development', 'process', 'user_involvement', 'subjective', 'psychological', 'state', 'reflect', 'importance', 'personal', 'relevance', 'system', 'user', 'distinction', 'consistent', 'involvement', 'find', 'discipline', 'also', 'lead', 'number', 'new', 'interesting', 'hypothesis', 'hypothesis', 'promise', 'rich', 'theoretical', 'network', 'describe', 'role', 'importance', 'participation', 'involvement', 'implementation', 'process']]


In [12]:
#Creat Disctionary
id2word = corpora.Dictionary(data_lemmatized)

#Creat Corpus
texts = data_lemmatized

#Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

#View 
print(corpus[:1])

[[(0, 2), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 2), (15, 1), (16, 2), (17, 1), (18, 1), (19, 1), (20, 2), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 3), (27, 2), (28, 1), (29, 1), (30, 3), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 2), (41, 1), (42, 1), (43, 4), (44, 1), (45, 4), (46, 2)]]


In [13]:
# word in given id
id2word[0]

'activity'

In [22]:
# Build LDA model for different number of topics

import time
start = time.clock()
num_topics_range = list(range(10,60,10))

for i in num_topics_range:
    num_topics = i
    lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus,
                                                                   id2word = id2word,
                                                                   num_topics= i,
                                                                   random_state=100,
                                                                   update_every=1,
                                                                   chunksize=100,
                                                                   passes=10,
                                                                   alpha='auto',
                                                                   per_word_topics=True)
    print(i)
    pprint(lda_model.print_topics())
    #Compute Perplexity
    print('\nPerplexity:', lda_model.log_perplexity(corpus))    #a measure of how good the model is, lower the better
    
    #Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score:', coherence_lda)

end = (time.clock() - start)
print(end)

  """


10
[(0,
  '0.020*"service" + 0.015*"web" + 0.014*"propose" + 0.012*"emr" + '
  '0.010*"application" + 0.009*"use" + 0.009*"mobile" + 0.009*"signature" + '
  '0.009*"quality" + 0.008*"base"'),
 (1,
  '0.009*"knowledge" + 0.007*"disease" + 0.007*"use" + 0.007*"bdl" + '
  '0.007*"intention" + 0.007*"technology" + 0.007*"problem" + 0.006*"research" '
  '+ 0.006*"application" + 0.006*"useful"'),
 (2,
  '0.023*"research" + 0.014*"system" + 0.013*"use" + 0.010*"paper" + '
  '0.010*"service" + 0.010*"literature" + 0.008*"review" + 0.008*"innovation" '
  '+ 0.008*"technology" + 0.008*"analysis"'),
 (3,
  '0.021*"knowledge" + 0.014*"key" + 0.012*"domain" + 0.011*"management" + '
  '0.011*"developer" + 0.009*"community" + 0.008*"organizational" + '
  '0.008*"logical" + 0.008*"happiness" + 0.007*"social"'),
 (4,
  '0.013*"knowledge" + 0.013*"performance" + 0.013*"sme" + 0.012*"transfer" + '
  '0.011*"route" + 0.009*"practice" + 0.008*"cognitive" + 0.008*"model" + '
  '0.007*"study" + 0.007*"find"'


Perplexity: -10.47116061177928

Coherence Score: 0.34016903319100494
14.492186999999998




### END

'import time
start = time.clock()
Build LDA model
'lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus,
                                                                   id2word = id2word,
                                                                   num_topics= 15,
                                                                   random_state=100,
                                                                   update_every=1,
                                                                   chunksize=100,
                                                                   passes=10,
                                                                   alpha='auto',
                                                                   per_word_topics=True)

elapsed = (time.clock() - start)
print(elapsed)