In [1]:
doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father."
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc4 = "Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better."
doc5 = "Health experts say that Sugar is not good for your lifestyle."

# compile documents
doc_complete = [doc1, doc2, doc3, doc4, doc5]

In [19]:
#Cleaning and Preprocessing
#Cleaning is an important step before any text mining task, in this step, we will remove the punctuations, 
#stopwords and normalize the corpus.
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string



In [51]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 

lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in doc_complete]

In [54]:
#Preparing Document-Term Matrix
# Importing Gensim
import gensim
from gensim import corpora

# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [60]:
#Running LDA Model
#Next step is to create an object for LDA model and train it on Document-Term matrix. The training also requires few 
#parameters as input which are explained in the above section. The gensim module 
#allows both LDA model estimation from a training corpus and inference of topic distribution on new, unseen documents.

# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel


In [61]:
# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)

In [64]:
#Now let see the results..here 0 ,1, 2 ... are topic and sugar,bad and  so on are word of  that topic
print(ldamodel.print_topics(num_topics=3, num_words=3))
#Each line is a topic with individual topic terms and weights.Here weight represents how important that word is for the topic?
#Topic1 can be termed as Bad Health, and Topic3 can be termed as Family.

[(0, '0.029*"sugar" + 0.029*"like" + 0.029*"consume"'), (1, '0.069*"father" + 0.069*"sister" + 0.048*"pressure"'), (2, '0.076*"sugar" + 0.075*"good" + 0.075*"health"')]


In [66]:
#Tips to improve results of topic modeling
#(1) Frequency Filter - 
    #Arrange every term according to its frequency.
    #Terms with higher frequencies are more likely to appear in the results as compared ones with low frequency.
    #The low frequency terms are essentially weak features of the corpus, hence it is a good practice to get rid 
        #of all those weak features.
    #An exploratory analysis of terms and their frequency can help to decide what frequency value should be considered 
        #as the threshold.

#(2) Part of Speech Tag Filter
    #POS tag filter is more about the context of the features than frequencies of features
    #So keeo the POS which is sensible
#(3) Batch Wise LDA –
    #In order to retrieve most important topic terms, a corpus can be divided into batches of fixed sizes.
    #Running LDA multiple times on these batches will provide different results, however, the best topic
    #terms will be the intersection of all batches.
    
import os
os.getcwd()



'C:\\Users\\sunny'