In [3]:
#Referred link
#https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
import pandas as pd

In [4]:
#This contains data of news headlines published over a period of 15 years.

data = pd.read_csv('F:\\Data Science\\JigSaw\\Fast revision\\NLP\\Topic Modelling\\abcnews-date-text.csv', error_bad_lines=False);


In [6]:
data.shape

(1103663, 2)

In [7]:
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

In [11]:

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sunny\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
#Write a function to perform lemmatize and stem preprocessing steps on the data set.
def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [16]:
#Select a document to preview after preprocessing.
doc_sample = documents[documents['index'] == 4310].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['rain', 'helps', 'dampen', 'bushfires']


 tokenized and lemmatized document: 
['rain', 'help', 'dampen', 'bushfires']


In [17]:
#Now Preprocess the headline text, saving the results as ‘processed_docs’
processed_docs = documents['headline_text'].map(preprocess)
processed_docs[:10]

0              [decide, community, broadcast, licence]
1                         [witness, aware, defamation]
2           [call, infrastructure, protection, summit]
3                          [staff, aust, strike, rise]
4             [strike, affect, australian, travellers]
5               [ambitious, olsson, win, triple, jump]
6               [antic, delight, record, break, barca]
7    [aussie, qualifier, stosur, waste, memphis, ma...
8             [aust, address, security, council, iraq]
9                         [australia, lock, timetable]
Name: headline_text, dtype: object

In [18]:
#Create a dictionary from ‘processed_docs’ containing the number of times a word appears in the training set.
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 broadcast
1 community
2 decide
3 licence
4 aware
5 defamation
6 witness
7 call
8 infrastructure
9 protection
10 summit


In [22]:
#Filter out tokens that appear in
    #less than 15 documents (absolute number) or
    #more than 0.5 documents (fraction of total corpus size, not absolute number).
    #after the above two steps, keep only the first 100000 most frequent tokens.
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [23]:
#Gensim doc2bow
    #For each document we create a dictionary reporting how many
    #words and how many times those words appear. Save this to ‘bow_corpus’, then check our selected document earlier.
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(112, 1), (489, 1), (970, 1), (4427, 1)]

In [24]:
#Preview Bag Of Words for our sample preprocessed document.
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 112 ("help") appears 1 time.
Word 489 ("rain") appears 1 time.
Word 970 ("bushfires") appears 1 time.
Word 4427 ("dampen") appears 1 time.


In [25]:
#TF-IDF
#Create tf-idf model object using models.TfidfModel on ‘bow_corpus’ and save it to ‘tfidf’,
#then apply transformation to the entire corpus and call it ‘corpus_tfidf’.
#Finally we preview TF-IDF scores for our first document.
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.595557507312157),
 (1, 0.3948343404468723),
 (2, 0.4976240764382213),
 (3, 0.4917187993528624)]


In [26]:
#Train our lda model using gensim.models.LdaMulticore and save it to ‘lda_model’
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [27]:
#For each topic, we will explore the words occuring in that topic and its relative weight.
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))
#Can you distinguish different topics using the words in each topic and their corresponding weights?    

Topic: 0 
Words: 0.031*"charge" + 0.021*"murder" + 0.019*"face" + 0.016*"interview" + 0.016*"jail" + 0.015*"shoot" + 0.014*"accuse" + 0.013*"tasmania" + 0.012*"police" + 0.012*"life"
Topic: 1 
Words: 0.028*"court" + 0.028*"sydney" + 0.014*"tell" + 0.014*"news" + 0.014*"case" + 0.013*"children" + 0.013*"hear" + 0.012*"water" + 0.012*"violence" + 0.012*"continue"
Topic: 2 
Words: 0.025*"adelaide" + 0.016*"power" + 0.014*"final" + 0.014*"lose" + 0.012*"royal" + 0.011*"beat" + 0.011*"darwin" + 0.010*"young" + 0.010*"commission" + 0.010*"station"
Topic: 3 
Words: 0.033*"police" + 0.025*"attack" + 0.023*"kill" + 0.020*"melbourne" + 0.016*"china" + 0.015*"arrest" + 0.014*"people" + 0.013*"league" + 0.013*"dead" + 0.012*"death"
Topic: 4 
Words: 0.021*"school" + 0.020*"help" + 0.019*"country" + 0.018*"government" + 0.018*"say" + 0.014*"family" + 0.014*"minister" + 0.013*"state" + 0.012*"need" + 0.012*"flood"
Topic: 5 
Words: 0.019*"change" + 0.018*"call" + 0.016*"plan" + 0.015*"council" + 0.011

In [28]:
#Running LDA using TF-IDF
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.014*"tasmania" + 0.013*"drum" + 0.007*"december" + 0.006*"celebrate" + 0.006*"social" + 0.005*"obama" + 0.004*"nation" + 0.004*"media" + 0.004*"detention" + 0.004*"stream"
Topic: 1 Word: 0.015*"rural" + 0.011*"news" + 0.010*"weather" + 0.007*"michael" + 0.007*"national" + 0.007*"christmas" + 0.007*"climate" + 0.007*"friday" + 0.006*"rain" + 0.006*"wednesday"
Topic: 2 Word: 0.010*"government" + 0.008*"health" + 0.008*"fund" + 0.007*"plan" + 0.005*"sport" + 0.005*"indigenous" + 0.005*"farm" + 0.005*"federal" + 0.005*"people" + 0.005*"service"
Topic: 3 Word: 0.018*"market" + 0.011*"price" + 0.010*"share" + 0.009*"rise" + 0.008*"business" + 0.008*"guilty" + 0.007*"australian" + 0.007*"commission" + 0.007*"plead" + 0.006*"bank"
Topic: 4 Word: 0.018*"police" + 0.013*"crash" + 0.011*"kill" + 0.009*"shoot" + 0.008*"miss" + 0.008*"search" + 0.008*"attack" + 0.008*"dead" + 0.008*"arrest" + 0.007*"grandstand"
Topic: 5 Word: 0.021*"interview" + 0.009*"john" + 0.008*"david" + 0.007

In [29]:
#Again, can you distinguish different topics using the words in each topic and their corresponding weights?
processed_docs[4310]

['rain', 'help', 'dampen', 'bushfires']

In [30]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.42051535844802856	 
Topic: 0.021*"school" + 0.020*"help" + 0.019*"country" + 0.018*"government" + 0.018*"say" + 0.014*"family" + 0.014*"minister" + 0.013*"state" + 0.012*"need" + 0.012*"flood"

Score: 0.22016997635364532	 
Topic: 0.026*"queensland" + 0.021*"north" + 0.018*"election" + 0.017*"live" + 0.016*"rural" + 0.016*"years" + 0.014*"labor" + 0.013*"service" + 0.013*"national" + 0.013*"health"

Score: 0.21913816034793854	 
Topic: 0.031*"charge" + 0.021*"murder" + 0.019*"face" + 0.016*"interview" + 0.016*"jail" + 0.015*"shoot" + 0.014*"accuse" + 0.013*"tasmania" + 0.012*"police" + 0.012*"life"

Score: 0.02002524398267269	 
Topic: 0.044*"australia" + 0.031*"south" + 0.017*"fight" + 0.013*"game" + 0.013*"hobart" + 0.012*"program" + 0.012*"korea" + 0.011*"question" + 0.011*"reveal" + 0.010*"east"

Score: 0.020025203004479408	 
Topic: 0.028*"court" + 0.028*"sydney" + 0.014*"tell" + 0.014*"news" + 0.014*"case" + 0.013*"children" + 0.013*"hear" + 0.012*"water" + 0.012*"violence"

In [31]:
#Performance evaluation by classifying sample document using LDA TF-IDF model.
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))
#Our test document has the highest probability to be part of the topic that our model assigned, which is the accurate classification.


Score: 0.5584179162979126	 
Topic: 0.021*"interview" + 0.009*"john" + 0.008*"david" + 0.007*"october" + 0.007*"north" + 0.007*"korea" + 0.006*"thursday" + 0.005*"april" + 0.005*"quiz" + 0.005*"cyclone"

Score: 0.2815326452255249	 
Topic: 0.015*"rural" + 0.011*"news" + 0.010*"weather" + 0.007*"michael" + 0.007*"national" + 0.007*"christmas" + 0.007*"climate" + 0.007*"friday" + 0.006*"rain" + 0.006*"wednesday"

Score: 0.02000921405851841	 
Topic: 0.010*"government" + 0.008*"health" + 0.008*"fund" + 0.007*"plan" + 0.005*"sport" + 0.005*"indigenous" + 0.005*"farm" + 0.005*"federal" + 0.005*"people" + 0.005*"service"

Score: 0.020006341859698296	 
Topic: 0.024*"country" + 0.021*"hour" + 0.012*"podcast" + 0.011*"donald" + 0.010*"turnbull" + 0.006*"malcolm" + 0.006*"say" + 0.005*"pacific" + 0.005*"liberal" + 0.005*"labor"

Score: 0.020006325095891953	 
Topic: 0.018*"police" + 0.013*"crash" + 0.011*"kill" + 0.009*"shoot" + 0.008*"miss" + 0.008*"search" + 0.008*"attack" + 0.008*"dead" + 0.008*

In [32]:
#Testing model on unseen document
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.6582505702972412	 Topic: 0.021*"school" + 0.020*"help" + 0.019*"country" + 0.018*"government" + 0.018*"say"
Score: 0.2083785980939865	 Topic: 0.037*"australian" + 0.019*"open" + 0.018*"market" + 0.016*"brisbane" + 0.014*"year"
Score: 0.016673319041728973	 Topic: 0.019*"change" + 0.018*"call" + 0.016*"plan" + 0.015*"council" + 0.011*"concern"
Score: 0.016671499237418175	 Topic: 0.025*"adelaide" + 0.016*"power" + 0.014*"final" + 0.014*"lose" + 0.012*"royal"
Score: 0.01667100004851818	 Topic: 0.031*"charge" + 0.021*"murder" + 0.019*"face" + 0.016*"interview" + 0.016*"jail"
Score: 0.01667100004851818	 Topic: 0.028*"court" + 0.028*"sydney" + 0.014*"tell" + 0.014*"news" + 0.014*"case"
Score: 0.01667100004851818	 Topic: 0.033*"police" + 0.025*"attack" + 0.023*"kill" + 0.020*"melbourne" + 0.016*"china"
Score: 0.01667100004851818	 Topic: 0.044*"australia" + 0.031*"south" + 0.017*"fight" + 0.013*"game" + 0.013*"hobart"
Score: 0.01667100004851818	 Topic: 0.026*"queensland" + 0.021*"north