In [1]:
import pandas as pd

data = pd.read_csv("abcnews-date-text.csv", error_bad_lines=False)
data_text = data[ : 300000][['headline_text']]
data_text['index'] = data_text.index
documents = data_text

In [2]:
data_text

Unnamed: 0,headline_text,index
0,aba decides against community broadcasting lic...,0
1,act fire witnesses must be aware of defamation,1
2,a g calls for infrastructure protection summit,2
3,air nz staff in aust strike for pay rise,3
4,air nz strike to affect australian travellers,4
...,...,...
299995,broughton hall audit reveals serious breaches,299995
299996,broughton hall fails key standards,299996
299997,broughton hall safe for residents govt says,299997
299998,burn off at conservation park aims to prevent,299998


In [3]:
# Load gensim and NLTK libraries

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)

In [4]:
import nltk
nltk.data.path.append('../../../../Python Libs/')

In [5]:
print(WordNetLemmatizer().lemmatize('went', pos='v'))

go


In [7]:
stemmer = SnowballStemmer("english")
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data={"original_words": original_words, "singles": singles})

Unnamed: 0,original_words,singles
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [9]:
def lemmatize_string(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in simple_preprocess(text) :
        if token not in STOPWORDS and len(token) > 3:
            result.append(lemmatize_string(token))
    return result


In [10]:
document_num = 4310
doc_sample = documents[documents['index'] == document_num].values[0][0]

print("Original Document : ")
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\n Tokenized and Lemmatized document : ")
print(preprocess(text=doc_sample))

Original Document : 
['rain', 'helps', 'dampen', 'bushfires']


 Tokenized and Lemmatized document : 
['rain', 'help', 'dampen', 'bushfir']


In [16]:
documents[documents['index'] == document_num].values[0][0]

'rain helps dampen bushfires'

In [17]:
preprocessed_docs = documents['headline_text'].map(preprocess)

In [18]:
preprocessed_docs[:10]

0            [decid, communiti, broadcast, licenc]
1                               [wit, awar, defam]
2           [call, infrastructur, protect, summit]
3                      [staff, aust, strike, rise]
4             [strike, affect, australian, travel]
5               [ambiti, olsson, win, tripl, jump]
6           [antic, delight, record, break, barca]
7    [aussi, qualifi, stosur, wast, memphi, match]
8            [aust, address, secur, council, iraq]
9                         [australia, lock, timet]
Name: headline_text, dtype: object

##### Bag of Words

In [19]:
from gensim.corpora import Dictionary

In [20]:
dictionary = Dictionary(preprocessed_docs)

In [21]:
count = 0 
for (k, v) in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 broadcast
1 communiti
2 decid
3 licenc
4 awar
5 defam
6 wit
7 call
8 infrastructur
9 protect
10 summit


In [32]:
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n=100000)

In [33]:
bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]

In [34]:
bow_corpus[document_num]

[(71, 1), (107, 1), (462, 1), (3530, 1)]

In [35]:
bow_doc_4310 = bow_corpus[document_num]

for i in range(0, len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} times".format(bow_doc_4310[i][0], dictionary[bow_doc_4310[i][0]], bow_doc_4310[i][1]))

Word 71 ("bushfir") appears 1 times
Word 107 ("help") appears 1 times
Word 462 ("rain") appears 1 times
Word 3530 ("dampen") appears 1 times


#### TFIDF on doc set

In [36]:
from gensim import models, corpora

In [41]:
tf_idf = models.TfidfModel(bow_corpus)

In [44]:
corpus_tfidf = tf_idf[bow_corpus]

In [45]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5959813347777092),
 (1, 0.39204529549491984),
 (2, 0.48531419274988147),
 (3, 0.5055461098578569)]


In [58]:
train = True
#Run only first time
if train:
    lda_model = models.LdaMulticore(corpus=bow_corpus,
                            num_topics=10,
                            id2word=dictionary,
                            passes=2
                           )
    lda_model.save("./model/lda_model")

In [59]:

lda_model = models.LdaModel.load("./model/lda_model")

In [60]:
for idx, topic in lda_model.print_topics(-1):
    print("Topic : {} \nWords : {}".format(topic, idx))
    print("\n")

Topic : 0.015*"murder" + 0.012*"jail" + 0.010*"year" + 0.010*"drought" + 0.009*"plan" + 0.009*"charg" + 0.009*"report" + 0.009*"releas" + 0.009*"clear" + 0.008*"court" 
Words : 0


Topic : 0.043*"polic" + 0.019*"kill" + 0.017*"investig" + 0.012*"attack" + 0.012*"probe" + 0.011*"bomb" + 0.011*"secur" + 0.009*"victim" + 0.008*"seek" + 0.008*"accid" 
Words : 1


Topic : 0.018*"miss" + 0.013*"search" + 0.012*"polic" + 0.010*"take" + 0.008*"guilti" + 0.008*"toll" + 0.007*"win" + 0.007*"death" + 0.007*"head" + 0.006*"tiger" 
Words : 2


Topic : 0.020*"council" + 0.015*"polic" + 0.015*"drug" + 0.013*"court" + 0.012*"elect" + 0.011*"seek" + 0.009*"water" + 0.009*"urg" + 0.008*"green" + 0.008*"charg" 
Words : 3


Topic : 0.011*"protest" + 0.008*"market" + 0.008*"melbourn" + 0.007*"prompt" + 0.007*"rais" + 0.007*"gold" + 0.006*"bush" + 0.006*"urg" + 0.005*"concern" + 0.005*"price" 
Words : 4


Topic : 0.021*"fund" + 0.019*"boost" + 0.019*"govt" + 0.011*"offer" + 0.011*"consid" + 0.010*"indigen" 

In [61]:
if train:
    lda_model_tfidf = models.LdaMulticore(corpus=corpus_tfidf,
                                      num_topics=10,
                                      id2word=dictionary,
                                      passes=2
                                     )
    lda_model_tfidf.save("./model/lda_model_tfidf")
lda_model_tfidf = models.LdaModel.load("./model/lda_model_tfidf")

In [62]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print("Topic : {} \nWords : {}".format(topic, idx))
    print("\n")

Topic : 0.008*"iraq" + 0.007*"kill" + 0.007*"troop" + 0.006*"attack" + 0.006*"plan" + 0.006*"soldier" + 0.006*"govt" + 0.004*"polic" + 0.004*"test" + 0.004*"afghanistan" 
Words : 0


Topic : 0.006*"polic" + 0.006*"charg" + 0.005*"care" + 0.005*"climat" + 0.005*"child" + 0.004*"seek" + 0.004*"violenc" + 0.004*"govt" + 0.004*"health" + 0.004*"bail" 
Words : 1


Topic : 0.006*"polic" + 0.005*"assault" + 0.005*"govt" + 0.005*"station" + 0.004*"school" + 0.004*"council" + 0.004*"plan" + 0.004*"nurs" + 0.004*"stand" + 0.004*"candid" 
Words : 2


Topic : 0.018*"crash" + 0.014*"polic" + 0.013*"kill" + 0.007*"investig" + 0.007*"charg" + 0.007*"court" + 0.007*"accid" + 0.006*"fatal" + 0.006*"bomb" + 0.006*"death" 
Words : 3


Topic : 0.007*"charg" + 0.007*"polic" + 0.006*"drive" + 0.006*"drink" + 0.004*"nuclear" + 0.004*"plan" + 0.004*"face" + 0.004*"govt" + 0.004*"kangaroo" + 0.004*"right" 
Words : 4


Topic : 0.006*"murder" + 0.006*"coast" + 0.006*"guilti" + 0.006*"govt" + 0.005*"plead" + 0.00

In [63]:
preprocessed_docs[document_num]

['rain', 'help', 'dampen', 'bushfir']

In [64]:
for index, score in sorted(lda_model[bow_corpus[document_num]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.8199232220649719	 
Topic: 0.020*"plan" + 0.017*"water" + 0.015*"council" + 0.012*"centr" + 0.009*"claim" + 0.009*"begin" + 0.007*"union" + 0.007*"worker" + 0.006*"rise" + 0.006*"land"

Score: 0.020015785470604897	 
Topic: 0.011*"protest" + 0.008*"market" + 0.008*"melbourn" + 0.007*"prompt" + 0.007*"rais" + 0.007*"gold" + 0.006*"bush" + 0.006*"urg" + 0.005*"concern" + 0.005*"price"

Score: 0.020013196393847466	 
Topic: 0.021*"fund" + 0.019*"boost" + 0.019*"govt" + 0.011*"offer" + 0.011*"consid" + 0.010*"indigen" + 0.010*"school" + 0.009*"push" + 0.009*"child" + 0.008*"worker"

Score: 0.0200100876390934	 
Topic: 0.020*"council" + 0.015*"polic" + 0.015*"drug" + 0.013*"court" + 0.012*"elect" + 0.011*"seek" + 0.009*"water" + 0.009*"urg" + 0.008*"green" + 0.008*"charg"

Score: 0.020007522776722908	 
Topic: 0.023*"urg" + 0.017*"govt" + 0.013*"plan" + 0.011*"chang" + 0.010*"iraq" + 0.010*"troop" + 0.010*"support" + 0.009*"talk" + 0.008*"stand" + 0.008*"nuclear"

Score: 0.020007511600

In [65]:
for index, score in sorted(lda_model_tfidf[bow_corpus[document_num]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.632253110408783	 
Topic: 0.019*"closer" + 0.011*"water" + 0.008*"rain" + 0.007*"drought" + 0.006*"govt" + 0.005*"council" + 0.005*"farmer" + 0.005*"hick" + 0.004*"restrict" + 0.004*"plan"

Score: 0.20766673982143402	 
Topic: 0.009*"blaze" + 0.006*"plan" + 0.006*"firefight" + 0.006*"council" + 0.005*"govt" + 0.005*"polic" + 0.005*"crew" + 0.004*"call" + 0.004*"titl" + 0.004*"develop"

Score: 0.020014846697449684	 
Topic: 0.008*"iraq" + 0.007*"kill" + 0.007*"troop" + 0.006*"attack" + 0.006*"plan" + 0.006*"soldier" + 0.006*"govt" + 0.004*"polic" + 0.004*"test" + 0.004*"afghanistan"

Score: 0.02001177705824375	 
Topic: 0.005*"say" + 0.005*"govt" + 0.004*"fund" + 0.004*"council" + 0.004*"cancer" + 0.004*"open" + 0.004*"australia" + 0.004*"patient" + 0.004*"lead" + 0.004*"urg"

Score: 0.020010005682706833	 
Topic: 0.007*"miss" + 0.006*"market" + 0.006*"search" + 0.005*"push" + 0.005*"public" + 0.005*"rise" + 0.005*"record" + 0.005*"govt" + 0.004*"plan" + 0.004*"price"

Score: 0.020

In [66]:

unseen_document = "My favorite sports activities are running and swimming."

# Data preprocessing step for the unseen document
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.5590149760246277	 Topic: 0.011*"protest" + 0.008*"market" + 0.008*"melbourn" + 0.007*"prompt" + 0.007*"rais"
Score: 0.2809218466281891	 Topic: 0.016*"health" + 0.014*"govt" + 0.012*"warn" + 0.010*"closer" + 0.009*"england"
Score: 0.02001125179231167	 Topic: 0.020*"plan" + 0.017*"water" + 0.015*"council" + 0.012*"centr" + 0.009*"claim"
Score: 0.020009081810712814	 Topic: 0.014*"hospit" + 0.014*"final" + 0.013*"test" + 0.010*"charg" + 0.009*"play"
Score: 0.02000868134200573	 Topic: 0.020*"council" + 0.015*"polic" + 0.015*"drug" + 0.013*"court" + 0.012*"elect"
Score: 0.020007474347949028	 Topic: 0.043*"polic" + 0.019*"kill" + 0.017*"investig" + 0.012*"attack" + 0.012*"probe"
Score: 0.020007459446787834	 Topic: 0.015*"murder" + 0.012*"jail" + 0.010*"year" + 0.010*"drought" + 0.009*"plan"
Score: 0.020007353276014328	 Topic: 0.018*"miss" + 0.013*"search" + 0.012*"polic" + 0.010*"take" + 0.008*"guilti"
Score: 0.020005952566862106	 Topic: 0.023*"urg" + 0.017*"govt" + 0.013*"plan" + 0.