In [1]:
import pandas as pd
import gensim
import numpy as np
import nltk

nltk.download('wordnet')

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from gensim import corpora, models
from pprint import pprint
from tqdm.notebook import tqdm

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/beijiayu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


---
## Step 1: Load the dataset

In [2]:
data = pd.read_csv('./data/abcnews-date-text.csv', error_bad_lines=False)

data_text = data[:300000][['headline_text']];
data_text['index'] = data_text.index

documents = data_text

In [3]:
'''
Get the total number of documents
'''
print(len(documents))

300000


In [4]:
documents[:5]

Unnamed: 0,headline_text,index
0,aba decides against community broadcasting lic...,0
1,act fire witnesses must be aware of defamation,1
2,a g calls for infrastructure protection summit,2
3,air nz staff in aust strike for pay rise,3
4,air nz strike to affect australian travellers,4


--- 
## Step 2: Data Preprocessing

In [5]:
np.random.seed(400)

In [6]:
print(WordNetLemmatizer().lemmatize('went', pos = 'v')) # past tense to present tense

go


In [7]:
stemmer = SnowballStemmer("english")
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]

pd.DataFrame(data={'original word':original_words, 'stemmed':singles })

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [8]:
'''
Write a function to perform the pre processing steps on the entire dataset
'''
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [9]:
'''
Preview a document after preprocessing
'''
document_num = 4310
doc_sample = documents[documents['index'] == document_num].values[0][0]

print("Original document: ")
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))

Original document: 
['rain', 'helps', 'dampen', 'bushfires']


Tokenized and lemmatized document: 
['rain', 'help', 'dampen', 'bushfir']


In [10]:
documents

Unnamed: 0,headline_text,index
0,aba decides against community broadcasting lic...,0
1,act fire witnesses must be aware of defamation,1
2,a g calls for infrastructure protection summit,2
3,air nz staff in aust strike for pay rise,3
4,air nz strike to affect australian travellers,4
...,...,...
299995,broughton hall audit reveals serious breaches,299995
299996,broughton hall fails key standards,299996
299997,broughton hall safe for residents govt says,299997
299998,burn off at conservation park aims to prevent,299998


In [11]:
processed_docs = [preprocess(text) for text in tqdm(documents['headline_text'])]

HBox(children=(FloatProgress(value=0.0, max=300000.0), HTML(value='')))




In [12]:
processed_docs[:10]

[['decid', 'communiti', 'broadcast', 'licenc'],
 ['wit', 'awar', 'defam'],
 ['call', 'infrastructur', 'protect', 'summit'],
 ['staff', 'aust', 'strike', 'rise'],
 ['strike', 'affect', 'australian', 'travel'],
 ['ambiti', 'olsson', 'win', 'tripl', 'jump'],
 ['antic', 'delight', 'record', 'break', 'barca'],
 ['aussi', 'qualifi', 'stosur', 'wast', 'memphi', 'match'],
 ['aust', 'address', 'secur', 'council', 'iraq'],
 ['australia', 'lock', 'timet']]

---
## Step 3.1: Bag of words on the dataset

In [13]:
'''
Create a dictionary from 'processed_docs' containing the number of times a word appears 
in the training set using gensim.corpora.Dictionary and call it 'dictionary'
'''
dictionary = gensim.corpora.Dictionary(processed_docs)

In [14]:
'''
Checking dictionary created
'''
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 broadcast
1 communiti
2 decid
3 licenc
4 awar
5 defam
6 wit
7 call
8 infrastructur
9 protect
10 summit


In [15]:
dictionary.filter_extremes(no_below=15, no_above=0.1)

In [16]:
'''
Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
words and how many times those words appear. Save this to 'bow_corpus'
'''
bow_corpus = [dictionary.doc2bow(doc) for index, doc in enumerate(tqdm(processed_docs))]

HBox(children=(FloatProgress(value=0.0, max=300000.0), HTML(value='')))




In [17]:
'''
Checking Bag of Words corpus for our sample document --> (token_id, token_count)
'''
bow_corpus[document_num]

[(71, 1), (107, 1), (462, 1), (3530, 1)]

In [18]:
'''
Preview BOW for our sample preprocessed document
'''
# Here document_num is document number 4310 which we have checked in Step 2
bow_doc_4310 = bow_corpus[document_num]

for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                                     dictionary[bow_doc_4310[i][0]], 
                                                     bow_doc_4310[i][1]))

Word 71 ("bushfir") appears 1 time.
Word 107 ("help") appears 1 time.
Word 462 ("rain") appears 1 time.
Word 3530 ("dampen") appears 1 time.


---
## Step 3.2: TF-IDF on our document set

In [19]:
tfidf = models.TfidfModel(bow_corpus)

In [20]:
corpus_tfidf = [tfidf[element] for element in tqdm(bow_corpus)]

HBox(children=(FloatProgress(value=0.0, max=300000.0), HTML(value='')))




In [21]:
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5959813347777092),
 (1, 0.39204529549491984),
 (2, 0.48531419274988147),
 (3, 0.5055461098578569)]


---
## Step 4.1: Running LDA using Bag of Words

In [22]:
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 10, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

In [23]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(topic, idx ))
    print("\n")

Topic: 0.025*"open" + 0.021*"test" + 0.018*"world" + 0.017*"win" + 0.015*"lead" + 0.014*"south" + 0.012*"aussi" + 0.012*"take" + 0.012*"timor" + 0.011*"strike" 
Words: 0


Topic: 0.035*"report" + 0.030*"help" + 0.018*"deal" + 0.017*"blaze" + 0.015*"inquiri" + 0.013*"close" + 0.013*"firefight" + 0.012*"bushfir" + 0.012*"trade" + 0.011*"river" 
Words: 1


Topic: 0.039*"crash" + 0.022*"closer" + 0.021*"road" + 0.017*"die" + 0.016*"coast" + 0.014*"train" + 0.014*"dead" + 0.012*"kill" + 0.012*"gold" + 0.012*"north" 
Words: 2


Topic: 0.041*"plan" + 0.036*"govt" + 0.033*"council" + 0.029*"water" + 0.022*"urg" + 0.018*"fund" + 0.016*"group" + 0.012*"boost" + 0.012*"chang" + 0.011*"concern" 
Words: 3


Topic: 0.023*"hospit" + 0.022*"labor" + 0.019*"defend" + 0.018*"elect" + 0.016*"govt" + 0.016*"protest" + 0.015*"minist" + 0.013*"chief" + 0.013*"work" + 0.013*"begin" 
Words: 4


Topic: 0.045*"warn" + 0.020*"fight" + 0.017*"nuclear" + 0.017*"england" + 0.017*"year" + 0.014*"action" + 0.014*"thr

---
## Step 4.2 Running LDA using TF-IDF

In [24]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, 
                                   num_topics = 10, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

In [25]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model_tfidf.print_topics(-1):
    print("Topic: {} Word: {}".format(idx, topic))
    print("\n")

Topic: 0 Word: 0.040*"closer" + 0.009*"cyclon" + 0.009*"tiger" + 0.008*"market" + 0.008*"england" + 0.008*"victori" + 0.007*"blue" + 0.006*"season" + 0.006*"open" + 0.006*"lead"


Topic: 1 Word: 0.023*"kill" + 0.021*"crash" + 0.012*"die" + 0.011*"timor" + 0.010*"troop" + 0.010*"iraq" + 0.010*"south" + 0.009*"injur" + 0.009*"dead" + 0.008*"bomb"


Topic: 2 Word: 0.023*"miss" + 0.019*"search" + 0.013*"rudd" + 0.011*"rate" + 0.008*"airport" + 0.008*"polic" + 0.008*"level" + 0.008*"escap" + 0.007*"post" + 0.007*"profit"


Topic: 3 Word: 0.014*"fatal" + 0.009*"qanta" + 0.008*"hill" + 0.008*"anti" + 0.007*"break" + 0.007*"law" + 0.007*"terror" + 0.006*"iemma" + 0.006*"speed" + 0.006*"plant"


Topic: 4 Word: 0.023*"charg" + 0.022*"polic" + 0.017*"court" + 0.015*"murder" + 0.012*"investig" + 0.012*"jail" + 0.011*"blaze" + 0.011*"face" + 0.010*"assault" + 0.010*"drug"


Topic: 5 Word: 0.012*"gold" + 0.010*"award" + 0.009*"win" + 0.008*"coast" + 0.008*"takeov" + 0.008*"world" + 0.007*"honour" + 

---
## Step 5.1: Performance evaluation by classifying sample document using LDA Bag of Words model

In [27]:
processed_docs[4310]

['rain', 'help', 'dampen', 'bushfir']

In [28]:
document_num = 4310

In [29]:
for index, score in sorted(lda_model[bow_corpus[document_num]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.619979202747345	 
Topic: 0.035*"report" + 0.030*"help" + 0.018*"deal" + 0.017*"blaze" + 0.015*"inquiri" + 0.013*"close" + 0.013*"firefight" + 0.012*"bushfir" + 0.012*"trade" + 0.011*"river"

Score: 0.2200065404176712	 
Topic: 0.020*"school" + 0.019*"rise" + 0.019*"farmer" + 0.019*"drought" + 0.017*"price" + 0.015*"high" + 0.014*"market" + 0.013*"rain" + 0.012*"feder" + 0.011*"safeti"

Score: 0.020001783967018127	 
Topic: 0.025*"open" + 0.021*"test" + 0.018*"world" + 0.017*"win" + 0.015*"lead" + 0.014*"south" + 0.012*"aussi" + 0.012*"take" + 0.012*"timor" + 0.011*"strike"

Score: 0.020001783967018127	 
Topic: 0.039*"crash" + 0.022*"closer" + 0.021*"road" + 0.017*"die" + 0.016*"coast" + 0.014*"train" + 0.014*"dead" + 0.012*"kill" + 0.012*"gold" + 0.012*"north"

Score: 0.020001783967018127	 
Topic: 0.041*"plan" + 0.036*"govt" + 0.033*"council" + 0.029*"water" + 0.022*"urg" + 0.018*"fund" + 0.016*"group" + 0.012*"boost" + 0.012*"chang" + 0.011*"concern"

Score: 0.0200017839670181

---
## Step 5.2: Performance evaluation by classifying sample document using LDA TF-IDF model

In [30]:
'''
Check which topic our test document belongs to using the LDA TF-IDF model.
'''
# Our test document is document number 4310
for index, score in sorted(lda_model_tfidf[bow_corpus[document_num]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.8199693560600281	 
Topic: 0.017*"water" + 0.015*"govt" + 0.012*"plan" + 0.011*"council" + 0.010*"fund" + 0.009*"urg" + 0.008*"drought" + 0.008*"boost" + 0.007*"farmer" + 0.007*"hick"

Score: 0.020007479935884476	 
Topic: 0.023*"charg" + 0.022*"polic" + 0.017*"court" + 0.015*"murder" + 0.012*"investig" + 0.012*"jail" + 0.011*"blaze" + 0.011*"face" + 0.010*"assault" + 0.010*"drug"

Score: 0.020004011690616608	 
Topic: 0.009*"govt" + 0.008*"action" + 0.008*"grower" + 0.007*"wind" + 0.007*"councillor" + 0.007*"farm" + 0.007*"teacher" + 0.006*"council" + 0.006*"rat" + 0.006*"urg"

Score: 0.020003804937005043	 
Topic: 0.009*"climat" + 0.009*"cancer" + 0.007*"light" + 0.007*"approv" + 0.006*"export" + 0.006*"price" + 0.006*"coal" + 0.006*"patient" + 0.006*"petrol" + 0.006*"green"

Score: 0.020002786070108414	 
Topic: 0.023*"miss" + 0.019*"search" + 0.013*"rudd" + 0.011*"rate" + 0.008*"airport" + 0.008*"polic" + 0.008*"level" + 0.008*"escap" + 0.007*"post" + 0.007*"profit"

Score: 0.

---
## Step 6: Testing model on unseen documen

In [31]:
unseen_document = "My favorite sports activities are running and swimming."

# Data preprocessing step for the unseen document
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.42001497745513916	 Topic: 0.025*"open" + 0.021*"test" + 0.018*"world" + 0.017*"win" + 0.015*"lead"
Score: 0.22000519931316376	 Topic: 0.028*"iraq" + 0.018*"talk" + 0.016*"hold" + 0.016*"australia" + 0.015*"troop"
Score: 0.21993423998355865	 Topic: 0.023*"hospit" + 0.022*"labor" + 0.019*"defend" + 0.018*"elect" + 0.016*"govt"
Score: 0.020015157759189606	 Topic: 0.045*"warn" + 0.020*"fight" + 0.017*"nuclear" + 0.017*"england" + 0.017*"year"
Score: 0.02000507153570652	 Topic: 0.035*"report" + 0.030*"help" + 0.018*"deal" + 0.017*"blaze" + 0.015*"inquiri"
Score: 0.02000507153570652	 Topic: 0.039*"crash" + 0.022*"closer" + 0.021*"road" + 0.017*"die" + 0.016*"coast"
Score: 0.02000507153570652	 Topic: 0.041*"plan" + 0.036*"govt" + 0.033*"council" + 0.029*"water" + 0.022*"urg"
Score: 0.02000507153570652	 Topic: 0.020*"school" + 0.019*"rise" + 0.019*"farmer" + 0.019*"drought" + 0.017*"price"
Score: 0.02000507153570652	 Topic: 0.073*"polic" + 0.031*"charg" + 0.027*"court" + 0.026*"face" 

In [32]:
unseen_document = "My favorite sports activities are running and swimming."

# Data preprocessing step for the unseen document
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.6199879050254822	 Topic: 0.045*"warn" + 0.020*"fight" + 0.017*"nuclear" + 0.017*"england" + 0.017*"year"
Score: 0.21997198462486267	 Topic: 0.073*"polic" + 0.031*"charg" + 0.027*"court" + 0.026*"face" + 0.020*"kill"
Score: 0.02001383528113365	 Topic: 0.041*"plan" + 0.036*"govt" + 0.033*"council" + 0.029*"water" + 0.022*"urg"
Score: 0.02000511810183525	 Topic: 0.025*"open" + 0.021*"test" + 0.018*"world" + 0.017*"win" + 0.015*"lead"
Score: 0.02000470645725727	 Topic: 0.028*"iraq" + 0.018*"talk" + 0.016*"hold" + 0.016*"australia" + 0.015*"troop"
Score: 0.020003294572234154	 Topic: 0.035*"report" + 0.030*"help" + 0.018*"deal" + 0.017*"blaze" + 0.015*"inquiri"
Score: 0.020003294572234154	 Topic: 0.039*"crash" + 0.022*"closer" + 0.021*"road" + 0.017*"die" + 0.016*"coast"
Score: 0.020003294572234154	 Topic: 0.023*"hospit" + 0.022*"labor" + 0.019*"defend" + 0.018*"elect" + 0.016*"govt"
Score: 0.020003294572234154	 Topic: 0.020*"school" + 0.019*"rise" + 0.019*"farmer" + 0.019*"drought"