# Topic Modelling using LDA - Latent Drichilet Allocation

**Dependenices**

In [149]:
#Dependencies
import pandas as pd
import gensim #the library for Topic modelling
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora, models
import pyLDAvis.gensim #LDA visualization library

from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer

import warnings
warnings.simplefilter('ignore')
from itertools import chain

**Read News articles pickle file**

In [150]:
df = pd.read_pickle('news_topic_modelling.pkl')
print(df.shape)
df

(10, 2)


Unnamed: 0,topic,text
0,corona,\n E-commerce companies can deliver essential ...
1,corona,\nGoa Chief Minister Pramod Sawant on Sunday s...
2,corona,\n Tamil Nadu has extended the coronavirus loc...
3,corona,\nSports complexes and stadia were on Sunday p...
4,corona,"\nMaharashtra on Sunday reported 2,347 coronav..."
5,iphone,\nThe news comes from Front Page Tech’s Jon Pr...
6,iphone,"\nYou can count on death, taxes, and a steady ..."
7,iphone,\nApple had been hoping to move past its recen...
8,iphone,\nThe iPhone 12 is supposed to deliver a small...
9,iphone,"\nThanks to a flurry of exciting leaks, Apple’..."


**Clean the data**

In [151]:
#clean the data
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(text):
    stop_free = ' '.join([word for word in text.lower().split() if word not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = ' '.join([lemma.lemmatize(word) for word in punc_free.split()])
    return normalized.split()

In [152]:
df['text_clean']=df['text'].apply(clean)

**Dataframe after cleaning**

In [153]:
df

Unnamed: 0,topic,text,text_clean
0,corona,\n E-commerce companies can deliver essential ...,"[ecommerce, company, deliver, essential, nones..."
1,corona,\nGoa Chief Minister Pramod Sawant on Sunday s...,"[goa, chief, minister, pramod, sawant, sunday,..."
2,corona,\n Tamil Nadu has extended the coronavirus loc...,"[tamil, nadu, extended, coronavirus, lockdown,..."
3,corona,\nSports complexes and stadia were on Sunday p...,"[sport, complex, stadium, sunday, permitted, o..."
4,corona,"\nMaharashtra on Sunday reported 2,347 coronav...","[maharashtra, sunday, reported, 2347, coronavi..."
5,iphone,\nThe news comes from Front Page Tech’s Jon Pr...,"[news, come, front, page, tech’s, jon, prosser..."
6,iphone,"\nYou can count on death, taxes, and a steady ...","[count, death, tax, steady, stream, shiny, new..."
7,iphone,\nApple had been hoping to move past its recen...,"[apple, hoping, move, past, recent, run, secur..."
8,iphone,\nThe iPhone 12 is supposed to deliver a small...,"[iphone, 12, supposed, deliver, smaller, notch..."
9,iphone,"\nThanks to a flurry of exciting leaks, Apple’...","[thanks, flurry, exciting, leak, apple’s, ipho..."


**Create Dictionary from the articles**

In [170]:
#create dictionary
dictionary = corpora.Dictionary(df['text_clean'])
#Total number of non-zeroes in the BOW matrix (sum of the number of unique words per document over the entire corpus).
print(dictionary.num_nnz)

2130


**Create document term matrix**

In [171]:
#create document term matrix
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df['text_clean'] ]
print(len(doc_term_matrix))

10


**Instantiate LDA model**

In [156]:
lda = gensim.models.ldamodel.LdaModel

**Fit LDA model on the dataset**

In [175]:
num_topics=3
%time ldamodel = lda(doc_term_matrix,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0)

Wall time: 641 ms


**Print the topics identified by LDA model**

In [176]:
ldamodel.print_topics(num_topics=num_topics)

[(0,
  '0.018*"iphone" + 0.016*"apple" + 0.009*"ipad" + 0.009*"new" + 0.008*"12" + 0.008*"io" + 0.008*"security" + 0.007*"user" + 0.006*"device" + 0.006*"tap"'),
 (1,
  '0.026*"iphone" + 0.015*"apple" + 0.008*"charging" + 0.007*"prosser" + 0.007*"portless" + 0.007*"lockdown" + 0.007*"would" + 0.007*"port" + 0.007*"zone" + 0.007*"new"'),
 (2,
  '0.013*"state" + 0.012*"said" + 0.010*"case" + 0.006*"covid19" + 0.006*"government" + 0.006*"lockdown" + 0.006*"allowed" + 0.006*"may" + 0.006*"district" + 0.006*"train"')]

**Visualize the LDA model results**

In [177]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

**Find which articles were marked in which cluster**

In [160]:
# Assigns the topics to the documents in corpus
lda_corpus = ldamodel[doc_term_matrix]

In [161]:
[doc for doc in lda_corpus]

[[(0, 0.0018114314), (1, 0.99638116), (2, 0.0018074277)],
 [(0, 0.0021974982), (1, 0.0027093685), (2, 0.99509317)],
 [(0, 0.62201446), (1, 0.3768041), (2, 0.0011814386)],
 [(0, 0.0017273024), (1, 0.9965288), (2, 0.0017438647)],
 [(0, 0.0023968231), (1, 0.9952025), (2, 0.0024006674)],
 [(0, 0.9979319), (1, 0.0009849896), (2, 0.0010830966)],
 [(0, 0.0005029842), (1, 0.00048812397), (2, 0.9990089)],
 [(0, 0.99837416), (1, 0.00080488826), (2, 0.00082096114)],
 [(0, 0.00135423), (1, 0.0011218124), (2, 0.99752396)],
 [(0, 0.9986003), (1, 0.0006869496), (2, 0.00071279914)]]

In [162]:
scores = list(chain(*[[score for topic_id,score in topic] \
                      for topic in [doc for doc in lda_corpus]]))

threshold = sum(scores)/len(scores)
print(threshold)

0.3333333344722632


In [163]:
cluster1 = [j for i,j in zip(lda_corpus,df.index) if i[0][1] > threshold]
cluster2 = [j for i,j in zip(lda_corpus,df.index) if i[1][1] > threshold]
cluster3 = [j for i,j in zip(lda_corpus,df.index) if i[2][1] > threshold]
# cluster4 = [j for i,j in zip(lda_corpus,df.index) if i[3][1] > threshold]
# cluster5 = [j for i,j in zip(lda_corpus,df.index) if i[4][1] > threshold]

print(len(cluster1))
print(len(cluster2))
print(len(cluster3))
# print(len(cluster4))
# print(len(cluster5))

4
4
3


In [164]:
df.iloc[cluster1]

Unnamed: 0,topic,text,text_clean
2,corona,\n Tamil Nadu has extended the coronavirus loc...,"[tamil, nadu, extended, coronavirus, lockdown,..."
5,iphone,\nThe news comes from Front Page Tech’s Jon Pr...,"[news, come, front, page, tech’s, jon, prosser..."
7,iphone,\nApple had been hoping to move past its recen...,"[apple, hoping, move, past, recent, run, secur..."
9,iphone,"\nThanks to a flurry of exciting leaks, Apple’...","[thanks, flurry, exciting, leak, apple’s, ipho..."


In [165]:
df.iloc[cluster2]

Unnamed: 0,topic,text,text_clean
0,corona,\n E-commerce companies can deliver essential ...,"[ecommerce, company, deliver, essential, nones..."
2,corona,\n Tamil Nadu has extended the coronavirus loc...,"[tamil, nadu, extended, coronavirus, lockdown,..."
3,corona,\nSports complexes and stadia were on Sunday p...,"[sport, complex, stadium, sunday, permitted, o..."
4,corona,"\nMaharashtra on Sunday reported 2,347 coronav...","[maharashtra, sunday, reported, 2347, coronavi..."


In [166]:
df.iloc[cluster3]

Unnamed: 0,topic,text,text_clean
1,corona,\nGoa Chief Minister Pramod Sawant on Sunday s...,"[goa, chief, minister, pramod, sawant, sunday,..."
6,iphone,"\nYou can count on death, taxes, and a steady ...","[count, death, tax, steady, stream, shiny, new..."
8,iphone,\nThe iPhone 12 is supposed to deliver a small...,"[iphone, 12, supposed, deliver, smaller, notch..."
