**Load Libraries**

In [116]:
!pip install --upgrade gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [117]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter

%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (14.0, 8.7)
#warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:,.2f}'.format


In [118]:
!pip install pyLDAvis

import pyLDAvis
import pyLDAvis.gensim_models

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [119]:
import nltk
from gensim import corpora
from gensim.corpora import Dictionary
from nltk.stem import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
import re
from nltk.corpus import stopwords

import glob
import os

# For displaying images in ipython
from IPython.display import HTML, display

**Load Sample Data**

In [120]:
sample = ["He should resign as CM and not continue in the hope of getting elected within six months",
          "Deaths during manual cleaning of sewage are unacceptable",
          "Even countries that don’t join the coalition will benefit from the transparency, says U.S. Treasury Secretary",
          "The drop in the reserves during the week ended August 19 was due to a fall in the Foreign Currency Assets (FCA) and the gold reserves",
          "Murder in the sewer","To allot the task of removing excreta and cleaning sewers to humans when machines are able to do the work is a gross violation of rights."]

In [121]:
sample=pd.DataFrame(sample)
sample.reset_index(inplace=True)
sample.columns=['index','Heading']
sample

Unnamed: 0,index,Heading
0,0,He should resign as CM and not continue in the...
1,1,Deaths during manual cleaning of sewage are un...
2,2,Even countries that don’t join the coalition w...
3,3,The drop in the reserves during the week ended...
4,4,Murder in the sewer
5,5,To allot the task of removing excreta and clea...


**Extract Raw Corpus**

In [122]:
sample_1 = sample.Heading.tolist()

In [123]:
print(type(sample_1))
print(sample_1[0:2])

<class 'list'>
['He should resign as CM and not continue in the hope of getting elected within six months', 'Deaths during manual cleaning of sewage are unacceptable']


In [124]:
wordnet_lemmatizer = WordNetLemmatizer()

In [125]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [126]:
from nltk.corpus import stopwords

In [127]:
stopwords = stopwords.words('english')

**Preprocessing of Raw Text**

In [128]:
# Method to preprocess my raw data
def preprocessText(x):
    temp = x.lower()
    temp = re.sub(r'[^\w]', ' ', temp)
    temp = nltk.word_tokenize(temp)
    temp = [wordnet_lemmatizer.lemmatize(w) for w in temp] # refind version of steeming, will remove true ing from root word
    temp = [word for word in temp if word not in stopwords ]
    return temp

In [129]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [130]:
articles_final = [preprocessText(Heading) for Heading in sample_1] # list compherension over each sample in list of sample

In [131]:
articles_final[0:2]

[['resign',
  'cm',
  'continue',
  'hope',
  'getting',
  'elected',
  'within',
  'six',
  'month'],
 ['death', 'manual', 'cleaning', 'sewage', 'unacceptable']]

Thses are bag of words or individual token

bag of words is tokenize word

**Transformation of Preprocessed text into Vector form using Gensim**

In [132]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(articles_final)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
#dictionary.filter_extremes(no_below=20, no_above=0.5)

In [133]:
print(dictionary)

Dictionary<51 unique tokens: ['cm', 'continue', 'elected', 'getting', 'hope']...>


In [104]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in articles_final]

Bag of word will create index and each word in the vocabulary , it will give a numeric value to it, basically like LabelEncoding, and it will store in valriable know as corpus.

In [150]:
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)],
 [(9, 1), (10, 1), (11, 1), (12, 1), (13, 1)],
 [(14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1)],
 [(24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 2),
  (36, 1),
  (37, 1)],
 [(38, 1), (39, 1)],
 [(9, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1)]]

In [151]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 51
Number of documents: 6


**Train LDA model using Gensim**

In [152]:
#Train LDA Model
from gensim.models import LdaModel
#setting training parameters
num_topics=4
chunksize=2000
passes=10
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token
# print(id2word)

model=LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    #iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every)

**Model exploration: Top K words in each topic**

In [137]:
import pprint

In [138]:
# Print the Keyword in the 3 topics
pprint.pprint(model.print_topics(num_words= 10))
doc_lda = model[corpus]

[(0,
  '0.059*"reserve" + 0.033*"u" + 0.033*"join" + 0.033*"coalition" + '
  '0.033*"even" + 0.033*"benefit" + 0.033*"secretary" + 0.033*"treasury" + '
  '0.033*"say" + 0.033*"transparency"'),
 (1,
  '0.036*"sewer" + 0.036*"cleaning" + 0.036*"right" + 0.036*"hope" + '
  '0.036*"task" + 0.036*"able" + 0.036*"excreta" + 0.036*"machine" + '
  '0.036*"violation" + 0.036*"human"'),
 (2,
  '0.085*"murder" + 0.085*"sewer" + 0.017*"unacceptable" + 0.017*"cleaning" + '
  '0.017*"treasury" + 0.017*"say" + 0.017*"benefit" + 0.017*"transparency" + '
  '0.017*"country" + 0.017*"secretary"'),
 (3,
  '0.070*"death" + 0.070*"sewage" + 0.070*"manual" + 0.070*"cleaning" + '
  '0.070*"unacceptable" + 0.014*"sewer" + 0.014*"murder" + 0.014*"resign" + '
  '0.014*"getting" + 0.014*"transparency"')]


**Model Visualization using PyLDAvis**

In [139]:
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
lda_viz = gensimvis.prepare(model, corpus, dictionary)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [140]:
vis = gensimvis.prepare(model, corpus, dictionary=dictionary)
vis


**Assign Topic Model Numbers to original Data Frame as Column**

In [141]:
# Assigns the topics to the documents in corpus
lda_corpus = model[corpus]

In [142]:
lda_corpus

<gensim.interfaces.TransformedCorpus at 0x7f824829a310>

In [143]:
topics = []

for doc in lda_corpus:
    temp_id = []
    temp_score = []
    for doc_tuple in doc:
        temp_id.append(doc_tuple[0])
        temp_score.append(doc_tuple[1])
    index = np.argmax(temp_score)
    topics.append(temp_id[index])

In [144]:
sample["Topic_num"] = topics

In [146]:
sample.head(n= 4)

Unnamed: 0,index,Heading,Topic_num
0,0,He should resign as CM and not continue in the...,1
1,1,Deaths during manual cleaning of sewage are un...,3
2,2,Even countries that don’t join the coalition w...,0
3,3,The drop in the reserves during the week ended...,0


In [147]:
sample

Unnamed: 0,index,Heading,Topic_num
0,0,He should resign as CM and not continue in the...,1
1,1,Deaths during manual cleaning of sewage are un...,3
2,2,Even countries that don’t join the coalition w...,0
3,3,The drop in the reserves during the week ended...,0
4,4,Murder in the sewer,2
5,5,To allot the task of removing excreta and clea...,1
