In [58]:
#https://stackabuse.com/python-for-nlp-topic-modeling/

## Latent Dirichlet Allocation (LDA) algorithm

In [4]:
import pandas as pd
import numpy as np

reviews_datasets = pd.read_excel('/Users/Suwani/Desktop/Moodys Project/Cleaned data/2019/sep19_cleaned.xlsx')
reviews_datasets.dropna()

Unnamed: 0.1,Unnamed: 0,Headline,Date,Year,Month,Day
0,0,“Sri Lanka is running out of time to agree on ...,2019-09-29,2019,9,29
1,1,A 4th coal power plant for Sri Lanka?,2019-09-29,2019,9,29
2,2,Sri Lanka’s UN delegation meets with Under Sec...,2019-09-28,2019,9,28
3,3,Sri Lanka welcomes proposal for Maldives’ re-a...,2019-09-28,2019,9,28
4,4,Sri Lanka celebrates World Tourism Day 2019 wi...,2019-09-27,2019,9,27
5,5,Sri Lanka arrives in Karachi for SL vs Pak series,2019-09-25,2019,9,25
6,6,Yoshitha Rajapaksa reinstated as Lieutenant of...,2019-09-25,2019,9,25
7,7,PM admits his involvement in the collapse of S...,2019-09-24,2019,9,24
8,8,Sri Lanka Administrative Service Association t...,2019-09-24,2019,9,24
9,9,Sisu Dham Sewana will help Buddhism flourish i...,2019-09-23,2019,9,23


In [45]:
import nltk
from nltk.corpus import stopwords
stopwords = nltk.corpus.stopwords.words('english')

newStopWords = ['Sri','Lanka','sri','lanka','lankan','sl','SL']
stopwords.extend(newStopWords)




In [46]:
'''specify to only include those words that 
appear in less than 80% of the document and appear in at least 2 documents. '''

#Creating a vocabulary for all the words in the dataset and removing stop words
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(max_df=0.8, min_df=2, stop_words=stopwords)
doc_term_matrix = count_vect.fit_transform(reviews_datasets['Headline'].values.astype('U'))

In [47]:
doc_term_matrix

<1135x1163 sparse matrix of type '<class 'numpy.int64'>'
	with 4507 stored elements in Compressed Sparse Row format>

In [48]:
#n_components specifies the number of categories


from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=5, random_state=42)
LDA.fit(doc_term_matrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [49]:
#fetches 10 words from our vocabulary:
import random

for i in range(10):
    random_id = random.randint(0,len(count_vect.get_feature_names()))
    print(count_vect.get_feature_names()[random_id])


progress
house
bestowed
complexes
almost
expressway
tea
railways
summit
beedi


In [50]:
#get the first topic
first_topic = LDA.components_[0]

In [51]:
#10 words with the highest probabilities will now belong to the last 10 indexes of the array
top_topic_words = first_topic.argsort()[-10:]

In [52]:
top_topic_words

array([ 661,  912,  162, 1069,  714,   99, 1004, 1058, 1009,  818])

In [53]:
for i in top_topic_words:
    print(count_vect.get_feature_names()[i])

management
school
business
tower
national
awards
state
today
strike
president


In [54]:
for i,topic in enumerate(LDA.components_):
    print(f'Top 10 words for topic #{i}:')
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['management', 'school', 'business', 'tower', 'national', 'awards', 'state', 'today', 'strike', 'president']


Top 10 words for topic #1:
['host', 'two', 'group', 'srilankan', 'wins', 'asia', 'opens', 'airlines', 'trade', 'indian']


Top 10 words for topic #2:
['country', 'bank', 'sajith', 'minister', 'new', 'india', 'president', 'pm', 'cricket', 'pakistan']


Top 10 words for topic #3:
['asia', 'wins', 'launches', 'presidential', '2019', 'slfp', 'candidate', 'series', 'world', 'first']


Top 10 words for topic #4:
['top', 'support', 'presidential', '2019', 'new', 'award', 'year', 'wins', 'colombo', 'awards']




In [55]:
topic_values = LDA.transform(doc_term_matrix)
topic_values.shape

(1135, 5)

In [56]:
reviews_datasets['Topic'] = topic_values.argmax(axis=1)

In [57]:
reviews_datasets.head()

Unnamed: 0.1,Unnamed: 0,Headline,Date,Year,Month,Day,Topic
0,0,“Sri Lanka is running out of time to agree on ...,2019-09-29,2019,9,29,4
1,1,A 4th coal power plant for Sri Lanka?,2019-09-29,2019,9,29,2
2,2,Sri Lanka’s UN delegation meets with Under Sec...,2019-09-28,2019,9,28,0
3,3,Sri Lanka welcomes proposal for Maldives’ re-a...,2019-09-28,2019,9,28,1
4,4,Sri Lanka celebrates World Tourism Day 2019 wi...,2019-09-27,2019,9,27,3
