In [1]:
#https://medium.com/ml2vec/topic-modeling-is-an-unsupervised-learning-approach-to-clustering-documents-to-discover-topics-fdfbf30e27df

In [1]:
import pandas as pd;
import numpy as np;
import scipy as sp;
import sklearn;
import sys;
from nltk.corpus import stopwords;

import nltk;
from gensim.models import ldamodel
import gensim.corpora;
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer;
from sklearn.decomposition import NMF;
from sklearn.preprocessing import normalize;
import pickle;

In [2]:
data = pd.read_excel('/Users/Suwani/Desktop/Moodys Project/Cleaned data/2019/aug19_cleaned.xlsx');
# We only need the Headlines text column from the data
data_text = data[['Headline']]

In [3]:
'''# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use','sri'])'''

"# NLTK Stop words\nfrom nltk.corpus import stopwords\nstop_words = stopwords.words('english')\nstop_words.extend(['from', 'subject', 're', 'edu', 'use','sri'])"

In [4]:
data_text = data_text.astype('str');
for idx in range(len(data_text)):
    
    #go through each word in each data_text row, remove stopwords, and set them on the index.
    data_text.iloc[idx]['Headline'] = [word for word in data_text.iloc[idx]['Headline'].split(' ') if word not in stopwords.words()];
    
    #print logs to monitor output
    if idx % 1000 == 0:
        sys.stdout.write('\rc = ' + str(idx) + ' / ' + str(len(data_text)));
#save data because it takes very long to remove stop words
pickle.dump(data_text, open('data_text.dat', 'wb'))
#get the words as an array for lda input
train_headlines = [value[0] for value in data_text.iloc[0:].values];

c = 1000 / 1171

In [5]:
num_topics = 3;

In [6]:
id2word = gensim.corpora.Dictionary(train_headlines);
corpus = [id2word.doc2bow(text) for text in train_headlines];
lda = ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics);

In [7]:
def get_lda_topics(model, num_topics):
    word_dict = {};
    for i in range(num_topics):
        words = model.show_topic(i, topn = 20);
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words];
    return pd.DataFrame(word_dict);


In [8]:
get_lda_topics(lda, num_topics)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03
0,Sri,Sri,Lanka
1,Lanka,Lanka,Sri
2,President,-,-
3,–,Lanka’s,arrested
4,-,Bank,Lankan
5,A,celebrates,today
6,new,new,–
7,rupee,today,Colombo
8,2019,World,South
9,Lankan,anniversary,Minister


## Implementing NMF

In [10]:
#the count vectorizer module needs string inputs, not array, so I join them with a space. This is a very quick operation.
train_headlines_sentences = [' '.join(text) for text in train_headlines]

In [11]:
vectorizer = CountVectorizer(analyzer='word', max_features=5000);
x_counts = vectorizer.fit_transform(train_headlines_sentences);

In [12]:
transformer = TfidfTransformer(smooth_idf=False);
x_tfidf = transformer.fit_transform(x_counts);

In [13]:
xtfidf_norm = normalize(x_tfidf, norm='l1', axis=1)

In [14]:
#obtain a NMF model.
model = NMF(n_components=num_topics, init='nndsvd');
#fit the model
model.fit(xtfidf_norm)

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0, max_iter=200,
    n_components=10, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [15]:
def get_nmf_topics(model, n_top_words):
    
    #the word ids obtained need to be reverse-mapped to the words so we can print the topic names.
    feat_names = vectorizer.get_feature_names()
    
    word_dict = {};
    for i in range(num_topics):
        
        #for each topic, obtain the largest values, and add the words they map to into the dictionary.
        words_ids = model.components_[i].argsort()[:-20 - 1:-1]
        words = [feat_names[key] for key in words_ids]
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = words;
    
    return pd.DataFrame(word_dict);

In [16]:
get_nmf_topics(model, 20)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06,Topic # 07,Topic # 08,Topic # 09,Topic # 10
0,sri,aqua,legacy,beat,arrested,today,slfp,president,tomorrow,2019
1,lanka,right,two,hope,drivers,gmoa,candidate,new,the,youth
2,rupee,wellness,navy,beacon,drunk,strike,sept,sirisena,perahera,best
3,shares,health,held,ray,jmi,thursday,slpp,envoy,randoli,awards
4,economic,website,nabs,lankan,members,monday,unp,peace,for,colombo
5,lankan,silver,killed,alisher,activist,schools,convention,meets,better,employer
6,tour,receives,illegal,overpowers,9399,hajj,talks,maithripala,parliament,presidential
7,development,best,heroin,turkmenistan,8488,celebrated,presidential,interpol,adjourned,concludes
8,bangladesh,heart,arrests,shreemal,6659,psc,announce,medal,final,hopefuls
9,low,place,traffickers,ashen,ntj,convene,directly,calls,kumbal,salute


In [17]:
topic_values = model.transform(xtfidf_norm)

In [18]:
data['Topic'] = topic_values.argmax(axis=1)
data

Unnamed: 0,Headline,Date,Year,Month,Day,Topic
0,Constiution of proposed Sri Lanka Podujana all...,2019-08-31,2019,8,31,0
1,“Sajith is the the guardian of all Sri Lankans...,2019-08-30,2019,8,30,0
2,"Sri Lanka to import 70,000 MT of corn due to l...",2019-08-30,2019,8,30,0
3,Ministry of Education joins notorious list of ...,2019-08-27,2019,8,27,0
4,Sri Lanka is yet to comprehend the need for Ca...,2019-08-27,2019,8,27,0
5,I will steer Sri Lanka to digitalization: Mini...,2019-08-26,2019,8,26,0
6,Elephant population in Sri Lankan to be survey...,2019-08-26,2019,8,26,0
7,PM lays foundation stone for third largest res...,2019-08-25,2019,8,25,0
8,"Sri Lankan shot dead in Oakland, USA",2019-08-25,2019,8,25,0
9,Sri Lanka is unable to protect its maritime bo...,2019-08-25,2019,8,25,0
