# Topic modelling using LDA

### 2017 Dec Shilpa Jain

## Read input file which is a csv containing 5 documents, each per row

In [1]:
# The code was removed by DSX for sharing.

Unnamed: 0,Text
0,"The ""Big Brother"" of Singapore football will b..."
1,Mahfizur Rahman watched his friends turn to cr...
2,"The going has been tough, but the Football Ass..."
3,Having pushed reigning world and European cham...
4,SINGAPORE - Registration for the Standard Char...


## Convert each doc into list of tokens and append it to a list

In [2]:
import nltk
docs=[]
for idx, row in df_data_1.iterrows():
    #print (row['Text'])
    
    tokens = nltk.word_tokenize(row['Text'])
    text = nltk.Text(tokens)
    docs.append(tokens)
print ((docs))
    

[['The', '``', 'Big', 'Brother', "''", 'of', 'Singapore', 'football', 'will', 'be', 'back', ',', 'but', 'not', 'immediately', ',', 'and', 'not', 'for', 'long', '.', 'In', 'an', 'exclusive', 'interview', 'with', 'The', 'New', 'Paper', ',', 'Persib', 'Bandung', 'striker', 'Noh', 'Alam', 'Shah', 'said', 'he', 'has', 'agreed', 'to', 'sign', 'a', 'short-term', 'deal', 'with', 'former', 'club', 'Tampines', 'Rovers', 'until', 'the', 'end', 'of', 'the', 'season', '.', 'But', 'the', '31-year-old', 'said', ':', '``', 'Beyond', 'that', ',', 'I', 'feel', 'my', 'future', 'is', 'still', 'in', 'Indonesia', '.', '``', 'I', 'feel', 'really', 'appreciated', 'here', '.', 'Four', 'Indo', 'clubs', 'already', 'made', 'me', 'offers', 'for', 'the', 'next', 'season', ',', 'which', 'starts', 'next', 'January', '.', "''", 'The', 'move', 'to', 'Singapore', 'still', 'hinges', 'on', 'whether', 'Tampines', 'can', 'secure', 'his', 'medical', 'documents', 'and', 'International', 'Transfer', 'Certificate', 'from', 'the

In [3]:
from nltk.corpus import stopwords
from nltk.stem.porter import *
stemmer=PorterStemmer()
import gensim
from gensim import corpora
from gensim import similarities
from gensim import models


def tolower(docs):
    docs=[[w.lower() for w in doc] for doc in docs]
    return docs
    
def fetchdictionary(docs):
    dictionary=corpora.Dictionary(docs)
    return dictionary

def removestop(docs):
    stop_list=stopwords.words('english')
    docs=[[w for w in doc if w not in stop_list] for doc in docs]
    return docs;

def stemwords(docs):
    docs=[[stemmer.stem(w) for w in doc] for doc in docs]
    
    #text2_stemmed=[stemmer.stem(w) for w in wordlist]
    return docs;

def convertToVec(docs,dictionary):
    vecs=[dictionary.doc2bow(doc) for doc in docs]
    return vecs

def buildindex(docs):
    index=similarities.SparseMatrixSimilarity(docs,110)
    return index;

def createtdif(docs):
    tfidf=models.TfidfModel(docs)
    return tfidf

In [5]:
docs=tolower(docs)
#print(docs)
#Remove stop words
docs=removestop(docs)
#Perform stemming
docs=stemwords(docs)

#Create dictionary
dictionary=fetchdictionary(docs)
print (dictionary)
token_to_id=dictionary.token2id
#Convert to vector
#print (type(docs))
vecs=convertToVec(docs,dictionary)
print (vecs)
#Build index for finding similarity
index=buildindex(vecs)
#print(index)

tdif=createtdif(vecs)
print (tdif)

Dictionary(735 unique tokens: ['main', 'better', 'menac', 'averag', 'thailand']...)
[[(0, 3), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 2), (7, 3), (8, 1), (9, 1), (10, 1), (11, 2), (12, 1), (13, 1), (14, 2), (15, 14), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 2), (22, 1), (23, 4), (24, 6), (25, 1), (26, 1), (27, 1), (28, 1), (29, 3), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 3), (36, 1), (37, 1), (38, 1), (39, 2), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 3), (50, 1), (51, 1), (52, 5), (53, 1), (54, 1), (55, 1), (56, 8), (57, 3), (58, 1), (59, 1), (60, 1), (61, 1), (62, 2), (63, 2), (64, 1), (65, 1), (66, 1), (67, 2), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 2), (74, 1), (75, 1), (76, 1), (77, 1), (78, 2), (79, 1), (80, 1), (81, 1), (82, 1), (83, 2), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 4), (94, 1), (95, 7), (96, 1), (97, 1), (98, 1), (99, 30), (100, 1), (101, 1), (1

## perform LDA on the document vectors vecs

In [6]:
ldamodel=gensim.models.ldamodel.LdaModel(corpus=vecs,id2word=dictionary,num_topics=2)

## Cell below shows how we can randomly choose 2 topics from the LDA results and for each topic we choose the top-20 frequent words, using show topics(2, 20). Then we can display the topics one by one. For each topic, we see a list of words together with their probabilitiesshown.

In [8]:
topics=ldamodel.show_topics(2,20)
topics[0]

(0,
 '0.039*"." + 0.036*"," + 0.019*"``" + 0.009*"\'\'" + 0.009*"s-leagu" + 0.008*"fa" + 0.008*"said" + 0.007*"singapor" + 0.007*"club" + 0.006*"year" + 0.006*"\'s" + 0.006*")" + 0.006*":" + 0.005*"(" + 0.005*"footbal" + 0.005*"player" + 0.004*"leagu" + 0.004*"play" + 0.004*"match" + 0.004*"also"')