# TOPIC MODELLING
### Name: Nur Syaida Firzana
### Id  : 66474666


In [1]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models.wrappers import LdaMallet
from gensim.models.coherencemodel import CoherenceModel
from gensim import similarities

import os.path
import re
import glob

import pandas as pd
import matplotlib as plt

import nltk
nltk.download('stopwords')
mallet_path = '/opt/mallet-2.0.8/bin/mallet' # this should be the correct path for the DIGI405 lab workrooms

[nltk_data] Downloading package stopwords to
[nltk_data]     /homedirs/nsm91/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

In [3]:
### functions to load a corpus from a directory of text files, preprocess the corpus and create the bag of words document-term matrix. 

In [4]:
# create document list:
documents_list = []
doc_register = {}

def load_data_from_dir(path):
    file_list = glob.glob(path + '/*.txt')
    
    for idx, filename in enumerate(file_list):
        with open(filename, 'r', encoding='utf8') as f:
            text = f.read()
            documents_list.append(text)
            doc_register[filename.split('/')[1]] = idx
    print("Total Number of Documents:",len(documents_list))
    return documents_list

In [5]:
def preprocess_data(doc_set,extra_stopwords = {}):
    # adapted from https://www.datacamp.com/community/tutorials/discovering-hidden-topics-python
    # replace all newlines or multiple sequences of spaces with a standard space
    doc_set = [re.sub('\s+', ' ', doc) for doc in doc_set]
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    # add any extra stopwords
    if (len(extra_stopwords) > 0):
        en_stop = en_stop.union(extra_stopwords)
    
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        # add tokens to list
        texts.append(stopped_tokens)
    return texts

In [6]:
def prepare_corpus(doc_clean):
    # adapted from https://www.datacamp.com/community/tutorials/discovering-hidden-topics-python
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
    dictionary = corpora.Dictionary(doc_clean)
    
    dictionary.filter_extremes(no_below=5, no_above=0.5)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

    return dictionary,doc_term_matrix

In [7]:
### Load and pre-process the corpus

In [8]:
# adjust the path below to wherever you have the transcripts2018 folder
document_list = load_data_from_dir("transcripts")

# I've added extra stopwords here in addition to NLTK's stopword list - you could look at adding others.
doc_clean = preprocess_data(document_list,{'laughter','applause'})

dictionary, doc_term_matrix = prepare_corpus(doc_clean)

Total Number of Documents: 3603


In [9]:
number_of_topics=20 # adjust this to alter the number of topics
words=20 #adjust this to alter the number of words output for the topic below

In [10]:
### LDA model with 20 topics

In [11]:
number_of_topics=20
words=20

In [12]:
ldamallet20 = LdaMallet(mallet_path, corpus=doc_term_matrix, num_topics=number_of_topics, id2word=dictionary)

In [13]:
# output the topics
ldamallet20.show_topics(num_topics=number_of_topics,num_words=words)

[(0,
  '0.014*"human" + 0.014*"history" + 0.012*"century" + 0.008*"god" + 0.006*"culture" + 0.005*"age" + 0.005*"future" + 0.005*"story" + 0.005*"religion" + 0.005*"compassion" + 0.005*"past" + 0.005*"death" + 0.004*"modern" + 0.004*"sense" + 0.004*"ancient" + 0.004*"humanity" + 0.004*"religious" + 0.004*"man" + 0.004*"beings" + 0.003*"power"'),
 (1,
  '0.018*"story" + 0.016*"book" + 0.016*"language" + 0.016*"words" + 0.013*"stories" + 0.013*"word" + 0.013*"read" + 0.009*"write" + 0.008*"books" + 0.008*"writing" + 0.007*"english" + 0.007*"wrote" + 0.006*"film" + 0.006*"talk" + 0.006*"voice" + 0.006*"love" + 0.005*"reading" + 0.005*"speak" + 0.005*"idea" + 0.005*"written"'),
 (2,
  '0.033*"school" + 0.030*"children" + 0.028*"kids" + 0.016*"students" + 0.014*"child" + 0.012*"education" + 0.011*"learn" + 0.011*"learning" + 0.010*"young" + 0.009*"parents" + 0.009*"high" + 0.007*"schools" + 0.007*"teach" + 0.007*"college" + 0.006*"class" + 0.006*"teachers" + 0.006*"family" + 0.006*"learned"

In [14]:
### Convert to Gensim model format

In [15]:
gensimmodel20 = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet20)

In [16]:
### Get a coherence score

In [17]:
coherencemodel = CoherenceModel(model=gensimmodel20, texts=doc_clean, dictionary=dictionary, coherence='c_v')
print (coherencemodel.get_coherence())

0.544693353469589


In [18]:
ldamallet30 = LdaMallet(mallet_path, corpus=doc_term_matrix, num_topics=30, id2word=dictionary)
gensimmodel30 = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet30)
coherencemodel = CoherenceModel(model=gensimmodel30, texts=doc_clean, dictionary=dictionary, coherence='c_v')
print (coherencemodel.get_coherence())

0.5383706283393228


In [19]:
### Test a range of topic sizes and plot the results

In [20]:
# supply values for k and the interval, eg 20, 60, 10 will train models for 20, 30, 40, 50, and 60 topics
min_k = 5
max_k = 40
intervals = 5

coherences = {}

for i in range(min_k, max_k, intervals):
    ldamalletmodel = LdaMallet(mallet_path, corpus=doc_term_matrix, num_topics=i, id2word=dictionary)
    gensimmodel = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamalletmodel)
    coherences[i] = CoherenceModel(model=gensimmodel, texts=doc_clean, dictionary=dictionary, coherence='c_v').get_coherence()


In [21]:
### Convert the coherence scores to pandas dataframe

In [22]:
# convert the coherence scores to a pandas dataframe
df = pd.DataFrame.from_dict(coherences, orient='index', columns=['Coherence'])
df['Topics'] = df.index

In [23]:
### Plot Result

In [24]:
df.plot(kind='line', x='Topics', y='Coherence')


<matplotlib.axes._subplots.AxesSubplot at 0x7f4a35c48150>

In [40]:
x = range(min_k, max_k, intervals)
for m, i in zip(x, coherences):
    print("Num Topics =",m, "has coherence value of", round(coherences[i], 5))

Num Topics = 5 has coherence value of 0.41437
Num Topics = 10 has coherence value of 0.46368
Num Topics = 15 has coherence value of 0.51783
Num Topics = 20 has coherence value of 0.53277
Num Topics = 25 has coherence value of 0.55145
Num Topics = 30 has coherence value of 0.54598
Num Topics = 35 has coherence value of 0.559


In [25]:
### Preview a document

In [26]:
doc_register['2012-09-14-timothy_bartik_the_economic_case_for_preschool.txt']

136

In [41]:
doc_id = 1117 # index of document to explore
print(re.sub('\s+', ' ', document_list[doc_id])) 

You probably don't know me, but I am one of those .01 percenters that you hear about and read about, and I am by any reasonable definition a plutocrat. And tonight, what I would like to do is speak directly to other plutocrats, to my people, because it feels like it's time for us all to have a chat. Like most plutocrats, I too am a proud and unapologetic capitalist. I have founded, cofounded or funded over 30 companies across a range of industries. I was the first non-family investor in Amazon.com. I cofounded a company called aQuantive that we sold to Microsoft for 6.4 billion dollars. My friends and I, we own a bank. I tell you this — (Laughter) — unbelievable, right? I tell you this to show that my life is like most plutocrats. I have a broad perspective on capitalism and business, and I have been rewarded obscenely for that with a life that most of you all can't even imagine: multiple homes, a yacht, my own plane, etc., etc., etc. But let's be honest: I am not the smartest person y

In [36]:
## Output the distribution of topics for the document


In [37]:
document_topics = gensimmodel20.get_document_topics(doc_term_matrix[doc_id]) # substitute other models here
document_topics = sorted(document_topics, key=lambda x: x[1], reverse=True) # sorts document topics

for topic, prop in document_topics:
    topic_words = [word[0] for word in gensimmodel20.show_topic(topic, 10)]
    print ("%.2f" % prop, topic, topic_words)

0.42 13 ['money', 'business', 'dollars', 'company', 'companies', 'market', 'percent', 'buy', 'pay', 'jobs']
0.17 2 ['school', 'children', 'kids', 'students', 'child', 'education', 'learn', 'learning', 'young', 'parents']
0.15 5 ['question', 'social', 'problem', 'important', 'understand', 'wrong', 'questions', 'thinking', 'answer', 'change']
0.11 10 ['countries', 'africa', 'country', 'global', 'china', 'india', 'percent', 'change', 'population', 'states']
0.04 11 ['000', 'percent', 'number', '10', 'car', 'million', '1', 'half', 'times', '2']
0.02 19 ['war', 'political', 'government', 'power', 'country', 'states', 'police', 'law', 'united', 'rights']


In [38]:
### Find similar documents


In [39]:
model_doc_topics = gensimmodel20.get_document_topics(doc_term_matrix) # substitute other models here
lda_index = similarities.MatrixSimilarity(model_doc_topics.corpus)
    
# query for our doc_id from above
similarity_index = lda_index[doc_term_matrix[doc_id]]
# Sort the similarity index
similarity_index = sorted(enumerate(similarity_index), key=lambda item: -item[1])
file_list = glob.glob('./transcripts/*.txt')
for i in range(1,4): 
    document_id, similarity_score = similarity_index[i]
    print('Document Index: ',document_id)
    print('Filename:', file_list[document_id])
    print('Similarity Score',similarity_score)
    print(re.sub('\s+', ' ', document_list[document_id][:500]), '...') # preview first 500 characters
    
    

Document Index:  2046
Filename: ./transcripts/2013-06-26-mariana_mazzucato_government_investor_risk_taker_innovator.txt
Similarity Score 0.3059151
Have you ever asked yourselves why it is that companies, the really cool companies, the innovative ones, the creative, new economy-type companies — Apple, Google, Facebook — are coming out of one particular country, the United States of America? Usually when I say this, someone says, "Spotify! That's Europe." But, yeah. It has not had the impact that these other companies have had. Now what I do is I'm an economist, and I actually study the relationship between innovation and economic growth at ...
Document Index:  2855
Filename: ./transcripts/2019-04-15-claudia_miner_a_new_way_to_get_every_child_ready_for_kindergarten.txt
Similarity Score 0.29231533
I'm an historian. And what I love about being an historian is it gives you perspective. Today, I'd like to bring that perspective to education in the United States. About the only thing people c