# Dynamic Topic Model 1
Update: 03.05.2021<br>
Mai Vu

In [3]:
#Basic libraries
import random
import time
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import numpy as np
from collections import defaultdict

import warnings
warnings.filterwarnings("ignore", category = DeprecationWarning)

#Libraries for lemmatization
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#Libraries for (dynamic) topic modeling
import gensim
from gensim import corpora
import pprint
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.models import ldaseqmodel
from gensim.test.utils import datapath

#For plotting
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
import plotly.graph_objs as go
import plotly.express as ex
from plotly.offline import iplot, init_notebook_mode
import cufflinks
cufflinks.go_offline(connected = True)
init_notebook_mode(connected = True)

# 1. Init steps

In [4]:
def CalculateCoherenceRange(texts, bow, dictionary, start = 1, end = 20):
    """Function to calculate the coherence scores of different LDA models with the number of topics in the given range.
    Abstracts are divided into smaller subsets so that the coherence curve will be smoother.
    
    Args:
        (list) texts: abstracts in text.
        (list) bow: bag of words of abstracts.
        (Dictionary) dictionary: a mapping between words in texts and their integer ids.
        (integers) start, end: number of topics' range.

    Returns:
        (lists) u_mass, c_v, c_uci, c_npmi: coherence scores of different LDA models with the number of topics in the given range.
    """
    u_mass = []
    c_v = []
    c_uci = []
    c_npmi = []
    n = len(bow)
    
    for num_topic in range(start, end + 1):
        np.random.seed(111)
        u_mass_temp = []
        c_v_temp = []
        c_uci_temp = []
        c_npmi_temp = []
        
        for i in range(10):
            index = np.random.choice(n, int(0.80 * n))
            t = [texts[x] for x in index]
            c = [bow[x] for x in index]
            LDA_model = LdaModel(c, num_topics = num_topic, id2word = dictionary, passes = 10, random_state = 111)
            
            #u_mass
            temp = CoherenceModel(model = LDA_model, texts = t, corpus = c, dictionary = dictionary, coherence = 'u_mass')
            u_mass_temp.append(temp.get_coherence())
            #c_v
            temp = CoherenceModel(model = LDA_model, texts = t, corpus = c, dictionary = dictionary, coherence = 'c_v')
            c_v_temp.append(temp.get_coherence())
            #c_uci
            temp = CoherenceModel(model = LDA_model, texts = t, corpus = c, dictionary = dictionary, coherence = 'c_uci')
            c_uci_temp.append(temp.get_coherence())
            #c_npmi
            temp = CoherenceModel(model = LDA_model, texts = t, corpus = c, dictionary = dictionary, coherence = 'c_npmi')
            c_npmi_temp.append(temp.get_coherence())
            
        u_mass.append(np.mean(u_mass_temp))
        c_v.append(np.mean(c_v_temp))
        c_uci.append(np.mean(c_uci_temp))
        c_npmi.append(np.mean(c_npmi_temp))
        print('.', end = " ")
        
    print()
    return u_mass, c_v, c_uci, c_npmi

In [5]:
def CalculateCoherence(LDA_model, texts, bow, dictionary):
    """Function to calculate the coherence scores of a LDA model.
    
    Args:
        (LdaModel) LDA_model: the given LDA model.
        (list) texts: abstracts in text.
        (list) bow: bag of words of abstracts.
        (Dictionary) dictionary: a mapping between words in texts and their integer ids.

    Returns:
        (floats) u_mass, c_v, c_uci, c_npmi: coherence scores of the given LDA model.
    """
    #u_mass
    temp = CoherenceModel(model = LDA_model, texts = texts, corpus = bow, dictionary = dictionary, coherence = 'u_mass')
    u_mass = temp.get_coherence()
    #c_v
    temp = CoherenceModel(model = LDA_model, texts = texts, corpus = bow, dictionary = dictionary, coherence = 'c_v')
    c_v = temp.get_coherence()
    #c_uci
    temp = CoherenceModel(model = LDA_model, texts = texts, corpus = bow, dictionary = dictionary, coherence = 'c_uci')
    c_uci = temp.get_coherence()
    #c_npmi
    temp = CoherenceModel(model = LDA_model, texts = texts, corpus = bow, dictionary = dictionary, coherence = 'c_npmi')
    c_npmi = temp.get_coherence()

    return u_mass, c_v, c_uci, c_npmi

In [6]:
def PlotKeywords(LDA_model, num_topic, num_word = 8):
    """Function to plot important keywords.
    
    Args:
        (LdaModel) LDA_model: the given LDA model.
        (integer) num_topic: number of topics of the model.
        (integer) num_word: number of words to plot.

    Returns:
        None.
    """
    keywords = [[word for word, _ in LDA_model.show_topic(topicid, topn = 20)] for topicid in range(LDA_model.num_topics)]
    keywords_prob = [[beta for _, beta in LDA_model.show_topic(topicid, topn = 20)] for topicid in range(LDA_model.num_topics)]

    gs = gridspec.GridSpec(round(math.sqrt(num_topic)) + 1, round(math.sqrt(num_topic)) + 1)
    gs.update(wspace = 0.5, hspace = 0.5)
    plt.figure(figsize = (20, 15))
    for i in range(num_topic):
        ax = plt.subplot(gs[i])
        plt.barh(range(num_word), keywords_prob[i][:num_word])
        ax.invert_yaxis()
        ax.set_yticks(range(num_word))
        ax.set_yticklabels(keywords[i][:num_word])
        plt.grid()
        plt.title("Topic " + str(i + 1))

In [7]:
def PlotCoherenceScores(u_mass, c_v, c_uci, c_npmi):
    """Function to plot coherence scores using Plotly.
    
    Args:
        (lists) u_mass, c_v, c_uci, c_npmi: coherence scores of different LDA models with the number of topics.

    Returns:
        None.
    """
    fig = go.Figure()
    fig.add_trace(go.Scatter(x = list(range(1, len(u_mass) + 1)), y = u_mass, mode = 'lines + markers', name = 'u_mass'))
    fig.add_trace(go.Scatter(x = list(range(1, len(c_v) + 1)), y = c_v, mode = 'lines + markers', name = 'c_v'))
    fig.add_trace(go.Scatter(x = list(range(1, len(c_uci) + 1)), y = c_uci, mode = 'lines + markers', name = 'c_uci'))
    fig.add_trace(go.Scatter(x = list(range(1, len(c_npmi) + 1)), y = c_npmi, mode = 'lines + markers', name = 'c_npmi'))
    fig.update_layout(title = 'Plot Coherence Scores', xaxis_title = 'Number of topics', legend_title = 'Metrics', xaxis_dtick = 1)
    fig.show()

# 2. Read and preprocess the dataset

In [8]:
#Read the data
eng_data = pd.read_csv('eng_abstracts_date+organization.csv', encoding = 'utf-8')

#Drop a special case
eng_data.drop([eng_data.index[10147]], inplace = True)

#Print samples
eng_data.sample(5)

Unnamed: 0,date,abstract_en,contributor_organization
10523,2017,This case study examined Circular Economy`s po...,Lahti University of Applied Sciences
14939,2017,This thesis presents a process that supports w...,Metropolia University of Applied Sciences
14374,2015,The purpose of this research was to study lead...,Oulu University of Applied Sciences
14562,2016,The objective of this study was to define the ...,Metropolia University of Applied Sciences
15420,2018,The purpose of this thesis is to provide the c...,Seinäjoki University of Applied Sciences


In [9]:
#sort data according to the 'date' column
eng_data.sort_values(by = ['date'])

#Count the number of abstracts in each year
_, time_slices = np.unique(eng_data['date'], return_counts = True)
# 2009 + 2010 + 2011 | 2012 + 2013 | 2014 + 2015 | 2016 + 2017 | 2018 + 2019
time_slices_2years_interval = [time_slices[0] + time_slices[1] + time_slices[2], time_slices[3] + time_slices[4],
                               time_slices[5] + time_slices[6], time_slices[7] + time_slices[8],
                               time_slices[9] + time_slices[10]]
print('Number of documents per time slice:', time_slices_2years_interval)

Number of documents per time slice: [2204, 2993, 3795, 4245, 2319]


In [10]:
#Create eng_stopwords set and lemmatizer from NLTK library
eng_stopwords = set(stopwords.words('english')).union(gensim.parsing.preprocessing.STOPWORDS)
lemmatizer = WordNetLemmatizer()

In [11]:
start_time = time.time() #Start count time

#Tokenization and delete punctuation, number, short words and stop words
allwords_abstracts = []
for abstract in eng_data['abstract_en']:
    tokens = []
    for token in nltk.word_tokenize(abstract.lower()):
        if token.isalpha() and token not in eng_stopwords and len(token) > 3:
            tokens.append(lemmatizer.lemmatize(token))
    allwords_abstracts.append(tokens)

print('- Finish preprocessing data (all words) in', round((time.time() - start_time)/60), 'min(s)')

- Finish preprocessing data (all words) in 1 min(s)


In [12]:
# Build the bigram
bigram = gensim.models.Phrases(allwords_abstracts, min_count = 10, threshold = 10)

In [13]:
for idx in range(len(allwords_abstracts)):
    allwords_abstracts[idx] = bigram[allwords_abstracts[idx]]

In [14]:
#Create dictionary for the given texts
allwords_dictionary = corpora.Dictionary(allwords_abstracts)
allwords_dictionary.filter_extremes(no_below = 10, no_above = 0.25) #Filter words that appear less than 10 documents and more than 25% of all documents
print(allwords_dictionary)

#Create the bag of words for all documents
allwords_bow = [allwords_dictionary.doc2bow(abstract) for abstract in allwords_abstracts]

Dictionary(8923 unique tokens: ['activity', 'actual', 'analyse', 'area', 'book']...)


# 3. Dynamic Topic Modeling

In [15]:
start_time = time.time() #Start count time
ldaseq = ldaseqmodel.LdaSeqModel(corpus = allwords_bow, id2word = allwords_dictionary, 
                                 time_slice = time_slices_2years_interval, num_topics = 8)
print('- Finish in', round((time.time() - start_time)/60), 'min(s)')


divide by zero encountered in double_scalars



- Finish in 1003 min(s)


In [16]:
path = datapath('dynamic_model_new')
ldaseq.save(path)

In [17]:
doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time = 0, corpus = allwords_bow)
vis_dtm = pyLDAvis.prepare(topic_term_dists = topic_term, doc_topic_dists = doc_topic, doc_lengths = doc_lengths, vocab = vocab, term_frequency = term_frequency)
pyLDAvis.display(vis_dtm)

In [19]:
ldaseq.print_topics(time = 3)

[[('customer', 0.028090092885420345),
  ('marketing', 0.02467477261640307),
  ('service', 0.01531662566167501),
  ('social_medium', 0.012383229662995683),
  ('product', 0.011541589471619353),
  ('brand', 0.009289270459456916),
  ('online', 0.008274774707807051),
  ('consumer', 0.006901081776289524),
  ('market', 0.006445183147656417),
  ('content', 0.005366411123480049),
  ('case_company', 0.005170015918601344),
  ('theory', 0.005143688785996941),
  ('survey', 0.005114086702558055),
  ('strategy', 0.004840678757680298),
  ('plan', 0.0047864341554036465),
  ('tool', 0.004650182475486216),
  ('order', 0.0043746705179359),
  ('objective', 0.004316164609559812),
  ('author', 0.004300618405139299),
  ('restaurant', 0.004190300315523909)],
 [('application', 0.026885607922179977),
  ('user', 0.013137510665782802),
  ('technology', 0.012920932805255978),
  ('network', 0.012310465403626436),
  ('software', 0.01081413641895797),
  ('solution', 0.00852902132634374),
  ('security', 0.0079999258309