<h1> Dynamic Topic Modelling using LDA </h1>

In [None]:
# Install pyLDAvis
#!pip install pyLDAvis

In [None]:
import pandas as pd
import numpy
import pickle
import time
import gensim
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim_models
import csv

#from sklearn.decomposition import LatentDirichletAllocation
#from sklearn.feature_extraction.text import CountVectorizer
#from gensim.models.coherencemodel import CoherenceModel

# setting up our imports
from gensim.models import ldaseqmodel
from gensim.corpora import Dictionary, bleicorpus
from gensim.matutils import hellinger
from gensim.models.coherencemodel import CoherenceModel
#from gensim.models.wrappers.dtmmodel import DtmModel

<h1> 1. Load data and treat </h1>

In [None]:
# Import the final tokens
#f = open('/project/biocomplexity/sdad/projects_data/ncses/prd/Tech-Report/FR_meta_and_final_tokens_21SEPT14.pkl', 'rb')
f = open('/project/biocomplexity/sdad/projects_data/ncses/prd/Tech-Report/case_studies/coronavirus_corpus.pkl', 'rb')
df = pickle.load(f)
f.close()
df.head()

In [None]:
# Count the number of Abstracts by year. PROJECT_START_DATE doesn't have missing informations= uses it for the time slice
df.columns

In [None]:
# length (number of abstract) of the coronavirus sample
print(len(df), "Abstracts")

In [None]:
# Function to compute the corpus and dictionary for the DTM LDA
def createLDAvars(docs):

    # Create the variables needed for LDA from df[final_frqwds_removed]: dictionary (id2word), corpus
    
    # Create Dictionary
    id2word = gensim.corpora.Dictionary(docs)

    #Filter words to only those found in at least a set number of documents (min_appearances)
    id2word.filter_extremes(no_below=20, no_above=0.6)
    
    # filter out stop words - "use" already filtered out by previous line
    id2word.filter_tokens(bad_ids=[id2word.token2id['research'], id2word.token2id['project']])

    # Create Corpus (Term Document Frequency)

    #Creates a count for each unique word appearing in the document, where the word_id is substituted for the word
    # corpus not need for c_v coherence
    corpus = [id2word.doc2bow(doc) for doc in docs]

    return id2word, corpus

In [None]:
# build the dictionary id2word
docs = df["final_tokens"]
[dictionary, corpus] = createLDAvars(docs)

In [None]:
# Create the time slice using the fiscal year
df['Year'] = df['FY']
time_slice = df['PROJECT_ID'].groupby(df['Year']).count()

<h2> 3. Solve the dynamic topic model </h2>

In [None]:
# Run the DMT. Pre-training model (default chain_variance=0.005)
ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus, id2word=dictionary, time_slice=time_slice, num_topics=30)

# save to file
pickle.dump(ldaseq, open('/project/biocomplexity/sdad/projects_data/ncses/prd/Dynamic_Topics_Modelling/LDA/DTM_LDA_30.pkl','wb'))

<h3> 3.1 Run the pickle file from slurm <h3 />

In [None]:
# Load the pickle result from slurm
f = open('/project/biocomplexity/sdad/projects_data/ncses/prd/Dynamic_Topics_Modelling/LDA/DTM_LDA_30.pkl', 'rb')
ldaseq = pickle.load(f)
f.close()

In [None]:
# Print the list of topics a given time
topic_list = ldaseq.print_topics(time=0)
topic_words = []

for i in range(len(topic_list)):
    topic_words.append([item[0] for item in topic_list[i]])

# Print the list of topics 
pd.DataFrame(topic_words)

# Save the result in csv
with open('Topic_list_0.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(topic_words)

In [None]:
# Time evolution of the first topic. Combine the whole time evolution in a DataFrame
time_topic = ldaseq.print_topic_times(topic=1)
time = list(time_slice.index)
topic_evol = pd.DataFrame(time_topic[0])
topic_evol.columns = ['WORDS', 'YEAR_'+str(time[0])]

for i in range(1,len(time_topic)):
    year_topic = pd.DataFrame(time_topic[i])
    year_topic.columns = ['WORDS', 'PROB_'+str(time[i])]
    topic_evol = topic_evol.merge(year_topic, how='outer', on=['WORDS'])

# Print the topic evolution. Detect some news words (max=30 words)
topic_evol.head(30)

# Save the result in csv
topic_evol.to_csv('Topic_evol_1.csv')

In [90]:
# Choose the best DTM using chain variance.

In [9]:
# Visualisation of the DTM at time = 0
doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=0, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_wrapper)

In [10]:
#ldaseq_chain.print_topic_times(0)

In [None]:
# Run the DTM model
dtm_model = DtmModel(dtm_path, corpus, time_slice, num_topics=5, id2word=dictionary, initialize_lda=True)
dtm_model.save('dtm_news')

In [None]:
# Topic coherence
# we just have to specify the time-slice we want to find coherence for.
topics_wrapper = ldaseq.dtm_coherence(time=0)
topics_dtm = ldaseq.dtm_coherence(time=2)

# running u_mass coherence on our models
cm_wrapper = CoherenceModel(topics=topics_wrapper, corpus=corpus, dictionary=dictionary, coherence='u_mass')
cm_DTM = CoherenceModel(topics=topics_dtm, corpus=corpus, dictionary=dictionary, coherence='u_mass')

print ("U_mass topic coherence")
print ("Wrapper coherence is ", cm_wrapper.get_coherence())
print ("DTM Python coherence is", cm_DTM.get_coherence())

# to use 'c_v' we need texts, which we have saved to disk.
texts = pickle.load(open('Corpus/texts', 'rb'))
cm_wrapper = CoherenceModel(topics=topics_wrapper, texts=texts, dictionary=dictionary, coherence='c_v')
cm_DTM = CoherenceModel(topics=topics_dtm, texts=texts, dictionary=dictionary, coherence='c_v')

print ("C_v topic coherence")
print ("Wrapper coherence is ", cm_wrapper.get_coherence())
print ("DTM Python coherence is", cm_DTM.get_coherence())