# Topic Modeling using Theses' Abstracts
## 6 Further Development and Conclusion
Mai Vu<br>
Helsinki Metropolia University of Applied Sciences<br>
Bachelor’s Thesis<br>
October 2021

In [1]:
#Import libraries
import os 
import re
import pickle
import math
import pandas as pd
import numpy as np
from random import randint
from operator import itemgetter
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

#Libraries for lemmatization
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#Libraries for topic modeling
import gensim
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import LdaSeqModel
from gensim.test.utils import datapath
from gensim.matutils import hellinger

In [2]:
#Prepare the corpus for plotting
file = open(os.getcwd() + "\\may27\\data\\final_data", 'rb')
eng_data = pickle.load(file)
file.close()

###Preprocess data
eng_data.sort_values(by = ['year'], inplace = True) #Sort data according to the 'year' column
dates = list(eng_data['year'])

###Tokenization and delete punctuation, number, short words and stop words
eng_stopwords = set(stopwords.words('english')).union(gensim.parsing.preprocessing.STOPWORDS)
lemmatizer = WordNetLemmatizer()
abstracts = []
for abstract in eng_data['en']:
    tokens = []
    for token in nltk.word_tokenize(abstract):
        #Keep abbreviations
        if token.upper() == token and token.isalpha() and len(token) > 1:
            tokens.append(token)
        #Or keep valid words
        else:
            token = token.lower()
            if token.isalpha() and token not in eng_stopwords and len(token) > 3:
                tokens.append(lemmatizer.lemmatize(token))
    abstracts.append(tokens)

del eng_data #Free some data

###Build the bigram
bigram = gensim.models.Phrases(abstracts, min_count = 50, threshold = 10)
for idx in range(len(abstracts)):
    abstracts[idx] = bigram[abstracts[idx]]

###Create dictionary for the given texts
dictionary = corpora.Dictionary(abstracts)
dictionary.filter_extremes(no_below = 50, no_above = 0.25) #Filter words that appear less than 50 documents and more than 25% of all documents

###Create the bag of words for all documents
bag_of_words = [dictionary.doc2bow(abstract) for abstract in abstracts]

#Remove abstracts with too few tokens
token_count = [len(x) for i, x in enumerate(bag_of_words)]
assert len(dates) == len(bag_of_words) # sanity checking...
dates, bag_of_words = zip(*[(dates[i], bag_of_words[i]) for i, x in enumerate(range(len(token_count))) if token_count[i] > 4])

In [3]:
#Get the topic
lda_cwd = os.getcwd() + "\\may27"
num_topic = 8
path = lda_cwd + "\\results\\lda_model_" + str(num_topic) + "topics.dat"
lda = LdaModel.load(datapath(path), mmap = 'r')

In [4]:
#Generate 4 random supervisors for 4 chosen topics
supervisor_construction = []
supervisor_healthcare = []
supervisor_education = []
supervisor_it = []

for i in range(len(abstracts)):
    if (randint(0, 1) == 1):
        new_doc_bow = dictionary.doc2bow(abstracts[i])
        topic = max(lda.get_document_topics(new_doc_bow), key = itemgetter(1))[0]
        if topic == 0:
            supervisor_construction.append(abstracts[i])
        elif topic == 3:
            supervisor_healthcare.append(abstracts[i])
        elif topic == 5:
            supervisor_education.append(abstracts[i])
        elif topic == 7:
            supervisor_it.append(abstracts[i])
            
#Apply the LDA model
supervisor_construction = lda.get_document_topics(dictionary.doc2bow([token for abstract in supervisor_construction for token in abstract]))
supervisor_healthcare = lda.get_document_topics(dictionary.doc2bow([token for abstract in supervisor_healthcare for token in abstract]))
supervisor_education = lda.get_document_topics(dictionary.doc2bow([token for abstract in supervisor_education for token in abstract]))
supervisor_it = lda.get_document_topics(dictionary.doc2bow([token for abstract in supervisor_it for token in abstract]))

In [5]:
#Generate student example
student_sample = "The control software is written in Python and runs on Raspberry Pi. It consists of 4 threads running simultaneously. The first and second ones are for reading weather data from the FMI Open Data service, the sun intensity data from pre-downloaded files from another FMI service called Ilmanet, and for transferring it to the simulator through REST API. Another thread is to apply AI algorithms to calculate the setpoint and send it to the simulator. The last thread is simply for the simulator to read and update the data. Each thread runs its own loop to do the task and sleeps until its next cycle. The length of the thread's cycle can be modified as well."

tokens = []
for token in nltk.word_tokenize(student_sample):
        if token.upper() == token and token.isalpha() and len(token) > 1:
            tokens.append(token)
        else:
            token = token.lower()
            if token.isalpha() and token not in eng_stopwords and len(token) > 3:
                tokens.append(lemmatizer.lemmatize(token))

tokens = bigram[tokens]
student_sample = lda.get_document_topics(dictionary.doc2bow(tokens)) #Apply the LDA model

In [6]:
#Build the dataframe for better visualization 
label = pd.Series(['supervisor_construction', 'supervisor_healthcare', 'supervisor_education', 'supervisor_it', 'student_sample'])
df = pd.DataFrame(columns = [1, 2, 3, 4, 5, 6, 7, 8])
df = df.append(pd.DataFrame({(x[0] + 1):x[1:] for x in supervisor_construction}))
df = df.append(pd.DataFrame({(x[0] + 1):x[1:] for x in supervisor_healthcare}))
df = df.append(pd.DataFrame({(x[0] + 1):x[1:] for x in supervisor_education}))
df = df.append(pd.DataFrame({(x[0] + 1):x[1:] for x in supervisor_it}))
df = df.append(pd.DataFrame({(x[0] + 1):x[1:] for x in student_sample}))
df.set_index([label], inplace = True)
df

Unnamed: 0,1,2,3,4,5,6,7,8
supervisor_construction,0.606338,0.058659,0.081363,0.015419,0.01347,0.018685,0.039093,0.166972
supervisor_healthcare,0.015448,0.031961,0.066,0.604735,0.125504,0.086463,0.019664,0.050225
supervisor_education,0.022969,0.076663,0.066236,0.079346,0.129593,0.525506,0.054068,0.045619
supervisor_it,0.115435,0.051179,0.061772,0.020447,,0.034713,0.055763,0.651389
student_sample,,,0.033828,0.027934,,0.105605,,0.822192


In [7]:
#Calculate the Hellinger distance
supervisors = [supervisor_construction, supervisor_healthcare, supervisor_education, supervisor_it, student_sample]
distant = df.T.corr()

for i in range(len(supervisors)):
    for j in range(len(supervisors)):
        distant.iloc[i][j] = hellinger(supervisors[i], supervisors[j])
        distant.iloc[j][i] = hellinger(supervisors[i], supervisors[j])
        
distant

Unnamed: 0,supervisor_construction,supervisor_healthcare,supervisor_education,supervisor_it,student_sample
supervisor_construction,0.0,0.699617,0.657584,0.430575,0.711775
supervisor_healthcare,0.699617,0.0,0.474689,0.684728,0.720271
supervisor_education,0.657584,0.474689,0.0,0.64391,0.686382
supervisor_it,0.430575,0.684728,0.64391,0.0,0.357975
student_sample,0.711775,0.720271,0.686382,0.357975,0.0
