# First Topic Modeling with Gensim
Update: 12.04.2021<br>
Mai Vu

In [1]:
#Basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict
import random

#Libraries for lemmatization
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#Lirary to check the language in text
from langdetect import detect

#Libraries for topic modeling
import gensim
from gensim import corpora
import pprint
from gensim import models
from gensim.models.ldamodel import LdaModel
from gensim.test.utils import datapath

## 1. Read the data

In [2]:
eng_data = pd.read_csv('eng_theseus_abstract.csv')
print('Number of rows before deleting non-English abstracts: ', len(eng_data))

#Data was extracted from the raw data with conditions such as 'language' = 'en'.
#But still there are Finnish, Swedish, even Chinese abstracts in the dataset.
#Here, those are deleted using langdetect library
eng_data = eng_data[eng_data['abstract_en'].map(detect) == 'en']
print('Number of rows after: ', len(eng_data))

Number of rows before deleting non-English abstracts:  11528
Number of rows after:  11506


## 2. Preprocess the data

In [3]:
#Create eng_stopwords set and lemmatizer from NLTK library
eng_stopwords = set(stopwords.words('english'))
eng_stopwords.update('also', 'used')
lemmatizer = WordNetLemmatizer()

#Tokenization and delete punctuation, number, and stopwords
abstracts = [[lemmatizer.lemmatize(word) for word in nltk.word_tokenize(abstract.lower())
                                     if word.isalpha() and word not in eng_stopwords]
              for abstract in eng_data['abstract_en']]

#Some words appear very few times. If we count all words, there will be 32k unique tokens.
#Here, we delete words that appear less than 3 times. Thus, unique tokens are nearly 16k.
count = defaultdict(int) #Count word frequencies
for abstract in abstracts:
    for token in abstract:
        count[token] += 1
processed_abstracts = [[token for token in abstract if count[token] > 2] for abstract in abstracts]

## 3. Converting corpus into list of vectors

In [4]:
dictionary = corpora.Dictionary(processed_abstracts)
print(dictionary)

Dictionary(15746 unique tokens: ['aaltonen', 'activity', 'actual', 'aim', 'also']...)


In [5]:
#Create the bag of words for all documents
bow_corpus = [dictionary.doc2bow(abstract) for abstract in processed_abstracts]

#Print first 20 words in a random abstract
pprint.pprint(bow_corpus[random.randint(0, 1000)][:20])

[(19, 2),
 (24, 1),
 (40, 1),
 (45, 1),
 (60, 2),
 (65, 1),
 (73, 1),
 (149, 1),
 (155, 3),
 (167, 2),
 (224, 1),
 (228, 1),
 (252, 1),
 (299, 2),
 (301, 1),
 (316, 1),
 (342, 2),
 (346, 1),
 (467, 1),
 (471, 1)]


## 4. LDA Model with 10 topics

In [6]:
LDA_model = LdaModel(bow_corpus, num_topics = 10, id2word = dictionary, passes = 30)

In [7]:
LDA_model.print_topics()

[(0,
  '0.012*"study" + 0.011*"research" + 0.010*"country" + 0.008*"risk" + 0.007*"financial" + 0.007*"data" + 0.006*"economic" + 0.006*"impact" + 0.006*"economy" + 0.005*"also"'),
 (1,
  '0.020*"research" + 0.017*"tourism" + 0.012*"survey" + 0.010*"consumer" + 0.010*"thesis" + 0.010*"study" + 0.010*"result" + 0.009*"hotel" + 0.008*"questionnaire" + 0.007*"restaurant"'),
 (2,
  '0.021*"company" + 0.021*"process" + 0.020*"management" + 0.017*"study" + 0.014*"research" + 0.014*"case" + 0.013*"thesis" + 0.012*"project" + 0.010*"employee" + 0.009*"organization"'),
 (3,
  '0.027*"health" + 0.024*"care" + 0.020*"study" + 0.015*"patient" + 0.015*"nurse" + 0.010*"research" + 0.010*"nursing" + 0.009*"review" + 0.009*"literature" + 0.009*"used"'),
 (4,
  '0.019*"student" + 0.012*"study" + 0.010*"thesis" + 0.009*"university" + 0.009*"work" + 0.009*"education" + 0.008*"research" + 0.008*"finland" + 0.007*"also" + 0.007*"group"'),
 (5,
  '0.019*"energy" + 0.011*"waste" + 0.010*"thesis" + 0.010*"wat

- Some topics are quite well-defined, such as topic 1 (Tourism), topic 3 (Healthcare), topic 8 (Information Technology), topic 9 (Marketing), 
- Some topics, such as 0, 2, and 4, are not so clear.
- Some meaningless keywords appear, such as "thesis", "used", "also", "part", "work". Might need to delete in the preprocessing stage.