### _This is the topic modelling Python program of the workgroup No.4, for the final project of the Digital Humanities Lab course at the Universiteit van Amsterdam.

In [None]:
# Universiteit van Amsterdam
# Digital Humanities Lab, WG04
# Final project - Topic modelling program
# project.ipynb

### _Importing all the necessary tools and libraries

In [1]:
import random
import pandas as pd
from nltk.stem.porter import *
from nltk.corpus import stopwords
from gensim import corpora, models
from nltk.tokenize import word_tokenize
import os, fitz, string, nltk, PyPDF2, gensim
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from functions import readPDF, processPDF, lemmatizeAndStem, toDataFrame, listOfWords, wordsPerArticle, tfidfCorpus

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Miller\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### _Reading all the PDF files that can be found in the given folder, preprocessing, tokenizing, lemmatizing, stemming, and then turning into a DataFrame for further usage.

In [2]:
# Reading the files and convert from pdf to string/dict
articles = readPDF('open access articles')

# Basic text-processing, tokenize and filtering
processedArticles = processPDF(articles)

# Gensim, LDA stuff
dataFrame = toDataFrame(processedArticles)

### _A dataframe containing the pdf file names, and all of its contents, already processed. 
#### _For further information how they are processed, please check the functions.py file and the processPDF function or use "pydoc functions.functions"

In [3]:
display(dataFrame)

Unnamed: 0,file_name,content
0,1-s2.0-S0550321316301341-main.pdf,"[avail, onlin, www.sciencedirect.com, scienced..."
1,43-Article Text-125-1-10-20161227.pdf,"[review, critic, view, polem, rudolf, siebert,..."
2,8.8.pdf,"[compar, analysi, methodolog, islam, jurist, u..."
3,8037-Article Text-26428-1-10-20130627.pdf,"[mediekultur, journal, media, communic, resear..."
4,Acta_acu-201102-0003.pdf,"[acta, universitati, agricultura, silvicultura..."
5,applsci-10-05589-v3.pdf,"[appli, scienc, articl, space, alloc, method, ..."
6,Cao2018_Article_StudyOnPM25PollutionAndTheMort...,"[research, articl, open, access, studi, pm2.5,..."
7,energies-08-03882.pdf,"[energi, 2015, 3882-3902, doi:10.3390/en805388..."
8,fpsyg-04-00479.pdf,"[origin, research, articl, publish, juli, 2013..."
9,ijerph-13-01268.pdf,"[intern, journal, environment, research, publi..."


In [4]:
listOfWords = listOfWords(dataFrame)

wordsPerArticle = wordsPerArticle(dataFrame)

tfidfCorpus = tfidfCorpus(listOfWords, dataFrame)

### _Generating a random number of topics with a simple LDA model, using the list of words per articles.

In [5]:
lda_model = gensim.models.LdaMulticore(wordsPerArticle, num_topics=random.randrange(10,26), id2word=listOfWords, passes=random.randrange(5,11), workers=3)

for index, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(index, topic))

Topic: 0 
Words: 0.001*"studi" + 0.001*"control" + 0.000*"group" + 0.000*"religion" + 0.000*"medicin" + 0.000*"field" + 0.000*"patient" + 0.000*"result" + 0.000*"trial" + 0.000*"target"
Topic: 1 
Words: 0.001*"group" + 0.001*"studi" + 0.001*"medicin" + 0.000*"patient" + 0.000*"land" + 0.000*"control" + 0.000*"result" + 0.000*"trial" + 0.000*"method" + 0.000*"effect"
Topic: 2 
Words: 0.014*"health" + 0.010*"pm2.5" + 0.010*"studi" + 0.010*"china" + 0.008*"cancer" + 0.008*"theori" + 0.008*"lung" + 0.006*"servic" + 0.006*"mortal" + 0.006*"field"
Topic: 3 
Words: 0.000*"medicin" + 0.000*"patient" + 0.000*"chang" + 0.000*"control" + 0.000*"studi" + 0.000*"land" + 0.000*"model" + 0.000*"china" + 0.000*"cancer" + 0.000*"method"
Topic: 4 
Words: 0.001*"medicin" + 0.001*"studi" + 0.001*"patient" + 0.001*"result" + 0.001*"health" + 0.001*"effect" + 0.001*"group" + 0.000*"complementari" + 0.000*"field" + 0.000*"control"
Topic: 5 
Words: 0.001*"studi" + 0.001*"patient" + 0.001*"medicin" + 0.000*"me

### _Generating a random number of topics with a TF-IDF LDA model, using the TF-IDF model's corpus of words

In [6]:
lda_model_tfidf = gensim.models.LdaMulticore(tfidfCorpus, num_topics=random.randrange(10,26), id2word=listOfWords, passes=random.randrange(5,11), workers=3)

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.000*"chandranigahpur" + 0.000*"chan1" + 0.000*"characteris-" + 0.000*"chao-yang" + 0.000*"chantal" + 0.000*"chang1" + 0.000*"charité" + 0.000*"chalder" + 0.000*"chan-" + 0.000*"charli"
Topic: 1 Word: 0.001*"medicin" + 0.001*"complementari" + 0.001*"patient" + 0.000*"therapi" + 0.000*"suppl" + 0.000*"treatment" + 0.000*"acupunctur" + 0.000*"altern" + 0.000*"clinic" + 0.000*"pain"
Topic: 2 Word: 0.001*"religion" + 0.001*"religi" + 0.000*"teacher" + 0.000*"danger" + 0.000*"teach" + 0.000*"respond" + 0.000*"essenti" + 0.000*"educ" + 0.000*"question" + 0.000*"2018"
Topic: 3 Word: 0.001*"land" + 0.000*"grassland" + 0.000*"map" + 0.000*"1932" + 0.000*"spatial" + 0.000*"topograph" + 0.000*"1954" + 0.000*"reconstruct" + 0.000*"settlement" + 0.000*"unus"
Topic: 4 Word: 0.000*"chandranigahpur" + 0.000*"chan1" + 0.000*"characteris-" + 0.000*"chao-yang" + 0.000*"chantal" + 0.000*"chang1" + 0.000*"charité" + 0.000*"chalder" + 0.000*"chan-" + 0.000*"charli"
Topic: 5 Word: 0.001*"pm2.

### _Evaluating the performance and accuracy of the simple LDA model, with a randomly choosen article's index.

In [7]:
randomArticle = random.randrange(0,15)
print("The random generated article's index: ", randomArticle)

for index, score in sorted(lda_model[wordsPerArticle[randomArticle]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))

The random generated article's index:  2

Score: 0.8808422684669495	 
Topic: 0.007*"عم��ا" + 0.005*"لوسر" + 0.005*"jurist" + 0.004*"حيج��لاو" + 0.004*"contradict" + 0.004*"conflict" + 0.003*"ام�دحأ" + 0.003*"ن�ضراعتﳌا" + 0.003*"نو�ي" + 0.003*"اذ�و"

Score: 0.11812471598386765	 
Topic: 0.028*"religion" + 0.022*"media" + 0.014*"mediat" + 0.008*"nordic" + 0.008*"book" + 0.007*"studi" + 0.006*"field" + 0.006*"theori" + 0.006*"cultur" + 0.006*"perspect"


### _Evaluating the performance and accuracy of the TF-IDF LDA model, with a randomly choosen article's index.

In [8]:
randomArticle_tfidf = random.randrange(0,15)
print("The random generated article's index: ", randomArticle_tfidf)

for index, score in sorted(lda_model_tfidf[wordsPerArticle[randomArticle_tfidf]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))

The random generated article's index:  7

Score: 0.9302291870117188	 
Topic: 0.001*"land" + 0.000*"grassland" + 0.000*"map" + 0.000*"1932" + 0.000*"spatial" + 0.000*"topograph" + 0.000*"1954" + 0.000*"reconstruct" + 0.000*"settlement" + 0.000*"unus"

Score: 0.055066995322704315	 
Topic: 0.001*"pm2.5" + 0.001*"health" + 0.001*"cancer" + 0.001*"lung" + 0.001*"agent" + 0.001*"biolog" + 0.001*"pest" + 0.000*"hospit" + 0.000*"household" + 0.000*"chronic"

Score: 0.010687386617064476	 
Topic: 0.001*"navig" + 0.001*"channel" + 0.001*"gray" + 0.000*"index" + 0.000*"gray-fuzzi" + 0.000*"first-level" + 0.000*"weight" + 0.000*"evalu" + 0.000*"set-valu" + 0.000*"safeti"


## _Acknowledgements and citations

#### _To build our topic modelling tool for the final project of this course, Sandra Li's article, and GitHub repository was a great help, and we implemented some of her ideas and methods, - such as the lemmatizing and stemming of words - to our program, and followed the pipeline of her program.
#### _The article: https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
#### _The repository: https://github.com/susanli2016/NLP-with-Python/blob/master/LDA_news_headlines.ipynb

#### _Any other part of the project is coded by ourselves, and we only used the documentations of the imported modules.


### _Universiteit van Amsterdam
### _Digital Humanities Lab - WG04
### _Final project

#### _This Python program for the project was coded entirely by Tamás Molnár

