In [None]:
## Vaishu Myadam (vmyadam1208@gmail.com)
## January - May, 2020

In [None]:
import glob, os
import nltk
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re
import gensim
from gensim import corpora
import pyLDAvis.gensim
import warnings

In [None]:
# reading in all files

doc_complete = []
for file in glob.glob('**/*.txt',recursive=True):
    with open(file) as read:
        doc_complete.append(read.read())
print(len(doc_complete))        

In [None]:
# cleaning input

nltk.download('stopwords')
nltk.download('wordnet')
exclude = set(string.punctuation)
stop = set(stopwords.words('english'))

# Note: actual words in 'new_words' were replaced with dummy words due to privacy reasons

new_words=('word1','word2', 'word3', 'word4')
for i in new_words:
    stop.add(i)

lemma = WordNetLemmatizer() # very useful in computational linguistics, more concise

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    normalized = re.sub(r"\b\d+\b", " ", normalized) #keeps digits inside words
    normalized = re.sub(r'\b\w{1,2}\b', '', normalized)
    #print(normalized)
    return normalized

doc_clean = [clean(doc).split() for doc in doc_complete] 
# print(doc_clean)

In [None]:
dictionary = corpora.Dictionary(doc_clean)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [None]:
# LDA model

Lda = gensim.models.ldamodel.LdaModel
ldamodel = Lda(doc_term_matrix, num_topics=15, id2word = dictionary, passes=50)

In [None]:
print(ldamodel.print_topics(num_topics=15, num_words=10))

In [None]:
# 15 topics, 10 lines

for i,topic in ldamodel.show_topics(formatted=True, num_topics=15, num_words=10):
    print(str(i)+": "+ topic)
    print()

In [None]:
# visualization

pyLDAvis.enable_notebook()

warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [None]:
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
