# Natural Language Processing Group Project
## Topic Modelling - Latent Dirchelet Allocation Implementation

Name: Zoe Tagboto

A list of the necessary libraries necessary to run this implementation

In [1]:
import numpy as np 
import re 
import string
import pandas as pd
from nltk.corpus import stopwords
import spacy
from pprint import pprint
import codecs
import gensim
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)





Using panda dataframes to read in the Data. This will make our lives way easier.

In [2]:
def read_data():
    question_data = pd.read_csv('Questions.txt', sep='\t', names=['questions'], index_col= None, header = 0 )
    topics_data = pd.read_csv('Topics.txt', sep='\t', names=['topics'], index_col= None, header = 0 )
        
    #After getting the data I combine them into 1 data frame to get questions and topics
    q_t_data = pd.concat([question_data,topics_data],axis=1)#.iloc[:].values
    
    return q_t_data

In [3]:
def clean_data(q_t_data):
    question_data = q_t_data.questions.values.tolist()
    question_data = [re.sub('\s+', ' ', sent) for sent in question_data]
    question_data = [re.sub("\'", "", sent) for sent in question_data]
    
    for sentence in question_data:
        yield(gensim.utils.simple_preprocess(str(sentence),deacc=True))
        

In [4]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    nlp = spacy.load('en', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [5]:
def data_normalize(question):
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100) 
    stop_words = stopwords.words('english')
    remove_stop =[[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in question]
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    #trigram_mod = gensim.models.phrases.Phraser(trigram)
    bigrams = [bigram_mod[doc] for doc in remove_stop]
    #trigrams = [trigram_mod[bigram_mod[doc]] for doc in remove_stop]
    
    
    data_lemmatized = lemmatization(bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    

    return data_lemmatized
    
   

In [6]:
def dict_and_corpus(normalized_text):
    id2word = corpora.Dictionary(normalized_text)
    texts = normalized_text
    corpus = [id2word.doc2bow(text) for text in texts]
    
   
    return id2word, corpus
    

In [7]:
def lda_model(id2word, corpus):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
    
    #pprint(lda_model.print_topics())
    doc_lda = lda_model[corpus]
    
    return(doc_lda)
    

In [8]:
def model_coherence(corpus, doc_lda, normalized_data, id2word):
    print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)
    

In [9]:
def visualization(lda_model, corpus, id2word):
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    vis

In [11]:
#q_t = read_data()
q_t_data = read_data()
data_words = list(clean_data(q_t_data))
normalized_data = data_normalize(data_words)

id2word, corpus = dict_and_corpus(normalized_data)
doc_lda = lda_model(id2word,corpus)

visualization(lda_model, corpus, id2word)
model_coherence(corpus, doc_lda, normalized_data, id2word)






AttributeError: 'function' object has no attribute 'num_topics'