In [17]:
import pandas as pd
import re
import gensim
from gensim.utils import simple_preprocess
import nltk
import gensim.corpora as corpora

#Get stopword list from file
def stopwords():
	stopwordList =[]
	with open("stopwords.txt",'r') as File:
		for line in File:
			for word in line.split():
				stopwordList.append(word.lower())
	return stopwordList

stop_words = stopwords()
stop_words.extend(['br'])

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

def remove_wordsgreaterthan(texts,length):
     return [[word for word in simple_preprocess(str(doc)) 
             if len(word)>length] for doc in texts]

    
data = pd.read_csv('FoodReviews.csv',low_memory = False)# Print head
print(data)

# Remove all columns exclusing the text
data.filter(items=['Text']) #'Id','Summary'

#Preprocess data
# Convert the text to lowercase and get rid of nonalphanumerics and get rid of numbers
data['Text'] = data['Text'].map(lambda x: re.sub('[,.!?@$-<>]', '', x))
data['Text'] = data['Text'].map(lambda x: x.lower())

#filter rows based on length of the review and then choose a sample size
data = data.loc[data['Text'].str.len() > 60].sample(1000)
data.filter('Text')
print(data['Text'])

data = data.Text.values.tolist()
data_words = list(sent_to_words(data))
data_words = remove_stopwords(data_words)
data_words = remove_wordsgreaterthan(data_words,2)

#Using Gensim LDA Model
id2word = corpora.Dictionary(data_words)
texts = data_words
corpus = [id2word.doc2bow(text) for text in texts]# View
print(corpus[:1][0][:30])

from pprint import pprint
num_topics = 10
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())


            Id   ProductId          UserId                      ProfileName  \
0            1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1            2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2            3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3            4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4            5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   
...        ...         ...             ...                              ...   
568449  568450  B001EO7N10  A28KG5XORO54AY                 Lettie D. Carter   
568450  568451  B003S1WTCU  A3I8AFVPEE8KI5                        R. Sawyer   
568451  568452  B004I613EE  A121AA1GQV751Z                    pksd "pk_007"   
568452  568453  B004I613EE   A3IBEVCTXKNOH          Kathy A. Welch "katwel"   
568453  568454  B001LR2CU2  A3LGQPJCZVL9UC                         srfell17   

        HelpfulnessNumerator  HelpfulnessDenominato

In [18]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis

In [19]:
pyLDAvis.enable_notebook()
p = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
p