In [1]:
import pandas as pd
import nltk;
from tqdm import tqdm

In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
#import pyLDAvis
#import pyLDAvis.gensim  # don't skip this
#import matplotlib.pyplot as plt
#%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
df = pd.read_csv("restaurants_nc.csv").head(50000)

In [5]:
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool
0,uBEfxb94q-wuWhpCixW0-A,v1kej913GYIN8FUSrkQ5hQ,LPhFORzrfa5_c6o2YXLBUA,1,2016-02-24,If I could give this less than a star I would....,0,0,0
1,bYu_NqO6gHzZYWjI7YLwgQ,dgsD6rM5wC3lsP46-yuedw,GqfGoFMC3cn_AZI7iUOWKg,1,2016-03-16,Terrible terrible service. I waited 40 mins fo...,0,0,0
2,jJ2Nhfa5MJp1F3kjL2KHMQ,GH7V7DMCF2RIsJAjvIHqUQ,ZdFl7MqnF09dcifG1j7l_w,1,2016-03-24,If I could give this particular Applebee's 0 s...,0,0,0
3,wdrPJmXQMvsEGAbsCeyoXA,fYBkpsjPhxXhkxD2ep4Vyg,Ch7NAhB_MWSDwcNbcptEKg,1,2016-05-16,Went here for my birthday and was so disappoin...,0,0,0
4,1O5J04Hvt_JlRd_kRbH3Pw,Pm-f2MVlywZUM6mqU88O_Q,zaLFPpm-t89qSDOaFVgwDQ,1,2016-05-24,They offer a Kobe Beef burger on their menu fo...,0,0,0


In [6]:
# NLTK Stop words
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
stop_words = stopwords.words('english')

In [7]:
reviews = list(df['text'])


In [8]:
def review_to_words(reviews):
    for review in tqdm(reviews):
        for sentence in sent_tokenize(review):
            yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

review_sentences = list(review_to_words(reviews))

100%|██████████| 50000/50000 [00:23<00:00, 2095.42it/s]


In [9]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]


def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in tqdm(texts):
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [10]:
# Remove Stop Words
review_words_nostops = remove_stopwords(review_sentences)


In [11]:
nlp = spacy.load('en', disable=['parser', 'ner'])

In [12]:
review_lemmatized = lemmatization(review_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

100%|██████████| 330134/330134 [24:56<00:00, 220.62it/s]


In [13]:
review_words_nostops[0:3]

[['could', 'give', 'less', 'star', 'would'],
 ['week', 'ago', 'coworkers', 'mine', 'went', 'brazwells'],
 ['working',
  'service',
  'industry',
  'expect',
  'get',
  'exceptional',
  'service',
  'night',
  'expect',
  'get',
  'kind',
  'service']]

In [14]:
review_lemmatized[0:3]

[['could', 'give', 'less', 'star', 'would'],
 ['week', 'ago', 'coworker', 'go', 'brazwell'],
 ['work',
  'service',
  'industry',
  'expect',
  'get',
  'exceptional',
  'service',
  'night',
  'expect',
  'get',
  'kind',
  'service']]

In [15]:
data_lemmatized = review_lemmatized

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

2018-04-30 12:04:26,746 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-04-30 12:04:26,917 : INFO : adding document #10000 to Dictionary(5604 unique tokens: ['could', 'give', 'less', 'star', 'would']...)
2018-04-30 12:04:27,069 : INFO : adding document #20000 to Dictionary(7957 unique tokens: ['could', 'give', 'less', 'star', 'would']...)
2018-04-30 12:04:27,242 : INFO : adding document #30000 to Dictionary(9610 unique tokens: ['could', 'give', 'less', 'star', 'would']...)
2018-04-30 12:04:27,390 : INFO : adding document #40000 to Dictionary(10779 unique tokens: ['could', 'give', 'less', 'star', 'would']...)
2018-04-30 12:04:27,535 : INFO : adding document #50000 to Dictionary(12063 unique tokens: ['could', 'give', 'less', 'star', 'would']...)
2018-04-30 12:04:27,690 : INFO : adding document #60000 to Dictionary(13007 unique tokens: ['could', 'give', 'less', 'star', 'would']...)
2018-04-30 12:04:27,859 : INFO : adding document #70000 to Dictionary(13885 unique token

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]]


In [None]:
??gensim.models.ldamodel.LdaModel

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

2018-04-30 12:04:35,367 : INFO : using autotuned alpha, starting with [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
2018-04-30 12:04:35,368 : INFO : using symmetric eta at 0.1
2018-04-30 12:04:35,376 : INFO : using serial LDA version on this node
2018-04-30 12:04:35,428 : INFO : running online (multi-pass) LDA training, 10 topics, 10 passes over the supplied corpus of 330134 documents, updating model once every 100 documents, evaluating perplexity every 1000 documents, iterating 50x with a convergence threshold of 0.001000
2018-04-30 12:04:35,429 : INFO : PROGRESS: pass 0, at document #100/330134
2018-04-30 12:04:35,474 : INFO : optimized alpha [0.09297006, 0.08605023, 0.08498056, 0.084059194, 0.08715798, 0.07797873, 0.093827076, 0.08468829, 0.08994305, 0.089433916]
2018-04-30 12:04:35,481 : INFO : merging changes from 100 documents into a model of 330134 documents
2018-04-30 12:04:35,543 : INFO : topic #5 (0.078): 0.217*"chicken" + 0.072*"order" + 0.072*"server" + 0.072*"tell" + 

In [None]:
lda_model.save('lda.model')

In [None]:
?lda_model.show_topics

In [None]:
lda_model.show_topics( num_topics=10, num_words=10)

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


In [None]:
gensim.utils.pickle(lda_model,'models/lda/lda_model.pkl')
gensim.utils.pickle(corpus,'models/lda/corpus.pkl')
gensim.utils.pickle(id2word,'models/lda/id2word.pkl')

In [None]:
lda_model = gensim.utils.unpickle('models/lda/lda_model.pkl')
corpus    = gensim.utils.unpickle('models/lda/corpus.pkl')
id2word   = gensim.utils.unpickle('models/lda/id2word.pkl')

In [None]:
import pyLDAvis.gensim as gensimvis
import pyLDAvis
vis_data = gensimvis.prepare(lda_model, corpus, id2word)
pyLDAvis.display(vis_data)


In [None]:
!mkdir -p models/hdp/

In [None]:
hdp = gensim.models.hdpmodel.HdpModel(corpus, id2word, T=10)
                                      
hdp.save('models/hdp/hdp_model.pkl')

In [None]:
vis_data = gensimvis.prepare(hdp, corpus, id2word)
pyLDAvis.display(vis_data)

In [28]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in tqdm(range(start, limit, step)):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=1000,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
        
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# Can take a long time to run.
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=20, step=2)

  0%|          | 0/9 [00:00<?, ?it/s]2018-04-30 23:21:10,465 : INFO : using autotuned alpha, starting with [0.5, 0.5]
2018-04-30 23:21:10,467 : INFO : using symmetric eta at 0.5
2018-04-30 23:21:10,473 : INFO : using serial LDA version on this node
2018-04-30 23:21:10,484 : INFO : running online (multi-pass) LDA training, 2 topics, 10 passes over the supplied corpus of 330134 documents, updating model once every 1000 documents, evaluating perplexity every 10000 documents, iterating 50x with a convergence threshold of 0.001000
2018-04-30 23:21:10,486 : INFO : PROGRESS: pass 0, at document #1000/330134
2018-04-30 23:21:11,071 : INFO : optimized alpha [0.696585, 0.6938135]
2018-04-30 23:21:11,072 : INFO : merging changes from 1000 documents into a model of 330134 documents
2018-04-30 23:21:11,083 : INFO : topic #0 (0.697): 0.020*"get" + 0.017*"order" + 0.016*"go" + 0.014*"come" + 0.014*"wait" + 0.013*"good" + 0.012*"food" + 0.012*"minute" + 0.012*"back" + 0.010*"place"
2018-04-30 23:21:11

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# Select the model and print the topics
optimal_model = model_list[3]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))