In [1]:
import pandas as pd
import nltk;
from tqdm import tqdm

In [39]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
#import pyLDAvis
#import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
#%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
logging.root.level = logging.ERROR
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [3]:
df = pd.read_csv("restaurants_nc.csv").head(50000)

In [4]:
# NLTK Stop words
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
stop_words = stopwords.words('english')

In [5]:
reviews = list(df['text'])

In [6]:
def review_to_words(reviews):
    for review in tqdm(reviews):
        for sentence in sent_tokenize(review):
            res = gensim.utils.simple_preprocess(str(sentence), deacc=True)  # deacc=True removes punctuations
            yield " ".join(res)

review_sentences = list(review_to_words(reviews))

100%|██████████| 50000/50000 [00:23<00:00, 2148.94it/s]


In [7]:
documents = review_sentences

In [8]:
documents[0:2]

['if could give this less than star would',
 'about week ago coworkers of mine and myself went in to brazwells']

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


In [10]:


# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=10, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()



In [11]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=10, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [12]:
from sklearn.decomposition import NMF, LatentDirichletAllocation



In [15]:
no_topics = 10

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1
          , l1_ratio=.5, init='nndsvd', max_iter=2000, verbose=1).fit(tfidf)


violation: 1.0
violation: 0.38424632875444403
violation: 0.15280355799430018
violation: 0.07279311467850261
violation: 0.05745231521106491
violation: 0.052806936910882195
violation: 0.049950656516617155
violation: 0.047966735630536404
violation: 0.04634241496486815
violation: 0.04524966560902011
violation: 0.04460339499054783
violation: 0.043893038638914035
violation: 0.04236775514960409
violation: 0.04126013112594195
violation: 0.0404913265757598
violation: 0.04003222148931994
violation: 0.03971732530760909
violation: 0.039666167455954714
violation: 0.03928406013313108
violation: 0.03791765933931086
violation: 0.03472224477776626
violation: 0.03113584985743151
violation: 0.028569657024812532
violation: 0.026317750979084047
violation: 0.02445384599405364
violation: 0.022904596308639013
violation: 0.021610411768164292
violation: 0.020541001511041365
violation: 0.019667677600579074
violation: 0.018951847015464433
violation: 0.018355771190815336
violation: 0.017818360582633717
violation: 

violation: 0.0002648180319054424
violation: 0.0002594853135326295
violation: 0.00025425953524130484
violation: 0.00024913867240498134
violation: 0.0002441205246292624
violation: 0.00023920313221897127
violation: 0.0002343845854704804
violation: 0.00022966281987346683
violation: 0.00022503588912845899
violation: 0.00022050200585869585
violation: 0.00021605926860420173
violation: 0.00021170588064309357
violation: 0.0002074400402839353
violation: 0.00020326012376341873
violation: 0.00019916429918679404
violation: 0.00019515082791497166
violation: 0.00019121818479636977
violation: 0.00018736470111080535
violation: 0.00018358873683340022
violation: 0.00017988884699649823
violation: 0.00017626354844138824
violation: 0.00017271130337888734
violation: 0.00016923057431964304
violation: 0.0001658201169622515
violation: 0.00016247835557609754
violation: 0.0001592038989239264
violation: 0.00015599537498840272
violation: 0.00015285163075768948
violation: 0.0001497712567713683
violation: 0.000146753

In [16]:
# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics
                                , n_jobs=-1
                                , verbose =1
                                , max_iter=10, learning_method='online', learning_offset=50.,random_state=0).fit(tf)


iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [19]:
import pickle
from sklearn.externals import joblib

In [21]:
lda_filename = 'lda_topics=10_iter=10.pkl'
joblib.dump(lda,lda_filename ) 

['lda_topics=10_iter=10.pkl']

In [24]:
!gsutil cp lda_topics=10_iter=10.pkl gs://np-training-public/

Copying file://lda_topics=10_iter=10.pkl [Content-Type=application/octet-stream]...
/ [1 files][  1.2 MiB/  1.2 MiB]                                                
Operation completed over 1 objects/1.2 MiB.                                      


In [None]:
lda.sa

In [25]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx) )
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]) )

no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)


Topic 0:
great atmosphere experience selection beer spot prices location lunch overall
Topic 1:
good really pretty overall selection beer pizza experience price prices
Topic 2:
place recommend amazing awesome nice eat try like really favorite
Topic 3:
food amazing excellent fresh awesome quality fast ok decent tasty
Topic 4:
service excellent customer slow fast quick friendly terrible bad horrible
Topic 5:
definitely recommend come ll try return going coming worth highly
Topic 6:
time just best ve like chicken order try restaurant really
Topic 7:
delicious fresh absolutely chicken salad pizza ordered fries hot cheese
Topic 8:
friendly staff nice wait attentive super helpful atmosphere clean fast
Topic 9:
love absolutely atmosphere restaurant location pizza coming chicken place decor


In [27]:
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
restaurant just sauce dinner make bad tried small wings worth
Topic 1:
chicken ordered wait fresh definitely people fried shrimp got rice
Topic 2:
food order come night minutes came hot drink tasty spot
Topic 3:
great service salad eat atmosphere beer awesome new stars selection
Topic 4:
love lunch did table went server got way didn right
Topic 5:
time best ve charlotte pretty place times going visit long
Topic 6:
food really try friendly staff service better experience like place
Topic 7:
delicious little food area bar sure sandwich ll day lot
Topic 8:
good like food cheese meal fries just flavor sweet chicken
Topic 9:
place menu don pizza nice amazing burger recommend favorite know


In [32]:
?pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer, mds='tsne')


In [37]:
import pyLDAvis.sklearn

In [40]:

pyLDAvis.sklearn.sklearn(lda, tf, tf_vectorizer, mds='tsne')


AttributeError: module 'pyLDAvis.sklearn' has no attribute 'sklearn'

In [50]:
def get_dominant_topic(model,data_vectorized,data):
    # Create Document - Topic Matrix
    lda_output = model.transform(data_vectorized)

    # column names
    topicnames = ["Topic" + str(i) for i in range(model.n_topics)]

    # index names
    docnames = ["Doc" + str(i) for i in range(len(data))]

    # Make the pandas dataframe
    df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

    # Get dominant topic for each document
    dominant_topic = np.argmax(df_document_topic.values, axis=1)
    df_document_topic['dominant_topic'] = dominant_topic

    # Styling
    def color_green(val):
        color = 'green' if val > .1 else 'black'
        return 'color: {col}'.format(col=color)

    def make_bold(val):
        weight = 700 if val > .1 else 400
        return 'font-weight: {weight}'.format(weight=weight)

    # Apply Style
    #df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
    #df_document_topics
    
    return df_document_topic


In [51]:
top_docs_df = get_dominant_topic(model=lda, data_vectorized = tf, data=review_sentences)


In [52]:
top_docs_df.head()

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
Doc0,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.55,9
Doc1,0.02,0.02,0.02,0.02,0.42,0.02,0.02,0.02,0.42,0.02,4
Doc2,0.01,0.01,0.08,0.55,0.16,0.01,0.01,0.01,0.01,0.16,3
Doc3,0.01,0.01,0.91,0.01,0.01,0.01,0.01,0.01,0.01,0.01,2
Doc4,0.01,0.01,0.01,0.01,0.92,0.01,0.01,0.01,0.01,0.01,4


In [102]:
def get_most_relevant_docs( top_docs_df ,review_sentences,topic=0):
    topic_name = f"Topic{topic}"
    _tdf = top_docs_df.sort_values([topic_name], ascending = False).head(10)
    top_docs = list(_tdf.index.get_values())
    top_docs = [ int(t.replace("Doc","")) for t in top_docs]
    
    relevant_reviews = []
    
    for d_idx in top_docs:
        relevant_reviews.append(review_sentences[d_idx])
    return relevant_reviews

In [110]:
top_reviews = get_most_relevant_docs(top_docs_df,review_sentences,topic=7)
for sent in top_reviews:
    print (sent+"\n")

mal servicio no se los recomiendo es siempre que venido veces no es solo hoy pence que havian mejorado

the only positives we found were accessible parking in noda and neatly designed restrooms

one little lot the valet parking is only for tupelo honey

first and last time coming here they have dj but no dance floor

it was very delicious smooth and buttery but am still on the fence as to if it was delicious

the food was also very delicious and reasonably priced

this is perfect little neighborhood bar

food was delicious and reasonably priced

the food is certainly reasonably priced

for sure we ll be back and dine in next time



In [89]:
list(_t_df.index.get_values())

['Doc279050',
 'Doc69761',
 'Doc206080',
 'Doc148797',
 'Doc161366',
 'Doc8135',
 'Doc116692',
 'Doc214913',
 'Doc203904',
 'Doc304534']

In [54]:
top_docs_df.sort_values(['Topic9'], ascending = False)

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
Doc80,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.89,9
Doc150423,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.87,9
Doc299399,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.87,9
Doc56840,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.87,9
Doc240177,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.87,9
Doc42873,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.85,9
Doc255,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.08,0.85,9
Doc158448,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.85,9
Doc487,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.85,9
Doc1545,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.85,9


In [None]:
top_docs_df.head()

In [None]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda.score(tf))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda.perplexity(tf))

# See model parameters
pprint(lda_model.get_params())

In [None]:
# Define Search Param
search_params = {'n_components': [2,3, 5, 6,7,8,9,10], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)

In [None]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

In [None]:
# Get Log Likelyhoods from Grid Search Output
n_topics = [2,3, 5, 6,7,8,9,10]
log_likelyhoods_5 = [round(gscore.mean_validation_score) for gscore in model.grid_scores_ if gscore.parameters['learning_decay']==0.5]
log_likelyhoods_7 = [round(gscore.mean_validation_score) for gscore in model.grid_scores_ if gscore.parameters['learning_decay']==0.7]
log_likelyhoods_9 = [round(gscore.mean_validation_score) for gscore in model.grid_scores_ if gscore.parameters['learning_decay']==0.9]

# Show graph
plt.figure(figsize=(12, 8))
plt.plot(n_topics, log_likelyhoods_5, label='0.5')
plt.plot(n_topics, log_likelyhoods_7, label='0.7')
plt.plot(n_topics, log_likelyhoods_9, label='0.9')
plt.title("Choosing Optimal LDA Model")
plt.xlabel("Num Topics")
plt.ylabel("Log Likelyhood Scores")
plt.legend(title='Learning decay', loc='best')
plt.show()

In [None]:
import pyLDAvis

In [30]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(best_lda_model, data_vectorized, vectorizer, mds='tsne')
panel

NameError: name 'best_lda_model' is not defined

In [None]:
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)

# Assign Column and Index
df_topic_keywords.columns = vectorizer.get_feature_names()
df_topic_keywords.index = topicnames

# View
df_topic_keywords.head()

In [None]:
# https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/