In [1]:
# import packages
import json, re, nltk
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.stem.wordnet import WordNetLemmatizer
from tmtoolkit.topicmod.evaluate import metric_coherence_gensim

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [2]:
df = pd.read_csv('yelp.csv')
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,J4a2TuhDasjn2k3wWtHZnQ,RNm_RWkcd02Li2mKPRe7Eg,xGXzsc-hzam-VArK6eTvtw,1.0,2,0,0,"This place used to be a cool, chill place. Now...",2018-01-21 04:41:03
1,FdoBFTjXXMn4hVnJ59EtiQ,eLAYHxHUutiXswy-CfeiUw,WQFn1A7-UAA4JT5YWiop_w,1.0,0,0,0,"They NEVER seem to get our \norder correct, se...",2017-09-08 23:26:10
2,m1GlqFGIN5eayrbb2IbRZg,B7YSV6r1ePAXc69FkDDuZw,wZgUAuDuEGPEzKK-PsngKQ,1.0,0,0,0,I wish I could give them zero stars. The call ...,2014-06-27 22:06:55
3,ucFOnqgaV40oQ2YNyz5ddQ,JHXQEayrDHOWGexs0dCviA,KXCXaF5qimmtKKqnPc_LQA,1.0,0,0,0,Great coffee and pastries. Baristas are excell...,2018-03-03 23:45:25
4,-QpNdU_p44GR0NcRxDRyNQ,ffJp-ZN80M4sSkDL8Ra18w,WDGeeyeK7bG0cvq_ZglAdA,1.0,0,0,0,Almost desolate restaurant and dingy evironmen...,2009-03-01 01:05:50


In [3]:
# initial stopwords
stopwords = nltk.corpus.stopwords.words('english')
# more stopwords
extra_stopwords = ['nt', 'good', 'bad', 'tell', 'say', 'ask', 'use', 'come', 'make', 
                   'company', 'stay', 'want', 'know', 'like', 'anymore', 'lot', 'great', 
                   'ok', 'nice', 'dish', 'really', 'look', 'better', 'guy', 'dr', 'try',
                   'let', 'need', 'alright', 'place', 'food', 'buy', 'eat']

# function that handles tokenization
def text_process(text):
    clean_tokens = []
    lmtzr = WordNetLemmatizer()
    
    # filter stopwords and change format
    for token in nltk.word_tokenize(text):
        token = token.replace("'s", " ").replace("n’t", " not").replace("’ve", " have")
        token = re.sub(r'[^a-zA-Z0-9 ]', '', token)
        if token.lower() not in stopwords and token.isalpha():
            clean_tokens.append(token.lower())
    
    result = [lmtzr.lemmatize(token, 'v') for token in clean_tokens]
    filtered_token = []
    
    # second filter
    for token in result:
        if token.lower() not in extra_stopwords: 
            filtered_token.append(token)
    return filtered_token

# construct tfidf matrix
def build_tfidf(data, text_process, max_df, min_df):
    # initialize
    tfidf_model = TfidfVectorizer(max_df=max_df, 
                                  min_df=min_df, 
                                  stop_words='english',
                                  use_idf=True, 
                                  tokenizer=text_process)

    tfidf_matrix = tfidf_model.fit_transform(data) #fit the vectorizer to synopses
    
    print ("In total, there are " + str(tfidf_matrix.shape[0]) + \
           " reviews and " + str(tfidf_matrix.shape[1]) + " terms.")
    return tfidf_matrix, tfidf_model

# build lda 
def conduct_lda(tfidf_matrix, data, num_components):
    lda = LatentDirichletAllocation(n_components=num_components)
    lda_output = lda.fit_transform(tfidf_matrix)
    # check shape
    print(lda_output.shape)
    
    return lda_output, lda

# print top n keywords for each topic
def generate_topic_words(tfidf_model, lda_model, n_words):
    words = np.array(tfidf_model.get_feature_names())
    topic_words = []
    # for each topic, we have words weight
    for topic_words_weights in lda_model.components_:
        top_words = topic_words_weights.argsort()[::-1][:n_words]
        topic_words.append(words.take(top_words))
    # store keywords in a dataframe
    df_topic_words = pd.DataFrame(topic_words)
    df_topic_words.columns = ['Word '+ str(i) for i in range(df_topic_words.shape[1])]
    df_topic_words.index = ['Topic '+ str(i) for i in range(df_topic_words.shape[0])]
    return df_topic_words

# pipeline for whole process
def pipeline(star_level, max_df, min_df, num_components, n_words):
    # select data
    data = df[df['stars']==star_level]['text'].tolist()
    # train tfidf matrix
    tfidf_matrix, tfidf_model = build_tfidf(data, text_process, max_df, min_df)
    # train lda model
    lda_output, lda_model = conduct_lda(tfidf_matrix, data, num_components)
    # generate top keywords
    keywords = generate_topic_words(tfidf_model, lda_model, n_words)
    
    return keywords, lda_model, tfidf_matrix, tfidf_model

In [4]:
# star level 1
df1, lda_model, tfidf_matrix, tfidf_model = pipeline(1, 0.99, 0.01, 3, 20)

In total, there are 8000 reviews and 738 terms.
(8000, 3)


In [5]:
pyLDAvis.sklearn.prepare(lda_model, tfidf_matrix, tfidf_model)

  default_term_info = default_term_info.sort_values(


In [6]:
# star level 2
df2, lda_model, tfidf_matrix, tfidf_model = pipeline(2, 0.99, 0.01, 3, 20)

In total, there are 8000 reviews and 768 terms.
(8000, 3)


In [7]:
pyLDAvis.sklearn.prepare(lda_model, tfidf_matrix, tfidf_model)

  default_term_info = default_term_info.sort_values(


In [8]:
# star level 3
df3, lda_model, tfidf_matrix, tfidf_model = pipeline(3, 0.99, 0.01, 5, 20)

In total, there are 8000 reviews and 765 terms.
(8000, 5)


In [9]:
pyLDAvis.sklearn.prepare(lda_model, tfidf_matrix, tfidf_model)

  default_term_info = default_term_info.sort_values(


In [10]:
# star level 4
df4, lda_model, tfidf_matrix, tfidf_model = pipeline(4, 0.99, 0.01, 7, 20)

In total, there are 8000 reviews and 690 terms.
(8000, 7)


In [11]:
pyLDAvis.sklearn.prepare(lda_model, tfidf_matrix, tfidf_model)

  default_term_info = default_term_info.sort_values(


In [12]:
# star level 5
df5, lda_model, tfidf_matrix, tfidf_model = pipeline(5, 0.99, 0.01, 3, 20)

In total, there are 8000 reviews and 590 terms.
(8000, 3)


In [13]:
pyLDAvis.sklearn.prepare(lda_model, tfidf_matrix, tfidf_model)

  default_term_info = default_term_info.sort_values(
