In [2]:
# Run in terminal or command prompt
# python3 -m spacy download en
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim
from nltk.corpus import stopwords
# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
# gensim
import gensim
from gensim import corpora
import pyLDAvis.gensim
# Plotting tools
from pprint import pprint
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
import glob
%matplotlib inline

In [15]:
# 1. data cleaning
def cleaning(df, sentiment = False):
    '''
    sentiment: default is False, return all data
    sentiment = 1: return positive data
    sentiment = 0: return negative data
    '''
    df['text'] = df['title_x']+' '+df['review']
    df['text'] =df['text'].replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace({r'\n':' ', r'\*': ''}, regex=True)
    if sentiment == 1:
        data = df.loc[df['sentiment'] == 1].text.values.tolist()
    elif sentiment == 0:
        data = df.loc[df['sentiment'] == 0].text.values.tolist()
    else:
        # Convert to list
        data = df.text.values.tolist()
    # Remove new line characters
    data = [re.sub(r'\s+', ' ', str(sent)) for sent in data]
    # Remove distracting single quotes
    data = [re.sub(r"\'", "", str(sent)) for sent in data]
    return data

In [10]:
# 2. tokenize
def tokenize(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations


In [11]:
# 3. lemmatization
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): #'NOUN', 'ADJ', 'VERB', 'ADV'
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

# Gridsearch using sklearn.decomposition.LatentDirichletAllocation

In [13]:
df = pd.read_csv('/Users/shelly/Google Drive/BIA660/final/word2vec/review_join_cluster_6000.csv')

In [16]:
# 1. cleaning
data = cleaning(df, sentiment = 1)
# 2. tokenize
data_words = list(tokenize(data))
# 3. lemmatization
nlp = spacy.load('en', disable=['parser', 'ner'])
data_lemmatized = lemmatization(data_words)

# 4. Vectorization
my_stop_words = text.ENGLISH_STOP_WORDS.union(["cook", 'cooking', 'cooker','cookbook','cookers','book','food','recipes', 'recipe'])
vectorizer = CountVectorizer(analyzer='word',min_df=5,
                             stop_words=my_stop_words,
                             lowercase=True)                  
                            # token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                            # max_features=50000,)
data_vectorized = vectorizer.fit_transform(data_lemmatized)

  return _unpackb(packed, **kwargs)
  return _unpackb(packed, **kwargs)


In [17]:
# 6. Grid Search for best param
# Define Search Param
search_params = {'n_components': [2,3,4]}
# Init the Model
lda = LatentDirichletAllocation(max_iter=25, learning_method='online', learning_offset=50.,random_state=0, learning_decay=0.9)
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params, cv =5)
# Do the Grid Search
model.fit(data_vectorized)
# Best Model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
print("# Log Likelyhood: Higher the better")
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))
print("# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word")



Best Model's Params:  {'n_components': 2}
Best Log Likelihood Score:  -299852.3912165296
# Log Likelyhood: Higher the better
Model Perplexity:  1133.164856253257
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word


# For consistency with the K-Means clustering, we first tried to use n_components=3

In [19]:
def lda_model(df, sent):
    
    print('sentiment =',sent)
    # 1. cleaning
    data = cleaning_2(df, sentiment = sent)
    # 2. tokenize
    data_words = list(tokenize(data))
    # 3. lemmatization
    nlp = spacy.load('en', disable=['parser', 'ner'])
    data_lemmatized = lemmatization(data_words)
    
    # 4. Vectorization
    my_stop_words = text.ENGLISH_STOP_WORDS.union(["cook", 'cooking', 'cooker','cookbook','cookers','book','food','recipes', 'recipe'])
    vectorizer = CountVectorizer(analyzer='word',min_df=5,
                                 stop_words=my_stop_words,
                                 lowercase=True)                  
                                # token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                                # max_features=50000,)
    data_vectorized = vectorizer.fit_transform(data_lemmatized)

    # 5. Build Model
    lda_model = LatentDirichletAllocation(n_components=3,
                                          max_iter=25, 
                                          learning_method='online',   
                                          random_state=0,
                                          batch_size=128,learning_decay=0.9,
                                          evaluate_every = -1,
                                          n_jobs = -1
                                         )
    lda_output = lda_model.fit_transform(data_vectorized)
#         print(lda_model)  # Model attributes
    # 6. Model Results
    # Log Likelyhood: Higher the better
    print("Log Likelihood: ", lda_model.score(data_vectorized))
    # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
    print("Perplexity: ", lda_model.perplexity(data_vectorized))
    # See model parameters
    return sent,data_vectorized,vectorizer,lda_model

# Topic Modeling for Positive Reviews vs Negative Reviews

In [20]:
if __name__ == "__main__":

    sent1,data_vectorized1,vectorizer1,lda_model1 = lda_model(df,0)
    sent2,data_vectorized2,vectorizer2,lda_model2 = lda_model(df,1)

sentiment = 0


  return _unpackb(packed, **kwargs)
  return _unpackb(packed, **kwargs)


Log Likelihood:  -534225.3677480974
Perplexity:  951.3055199806768
sentiment = 1


  return _unpackb(packed, **kwargs)
  return _unpackb(packed, **kwargs)


Log Likelihood:  -1406031.5339347455
Perplexity:  1129.4542925765927


In [28]:
# Create Document — Topic Matrix
def doc_topic_matrix(data_vectorized, best_lda_model):
    lda_output = best_lda_model.transform(data_vectorized)
    topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
    docnames = ["Doc" + str(i) for i in range(data_vectorized.shape[0])]
    df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
    dominant_topic = np.argmax(df_document_topic.values, axis=1)
    df_document_topic['dominant_topic'] = dominant_topic

    def color_green(val):
        color = 'green' if val > .1 else 'black'
        return 'color: {col}'.format(col=color)
    def make_bold(val):
        weight = 700 if val > .1 else 400
        return 'font-weight: {weight}'.format(weight=weight)
    df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
    
    return df_document_topics


# Document — Topic Matrix for Negative Reviews

In [29]:
doc_topic_matrix(data_vectorized1, lda_model1)

Unnamed: 0,Topic0,Topic1,Topic2,dominant_topic
Doc0,0.88,0.06,0.06,0
Doc1,0.25,0.08,0.66,2
Doc2,0.83,0.1,0.07,0
Doc3,0.43,0.14,0.43,0
Doc4,0.28,0.01,0.7,2
Doc5,0.44,0.09,0.47,2
Doc6,0.2,0.24,0.56,2
Doc7,0.67,0.3,0.03,0
Doc8,0.04,0.35,0.61,2
Doc9,0.42,0.01,0.57,2


# Document — Topic Matrix for Positive Reviews

In [30]:
doc_topic_matrix(data_vectorized2, lda_model2)

Unnamed: 0,Topic0,Topic1,Topic2,dominant_topic
Doc0,0.64,0.23,0.13,0
Doc1,0.12,0.88,0.0,1
Doc2,0.02,0.97,0.01,1
Doc3,0.46,0.54,0.0,1
Doc4,0.4,0.6,0.0,1
Doc5,0.43,0.38,0.19,0
Doc6,0.73,0.26,0.0,0
Doc7,0.16,0.8,0.05,1
Doc8,0.01,0.98,0.01,1
Doc9,0.54,0.21,0.25,0


In [32]:
# Topic-Keyword Matrix
def topic_keyword_Matrix(best_lda_model, vectorizer):
    df_topic_keywords = pd.DataFrame(best_lda_model.components_)
    # Assign Column and Index
    df_topic_keywords.columns = vectorizer.get_feature_names()
    topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
    df_topic_keywords.index = topicnames

    print(df_topic_keywords.head())
    return df_topic_keywords

# Topic-keyword Matrix for Negative Reviews

In [33]:
topic_keyword_Matrix(lda_model1, vectorizer1)

         abandon   ability       able  absolute  absolutely    absorb  \
Topic0  0.391052  1.850515  19.728016  1.723496   15.888694  1.307169   
Topic1  5.335001  8.462816  14.946376  3.213465    8.583035  5.954141   
Topic2  1.953887  2.082978  33.130470  0.774530   10.359178  0.341466   

           abuse    accept  acceptable    access  ...       year      yeast  \
Topic0  0.353764  0.372342    0.405559  5.611417  ...  32.704900  13.306048   
Topic1  7.931157  5.921926    7.085069  8.231335  ...  90.900180   0.364787   
Topic2  0.350621  0.346606    2.862542  7.949045  ...  64.880642   3.613603   

        yesterday    yogurt      young   youtube     yuck     yummy      zone  \
Topic0   5.956542  6.240245   1.460158  0.975363  3.68605  8.202321  1.284301   
Topic1   0.339651  0.475116  13.105843  6.835962  0.78484  0.917890  1.833220   
Topic2   0.361488  6.583357   3.397252  2.697658  1.24831  0.397247  3.463198   

        zucchini  
Topic0  2.570807  
Topic1  0.352198  
Topic2  

Unnamed: 0,abandon,ability,able,absolute,absolutely,absorb,abuse,accept,acceptable,access,...,year,yeast,yesterday,yogurt,young,youtube,yuck,yummy,zone,zucchini
Topic0,0.391052,1.850515,19.728016,1.723496,15.888694,1.307169,0.353764,0.372342,0.405559,5.611417,...,32.7049,13.306048,5.956542,6.240245,1.460158,0.975363,3.68605,8.202321,1.284301,2.570807
Topic1,5.335001,8.462816,14.946376,3.213465,8.583035,5.954141,7.931157,5.921926,7.085069,8.231335,...,90.90018,0.364787,0.339651,0.475116,13.105843,6.835962,0.78484,0.91789,1.83322,0.352198
Topic2,1.953887,2.082978,33.13047,0.77453,10.359178,0.341466,0.350621,0.346606,2.862542,7.949045,...,64.880642,3.613603,0.361488,6.583357,3.397252,2.697658,1.24831,0.397247,3.463198,9.417074


In [34]:
# Topic-keyword Matrix for Positive Reviews

In [35]:
topic_keyword_Matrix(lda_model2, vectorizer2)

            aaron    ability        able     abroad   absolute  absolutely  \
Topic0  33.754656   6.394348   33.059154   1.135119   0.536296    6.806265   
Topic1   0.379901  17.659882  101.029136   0.338419  14.786811   58.899354   
Topic2   3.740012  18.374718  114.974303  10.136922  11.772264  123.376667   

           absorb  absorption      abuse    accept  ...      young     youth  \
Topic0   4.172250    5.178223   1.577660  4.026365  ...   7.272599  2.580188   
Topic1  10.111249    0.368057  10.881928  5.206997  ...  35.123070  5.956988   
Topic2   0.408844    0.348617   1.141194  1.417404  ...  26.394739  0.334178   

          youtube        yr       yum       yummy      zest      zone  \
Topic0  17.465211  1.729481  0.344550    5.323107  5.850337  0.406900   
Topic1   0.565320  1.116928  0.339512    1.392383  0.338291  2.464305   
Topic2   2.242771  2.901906  9.945345  112.104650  0.632938  8.828968   

           zoodle   zucchini  
Topic0   0.343143   0.481363  
Topic1   0.

Unnamed: 0,aaron,ability,able,abroad,absolute,absolutely,absorb,absorption,abuse,accept,...,young,youth,youtube,yr,yum,yummy,zest,zone,zoodle,zucchini
Topic0,33.754656,6.394348,33.059154,1.135119,0.536296,6.806265,4.17225,5.178223,1.57766,4.026365,...,7.272599,2.580188,17.465211,1.729481,0.34455,5.323107,5.850337,0.4069,0.343143,0.481363
Topic1,0.379901,17.659882,101.029136,0.338419,14.786811,58.899354,10.111249,0.368057,10.881928,5.206997,...,35.12307,5.956988,0.56532,1.116928,0.339512,1.392383,0.338291,2.464305,0.334087,0.372357
Topic2,3.740012,18.374718,114.974303,10.136922,11.772264,123.376667,0.408844,0.348617,1.141194,1.417404,...,26.394739,0.334178,2.242771,2.901906,9.945345,112.10465,0.632938,8.828968,10.942629,52.256772


In [50]:
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=15):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    
    df_topic_keywords = pd.DataFrame(topic_keywords)
    print(df_topic_keywords.shape)
    df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
    df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
    
    return df_topic_keywords
show_topics(vectorizer=vectorizer1, lda_model=lda_model1)

(3, 15)


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,good,make,use,ingredient,just,look,time,great,buy,try,lot,star,page,really,think
Topic 1,read,good,write,story,just,buy,know,author,say,information,life,really,want,think,people
Topic 2,wheat,diet,eat,alternative,non,just,product,bread,good,try,free,plan,day,weight,make


In [51]:
show_topics(vectorizer=vectorizer2, lda_model=lda_model2)

(3, 15)


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,diet,eat,weight,lose,good,day,work,lot,plan,healthy,pound,make,just,week,fat
Topic 1,read,life,good,eat,feel,make,diet,love,year,change,just,way,time,really,know
Topic 2,make,good,great,love,use,easy,try,just,buy,ingredient,time,bread,delicious,keto,look


## It seems  useless to seperate positive reviews from negative reviews and train two topic models on top of each, Since top words for negative reviews also includes a lot of positive words.

# Gensim LDA Model based on Book Clusters

In [43]:
df = pd.read_csv('/Users/shelly/Google Drive/BIA660/final/word2vec/review_join_cluster_6000.csv')

In [44]:
df.shape

(6053, 24)

In [45]:
# 1. data cleaning
def cleaning_2(df, sentiment = False):
    '''
    sentiment: default is False, return all data
    sentiment = 1: return positive data
    sentiment = 0: return negative data
    '''
    df['text'] = df['title_x']+' '+df['review']
    df['text'] =df['text'].replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace({r'\n':' ', r'\*': ''}, regex=True)
    if sentiment == 1:
        data = df.loc[df['sentiment'] == 1].text.values.tolist()
    elif sentiment == 0:
        data = df.loc[df['sentiment'] == 0].text.values.tolist()
    else:
        # Convert to list
        data = df.text.values.tolist()
    # Remove new line characters
    data = [re.sub(r'\s+', ' ', str(sent)) for sent in data]
    # Remove distracting single quotes
    data = [re.sub(r"\'", "", str(sent)) for sent in data]
    return data


In [46]:
def cluster_lda_gensim(tf, tf_feature_names,num_topics):
    corpus = gensim.matutils.Sparse2Corpus(tf, documents_columns=False)
    id2word={idx:w for idx, w in enumerate(tf_feature_names)}
    dictionary = corpora.Dictionary.from_corpus(corpus, id2word=id2word)
    
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, alpha='auto', eval_every=1, \
                                                num_topics = num_topics, \
                                                id2word=id2word, \
                                                iterations=25
                                              )
    lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
    return lda_display

In [47]:
# 1. cleaning
data = cleaning_2(df, sentiment = False)
# 2. tokenize
data_words = list(tokenize(data))
# 3. lemmatization
nlp = spacy.load('en', disable=['parser', 'ner'])
data_lemmatized = lemmatization(data_words)
# 4. Vectorization
my_stop_words = text.ENGLISH_STOP_WORDS.union(["cook", 'cooking', 'cooker','cookbook','cookers','book','food','recipes', 'recipe'])

vectorizer = CountVectorizer(analyzer='word',min_df=5,
                             stop_words=my_stop_words,
                             lowercase=True)                  
                            # token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                            # max_features=50000,)
data_vectorized = vectorizer.fit_transform(data_lemmatized)

  return _unpackb(packed, **kwargs)
  return _unpackb(packed, **kwargs)


In [48]:
# Cluster 0, Sentiment = False

lda_display = cluster_lda_gensim(data_vectorized, vectorizer.get_feature_names(), 3)
#lda_display = pyLDfAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [49]:
pyLDAvis.save_html(lda_display, 'lda_display_6000_reviews.html')