Here we take the LDA model (5 topics), only nouns that we trained, and prepare it for visualization on the dash app.  We take the top 50 words associated with each topic and calculate the fraction of reviews that word appears in for each coffee shop. Additionally, for each coffee shop, we store the topic component values. 

In [1]:
import numpy as np
import pandas as pd


import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
#import en_core_web_sm

from tqdm import tqdm_notebook as tqdm
from pprint import pprint

In [2]:
shops = pd.read_csv('./ProcessedData/coffeeshops_withcfcutoff.csv')
reviews = pd.read_csv('./ProcessedData/reviews_withlda5topicfeatures.csv')
reviews.rename(columns={'rating':'revrating'},inplace=True)
merged = pd.merge(shops,reviews,how='inner',on = ['alias'])

In [3]:
reviews.columns

Index(['index', 'id', 'name', 'alias', 'is_closed', 'review_count', 'price',
       'rating_x', 'transactions', 'latitude', 'longitude', 'geometry',
       'index_right', 'boro_code', 'boro_name', 'county_fip', 'ntacode',
       'ntaname', 'shape_area', 'shape_leng', 'catlist', 'numcoffeemen',
       'numreviews', 'fraccof', 'idx', 'date', 'rating_y', 'reviewtxt',
       'mreviewtxt', 't0s', 't1s', 't2s', 't3s', 't4s', 't0senlen', 't1senlen',
       't2senlen', 't3senlen', 't4senlen', 'sentiment', 'avgsentiment', 't0',
       't1', 't2', 't3', 't4', 'reviewlen'],
      dtype='object')

In [4]:
#Loading the previously trained LDA model
import pickle
from gensim.test.utils import datapath
from gensim.models import LdaModel
from gensim import corpora
#Visualize the LDA topics
dictionary = gensim.corpora.Dictionary.load('dictionary_allreviews_nouns.gensim')
corpus = pickle.load(open('corpus_allreviews_nouns.pkl', 'rb'))

temp_file = datapath("lda_nounsonly_5topics.gensim")
lda = gensim.models.ldamodel.LdaModel.load(temp_file)


In [5]:
#A simple example of applying the trained lda model to a new text
other_texts = [
['computer', 'time', 'table','-'],
['survey', 'response', 'eps'],
['human', 'system', 'coffee']]
other_corpus = [dictionary.doc2bow(text) for text in other_texts]
vector1 = lda[other_corpus[0]]
vector1

vector2 = lda[other_corpus[2]]
vector2

([(0, 0.05003922),
  (1, 0.050024386),
  (2, 0.3000926),
  (3, 0.54981965),
  (4, 0.050024122)],
 [(31, [3]), (419, [2]), (734, [3])],
 [(31, [(3, 0.9999051)]), (419, [(2, 0.9995777)]), (734, [(3, 0.998249)])])

In [6]:
#A simple example of applying vader to measure the sentiment in a sentence
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
vs = analyzer.polarity_scores('In some ways, they are totally opposite cataclysms. Good Awesome Terrific')
print(vs)

{'neg': 0.0, 'neu': 0.436, 'pos': 0.564, 'compound': 0.885}


Preparing the review text to be processed by the trained LDA model|

In [7]:
revldaweights = pd.read_csv('./ProcessedData/reviews_withlda5topicfeatures.csv')


In [8]:
revldaweights.head(5)
meanldaweights = revldaweights.groupby('alias')[['t0','t1','t2','t3','t4','t0s','t1s','t2s','t3s','t4s']].mean()

In [9]:
merged = meanldaweights.merge(shops,how='inner',on='alias')

In [10]:
merged.to_csv('./DataForDash/shopswithlda5topic.csv',index=False)

In [40]:
merged[['alias','t3s']].sort_values(by='t3s').tail(250)

Unnamed: 0,alias,t3s
302,oliver-coffee-new-york,0.373311
180,gregorys-coffee-new-york-22,0.373608
303,one-girl-cookies-brooklyn-3,0.373632
482,takahachi-bakery-new-york,0.373690
171,gregorys-coffee-brooklyn-2,0.374421
...,...,...
4,787-coffee-new-york-5,0.703245
261,lülü-coffee-co-new-york-23,0.706704
229,kaigo-coffee-room-brooklyn-4,0.720201
35,blue-bottle-coffee-new-york-22,0.723296


In [41]:
#Now getting the top words for each topic

In [42]:
print(lda.get_topic_terms(0))
print(dictionary[12])

[(12, 0.052774273), (80, 0.030384272), (154, 0.030074613), (11, 0.024772303), (18, 0.02101725), (189, 0.019616796), (50, 0.01771511), (25, 0.016095303), (183, 0.015675712), (398, 0.014464904)]
place


In [21]:
#Creating a data frame, to store the fraction of the top 10 words in each topic
results = []

for i in range(5):
    termtuples = lda.get_topic_terms(i)
    rank = 1
    for (dicti,prob) in termtuples:
        word = dictionary[dicti]
        wordinfo = [i, word, prob,rank]
        results.append(wordinfo)
        rank = rank+1
        
topwordsbytopic = pd.DataFrame(results)
topwordsbytopic.columns = ['ldatopic','word','probability','rank']

In [25]:
topwordsbytopic.head(5)

Unnamed: 0,ldatopic,word,probability,rank
0,0,place,0.052774,1
1,0,table,0.030384,2
2,0,work,0.030075,3
3,0,people,0.024772,4
4,0,area,0.021017,5


In [23]:
topwordsbytopic.to_csv('./DataForDash/ldatopwordsfortopics.csv',index=False)