What this notebook does:

I explore LDA topic scores for a list of manually selected coffee product ('./Data/coffeeterms.csv'), using the 5 topic LDA model (see './LDA_Fitting/LDA_onreviews_nouns_withhptuning.ipynb').  

Although most coffee drinks are in the coffee topic, some are in other topics. For example flat-whites are in the food topic, while affogatos are in the sweets topic.

In [None]:
#Step one directory up to access the yelp scraping function in the helper_functions module
import os
print(os.getcwd())
os.chdir('../')
os.getcwd()

In [1]:
import numpy as np
import pandas as pd

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
#import en_core_web_sm

from tqdm import tqdm_notebook as tqdm
from pprint import pprint

import pickle
from gensim.test.utils import datapath
from gensim.models import LdaModel
from gensim import corpora



In [2]:
shops = pd.read_csv('./ProcessedData/coffeeshops_withcfcutoff.csv')
reviews = pd.read_csv('./ProcessedData/allreviews_txtprocessed.csv')
reviews.rename(columns = {'rating':'review_rating'},inplace=True)
merged = pd.merge(shops,reviews,how='inner',on = ['alias'])

In [3]:
print(merged.columns)
print(merged.shape)

Index(['id', 'name', 'alias', 'is_closed', 'review_count', 'price', 'rating',
       'transactions', 'latitude', 'longitude', 'geometry', 'index_right',
       'boro_code', 'boro_name', 'county_fip', 'ntacode', 'ntaname',
       'shape_area', 'shape_leng', 'catlist', 'numcoffeemen', 'numreviews',
       'fraccof', 'name_top5count', 'reviewidx', 'shopidx', 'date',
       'review_rating', 'reviewtxt', 'mreviewtxt'],
      dtype='object')
(21594, 30)


In [4]:
#Loading the previously trained LDA model


dictionary = gensim.corpora.Dictionary.load('dictionary_allreviews_nouns.gensim')
corpus = pickle.load(open('corpus_allreviews_nouns.pkl', 'rb'))

temp_file = datapath("lda_nounsonly_5topics.gensim")
lda = gensim.models.ldamodel.LdaModel.load(temp_file)


In [5]:
#A simple example of applying the trained lda model to a new text
other_texts = [
['computer', 'time', 'table','-'],
['survey', 'response', 'eps'],
['human', 'system', 'coffee']]
other_corpus = [dictionary.doc2bow(text) for text in other_texts]
vector1 = lda[other_corpus[0]]
vector1

vector2 = lda[other_corpus[2]]
vector2

([(0, 0.050016645),
  (1, 0.54981506),
  (2, 0.050021127),
  (3, 0.05001687),
  (4, 0.3001303)],
 [(4, [4]), (584, [1]), (1316, [1])],
 [(4, [(4, 0.9999962)]), (584, [(1, 0.99983275)]), (1316, [(1, 0.99871284)])])

In [16]:
lda[dictionary.doc2bow(['flat_white'])]

([(0, 0.100028254),
  (1, 0.10002753),
  (2, 0.59988797),
  (3, 0.100028664),
  (4, 0.10002759)],
 [(1927, [2])],
 [(1927, [(2, 0.9994534)])])

In [55]:
lda.get_document_topics(dictionary.doc2bow(['flat_white']))
lda.get_document_topics(dictionary.doc2bow(['pour_over']))
lda.get_document_topics(dictionary.doc2bow(['single_origin']))
lda.get_document_topics(dictionary.doc2bow(['slow']))
lda.get_document_topics(dictionary.doc2bow(['steak']))

[(0, 0.10002845),
 (1, 0.100028075),
 (2, 0.5998867),
 (3, 0.10002877),
 (4, 0.10002798)]

In [34]:
lda.show_topic(1)

[('time', 0.04610378),
 ('customer', 0.038744505),
 ('service', 0.037352882),
 ('line', 0.028180756),
 ('staff', 0.028090455),
 ('barista', 0.02004683),
 ('employee', 0.019347046),
 ('morning', 0.018592017),
 ('minute', 0.018041112),
 ('day', 0.016217884)]

### Getting LDA 5-topic importances for the manually researched coffee terms

In [42]:
cofterms = pd.read_csv('./Data/coffeeterms.csv')

In [45]:
coftermlist = cofterms['term'].to_list()
topicass = []
for term in coftermlist:
    ldavec = lda.get_document_topics(dictionary.doc2bow([term]))
    topicass.append([i[1] for i in ldavec])


In [46]:
topicassdf = pd.DataFrame(topicass)
topicassdf.columns = ['t0','t1','t2','t3','t4']
topicassdf = pd.concat([cofterms,topicassdf],axis=1)

In [47]:
topicassdf

Unnamed: 0,term,t0,t1,t2,t3,t4
0,affogato,0.599767,0.100058,0.100058,0.100059,0.100058
1,americano,0.100024,0.100023,0.100024,0.100024,0.599905
2,barista,0.100001,0.597834,0.100001,0.100001,0.102164
3,cappuccino,0.100001,0.100001,0.100001,0.100001,0.599995
4,coffee,0.1,0.1,0.1,0.1,0.6
5,chemex,0.2,0.2,0.2,0.2,0.2
6,clover,0.100009,0.100009,0.100009,0.599963,0.100009
7,cold_brew,0.100078,0.100078,0.100078,0.100079,0.599687
8,cold_drip,0.2,0.2,0.2,0.2,0.2
9,cortado,0.100083,0.59967,0.100082,0.100084,0.100081
