Here we engineer several sets of features for prediction of Yelp rating from reviews.  The features here are derived from the results of LDA on the pooled reviews, which identified 5 topics.  First words in each review are tokenized and grouped by sentence. The 5 topic lda model is applied to each sentence. A sentence is assigned to a topic if that topic makes up more than 50% of that sentence. A sentiment analyzer is applied to the sentence.  The sentiment score of the topic for that review is the sum of the sentiment scores for the sentences assigned to that topic. The number of words assigned to the topic is also used.  Finally we measure the number of words used to describe each topic in the review.

In [2]:
import numpy as np
import pandas as pd


import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
#import en_core_web_sm

from tqdm import tqdm_notebook as tqdm
from pprint import pprint

In [3]:
shops = pd.read_csv('./ProcessedData/coffeeshops_withcfcutoff.csv')
reviews = pd.read_csv('./ProcessedData/allreviews_txtprocessed.csv')
merged = pd.merge(shops,reviews,how='inner',on = ['alias'])

In [4]:
merged.mreviewtxt[0]

"i can't believe i have never left a review for this place considering the amount of times i stop by here. and all the times i have been here not once has aone been rude to me. i always make sure to get a latter art so maybe the taro or red velvet or matcha to be honest i like them all  just the cuteness of it makes me love it. understandable that ma people don't wanna pay   -  for such small cups of latte but it is definitely based on preference  second -- the bingsoo here is so so good  the one i get the most has to be the mango cheesecake it literally has lives of cheesecakes and has condensed milk and all that good stuff - it's very sweet incase u don't like that  i really don't think it's a miss if you come here   they have individual and large bingsoos-- i get individual because i don't like sharing the goodness ...."

In [5]:
#Loading the previously trained LDA model
import pickle
from gensim.test.utils import datapath
from gensim.models import LdaModel
from gensim import corpora
#Visualize the LDA topics
dictionary = gensim.corpora.Dictionary.load('dictionary_allreviews_nouns.gensim')
corpus = pickle.load(open('corpus_allreviews_nouns.pkl', 'rb'))

temp_file = datapath("lda_nounsonly_5topics.gensim")
lda = gensim.models.ldamodel.LdaModel.load(temp_file)


In [6]:
#A simple example of applying the trained lda model to a new text
other_texts = [
['computer', 'time', 'table','-'],
['survey', 'response', 'eps'],
['human', 'system', 'coffee']]
other_corpus = [dictionary.doc2bow(text) for text in other_texts]
vector1 = lda[other_corpus[0]]
vector1

vector2 = lda[other_corpus[2]]
vector2

([(0, 0.05003922),
  (1, 0.050024386),
  (2, 0.3000926),
  (3, 0.54981965),
  (4, 0.050024122)],
 [(31, [3]), (419, [2]), (734, [3])],
 [(31, [(3, 0.9999051)]), (419, [(2, 0.9995777)]), (734, [(3, 0.998249)])])

In [7]:
#A simple example of applying vader to measure the sentiment in a sentence
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
vs = analyzer.polarity_scores('In some ways, they are totally opposite cataclysms. Good Awesome Terrific')
print(vs)

{'neg': 0.0, 'neu': 0.436, 'pos': 0.564, 'compound': 0.885}


In [35]:
#Divde up the reviews

In [23]:
splitrev = [str.split(merged.mreviewtxt[i],sep='.') for i in range(len(merged))]
len(splitrev)

19590

In [24]:
#Go through the 

sentenceldabyr = []
sentimentspertopic = []
senlenpertopic = []


for review in splitrev:
    ldavectors = []
    reviewsbytopic = [0,0,0,0,0]
    reviewlbytopic = [0,0,0,0,0]
    for sentence in review:
        vs = analyzer.polarity_scores(sentence)['compound']
        splits = str.split(sentence)
        slen = len(splits)
        corpus = dictionary.doc2bow(splits)
        ldav = lda[corpus][0]
        ldavectors.append(ldav)
        #Updating the sentiment score for each topic in the reviews
        for (index,ldaf) in ldav:
            reviewsbytopic[index] = reviewsbytopic[index] + vs*ldaf
            reviewlbytopic[index] = reviewlbytopic[index] + slen*ldaf
        
    sentimentspertopic.append(reviewsbytopic)
    senlenpertopic.append(reviewlbytopic)
    sentenceldabyr.append(ldavectors)

In [32]:
sentimentspertopicdf = pd.DataFrame(sentimentspertopic)
sentimentspertopicdf.columns = ['t0s','t1s','t2s','t3s','t4s'] 

prefix = ['t0','t1','t2','t3','t4']
senlenpertopicdf = pd.DataFrame(senlenpertopic)
senlenpertopicdf.columns = [pref + 'senlen' for pref in prefix]



In [33]:
print(sentimentspertopicdf.head(5))
print(senlenpertopicdf.head(5))
print(sentimentspertopicdf.shape)
print(senlenpertopicdf.shape)

        t0s       t1s       t2s       t3s       t4s
0 -0.010556  0.180251  0.012068  0.043956  0.890080
1 -0.018792  0.144005  0.593450  0.760924  0.426514
2  0.224952  0.406399  0.133696  0.774286  0.641366
3  0.824146  0.814918  0.587349  0.898249  2.142437
4 -0.037538  0.057878  0.631118  0.835730  0.167957
    t0senlen   t1senlen   t2senlen   t3senlen   t4senlen
0   1.687048  10.689785  47.067797  18.965432  85.014230
1  13.649553   5.976947  23.739573  29.660751  13.973175
2   4.881982   9.142167   4.172507  15.323400  38.479940
3  34.537300  19.361009  26.728325  18.182833  48.190533
4  33.489670  23.210480  58.604848  79.005441  18.135223
(19590, 5)
(19590, 5)


In [42]:
merged1 = pd.concat([merged.reset_index(),sentimentspertopicdf,senlenpertopicdf],axis=1)

In [43]:
print(merged1.shape)
print(merged1.head(5))
print(merged1.columns)

(19590, 32)
   index                      id          name                  alias  \
0      0  UZViRVpxNZvOM5KarmbT1g  Sweet Moment  sweet-moment-new-york   
1      1  UZViRVpxNZvOM5KarmbT1g  Sweet Moment  sweet-moment-new-york   
2      2  UZViRVpxNZvOM5KarmbT1g  Sweet Moment  sweet-moment-new-york   
3      3  UZViRVpxNZvOM5KarmbT1g  Sweet Moment  sweet-moment-new-york   
4      4  UZViRVpxNZvOM5KarmbT1g  Sweet Moment  sweet-moment-new-york   

   is_closed                                         categories  review_count  \
0      False  [{'alias': 'coffee', 'title': 'Coffee & Tea'},...           822   
1      False  [{'alias': 'coffee', 'title': 'Coffee & Tea'},...           822   
2      False  [{'alias': 'coffee', 'title': 'Coffee & Tea'},...           822   
3      False  [{'alias': 'coffee', 'title': 'Coffee & Tea'},...           822   
4      False  [{'alias': 'coffee', 'title': 'Coffee & Tea'},...           822   

  price  rating_x transactions  ...       t0s       t1s       

In [45]:
merged1.to_csv('./ProcessedData/reviews_withlda5topicfeatures.csv',index=False)