In [3]:
import gensim, operator
from scipy import spatial
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
df = pd.read_csv('yelp.csv')
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,J4a2TuhDasjn2k3wWtHZnQ,RNm_RWkcd02Li2mKPRe7Eg,xGXzsc-hzam-VArK6eTvtw,1.0,2,0,0,"This place used to be a cool, chill place. Now...",2018-01-21 04:41:03
1,FdoBFTjXXMn4hVnJ59EtiQ,eLAYHxHUutiXswy-CfeiUw,WQFn1A7-UAA4JT5YWiop_w,1.0,0,0,0,"They NEVER seem to get our \norder correct, se...",2017-09-08 23:26:10
2,m1GlqFGIN5eayrbb2IbRZg,B7YSV6r1ePAXc69FkDDuZw,wZgUAuDuEGPEzKK-PsngKQ,1.0,0,0,0,I wish I could give them zero stars. The call ...,2014-06-27 22:06:55
3,ucFOnqgaV40oQ2YNyz5ddQ,JHXQEayrDHOWGexs0dCviA,KXCXaF5qimmtKKqnPc_LQA,1.0,0,0,0,Great coffee and pastries. Baristas are excell...,2018-03-03 23:45:25
4,-QpNdU_p44GR0NcRxDRyNQ,ffJp-ZN80M4sSkDL8Ra18w,WDGeeyeK7bG0cvq_ZglAdA,1.0,0,0,0,Almost desolate restaurant and dingy evironmen...,2009-03-01 01:05:50


In [4]:
model_path = ''

def load_wordvec_model(modelName, modelFile, flagBin):
    print('Loading ' + modelName + ' model...')
    model = KeyedVectors.load_word2vec_format(model_path + modelFile, binary=flagBin)
    print('Finished loading ' + modelName + ' model...')
    return model

model_word2vec = load_wordvec_model('Word2Vec', 'GoogleNews-vectors-negative300.bin', True)

Loading Word2Vec model...
Finished loading Word2Vec model...


In [6]:
# we want to eliminate the restaurant that has reviews less than 20.
counts=df['business_id'].value_counts()
df20 = df[df['business_id'].isin(counts[counts > 20].index)]
# first business - The Burren 
business1 = df20['business_id'].unique()[0]
# all customer reviews of The Burren
review_1 = df20[df20['business_id']==business1]['text'].tolist()df20['business_id'].value_counts()

In [44]:
# average rating of The Burren
score_1 = df20[df20['business_id']==business1]['stars']
score_1.mean()

2.951219512195122

In [7]:
from nltk.tokenize import sent_tokenize
sentences = []
for s in review_1:
    sentences.append(sent_tokenize(s))
    
sentences = [y for x in sentences for y in x]
text = " ".join(sentences)

In [63]:
topic_taxonomy = {
    "service customer customers experience":
    {
        "Good Service Feedback": "great help good surprise love excellent nice amazing",
        "Bad Service Feedback": "bad worst rude wait time minutes wait walk line disappoint horrible suck terrible attitude slow rush",
        "Subjective Service Feedback": "leave think" 
    },
    "food meal":
    {
        "Good Taste": "tasty good nice delicious yummy",
        "Bad Taste": "bland bad worst disgust dry flavour less flavourless disappoint horrible suck terrible",
        "Food Category": "fast food dessert breakfast thai vegan asian healthy salad mexican chinese italian korean japanese",
        "Food Item": "chicken meal rice salad burger meat drink bread sandwiches pizza cheese coffee sauce cream breakfast cake wing tea seafood sushi beer",
        "Cooking Methods": "fry pan boil saute cook steam bake",
    },
     "category restaurant auto service beauty store health business":
    {
        "Business Type": "restaurant bar store spa pharmacy salon financial hotel education pets entertainment medical shopping home service rental",
        "Buiness Service Category": "nail polish wash hair snacks tea drink food car gas repair book art movie fitness baby flower alcohol",

    },
     "price pay payment charge money tip check fee recipt":
    {
        "Payment Type": "credit card cash",
        "Pricy": "overprice expensive over priced $",
        "Resonable Price": "cheap worth",
        "Deal": "groupon discount"
    },
     "infrastructure location room table wall bed bathroom area seat park delivery":
    {
        "Infrastructure Good Feedback": "clean",
        "Infrastructure Bad Feedback": "disgust dirty crowd crowdy smell filthy",
        "Location Neighborhood": "airport"
    }
}

In [11]:
def vec_similarity(input1, input2, vectors):
    term_vectors = [np.zeros(300), np.zeros(300)]
    terms = [input1, input2]
        
    for index, term in enumerate(terms):
        for i, t in enumerate(term.split(' ')):
            try:
                term_vectors[index] += vectors[t]
            except:
                term_vectors[index] += 0
        
    result = (1 - spatial.distance.cosine(term_vectors[0], term_vectors[1]))
    if result == 'nan':
        result = 0
        
    return result

In [34]:
# function checks whether the input words are present in the vocabulary for the model
def vocab_check(vectors, words):
    
    output = list()
    for word in words:
        if word in vectors.key_to_index:
            output.append(word.strip())
            
    return output

In [13]:
# function calculates similarity between two strings using a particular word vector model
def calc_similarity(input1, input2, vectors):
    s1words = set(vocab_check(vectors, input1.split()))
    s2words = set(vocab_check(vectors, input2.split()))
    
    output = vectors.n_similarity(s1words, s2words)
    return output

In [41]:
#function takes an input string, runs similarity for each item in topic_taxonomy, sorts and returns top 10 results
def classify_topics(input, vectors):
    feed_score = dict()
    for key, value in topic_taxonomy.items():
        max_value_score = dict()
        for label, keywords in value.items():
            max_value_score[label] = 0
            topic = (key + ' ' + keywords).strip()
            max_value_score[label] += float(calc_similarity(input, topic, vectors))
            
        sorted_max_score = sorted(max_value_score.items(), key=operator.itemgetter(1), reverse=True)[0]
        feed_score[sorted_max_score[0]] = sorted_max_score[1]
    return sorted(feed_score.items(), key=operator.itemgetter(1), reverse=True)[:5]

In [55]:
# classify topics within all customer review of The Burren
output1 = classify_topics(text, model_word2vec)
print(output1)

[('Bad Service Feedback', 0.7216174602508545), ('Bad Taste', 0.6662274599075317), ('Buiness Service Category', 0.6246830224990845), ('Infrastructure Bad Feedback', 0.5886725187301636), ('Price', 0.46098247170448303)]


In [64]:
# use one sample customer review from The Burren to test the model
output2 = classify_topics('There are so many better places in davis square where they are glad you are visiting \
                            their business. Sad that the burren is now the worst place in davis. \
                            So what exactly does the Burren think it is? $6 for a pint of Guiness at a dingy Irish bar in Davis Sq? \
                            Outrageous, its not Fanueil Hall! Went with a few friends, got fish and chips. \
                            Over priced for dry, flavor less fish and chips', model_word2vec)
print(output2)

[('Bad Service Feedback', 0.6423662304878235), ('Bad Taste', 0.5662879347801208), ('Infrastructure Bad Feedback', 0.5158231854438782), ('Buiness Service Category', 0.5130825042724609), ('Pricy', 0.4023929536342621)]
