<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Aspect-Based-Opinion-Mining-of-Hotel-Reviews" data-toc-modified-id="Aspect-Based-Opinion-Mining-of-Hotel-Reviews-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Aspect-Based Opinion Mining of Hotel Reviews</a></span><ul class="toc-item"><li><span><a href="#1.-Get-hotel-that-has-more-than-100-reviews" data-toc-modified-id="1.-Get-hotel-that-has-more-than-100-reviews-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>1. Get hotel that has more than 100 reviews</a></span></li><li><span><a href="#3.-Generate-aspect-rate-based-on-sentiment-analysis" data-toc-modified-id="3.-Generate-aspect-rate-based-on-sentiment-analysis-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>3. Generate aspect rate based on sentiment analysis</a></span></li></ul></li></ul></div>

## Hotel Rank based on customer preference

### 1. Using hotel in San Diego for example

In [1]:
import os
from os import listdir
import pandas as pd
from nltk.corpus import sentiwordnet as swn
df = pd.read_csv('C:/Users/xyyao/Documents/GitHub/Topics-Extraction-Hotel-Reviews - local/hotel-reviews/Datafiniti_Hotel_Reviews_Jun19.csv', header=0)
df.columns

Index(['id', 'dateAdded', 'dateUpdated', 'address', 'categories',
       'primaryCategories', 'city', 'country', 'keys', 'latitude', 'longitude',
       'name', 'postalCode', 'province', 'reviews.date', 'reviews.dateAdded',
       'reviews.dateSeen', 'reviews.rating', 'reviews.sourceURLs',
       'reviews.text', 'reviews.title', 'reviews.userCity',
       'reviews.userProvince', 'reviews.username', 'sourceURLs', 'websites'],
      dtype='object')

In [2]:
df.drop(columns=['dateAdded', 'dateUpdated', 'address', 'categories',
       'primaryCategories', 'keys', 'reviews.date', 'reviews.dateAdded',
       'reviews.dateSeen', 'reviews.userCity',
       'reviews.userProvince', 'reviews.username', 'sourceURLs', 'websites'], inplace = True)
df = df.rename(index=str, columns={'reviews.rating':'rating', 'reviews.sourceURLs': 'reviews_sourceURLs', 'reviews.text':'reviews_text', 'reviews.title': 'reviews_title'})

In [3]:
df1 = df[df.groupby(['id'])['id'].transform('count') > 50]

In [4]:
df_sd = df1[df1['city'] == 'San Diego']

In [5]:
import numpy as np
from collections import Counter
dic_sd = {}
for name in np.unique(df_sd['name']):
    dic_sd[name] = {'num_reviews':0, 'term_dict':Counter()}

### 2. Generate term dict with sentiment score

In [7]:
import numpy as np
import pickle
from collections import Counter
import re

# nlp libraries/api
import en_core_web_lg
from spacy import displacy
import gensim
#from neuralcoref import Coref

spacy = en_core_web_lg.load()
#coref = Coref(nlp=spacy)

In [8]:
# Uncomment below if running for first time.
# Setup nltk corpora path and Google Word2Vec location
#google_vec_file = 'GoogleNews-vectors-negative300.bin'
#word2vec = gensim.models.KeyedVectors.load_word2vec_format(google_vec_file, binary=True)
#pickle.dump(word2vec, open("word2vec_google.pkl", 'wb'))

# If above script has been run, load saved word embedding
word2vec = pickle.load(open("word2vec_google.pkl", 'rb'))

In [9]:
def check_similarity(aspects, word):
    '''
    checks for word2vec similarity values between category word and the term
    returns most similar word
    '''
    similarity = []
    for aspect in aspects:
        sim = [word2vec.n_similarity([aspect], word.split())]
        for splited in word.split():
            sim.append(word2vec.n_similarity([aspect], [splited]))
        similarity.append(max(sim))
    # set threshold for max value
    if max(similarity) > 0.2:
        return aspects[np.argmax(similarity)]
    else:
        return None


def modify_sentiment(token, sentiment):
    for child in token.children:
        # if there's a adj modifier (i.e. very, pretty, etc.) add more weight to sentiment
        # This could be better updated for modifiers that either positively or negatively emphasize
        # can't catch "there are nowhere near enough loungers"
        if ((child.dep_ == "amod") or (child.dep_ == "advmod") or (child.dep_ == "neg")):
            try:
                senti = list(swn.senti_synsets(child, 'r'))[0]
                pos = senti.pos_score()
                neg = senti.neg_score()
                if child.dep_ == "neg":
                    sentiment *= -(neg + 1)
                else:
                    sentiment *= pos + 1
            except:
                continue
    return sentiment


def add_sentiment(token, sent_dict, sentiment):
    if (token.dep_ == "advmod"):
        return sent_dict
    elif (token.dep_ == "acomp"):
        sent_dict = for_acomp(token, sent_dict, sentiment)
    elif (token.dep_ == "amod"):
        sent_dict = for_amod(token, sent_dict, sentiment)
    elif (token.dep_ == "compound"):
        sent_dict = for_compound(token, sent_dict, sentiment)
    elif (token.dep_ == "conj"):
        sent_dict = add_sentiment(token.head, sent_dict, sentiment)
    elif (token.dep_ == "dobj"):
        sent_dict = for_dobj(token, sent_dict, sentiment)
    elif (token.dep_ == "relcl"):
        if token.head.pos_ == "NOUN" or token.head.pos_ == "PROPN":
            sent_dict[token.head.lemma_] += sentiment
    else:
        # if verb, check if there's a direct object
        if token.head.dep_ == "advcl" or token.head.dep_ == "ROOT":
            sent_dict = for_advcl(token.head, sent_dict, sentiment, token)
        for child in token.children:
            if (token.pos_ == "VERB") & (child.dep_ == "dobj") & (child.pos_ != 'PRON'):
                sent_dict = check_compound(child, sent_dict, sentiment, token)
            # check for nouns
            elif child.pos_ == "NOUN" or child.pos_ == "PROPN":
                sent_dict = check_compound(child, sent_dict, sentiment, token)
            if ((token.pos_ == "VERB") or (token.pos_ == "ADJ")) & (child.dep_ == "prep"):
                sent_dict = for_prep(child, sent_dict, sentiment)
    return sent_dict


def add_subchild(child, sent_dict, sentiment, token, added):
    if child.children == []:
        return sent_dict, added
    else:
        conj = 0
        for subchild in child.children:
            if (subchild.dep_ in ["compound", "amod", "nmod"]) & (subchild != token):
                sent_dict[subchild.lemma_ + ' ' + child.lemma_] += sentiment
                for sub in subchild.children:
                    if sub.pos_ in ["NOUN", "PROPN"]:
                        sent_dict[sub.lemma_ + ' ' + child.lemma_] += sentiment
                added = True
            # check for conjugates (a AND b), then add both to dictionary
            elif subchild.text == "and" or subchild.text == ",":
                conj += 1
        for subchild in child.children:
            if (conj > 0) & (subchild.pos_ == "NOUN" or subchild.pos_ == "PROPN"):
                sent_dict = check_compound(subchild, sent_dict, sentiment, child)
                conj -= 1
    return sent_dict, added


def check_compound(child, sent_dict, sentiment, token=None):
    added = False
    sent_dict, added = add_subchild(child, sent_dict, sentiment, token, added)
    if not added:
        sent_dict[child.lemma_] += sentiment
    return sent_dict


def for_compound(token, sent_dict, sentiment):
    if token.head.dep_ == "compound":
        sent_dict = for_compound(token.head, sent_dict, sentiment)
    elif token.head.pos_ in ["NOUN", "PROPN"]:
        sent_dict = check_compound(token.head, sent_dict, sentiment)
    return sent_dict


def for_acomp(token, sent_dict, sentiment):
    for ancestor in token.ancestors:
        if ancestor.dep_ == "attr":
            sent_dict[ancestor] += sentiment
        if ancestor.dep_ in ["advcl", "ROOT", "conj", "ccomp"]:
            sent_dict = for_advcl(ancestor, sent_dict, sentiment, token)
    return sent_dict


def for_advcl(ancestor, sent_dict, sentiment, token):
    for child in ancestor.children:
        if (child.dep_ in ["nsubj", "compound"]) & (child.pos_ in ["NOUN", "PROPN"]):
            sent_dict = check_compound(child, sent_dict, sentiment, token)
    return sent_dict


def for_amod(token, sent_dict, sentiment):
    if token.head.dep_ == "appos":
        sent_dict = check_compound(token.head, sent_dict, sentiment, token)
    elif token.head.dep_ == "pobj":
        if token.head.head.head.dep_ in ["advcl", "ROOT", "conj"]:
            sent_dict = for_advcl(token.head.head.head, sent_dict, sentiment, token)
    if token.head.pos_ == "NOUN" or token.head.pos_ == "PROPN":  # token.head.dep_ != "compound", so "free room delivery" not added twice
        sent_dict = check_compound(token.head, sent_dict, sentiment, token)
    return sent_dict


def for_dobj(token, sent_dict, sentiment):
    if token.head.dep_ == "conj":
        head = token.head.head
        head_child = token.head
    else:
        head = token.head
        head_child = token
    for child in head.children:
        if (child.dep_ == "nsubj") & (child.pos_ == "NOUN" or child.pos_ == "PROPN"):
            sent_dict = check_compound(child, sent_dict, sentiment, head_child)
    return sent_dict


def for_prep(token, sent_dict, sentiment):
    for child in token.children:
        if child.pos_ == "NOUN" or child.pos_ == "PROPN":
            sent_dict = check_compound(child, sent_dict, sentiment, token)
    return sent_dict


def feature_sentiment(sentence, sent_dict):
    '''
    input: dictionary and sentence
    function: appends dictionary with new features if the feature did not exist previously,
              then updates sentiment to each of the new or existing features
    output: updated dictionary
    '''
    sentence = spacy(sentence)
    for token in sentence:
        # check if the word is an opinion word, then assign sentiment
        try:
            senti = list(swn.senti_synsets(token.text, 'a'))[0]
            pos = senti.pos_score()
            neg = senti.neg_score()
            if (pos != 0) or (neg != 0):
                sentiment = pos if pos > neg else -neg
            else:
                continue
            # if target is an adverb modifier (i.e. pretty, highly, etc.)
            # but happens to be an opinion word, ignore and pass
            sentiment = modify_sentiment(token, sentiment)
            sent_dict = add_sentiment(token, sent_dict, sentiment)
        except:
            continue
    #print(sent_dict)
    return sent_dict


from nltk.tokenize import sent_tokenize


def split_sentence(text):
    '''
    splits review into a list of sentences using spacy's sentence parser
    '''
    review = spacy(text)
    bag_sentence = []
    start = 0
    for token in review:
        if token.sent_start:
            bag_sentence.append(review[start:(token.i - 1)])
            start = token.i
        if token.i == len(review) - 1:
            bag_sentence.append(review[start:(token.i + 1)])
    return bag_sentence


# Remove special characters using regex
def remove_special_char(sentence):
    return re.sub(r"[^a-zA-Z0-9.',:;?]+", ' ', sentence)


# "ROOMS", "CLEANLINESS", "VALUE", "SERVICE", "LOCATION", "CHECKIN", "BUSINESS", "FOOD", "BUILDING", "OTHER", "NOTRELATED"
def review_pipe(file, dict_sd):
    with open(file) as fp:
        Lines = fp.readlines()
        for line in Lines:
            name = line.split('|')[0]
            review = line.split('|')[1].split('Thank you for your')[0]

            # review = replace_pronouns(review)
            sentences = split_sentence(review)
            for sentence in sentences:
                sentence = remove_special_char(str(sentence))
                dict_sd[name]['term_dict'] = feature_sentiment(sentence.lower(), dict_sd[name]['term_dict'])
            dict_sd[name]['num_reviews'] += 1
    for name in dict_sd:
        for term in dict_sd[name]['term_dict']:
            dict_sd[name]['term_dict'][term] /= dict_sd[name]['num_reviews']/100
    return dict_sd

In [10]:
term_dict_sd = review_pipe('san_diego.txt', dic_sd)
term_dict_sd

{'Best Western Mission Bay': {'num_reviews': 69,
  'term_dict': Counter({'western mission': 32.608695652173914,
           'floor': 1.4492753623188408,
           'room': 9.96376811594203,
           'star hotel': 0.1811594202898551,
           'breakfast': 18.297101449275363,
           'people': 1.6304347826086958,
           'sea': -2.536231884057971,
           'town': 3.2608695652173916,
           'parking': 4.166666666666667,
           'hotel': 8.695652173913045,
           'man': 1.2681159420289856,
           'pool': 4.528985507246377,
           'abd spa': 2.536231884057971,
           'afternoon': 21.73913043478261,
           'good afternoon': -0.1811594202898551,
           'stay': 13.224637681159422,
           'ordinary hotel': -1.0869565217391306,
           'ugly hotel': -1.2681159420289856,
           'musty hotel': -1.2681159420289856,
           'standard': 0.1811594202898551,
           'toilet': -0.3623188405797102,
           'football team': -0.3623188405797102

In [47]:
from collections import defaultdict
def rank_hotel(term):
    rank_hotel = defaultdict(int)
    for name in term_dict_sd:
        for t in term_dict_sd[name]['term_dict']:
            try:
                sim = word2vec.n_similarity(t.split(), [term])
                if sim > 0.3:
                    print(t)
                    rank_hotel[name] += term_dict_sd[name]['term_dict'][t]
            except:
                continue
    return rank_hotel

In [48]:
rank_hotel = rank_hotel('breakfast')

breakfast
hotel
ordinary hotel
breakfast course
bed
breakfast lot
good fare
day
free breakfast
hotel room
night
evening
breakfast burrito
waffle
morning
breakfast room
continental food
breakfast food
continental breakfast
breakfast entree
continental entree
fast food
comfortable bed
entire breakfast
good morning
queen bed
dining
breakfast special
full breakfast
guest smoking
room bathroom
hotel staff
good breakfast
hotel employee
shower
breakfast area
spacious buffet
incredible buffet
breakfast buffet
bed room
nice shower
easy shower
sleep
cereal
yogurt
hot beverage
fabulous breakfast
breakfast gal
coffee
clean hotel
basic hotel
good hotel
bayside inn
hotel
donut
muffin
dinner
breakfast buffet
bed
breakfast
comfortable bed
evening meal
connect restaurant
great shower
lovely room
night
comfy bed
inclusive breakfast
nice bed
hotel staff
good inn
complementary breakfast
clean hotel
affordable hotel
downtown hotel
bay breakfast
complimentary breakfast
biscuit
cheese
day
complimentary buffe

In [46]:
sorted(rank_hotel.items(), key=lambda v: v[1], reverse=True)

[('Best Western Plus Bayside Inn', 72.4609375),
 ('The Pearl Hotel', 67.07746478873241),
 ('Best Western Plus Hacienda Hotel Old Town', 62.109375),
 ('Best Western San Diego/Miramar Hotel', 60.00000000000001),
 ('Quality Suites San Diego SeaWorld Area', 59.72222222222224),
 ('Best Western Yacht Harbor Hotel', 59.65909090909087),
 ('Best Western Mission Bay', 51.99275362318842),
 ('Hampton Inn San Diego Del Mar', 45.875),
 ('Best Western Seven Seas', 31.81818181818182),
 ('Ocean Park Inn', 22.222222222222232)]