In [1]:
import numpy as np
import pickle
from collections import Counter
import re

# nlp libraries/api
import en_core_web_lg
from spacy import displacy
import gensim
#from neuralcoref import Coref

spacy = en_core_web_lg.load()
#coref = Coref(nlp=spacy)


In [3]:
# Load opinion lexicon
neg_file = open("neg_words.txt",encoding = "ISO-8859-1")
pos_file = open("pos_words.txt",encoding = "ISO-8859-1")
neg = [line.strip() for line in neg_file.readlines()]
pos = [line.strip() for line in pos_file.readlines()]
opinion_words = neg + pos


In [11]:

def check_similarity(aspects, word):
    '''
    checks for word2vec similarity values between category word and the term
    returns most similar word
    '''
    similarity = []
    for aspect in aspects:
        similarity.append(word2vec.n_similarity([aspect], word.split()))
    # set threshold for max value
    if max(similarity) > 0.2:
        return aspects[np.argmax(similarity)]
    else:
        return None


def assign_term_to_aspect(aspect_sent, terms_dict, sent_dict, pred):
    '''
    function: takes in a sentiment dictionary and appends the aspect dictionary
    inputs: sent_dict is a Counter in the form Counter(term:sentiment value)
            aspect_sent is total sentiment tally
            terms_dict is dict with individual aspect words associated with sentiment
    output: return two types of aspect dictionaries:
            updated terms_dict and aspect_sent
    '''
    aspects = ['location', 'checkin', 'food', 'building', 'room', 'cleanliness', 'value', 'service', 'business']

    # First, check word2vec
    # Note: the .split() is used for the term because word2vec can't pass compound nouns
    for term in sent_dict:
        try:
            # The conditions for when to use the NB classifier as default vs word2vec
            aspect = check_similarity(aspects, term)
            if aspect:
                terms_dict[aspect.upper()][term] += sent_dict[term]
                if sent_dict[term] > 0:
                    aspect_sent[aspect.upper()]["pos"] += sent_dict[term]
                else:
                    aspect_sent[aspect.upper()]["neg"] += abs(sent_dict[term])
            elif pred:
                aspect = str(pred[0]).strip("(',)")
                terms_dict[aspect.upper()][term] += sent_dict[term]
                if sent_dict[term] > 0:
                    aspect_sent[aspect.upper()]["pos"] += sent_dict[term]
                else:
                    aspect_sent[aspect.upper()]["neg"] += abs(sent_dict[term])
            # if unable to classify via NB or word2vec, then put them in misc. bucket
            else:
                terms_dict["OTHER"][term] += sent_dict[term]
                if sent_dict[term] > 0:
                    aspect_sent["OTHER"]["pos"] += sent_dict[term]
                else:
                    aspect_sent["OTHER"]["neg"] += abs(sent_dict[term])
        except:
            print(term, "not in vocab")
            continue
    return aspect_sent, terms_dict


def modify_sentiment(token, sentiment):
    for child in token.children:
        # if there's a adj modifier (i.e. very, pretty, etc.) add more weight to sentiment
        # This could be better updated for modifiers that either positively or negatively emphasize
        # can't catch "there are nowhere near enough loungers"
        if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
            sentiment *= 1.5
        # check for negation words and flip the sign of sentiment
        if child.dep_ == "neg":
            sentiment *= -1
    return sentiment


def add_subchild(child, sent_dict, sentiment, token, added):
    if child.children == []:
        return sent_dict, added
    else:
        conj = 0
        for subchild in child.children:
            if (subchild.dep_ in ["compound", "amod", "nmod"]) & (subchild != token):
                sent_dict[subchild.lemma_ + ' ' + child.lemma_] += sentiment
                for sub in subchild.children:
                    if sub.pos_ in ["NOUN", "PROPN"]:
                        sent_dict[sub.lemma_ + ' ' + child.lemma_] += sentiment
                added = True
            # check for conjugates (a AND b), then add both to dictionary
            elif subchild.text == "and" or subchild.text == ",":
                conj += 1
        for subchild in child.children:
            if (conj > 0) & (subchild.pos_ == "NOUN" or subchild.pos_ == "PROPN"):
                sent_dict = check_compound(subchild, sent_dict, sentiment, child)
                conj -= 1
    return sent_dict, added


def check_compound(child, sent_dict, sentiment, token=None):
    added = False
    sent_dict, added = add_subchild(child, sent_dict, sentiment, token, added)
    if not added:
        sent_dict[child.lemma_] += sentiment
    return sent_dict


def for_compound(token, sent_dict, sentiment):
    if token.head.dep_ == "compound":
        sent_dict = for_compound(token.head, sent_dict, sentiment)
    elif token.head.pos_ in ["NOUN", "PROPN"]:
        sent_dict = check_compound(token.head, sent_dict, sentiment)
    return sent_dict


def for_acomp(token, sent_dict, sentiment):
    for ancestor in token.ancestors:
        if ancestor.dep_ == "attr":
            sent_dict[ancestor] += sentiment
        if ancestor.dep_ in ["advcl", "ROOT", "conj"]:
            sent_dict = for_advcl(ancestor, sent_dict, sentiment, token)
    return sent_dict


def for_advcl(ancestor, sent_dict, sentiment, token):
    for child in ancestor.children:
        if (child.dep_ in ["nsubj", "compound"] ) & (child.pos_ in ["NOUN", "PROPN"]):
            sent_dict = check_compound(child, sent_dict, sentiment, token)
    return sent_dict


def for_amod(token, sent_dict, sentiment):
    if token.head.dep_ == "appos":
        sent_dict = check_compound(token.head, sent_dict, sentiment, token)
    elif token.head.dep_ == "pobj":
        if token.head.head.head.dep_ in ["advcl", "ROOT", "conj"]:
            sent_dict = for_advcl(token.head.head.head, sent_dict, sentiment, token)
    if token.head.pos_ == "NOUN" or token.head.pos_ == "PROPN":  # token.head.dep_ != "compound", so "free room delivery" not added twice
        sent_dict = check_compound(token.head, sent_dict, sentiment, token)
    return sent_dict


def for_dobj(token, sent_dict, sentiment):
    if token.head.dep_ == "conj":
        head = token.head.head
        head_child = token.head
    else:
        head = token.head
        head_child = token
    for child in head.children:
        if (child.dep_ == "nsubj") & (child.pos_ == "NOUN" or child.pos_ == "PROPN"):
            sent_dict = check_compound(child, sent_dict, sentiment, head_child)
    return sent_dict


def for_prep(token, sent_dict, sentiment):
    for child in token.children:
        if child.pos_ == "NOUN" or child.pos_ == "PROPN":
            sent_dict = check_compound(child, sent_dict, sentiment, token)
    return sent_dict


def feature_sentiment(sentence):
    '''
    input: dictionary and sentence
    function: appends dictionary with new features if the feature did not exist previously,
              then updates sentiment to each of the new or existing features
    output: updated dictionary
    '''
    sent_dict = Counter()
    sentence = spacy(sentence)
    for token in sentence:
        # check if the word is an opinion word, then assign sentiment
        if (token.lemma_ in opinion_words) or (token.text in opinion_words):
            sentiment = 1 if ((token.text in pos) or (token.lemma_ in pos)) else -1
            # if target is an adverb modifier (i.e. pretty, highly, etc.)
            # but happens to be an opinion word, ignore and pass
            sentiment = modify_sentiment(token, sentiment)
            if (token.dep_ == "advmod"):
                continue
            elif (token.dep_ == "acomp"):
                sent_dict = for_acomp(token, sent_dict, sentiment)
            elif (token.dep_ == "amod"):
                sent_dict = for_amod(token, sent_dict, sentiment)
            elif (token.dep_ == "compound"):
                sent_dict = for_compound(token, sent_dict, sentiment)
            elif (token.dep_ == "dobj"):
                sent_dict = for_dobj(token, sent_dict, sentiment)
            elif (token.dep_ == "relcl"):
                if token.head.pos_ == "NOUN" or token.head.pos_ == "PROPN":
                    sent_dict[token.head.lemma_] += sentiment
            else:
                    # if verb, check if there's a direct object
                if token.head.dep_ == "advcl" or token.head.dep_ == "ROOT":
                    sent_dict = for_advcl(token.head, sent_dict, sentiment, token)
                for child in token.children:
                    if (token.pos_ == "VERB") & (child.dep_ == "dobj"):
                        sent_dict = check_compound(child, sent_dict, sentiment, token)
                    # check for nouns
                    elif child.pos_ == "NOUN" or child.pos_ == "PROPN":
                        sent_dict = check_compound(child, sent_dict, sentiment, token)
                    if ((token.pos_ == "VERB") or (token.pos_ == "ADJ")) & (child.dep_ == "prep"):
                        sent_dict = for_prep(child, sent_dict, sentiment)
    return sent_dict


def classify_and_sent(sentence, aspect_sent, terms_dict):
    '''
    function: classify the sentence into a category, and assign sentiment
    note: aspect_dict is a parent dictionary with all the aspects
    input: sentence & aspect dictionary, which is going to be updated
    output: updated aspect dictionary
    '''
    # classify sentence with NB classifier
    predicted = svm_model.predict([sentence])
    pred = mlb.inverse_transform(predicted)
    if "('OTHER')," in pred:
        pred.remove("('OTHER'),")
    if "('NOTRELATED')," in pred:
        pred.remove("('NOTRELATED'),")

    # get aspect names and their sentiment in a dictionary form
    sent_dict = feature_sentiment(sentence)

    # try to categorize the aspect names into the 4 aspects in aspect_dict
    aspect_sent, terms_dict = assign_term_to_aspect(aspect_sent, terms_dict, sent_dict, pred)
    return aspect_sent, terms_dict


#def replace_pronouns(text):
 #   coref.one_shot_coref(text)
  #  return coref.get_resolved_utterances()[0]

from nltk.tokenize import sent_tokenize

def split_sentence(text):
    '''
    splits review into a list of sentences using spacy's sentence parser
    '''
    review = spacy(text)
    bag_sentence = []
    start = 0
    for token in review:
        if token.sent_start:
            bag_sentence.append(review[start:(token.i - 1)])
            start = token.i
        if token.i == len(review) - 1:
            bag_sentence.append(review[start:(token.i + 1)])
    return bag_sentence

# Remove special characters using regex
def remove_special_char(sentence):
    return re.sub(r"[^a-zA-Z0-9.',:;?]+", ' ', sentence)
# "ROOMS", "CLEANLINESS", "VALUE", "SERVICE", "LOCATION", "CHECKIN", "BUSINESS", "FOOD", "BUILDING", "OTHER", "NOTRELATED"
def review_pipe(review, aspect_sent, terms_dict):
    #review = replace_pronouns(review)
    sentences = split_sentence(review)
    for sentence in sentences:
        sentence = remove_special_char(str(sentence))
        aspect_sent, terms_dict = classify_and_sent(sentence.lower(), aspect_sent, terms_dict)
    return aspect_sent, terms_dict

In [5]:
# Uncomment below if running for first time.
# Setup nltk corpora path and Google Word2Vec location
#google_vec_file = 'GoogleNews-vectors-negative300.bin'
#word2vec = gensim.models.KeyedVectors.load_word2vec_format(google_vec_file, binary=True)
#pickle.dump(word2vec, open("word2vec_google.pkl", 'wb'))

# If above script has been run, load saved word embedding
word2vec = pickle.load(open("word2vec_google.pkl", 'rb'))

# load the Multi-label binarizer from previous notebook
mlb = pickle.load(open("mlb.pkl", 'rb'))

# load the fitted naive bayes model from previous notebook
svm_model = pickle.load(open("svm_model.pkl", 'rb'))

In [6]:
with open("hotel_reviewVA.txt") as fp: 
    reviews = []
    Lines = fp.readlines() 
    for line in Lines: 
        reviews.append(line)

In [12]:
aspect_sent={'LOCATION':Counter(), 'CHECKIN':Counter(), 'FOOD':Counter(), 'BUILDING':Counter(), 'ROOM':Counter(), 'CLEANLINESS':Counter(), 'VALUE':Counter(), 'SERVICE':Counter(), 'BUSINESS':Counter(), 'OTHER':Counter()}
term_dict={'LOCATION':Counter(), 'CHECKIN':Counter(), 'FOOD':Counter(), 'BUILDING':Counter(), 'ROOM':Counter(), 'CLEANLINESS':Counter(), 'VALUE':Counter(), 'SERVICE':Counter(), 'BUSINESS':Counter(), 'OTHER':Counter()}
for review in reviews:
    aspect_sent, term_dict = review_pipe(review, aspect_sent, term_dict)

attention not in vocab
rimmed glass not in vocab
-PRON- not in vocab
-PRON- not in vocab
order not in vocab
-PRON- not in vocab
-PRON- not in vocab
. location not in vocab
n.o location not in vocab
-PRON- not in vocab
-PRON- not in vocab
-PRON- not in vocab
-PRON- not in vocab
-PRON- not in vocab
-PRON- not in vocab
-PRON- not in vocab
-PRON- not in vocab
callaisgeneral manager not in vocab
air con not in vocab
-PRON- not in vocab
russells not in vocab
-PRON- not in vocab
-PRON- not in vocab
-PRON- not in vocab
-PRON- not in vocab
-PRON- not in vocab
callaisgeneral manager not in vocab
advice not in vocab
-PRON- not in vocab
-PRON- not in vocab
callaisgeneral manager not in vocab
-PRON- not in vocab
-PRON- not in vocab
-PRON- not in vocab
-PRON- not in vocab
-PRON- not in vocab
-PRON- not in vocab
-PRON- not in vocab
-PRON- not in vocab
-PRON- not in vocab
-PRON- not in vocab
sheet not in vocab
addition not in vocab
-PRON- not in vocab
-PRON- not in vocab
-PRON- not in vocab
-PRON- not

In [27]:
import pandas as pd
aspect, pos, neg = [], [], []
for k, v in aspect_sent.items():
    aspect.append(k)
    pos.append(v['pos'])
    neg.append(v['neg'])
df = pd.DataFrame({'aspect': aspect, 'pos': pos, 'neg': neg})
df['rate'] = (df['pos'] - df['neg']) * 5 / (df['pos'] + df['neg'])
df.to_csv('VA_aspect_sent.csv')

In [37]:
from collections import defaultdict
lowest = defaultdict()
highest = defaultdict()
for i in term_dict:
    if term_dict[i]:
        highest[i] = term_dict[i].most_common()[:10]
        lowest[i] = term_dict[i].most_common()[-10:]
highest_df = pd.DataFrame(highest)
lowest_df = pd.DataFrame(lowest)
lowest_df = lowest_df.drop(["OTHER"],1)
highest_df = highest_df.drop(["OTHER"],1)

In [35]:
highest_df

Unnamed: 0,LOCATION,CHECKIN,FOOD,BUILDING,ROOM,CLEANLINESS,VALUE,SERVICE,BUSINESS
0,"(location, 21)","(feedback, 16.5)","(breakfast, 13)","(city, 4)","(room, 31.5)","(clean parking, 3)","(value, 6)","(staff, 43.0)","(bank, 2)"
1,"(visit, 13)","(kind word, 6)","(food, 7)","(property, 3.5)","(hotel, 24.5)","(fitness center, 2.5)","(price, 4)","(service, 15)","(continue patronage, 2)"
2,"(close location, 4)","(casey manager, 4)","(restaurant, 7.0)","(efficient parking, 3)","(bed, 21.5)","(luxurious amenity, 2.5)","(overall value, 3)","(experience, 7)","(job, 2)"
3,"(place, 3)","(valet, 4)","(dinner, 4)","(hvac, 3.0)","(bathroom, 5.5)","(cleanliness, 2)","(great value, 2)","(review, 6)","(office manager, 2)"
4,"(view, 3)","(wi fi, 4)","(coffee, 4)","(bank vault, 2)","(pillow, 5)","(amenity, 2)","(correct price, 1.5)","(customer service, 5)","(vibrant art, 1)"
5,"(walk, 2)","(wi, 4)","(drink, 4)","(park condos, 2)","(hour, 5)","(noise, 1.5)","(pricing, 1)","(a. manager, 4)","(prestigious bank, 1)"
6,"(living area, 2)","(free fi, 3)","(variety, 2.5)","(architectural detail, 1)","(suite, 4)","(ambience, 1)","(knowledge, 1)","(free, 4)","(new experience, 1)"
7,"(choice, 2)","(fun, 2)","(continental breakfast, 2)","(fountain, 1)","(towel, 4)","(high level, 1)","(future expectation, 1)","(game traffic, 2.5)","(marketplace, 1)"
8,"(great location, 2)","(honest feedback, 2)","(new restaurant, 2)","(street, 1)","(desk staff, 4)","(safe, 1)","(side, 1)","(care, 2)","(transaction, 1)"
9,"(size, 1.5)","(orleans experience, 1)","(many restaurant, 2)","(parking lot, 1)","(valet parking, 4)","(kind regard, 1)","(cost, 1)","(excellent service, 2)","(work bank, 1)"


In [38]:
lowest_df

Unnamed: 0,LOCATION,CHECKIN,FOOD,BUILDING,ROOM,CLEANLINESS,VALUE,SERVICE,BUSINESS
0,"(neighborhood, 1)","(new arrival, 1)","(friendly atmosphere, 0)","(street, 1)","(low room, -2)","(high level, 1)","(side, 1)","(maintenance guy, -1)","(vibrant art, 1)"
1,"(surround area, 1)","(heather gordon, 1)","(good atmosphere, 0)","(parking lot, 1)","(light room, -2)","(safe, 1)","(cost, 1)","(woman, -1)","(prestigious bank, 1)"
2,"(light selection, 1)","(denny, 1)","(item, -1)","(entrance, -1)","(carpeting, -2)","(kind regard, 1)","(rate, 1)","(note, -1)","(new experience, 1)"
3,"(many place, 0)","(diego tip, 1)","(water glass, -1)","(exercise equipment, -1)","(floor room, -2)","(housekeeping staff, 1)","(what, 1)","(child, -1)","(marketplace, 1)"
4,"(other place, 0)","(san tip, 1)","(only disappointment, -1)","(major road, -1)","(bed room, -2)","(decor, 1)","(review, 1)","(surprise, -1)","(transaction, 1)"
5,"(town atmosphere, 0)","(thompson river, 0)","(old bedding, -1)","(part, -1)","(king room, -2)","(amenity caddy, 1)","(option, 1)","(courtesy shuttle, -1)","(work bank, 1)"
6,"(attractive place, -1)","(wifi, 0)","(meal, -1)","(parking, -1)","(door, -2)","(pride, 1)","(amount, -1)","(only inconvenience, -2)","(business district, 1)"
7,"(facility, -1)","(katrina, -1)","(big river, -1)","(mountain park, -1)","(kitchen area, -2)","(traffic noise, -1)","(monetary limit, -1)","(traffic, -2)","(central, 1)"
8,"(only problem, -1)","(jared, -1)","(pasta, -1)","(the, -1)","(window, -2)","(noise level, -1)","(several opportunity, -1)","(ac, -2)","(great job, 1)"
9,"(difficult access, -2)","(tv, -2)","(water, -3)","(condo, -2)","(room service, -3)","(smallness inconvenience, -2)","(expectation, -4)","(c unit, -2)","(business, -1)"
