## Key Topics Extraction and Contextual Sentiment of Hotel Reviews

## 1. Preprocess

In [3]:
import os
from os import listdir
import pandas as pd
#!pip install plotly
import plotly.graph_objs as go
df = pd.read_csv('hotel-reviews/Datafiniti_Hotel_Reviews_Jun19.csv', header=0)
df.columns

Index(['id', 'dateAdded', 'dateUpdated', 'address', 'categories',
       'primaryCategories', 'city', 'country', 'keys', 'latitude', 'longitude',
       'name', 'postalCode', 'province', 'reviews.date', 'reviews.dateAdded',
       'reviews.dateSeen', 'reviews.rating', 'reviews.sourceURLs',
       'reviews.text', 'reviews.title', 'reviews.userCity',
       'reviews.userProvince', 'reviews.username', 'sourceURLs', 'websites'],
      dtype='object')

In [4]:
df.drop(columns=['dateAdded', 'dateUpdated', 'address', 'categories',
       'primaryCategories', 'keys', 'reviews.date', 'reviews.dateAdded',
       'reviews.dateSeen', 'reviews.userCity',
       'reviews.userProvince', 'reviews.username', 'sourceURLs', 'websites'], inplace = True)
df = df.rename(index=str, columns={'reviews.rating':'rating', 'reviews.sourceURLs': 'reviews_sourceURLs', 'reviews.text':'reviews_text', 'reviews.title': 'reviews_title'})

In [14]:
df.head()

Unnamed: 0,id,city,country,latitude,longitude,name,postalCode,province,rating,reviews_sourceURLs,reviews_text,reviews_title
0,AWE2FvX5RxPSIh2RscTK,Goleta,US,34.44178,-119.81979,Best Western Plus South Coast Inn,93117,CA,3,https://www.tripadvisor.com/Hotel_Review-g3243...,"This hotel was nice and quiet. Did not know, t...",Best Western Plus Hotel
1,AVwcj_OhkufWRAb5wi9T,Carmel by the Sea,US,36.55722,-121.92194,Best Western Carmel's Town House Lodge,93921,CA,4,https://www.tripadvisor.com/Hotel_Review-g3217...,We stayed in the king suite with the separatio...,Clean rooms at solid rates in the heart of Carmel
2,AVwcj_OhkufWRAb5wi9T,Carmel by the Sea,US,36.55722,-121.92194,Best Western Carmel's Town House Lodge,93921,CA,3,https://www.tripadvisor.com/Hotel_Review-g3217...,"Parking was horrible, somebody ran into my ren...",Business
3,AVwcj_OhkufWRAb5wi9T,Carmel by the Sea,US,36.55722,-121.92194,Best Western Carmel's Town House Lodge,93921,CA,5,https://www.tripadvisor.com/Hotel_Review-g3217...,Not cheap but excellent location. Price is som...,Very good
4,AVwcj_OhkufWRAb5wi9T,Carmel by the Sea,US,36.55722,-121.92194,Best Western Carmel's Town House Lodge,93921,CA,2,https://www.tripadvisor.com/Hotel_Review-g3217...,If you get the room that they advertised on th...,Low chance to come back here


In [5]:
f= open("hotel_review.txt","w+")
for i in range(df.shape[0]):
    f.write(df.reviews_text[i])
    f.write("\n")
f.close()

In [6]:
with open("hotel_review.txt") as fp: 
    reviews = []
    Lines = fp.readlines() 
    for line in Lines[:5]: 
        reviews.append(line)

In [7]:
reviews

['This hotel was nice and quiet. Did not know, there was train track near by. But it was only few train passed during our stay. Best Western changed hotel classification. The Plus category are not the same as before.\n',
 "We stayed in the king suite with the separation between the bedroom and the living space. The sofa bed wasn't very good I had back discomfort by the day we left on our three night stay. The room is clean, and the king bed very comfortable. This hotel is located within walking distance to most places you will want to... More\n",
 "Parking was horrible, somebody ran into my rental car while staying there. I didn't get to try the breakfast, I was there for business so the restaurant opened to late for the business world to enjoy, I had to asked for coffee for my room, And the items in the vending machine were stale.\n",
 'Not cheap but excellent location. Price is somewhat standard for not hacing reservations. But room was nice and clean. They offer good continental bre

## 2. Amenity Extraction

In [24]:
import nltk.data
from os import listdir
from os.path import isfile, join
from nltk.util import bigrams 
from nltk.tokenize import TreebankWordTokenizer
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
treebank_tokenizer = TreebankWordTokenizer()
import numpy as np
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords
#stop_words = stopwords.words('english')

In [25]:
#check if the element passed in is a float
def isfloat(value):
  try:
    float(value)
    #return true if it can be cast to a float
    return True
  except ValueError:
    return False

In [208]:
stop_words = ['in','of','at','a','the']
def process_text(text):
    #tokenzie the text as sentences
    punkt_sentences = sentence_tokenizer.tokenize(text)
    #tokenize as lists of sentence lists of words
    sentences_words = [treebank_tokenizer.tokenize(sentence) for sentence in punkt_sentences]
    #get a list of words without punctuations
    tokens = [word for sentence in sentences_words for word in sentence if word == 'A/C' or word.isalpha()]
    #lemmatization 
    tokens = [wordnet_lemmatizer.lemmatize(word) for word in tokens]
    #remove the stopwords
    tokens = [word for word in tokens if not word in stop_words]
    #remove numbers
    tokens = [word for word in tokens if isfloat(word == False)]
    #generate bigrams
    bigrams = nltk.bigrams(tokens)
    return tokens,list(bigrams)

### Train a sentiment classifer

In [134]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
 
def word_feats(words):
    return dict([(word, True) for word in words])
 
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
 
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
 
negcutoff = int(len(negfeats)*3/4)
poscutoff = int(len(posfeats)*3/4)
 
#print(negcutoff)
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
#print('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))
 
classifier = NaiveBayesClassifier.train(trainfeats)
#print('accuracy:', nltk.classify.util.accuracy(classifier, testfeats))
#classifier.show_most_informative_features()

In [203]:
import pickle
save_classifier = open("naivebayes.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [204]:
classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()

In [214]:
def senti(bigram):
    return classifier.classify(word_feats(bigram.split())) == 'pos'
sent('good')

False

In [216]:
amenity_list = ['A/C','fan','parking','coffee','breakfast','pool','clean',
             'kitchen','internet','computer','recreation','exercise',
             'vending','locker','dryer','laundry']
def amenity_extraction_hotel(hotel):
    amenities = {}
    for text in df[df['name'] == hotel]['reviews_text']:
        tokens,bigrams = process_text(text)
        for word in tokens:
            for amenity in amenity_list:
                if word.lower() == amenity.lower():
                    if word not in amenities.keys():
                        amenities.update({word:0})
                    for bi in bigrams:
                        if bi[1].lower == word.lower:
                            #print(bi[0])
                            #print(senti(bi[0]))
                            if senti(bi[0]):
                                amenities[word] += 1
                            else:
                                amenities[word] -= 1               
    return amenities

In [212]:
#amenity_extraction_hotel(df['name'][4])

is clean
False
try breakfast
True
for coffee
True
item vending
False
and clean
True
continental breakfast
False
no A/C
False
have fan
False
near pool
True


{'clean': 0,
 'Parking': 0,
 'breakfast': 0,
 'coffee': 1,
 'vending': -1,
 'A/C': -1,
 'fan': -1,
 'pool': 1}

### Extract amenties for all the hotels

In [217]:
hotel_amenities = {}
for hotel in df['name'].unique():
    amenities = amenity_extraction_hotel(hotel)
    hotel_amenities.update({hotel:amenities})

## 3. More

In [131]:
#!pip install negspacy
import spacy
from negspacy.negation import Negex

nlp = spacy.load("en_core_web_sm")
negex = Negex(nlp, ent_types=["NOUN','PRODUCT"])
nlp.add_pipe(negex, last=True)

doc = nlp("Like Carmel, no air-conditioner in rooms but they have a fan for air circulation. Nice pool, nice food, bad car service.")
for e in doc.ents:
    print(e.text, e._.negex)

Carmel False
