In [2]:
import numpy as np
import pandas as pd
import requests
import json
import time
import random
import pickle
from nltk.tokenize import sent_tokenize
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
import spacy
import itertools
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, TruncatedSVD
import collections
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# 0. Pulling data
# The function to retrieve 50 reviews from a single hotel

In [2]:
# read hotel reviews via API
# hotelId should be a text string
def get_hotel_review(hotelId):

    url = "https://hotels4.p.rapidapi.com/reviews/v2/list"

    querystring = {"hotelId":hotelId,"reviewOrder":"date_newest_first","tripTypeFilter":"all"}

    headers = {
        'x-rapidapi-host': "hotels4.p.rapidapi.com",
        'x-rapidapi-key': "YOUR_KEY"
        }

    response = requests.request("GET", url, headers=headers, params=querystring)

    data = json.loads(response.text)
    
    # turn the json response into a dataframe
    df = pd.json_normalize(data['data']['reviews']['body']['reviewContent']['reviews']['hermes']['groups'],
                           record_path = ['items'])
    
    return df

In [5]:
df = get_hotel_review('292776')

In [74]:
#df.description

# The function to get a list of hotels with a keyword

In [8]:
# search hotel by keyword, the "query" argument

def get_hotels(keyword):
    url = "https://hotels4.p.rapidapi.com/locations/v2/search"

    querystring = {"query":keyword,"locale":"en_US","currency":"USD"}

    headers = {
        'x-rapidapi-host': "hotels4.p.rapidapi.com",
        'x-rapidapi-key': "YOUR_KEY"
        }

    response = requests.request("GET", url, headers=headers, params=querystring)

    data = json.loads(response.text)

    df_loc = pd.json_normalize(data['suggestions'],
                              record_path=['entities'],
                              meta=['group'])
    
    return df_loc[df_loc.group=='HOTEL_GROUP']

In [67]:
df_loc = get_hotels('times square')

In [72]:
df_loc[['name', 'destinationId']]

Unnamed: 0,name,destinationId
2,Times Square Home,1068236352
3,TBA Times Square,1478700992
4,Sheraton New York Times Square Hotel,105816
5,The Times Square EDITION,962253664
6,The Westin New York at Times Square,226121
7,W New York - Times Square,199889
8,Moxy NYC Times Square,574581888
9,Pod Times Square,630803872
10,Cassa Times Square Hotel,287688288
11,M Social Hotel Times Square New York,114036


In [80]:
for name, destinationId in zip(df_loc.name, df_loc.destinationId):
    print(name, destinationId)

Times Square Home 1068236352
TBA Times Square 1478700992
Sheraton New York Times Square Hotel 105816
The Times Square EDITION 962253664
The Westin New York at Times Square 226121
W New York - Times Square 199889
Moxy NYC Times Square 574581888
Pod Times Square 630803872
Cassa Times Square Hotel 287688288
M Social Hotel Times Square New York 114036
Hampton Inn Times Square North 163790
The Hotel @ Times Square 292800


# Pipeline - getting the latest 50 reviews from 50 hotels around Times Square

In [13]:
# read the list of hotels and their IDs
hotels = pd.read_csv('hotels.csv', dtype='object')

In [18]:
hotels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Hotel   50 non-null     object
 1   ID      50 non-null     object
dtypes: object(2)
memory usage: 928.0+ bytes


In [19]:
keyword = ['times square']

raw_review = []

for area in keyword:
    #df_id = get_hotels(area)
    
    #for name, destinationId in zip(df_id.name, df_id.destinationId):
    for name, destinationId in zip(hotels.Hotel, hotels.ID):
        print('Getting review from ', name, ' with ID: ', destinationId)
        review = get_hotel_review(destinationId)
        review['hotel'] = name
        raw_review.append(review[['hotel', 'description']])
        time.sleep(1 + 2 * random.random())
        
print('Process done.')

Getting review from  Room Mate Grace Boutique Hotel  with ID:  225936
Getting review from  Hotel St. James  with ID:  217460
Getting review from  The Bryant Park Hotel  with ID:  192073
Getting review from  Millennium Hotel Broadway Times Square  with ID:  150387
Getting review from  LUMA Hotel Times Square  with ID:  474171296
Getting review from  Executive Hotel Le Soleil New York  with ID:  513814
Getting review from  Radio City Apartments  with ID:  444302
Getting review from  Artel Hotel Times Square  with ID:  867133152
Getting review from  Arlo Midtown  with ID:  1558419520
Getting review from  The Gotham Hotel  with ID:  367406
Getting review from  Hotel Mela Times Square  with ID:  244926
Getting review from  Hotel Riu Plaza New York Times Square  with ID:  536247
Getting review from  City Club Hotel  with ID:  198706
Getting review from  Club Quarters, Times Square - Midtown  with ID:  163809
Getting review from  Park Terrace Hotel on Bryant Park  with ID:  541480064
Getting 

In [3]:
# pickle the result
'''
with open('raw_review', 'wb') as to_write:
    pickle.dump(raw_review, to_write)
'''

with open('raw_review', "rb") as to_read:
    raw_review = pickle.load(to_read)


In [7]:
hotel_review = pd.concat(raw_review, axis=0).reset_index(drop=True)
review = hotel_review.description

In [9]:
# each row is 1 review
review

0       The hotel is great and so were the staff, only...
1       Had to call for service with television and li...
2       •Cute & clean rooms!\n•Very unique wallpaper o...
3                                                        
4       The hotel was the perfect location for walking...
                              ...                        
2492    The TV was nothing but static. I finally figur...
2493    Great hotel. Location close to Times Square. W...
2494    The location was perfect right off Time Square...
2495    Great!  Being NYC I was surprised how large th...
2496    I stayed for three nights while attending a co...
Name: description, Length: 2497, dtype: object

In [10]:
# break all reviews to sentences
sentences = [sent_tokenize(text) for text in review]

# flatten the list of lists, each row is now 1 sentence
sentences = list(itertools.chain.from_iterable(sentences))

# Get rid of sentences less than 3 characters
sentences = [text for text in sentences if len(text) > 3]

In [11]:
print('# of sentences: ', len(sentences))

# of sentences:  8391


In [12]:
sentences[:3]

['The hotel is great and so were the staff, only thing missing from this great establishment is a microwave that the guest can use!',
 'Had to call for service with television and lighting \nI expected much more when I asked for parking validation unable to accommodate due to only working with Meyers parking garage.',
 '•Cute & clean rooms!']

# 1. Vectorizer with stemmer
## 1.1 Count Vectorizer

In [13]:
# Apply Stemmer on Vectorizer
#stemmer = LancasterStemmer()
stemmer = SnowballStemmer(language='english')

# Create a Stemmed vectorizer
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

vectorizer_s = StemmedCountVectorizer(min_df=3, analyzer="word", stop_words='english', max_df=0.8)
X = vectorizer_s.fit_transform(sentences)
cv_matrix = pd.DataFrame(X.toarray(), columns=vectorizer_s.get_feature_names())
cv_matrix

Unnamed: 0,00,10,100,11,11th,12,14,15,15th,17,...,ya,yard,year,yellow,yes,yogurt,york,young,zero,água
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8386,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8387,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8388,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8389,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 1.2 Tf-idf Vectorizer

In [14]:
# Apply Stemmer on Vectorizer
#stemmer = LancasterStemmer()
stemmer = SnowballStemmer(language='english')

# Create a Stemmed vectorizer
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

vectorizer_s = StemmedTfidfVectorizer(min_df=3, analyzer="word", stop_words='english', max_df=0.8)
X = vectorizer_s.fit_transform(sentences)
tfidf_matrix = pd.DataFrame(X.toarray(), columns=vectorizer_s.get_feature_names())
tfidf_matrix

Unnamed: 0,00,10,100,11,11th,12,14,15,15th,17,...,ya,yard,year,yellow,yes,yogurt,york,young,zero,água
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8386,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0
8387,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0
8388,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.4125,0.0,0.0,0.0,0.0
8389,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0


# 2. Topic Modeling
## 2.1 NMF - Non-Negative Matrix Factorization with Count Vectorizer

In [15]:
topics = 4

nmf = NMF(topics)
doc_topic = nmf.fit_transform(cv_matrix)



## The function to return the top n words of an algorithm

In [16]:
# Function to return top n words of a topic
def get_top_terms(topic, n_terms, algo, terms):
    # get the topic components (i.e., term weights)
    topic_comp = algo.components_

    # get term indices, sorted (descending) by topic weights
    term_indices = np.argsort(topic_comp[topic])[-n_terms:]
    
    # use the `terms` array to get the actual top terms
    #print(topic_comp[topic][term_indices])
    #print(term_indices)
    top_words = [terms[i] for i in term_indices]
    print('Topic ', topic, ':')
    print(top_words[::-1])
    return None

In [17]:
for n in range(topics):
    get_top_terms(n, 20, nmf, vectorizer_s.get_feature_names())

Topic  0 :
['room', 'clean', 'small', 'bed', 'comfort', 'nice', 'bathroom', 'like', 'check', 'didn', 'view', 'work', 'need', 'size', 'day', 'stay', 'did', 'tini', 'night', 'door']
Topic  1 :
['locat', 'great', 'time', 'squar', 'walk', 'close', 'perfect', 'conveni', 'excel', 'good', 'park', 'clean', 'central', 'place', 'servic', 'broadway', 'properti', 'subway', 'restaur', 'near']
Topic  2 :
['hotel', 'stay', 'time', 'night', 'squar', 'definit', 'nyc', 'nice', 'love', 'good', 'recommend', 'new', 'just', 'clean', 'book', 'day', 'place', 'like', 'area', 'close']
Topic  3 :
['staff', 'friend', 'help', 'clean', 'nice', 'desk', 'super', 'check', 'accommod', 'extrem', 'excel', 'welcom', 'great', 'kind', 'amaz', 'profession', 'breakfast', 'pleasant', 'wonder', 'courteous']


* Topic 0: Room/size/bathroom/view
* Topic 1: Location/Time Square/convenience
* Topic 2: Hotel/Overall experience/Time Square
* Topic 3: Staff/friendly/front desk/breakfast

In [18]:
# classified each sentence with a topic based on the highest score
classified_topic = np.argmax(nmf.transform(cv_matrix), axis=1)
classified_topic

array([1, 1, 0, ..., 3, 3, 2])

In [19]:
collections.Counter(classified_topic)

Counter({1: 1793, 0: 2579, 2: 2591, 3: 1428})

In [20]:
nmf.transform(cv_matrix)

array([[0.        , 0.17966858, 0.15985054, 0.14945153],
       [0.01471315, 0.02001792, 0.01848832, 0.01000658],
       [0.14079954, 0.0013117 , 0.        , 0.01618069],
       ...,
       [0.00237684, 0.00281849, 0.0034599 , 0.00358461],
       [0.        , 0.        , 0.        , 0.05384601],
       [0.00200505, 0.        , 0.05852906, 0.        ]])

In [21]:
np_sentences = np.array(sentences)

In [22]:
print(sum(classified_topic == 0))
np_sentences[classified_topic == 0][:100]

2579


array(['•Cute & clean rooms!',
       '•Very unique wallpaper on each floor & in each room!',
       'Be prepared to get stuck in one of their old elevators and have the NYFD rescue you after being trapped.',
       'Comfortable bed.',
       'The only problem is that the bed was on a high platform with open wedge that you can sit on.',
       'But I had to climb up to get into bed.',
       'the closet door handles are about 1/8 big!',
       'very hard to grasp to open the doors.',
       'One hint:  skip the $15 breakfast and go next door to the Pub next door.',
       'Fun room.', 'Love Room Mate,  close to Broadway, great price.',
       'Room was well decorated with a fantastic shower.',
       'Room was larger than it looks in picture.',
       'Small space used very efficiently.',
       'But it was nicely decorated and the bed was comfortable and everything was clean and organized.',
       'The room was clean and very conveniently located.',
       'Requested bath towels and 

In [23]:
print(sum(classified_topic == 1))
np_sentences[classified_topic == 1][:100]

1793


array(['The hotel is great and so were the staff, only thing missing from this great establishment is a microwave that the guest can use!',
       'Had to call for service with television and lighting \nI expected much more when I asked for parking validation unable to accommodate due to only working with Meyers parking garage.',
       '•Right next to an Irish restaurant & bar!',
       '•You step outside and Time Square is just a half black away!',
       '(Great Location!)',
       'The hotel was the perfect location for walking around times square and it was right near all of the theaters for broadway.',
       'Great atmosphere.', 'Not easy with 2 knee replacements.',
       'Very great location.',
       'The location is awesome, so close to everything if your seeing a show.',
       'The location was AMAZING!!!', 'Great location.',
       'Just steps away from Times Square.', 'Breakfast was great.',
       'The location is perfect for broadway shows.',
       'Pool and bar close

In [24]:
print(sum(classified_topic == 2))
np_sentences[classified_topic == 2][:100]

2591


array(['It was a cute little hotel and it was perfect for me and my two friends.',
       'The only issue we had was the shower did not have much hot water but for the most part ee enjoyed our stay!',
       "This was an excellent hotel when you're looking for a comfortable, clean and convenient Manhattan hotel.",
       'We have found our "go-to" hotel for our frequent trips into New York City.',
       'Loved the bunk/cubbies.',
       'we loved the hotel everything was great except the shower didn’t have any hot water nor any pressure',
       'This is the only place I stay in NYC !!', 'Highly recommend!',
       'Room size was tiny but that is the norm when you stay at a Times Square boutique hotel.',
       'They have a partnership with a parking lot a block and a half away,but you have to pay.',
       'So it was inconvenient to have to lug all your luggage from the parking lot.',
       'This is a wonderful European style hotel.',
       'I will definitely go back.', 'We had a v

In [25]:
print(sum(classified_topic == 3))
np_sentences[classified_topic == 3][:100]

1428


array(['Amazing very friendly front desk', 'Very rude staff.',
       'very friendly staff.', 'Awesome breakfast.',
       'Very convenient to Broadway, nice vibe, funky motif',
       'The employees were very friendly and helpful.', 'Friendly staff.',
       'Staff was very helpful\nand welcoming.',
       'Excellent staff and service',
       'Staff was friendly and they went out thier way to make sure we were ok. Only drawback was the kack of parking.',
       'It was amazing', 'Staff were always helpful and friendly.',
       'Thank you.', 'Great location and friendly staff',
       'The bathroom is spacious with nice amenities.',
       'And the staff I interacted with were lovely.',
       'My friend and I were in NY for a concert.',
       'Very clean and tidy.', 'Staff friendly, decor inviting.',
       'Staff was over the top helpful.',
       'James was extremely helpful with all our needs.',
       'We received immediate responses anytime we called the front desk for informa

## 2.2 NMF - Non-Negative Matrix Factorization with Tf-idf Vectorizer

In [26]:
topics = 4

nmf_tfidf = NMF(topics)
doc_topic = nmf_tfidf.fit_transform(tfidf_matrix)



In [27]:
for n in range(topics):
    get_top_terms(n, 20, nmf_tfidf, vectorizer_s.get_feature_names())

Topic  0 :
['great', 'locat', 'hotel', 'excel', 'perfect', 'good', 'squar', 'time', 'conveni', 'servic', 'close', 'staff', 'properti', 'love', 'amaz', 'walk', 'central', 'place', 'restaur', 'view']
Topic  1 :
['staff', 'friend', 'help', 'nice', 'super', 'desk', 'accommod', 'welcom', 'excel', 'extrem', 'profession', 'amaz', 'wonder', 'kind', 'pleasant', 'attent', 'check', 'courteous', 'hotel', 'awesom']
Topic  2 :
['clean', 'room', 'nice', 'small', 'hotel', 'comfort', 'bed', 'bathroom', 'modern', 'spacious', 'good', 'quiet', 'view', 'tini', 'need', 'servic', 'size', 'like', 'love', 'check']
Topic  3 :
['stay', 'definit', 'hotel', 'recommend', 'night', 'time', 'enjoy', 'place', 'good', 'love', 'overal', 'nyc', 'high', 'squar', 'realli', 've', 'nice', 'day', 'perfect', 'new']


* Topic 0: Location, Times Square, convenience, services, staff, property
* Topic 1: Staff, front desk
* Topic 2: Room, room size, bathroom, amenities, stay
* Topic 3: Stay, overall experience

### It generates pretty much the same result as NMF + count vectorizer.

In [28]:
# classified each sentence with a topic based on the highest score
classified_topic = np.argmax(nmf_tfidf.transform(tfidf_matrix), axis=1)
classified_topic

array([0, 2, 2, ..., 2, 1, 3])

In [29]:
collections.Counter(classified_topic)

Counter({0: 1537, 2: 4017, 3: 1753, 1: 1084})

In [30]:
print(sum(classified_topic == 0))
np_sentences[classified_topic == 0][:10]

1537


array(['The hotel is great and so were the staff, only thing missing from this great establishment is a microwave that the guest can use!',
       '•Right next to an Irish restaurant & bar!', '(Great Location!)',
       'The hotel was the perfect location for walking around times square and it was right near all of the theaters for broadway.',
       'Great atmosphere.', 'Very great location.',
       'The location is awesome, so close to everything if your seeing a show.',
       'Love Room Mate,  close to Broadway, great price.',
       'we loved the hotel everything was great except the shower didn’t have any hot water nor any pressure',
       'The location was AMAZING!!!'], dtype='<U590')

In [31]:
print(sum(classified_topic == 1))
np_sentences[classified_topic == 1][:10]

1084


array(['It was a cute little hotel and it was perfect for me and my two friends.',
       'Amazing very friendly front desk', 'Very rude staff.',
       'very friendly staff.', 'Awesome breakfast.',
       'The employees were very friendly and helpful.', 'Friendly staff.',
       'Staff was very helpful\nand welcoming.',
       'Excellent staff and service',
       'Staff was friendly and they went out thier way to make sure we were ok. Only drawback was the kack of parking.'],
      dtype='<U590')

In [32]:
print(sum(classified_topic == 2))
np_sentences[classified_topic == 2][:10]

4017


array(['Had to call for service with television and lighting \nI expected much more when I asked for parking validation unable to accommodate due to only working with Meyers parking garage.',
       '•Cute & clean rooms!',
       '•Very unique wallpaper on each floor & in each room!',
       'Be prepared to get stuck in one of their old elevators and have the NYFD rescue you after being trapped.',
       "This was an excellent hotel when you're looking for a comfortable, clean and convenient Manhattan hotel.",
       'Comfortable bed.',
       'The only problem is that the bed was on a high platform with open wedge that you can sit on.',
       'But I had to climb up to get into bed.',
       'Not easy with 2 knee replacements.',
       'the closet door handles are about 1/8 big!'], dtype='<U590')

In [33]:
print(sum(classified_topic == 3))
np_sentences[classified_topic == 3][:10]

1753


array(['•You step outside and Time Square is just a half black away!',
       'The only issue we had was the shower did not have much hot water but for the most part ee enjoyed our stay!',
       'We have found our "go-to" hotel for our frequent trips into New York City.',
       'Loved the bunk/cubbies.',
       'This is the only place I stay in NYC !!', 'Highly recommend!',
       'Just steps away from Times Square.',
       'Room size was tiny but that is the norm when you stay at a Times Square boutique hotel.',
       'This is a wonderful European style hotel.',
       'I will definitely go back.'], dtype='<U590')

## 2.3 LSA - Latent Semantic Analysis with Tf-idf Vectorizer

In [34]:
n_topics = 3
lsa = TruncatedSVD(n_topics)
lsa.fit(tfidf_matrix)

TruncatedSVD(n_components=3)

In [35]:
for n in range(n_topics):
    get_top_terms(n, 15, lsa, vectorizer_s.get_feature_names())

Topic  0 :
['great', 'locat', 'staff', 'hotel', 'friend', 'clean', 'room', 'stay', 'nice', 'help', 'good', 'excel', 'time', 'perfect', 'squar']
Topic  1 :
['staff', 'friend', 'help', 'clean', 'room', 'nice', 'small', 'super', 'desk', 'extrem', 'comfort', 'welcom', 'accommod', 'bed', 'profession']
Topic  2 :
['room', 'clean', 'stay', 'hotel', 'small', 'nice', 'definit', 'comfort', 'bed', 'good', 'night', 'bathroom', 'time', 'place', 'love']


* Topic 0: Staff/location/room/Times Square
* Topic 1: Staff/room
* Topic 2: Room/stay

The result is inferior than the NMF in my opinion

## 2.4 LSA with Count Vectorizer

In [36]:
n_topics = 3
lsa = TruncatedSVD(n_topics)
lsa.fit(cv_matrix)

TruncatedSVD(n_components=3)

In [37]:
for n in range(n_topics):
    get_top_terms(n, 15, lsa, vectorizer_s.get_feature_names())

Topic  0 :
['room', 'hotel', 'clean', 'locat', 'great', 'staff', 'stay', 'small', 'time', 'nice', 'bed', 'friend', 'comfort', 'squar', 'check']
Topic  1 :
['locat', 'hotel', 'great', 'staff', 'time', 'squar', 'friend', 'stay', 'help', 'excel', 'close', 'walk', 'good', 'perfect', 'conveni']
Topic  2 :
['hotel', 'stay', 'time', 'squar', 'night', 'definit', 'nyc', 'new', 'recommend', 'book', 'love', 'just', 'walk', 'day', 'close']


* Topic 0: Room/location/Times Square/staff
* Topic 1: Location/staff
* Topic 2: Location/Time Square

LSA is definitely not working well for this particular application.
## After comparing with different combinations, I think NMF + Count Vectorizer is the best solution for our problem.

# 3. Sentiment Analysis on each sentence

In [38]:
analyzer = SentimentIntensityAnalyzer()

In [39]:
score = [analyzer.polarity_scores(x) for x in sentences]

In [40]:
hotel_review

Unnamed: 0,hotel,description
0,Room Mate Grace Boutique Hotel,"The hotel is great and so were the staff, only..."
1,Room Mate Grace Boutique Hotel,Had to call for service with television and li...
2,Room Mate Grace Boutique Hotel,•Cute & clean rooms!\n•Very unique wallpaper o...
3,Room Mate Grace Boutique Hotel,
4,Room Mate Grace Boutique Hotel,The hotel was the perfect location for walking...
...,...,...
2492,Hotel 46 Times Square,The TV was nothing but static. I finally figur...
2493,Hotel 46 Times Square,Great hotel. Location close to Times Square. W...
2494,Hotel 46 Times Square,The location was perfect right off Time Square...
2495,Hotel 46 Times Square,Great! Being NYC I was surprised how large th...


In [41]:
# break all reviews to sentences
hotel_sentences = [(hotel, sent_tokenize(text)) for hotel, text in zip(hotel_review.hotel, hotel_review.description)]

In [42]:
# Create a df to store hotel and each sentence
hotel_sentences = pd.DataFrame(hotel_sentences, columns=['hotel', 'review'])
hotel_sentences = hotel_sentences.set_index('hotel')['review'].apply(pd.Series).stack().reset_index()
hotel_sentences.drop('level_1', axis=1, inplace=True)
hotel_sentences.columns = ['hotel', 'review']

In [43]:
# remove sentences less than 3 characters
hotel_sentences = hotel_sentences[hotel_sentences['review'].apply(lambda x: True if len(x) > 3 else False)]
hotel_sentences.reset_index(drop=True, inplace=True)

In [44]:
# merge the sentiment analysis to the hotel sentences df
hotel_sentences['score'] = [x['compound'] for x in score]
hotel_sentences.head()

Unnamed: 0,hotel,review,score
0,Room Mate Grace Boutique Hotel,"The hotel is great and so were the staff, only...",0.807
1,Room Mate Grace Boutique Hotel,Had to call for service with television and li...,0.0
2,Room Mate Grace Boutique Hotel,•Cute & clean rooms!,0.4574
3,Room Mate Grace Boutique Hotel,•Very unique wallpaper on each floor & in each...,0.0
4,Room Mate Grace Boutique Hotel,•Right next to an Irish restaurant & bar!,0.0


# 4. Recommender

In [45]:
# Merge the topic scores
topic_score = nmf.transform(cv_matrix)
topic_score = pd.DataFrame(topic_score, columns=['topic_0', 'topic_1', 'topic_2', 'topic_3'])
hotel_sentences = pd.concat([hotel_sentences, topic_score], axis=1)

In [46]:
hotel_sentences.head(10)

Unnamed: 0,hotel,review,score,topic_0,topic_1,topic_2,topic_3
0,Room Mate Grace Boutique Hotel,"The hotel is great and so were the staff, only...",0.807,0.0,0.179669,0.159851,0.149452
1,Room Mate Grace Boutique Hotel,Had to call for service with television and li...,0.0,0.014713,0.020018,0.018488,0.010007
2,Room Mate Grace Boutique Hotel,•Cute & clean rooms!,0.4574,0.1408,0.001312,0.0,0.016181
3,Room Mate Grace Boutique Hotel,•Very unique wallpaper on each floor & in each...,0.0,0.120942,0.0,0.0,0.0
4,Room Mate Grace Boutique Hotel,•Right next to an Irish restaurant & bar!,0.0,0.001426,0.009722,0.008341,0.001243
5,Room Mate Grace Boutique Hotel,•You step outside and Time Square is just a ha...,0.0,0.004105,0.061171,0.038846,0.0
6,Room Mate Grace Boutique Hotel,(Great Location!),0.6588,0.0,0.214358,0.0,0.0
7,Room Mate Grace Boutique Hotel,The hotel was the perfect location for walking...,0.5719,0.0,0.205501,0.19629,0.0
8,Room Mate Grace Boutique Hotel,It was a cute little hotel and it was perfect ...,0.8689,0.0,0.0,0.170198,0.076121
9,Room Mate Grace Boutique Hotel,The only issue we had was the shower did not h...,0.7301,0.015265,0.0,0.066044,0.0


In [75]:
# Merge the classified topic for each sentence
classified_topic = np.argmax(nmf.transform(cv_matrix), axis=1)
hotel_sentences['classified_topic'] = classified_topic
hotel_sentences

Unnamed: 0,hotel,review,score,topic_0,topic_1,topic_2,topic_3,classified_topic,topic_sum
0,Room Mate Grace Boutique Hotel,"The hotel is great and so were the staff, only...",0.8070,0.000000,0.179669,0.159851,0.149452,1,0.488971
1,Room Mate Grace Boutique Hotel,Had to call for service with television and li...,0.0000,0.014713,0.020018,0.018488,0.010007,1,0.063226
2,Room Mate Grace Boutique Hotel,•Cute & clean rooms!,0.4574,0.140800,0.001312,0.000000,0.016181,0,0.158292
3,Room Mate Grace Boutique Hotel,•Very unique wallpaper on each floor & in each...,0.0000,0.120942,0.000000,0.000000,0.000000,0,0.120942
4,Room Mate Grace Boutique Hotel,•Right next to an Irish restaurant & bar!,0.0000,0.001426,0.009722,0.008341,0.001243,1,0.020731
...,...,...,...,...,...,...,...,...,...
8386,Hotel 46 Times Square,"The room included a TV with HBO, a safe in the...",0.4404,0.121487,0.000000,0.000000,0.000000,0,0.121487
8387,Hotel 46 Times Square,No refrigerator or microwave.,-0.2960,0.001343,0.000150,0.000414,0.000000,0,0.001907
8388,Hotel 46 Times Square,"The breakfast was basic (toast, bagels, yogurt...",0.0000,0.002377,0.002818,0.003460,0.003585,3,0.012240
8389,Hotel 46 Times Square,The workers were gracious and helpful.,0.7506,0.000000,0.000000,0.000000,0.053846,3,0.053846


Recall:
* Topic 0: Room/size/bathroom/view
* Topic 1: Location/Time Square/convenience
* Topic 2: Hotel impression/Overall experience/Time Square
* Topic 3: Staff/friendly/front desk/breakfast

In [76]:
# Sanity check on the topics, looking at the reviews of top 10 scores for each topic
hotel_sentences.sort_values('topic_3', ascending=False).head(10)

Unnamed: 0,hotel,review,score,topic_0,topic_1,topic_2,topic_3,classified_topic,topic_sum
5876,The Time New York,We will come back because everyone on the staf...,0.7424,0.016113,0.0,0.001516,0.475644,3,0.493272
8068,The New Yorker A Wyndham Hotel,All the staff were friendly from the front des...,0.4939,0.013439,0.0,0.0,0.412912,3,0.42635
7119,"InterContinental - New York Times Square, an I...","Restaurant staff, housekeeping and hotel staff...",0.875,0.0,0.0,0.163115,0.392185,3,0.5553
5947,The Time New York,Guest staff at checkin were friendly however t...,0.0516,0.0,0.026718,0.022888,0.386065,3,0.435671
3425,Millennium Premier New York Times Square,Some of the staff was friendly but for the maj...,0.2732,0.0,0.0,0.0,0.378303,3,0.378303
6008,The Time New York,The staff was very nice only complaint besides...,0.8414,0.030654,0.220231,0.003979,0.36462,3,0.619484
4804,"Staypineapple, An Artful Hotel, Midtown","Clean, quiet, friendly staff, economical, and ...",0.8442,0.01448,0.0,0.0,0.334852,3,0.349332
866,Executive Hotel Le Soleil New York,"clean, excellent location, the staff is very f...",0.9162,0.013592,0.126852,0.0,0.318624,3,0.459069
6131,Sanctuary Hotel New York,Staff were exceptional especially our housekee...,0.0,0.0,0.0,0.0,0.313179,3,0.313179
710,LUMA Hotel Times Square,The staff was amazing- excellent service from ...,0.8176,0.0,0.009506,0.004427,0.312445,3,0.326378


In [77]:
# Taking the average score of each topic for each hotel
hotel_score = pd.DataFrame(hotel_sentences.groupby(['hotel', 'classified_topic']).score.mean())
hotel_score

Unnamed: 0_level_0,Unnamed: 1_level_0,score
hotel,classified_topic,Unnamed: 2_level_1
3 West Club,0,0.079400
3 West Club,1,0.438670
3 West Club,2,0.361102
3 West Club,3,0.550715
Ameritania at Times Square,0,0.147820
...,...,...
Wingate by Wyndham New York Midtown South/5th Ave,3,0.568127
citizenM New York Times Square,0,0.254058
citizenM New York Times Square,1,0.469003
citizenM New York Times Square,2,0.402100


In [78]:
# get the names of all hotels
hotels = hotel_sentences.hotel.unique()
len(hotels)

50

In [79]:
'''
Recommender that will take preference of each topic as inputs (int)
* Topic 0: Room/size/bathroom/view
* Topic 1: Location/Time Square/convenience
* Topic 2: Hotel impression/Overall experience/Time Square
* Topic 3: Staff/friendly/front desk/breakfast
'''

def preference(first, second, third, fourth):
    rec = {}
    for hotel in hotels:
        weighted_score = 0
        weighted_score += hotel_score.loc[(hotel, first), 'score'] * 4
        weighted_score += hotel_score.loc[(hotel, second), 'score'] * 3
        weighted_score += hotel_score.loc[(hotel, third), 'score'] * 2
        weighted_score += hotel_score.loc[(hotel, fourth), 'score'] * 1
        rec[hotel] = weighted_score
    return pd.DataFrame.from_dict(rec, orient='index', columns=['weighted_score']
                                 ).sort_values('weighted_score', ascending=False)[:10]

In [131]:
preference(0, 2, 3, 1)

Unnamed: 0,weighted_score
CIVILIAN Hotel,3.893164
Library Hotel by Library Hotel Collection,3.83458
Arlo Midtown,3.826522
citizenM New York Times Square,3.808906
The Iroquois New York,3.793241
LUMA Hotel Times Square,3.630871
Wingate by Wyndham New York Midtown South/5th Ave,3.619088
"The Jewel, a Club Quarters Hotel, Opposite Rockefeller Center",3.603768
Hotel Riu Plaza New York Times Square,3.538354
Executive Hotel Le Soleil New York,3.358773


In [94]:
preference(0, 1, 2, 3)

Unnamed: 0,weighted_score
CIVILIAN Hotel,4.172897
Library Hotel by Library Hotel Collection,4.153596
Arlo Midtown,3.972568
The Iroquois New York,3.943714
citizenM New York Times Square,3.786125
The Bryant Park Hotel,3.64034
"The Jewel, a Club Quarters Hotel, Opposite Rockefeller Center",3.628474
LUMA Hotel Times Square,3.622235
Hotel Riu Plaza New York Times Square,3.603578
Wingate by Wyndham New York Midtown South/5th Ave,3.587663


In [81]:
# To check the individual score of a hotel  
for i in range(4):
    print(i, ": ", hotel_score.loc[('Crowne Plaza HY36 Midtown Manhattan, an IHG Hotel', i)])

0 :  score    0.135223
Name: (Crowne Plaza HY36 Midtown Manhattan, an IHG Hotel, 0), dtype: float64
1 :  score    0.515282
Name: (Crowne Plaza HY36 Midtown Manhattan, an IHG Hotel, 1), dtype: float64
2 :  score    0.339894
Name: (Crowne Plaza HY36 Midtown Manhattan, an IHG Hotel, 2), dtype: float64
3 :  score    0.627979
Name: (Crowne Plaza HY36 Midtown Manhattan, an IHG Hotel, 3), dtype: float64


# 5. Polish the topic classifier
Rationale: Some sentences just don't belong to any topic.

In [82]:
hotel_sentences[(hotel_sentences['classified_topic'] == 0)].sort_values('topic_0').head(50)

Unnamed: 0,hotel,review,score,topic_0,topic_1,topic_2,topic_3,classified_topic,topic_sum
5296,Hyatt Place New York City/Time Square,Never again !,0.0,0.0,0.0,0.0,0.0,0,0.0
1185,Artel Hotel Times Square,Our too.,0.0,0.0,0.0,0.0,0.0,0,0.0
3543,Embassy Suites by Hilton New York Manhattan Ti...,Was fraudulent.,-0.4939,0.0,0.0,0.0,0.0,0,0.0
6895,Pestana CR7 Times Square,They were an announce and bothersome.,-0.3182,0.0,0.0,0.0,0.0,0,0.0
5200,Hyatt Place New York City/Time Square,"Perfecto ubication,",0.3182,0.0,0.0,0.0,0.0,0,0.0
3306,Millennium Premier New York Times Square,Mary Marinucci,0.0,0.0,0.0,0.0,0.0,0,0.0
7741,Doubletree by Hilton New York Times Square South,It was hilarious!,0.4574,0.0,0.0,0.0,0.0,0,0.0
5332,Doxie Hotel,"There are rats everywhere.""",0.0,0.0,0.0,0.0,0.0,0,0.0
504,Millennium Hotel Broadway Times Square,Nothing,0.0,0.0,0.0,0.0,0.0,0,0.0
1564,The Gotham Hotel,I would repeat!,0.0,0.0,0.0,0.0,0.0,0,0.0


### Notice that there are many sentences which have zero score on all four topics, but they are all assigned to topic 0. I think we should remove them for our hotel_sentences dataframe.

In [83]:
nmf.transform(cv_matrix)[5200]

array([0., 0., 0., 0.])

In [84]:
hotel_sentences['topic_sum'] = hotel_sentences.loc[:, 'topic_0':'topic_3'].sum(axis=1)

In [85]:
sum(hotel_sentences.topic_sum == 0)

75

We have 75 sentences which have the sum of topic score equal to 0.

In [86]:
hotel_sentences_v2 = hotel_sentences[hotel_sentences.topic_sum != 0].reset_index(drop=True)
hotel_sentences_v2

Unnamed: 0,hotel,review,score,topic_0,topic_1,topic_2,topic_3,classified_topic,topic_sum
0,Room Mate Grace Boutique Hotel,"The hotel is great and so were the staff, only...",0.8070,0.000000,0.179669,0.159851,0.149452,1,0.488971
1,Room Mate Grace Boutique Hotel,Had to call for service with television and li...,0.0000,0.014713,0.020018,0.018488,0.010007,1,0.063226
2,Room Mate Grace Boutique Hotel,•Cute & clean rooms!,0.4574,0.140800,0.001312,0.000000,0.016181,0,0.158292
3,Room Mate Grace Boutique Hotel,•Very unique wallpaper on each floor & in each...,0.0000,0.120942,0.000000,0.000000,0.000000,0,0.120942
4,Room Mate Grace Boutique Hotel,•Right next to an Irish restaurant & bar!,0.0000,0.001426,0.009722,0.008341,0.001243,1,0.020731
...,...,...,...,...,...,...,...,...,...
8311,Hotel 46 Times Square,"The room included a TV with HBO, a safe in the...",0.4404,0.121487,0.000000,0.000000,0.000000,0,0.121487
8312,Hotel 46 Times Square,No refrigerator or microwave.,-0.2960,0.001343,0.000150,0.000414,0.000000,0,0.001907
8313,Hotel 46 Times Square,"The breakfast was basic (toast, bagels, yogurt...",0.0000,0.002377,0.002818,0.003460,0.003585,3,0.012240
8314,Hotel 46 Times Square,The workers were gracious and helpful.,0.7506,0.000000,0.000000,0.000000,0.053846,3,0.053846


### Get back to dig around.

In [87]:
hotel_sentences_v2[(hotel_sentences_v2['classified_topic'] == 3)].sort_values('topic_3').head(10)

Unnamed: 0,hotel,review,score,topic_0,topic_1,topic_2,topic_3,classified_topic,topic_sum
3687,CIVILIAN Hotel,"Petit bémol, pas assez de patères pour accroch...",0.0,2e-06,0.0,4e-06,3.2e-05,3,3.8e-05
2304,Park Terrace Hotel on Bryant Park,It was a mistake and she apologized.,-0.0258,5.6e-05,0.0,0.0,8.5e-05,3,0.000141
1807,Hotel Mela Times Square,So figure it out.,0.0,0.000125,0.0,5.9e-05,0.000127,3,0.000311
5388,Doxie Hotel,At first I was so worried.,-0.4341,0.0,0.0,5.4e-05,0.000157,3,0.000211
374,The Bryant Park Hotel,So I let it go but lesson learned.,0.0,2e-06,7.6e-05,0.00034,0.00041,3,0.000827
3688,CIVILIAN Hotel,"L’hôtel est très agréable, la déco est soignée...",0.0,0.0,3.1e-05,0.000333,0.000489,3,0.000853
4491,M Social Hotel Times Square New York,No one answered.,-0.296,2.2e-05,0.000148,0.0,0.000526,3,0.000696
6623,Best Western Premier Herald Square,"(It's your job, not mine.)",0.0,0.000257,0.000163,0.0,0.000531,3,0.000951
5815,The Time New York,"If you’re dairy free, diabetic or have celiac ...",0.743,0.000397,0.000477,0.000498,0.000609,3,0.001981
1173,Artel Hotel Times Square,It was free!,0.5562,0.000402,0.000484,0.00036,0.000611,3,0.001857


### Even though these sentences are classified as topic 3, they actually don't provide much insights. Let's see if the minimum score of 0.001 helps.

In [88]:
hotel_sentences_v2[(hotel_sentences_v2['classified_topic'] == 3) & 
                  (hotel_sentences_v2['topic_3'] >= 0.01)].sort_values('topic_3').head(20)

Unnamed: 0,hotel,review,score,topic_0,topic_1,topic_2,topic_3,classified_topic,topic_sum
7288,Row NYC,Elevator super sketchy.,0.5994,0.003565,0.000997,0.001541,0.01003,3,0.016132
7852,Archer Hotel New York,Even disregarding COVID that’s like super gross.,0.34,0.007673,0.001502,0.005341,0.010053,3,0.024568
7547,Doubletree by Hilton New York Times Square South,The front desk didn't have any sense of humor ...,0.6249,0.0078,0.0,0.003812,0.010154,3,0.021765
892,Executive Hotel Le Soleil New York,Not sure why the lady would say no with out ev...,-0.4874,0.00735,4e-06,0.006438,0.010202,3,0.023994
4473,M Social Hotel Times Square New York,"Once I got to check in, it was quick and easy.",0.4404,0.008779,0.00433,0.005987,0.010307,3,0.029402
2607,RIU Plaza Manhattan Times Square,I call front desk all they said is that it was...,0.0,0.004116,0.0,0.00335,0.010335,3,0.017801
5566,Pod Times Square,When I arrived they wanted me to wait 4 hours ...,0.0,0.009024,0.002681,0.007999,0.010351,3,0.030054
4376,3 West Club,It had a desk and chair.,0.0,0.003692,0.0,0.000934,0.010413,3,0.015039
6339,"Crowne Plaza HY36 Midtown Manhattan, an IHG Hotel",We arrived early and they allowed us an early ...,0.0,0.009025,0.0,0.007167,0.01045,3,0.026642
6131,Citadines Connect Fifth Avenue New York,Under the desk.,0.0,0.002877,0.0,0.000861,0.010471,3,0.014209


### Conclusion: I think it's possible to tune the minimum score of a sentence to be included in our score-calculated recommender. However, this parameter might be different from model to model. For the simplicity of the project, I am leaving it as is.
### I also notice that sentiment analysis is not that accurate when looking at the comments one by one. For example, the comment #7288 and #7852, the first two comments in the above dataframe.

# Appendix: for project presentation

In [89]:
hotel_sentences[(hotel_sentences['classified_topic'] == 0)].sort_values('topic_0', ascending=False).head(10)

Unnamed: 0,hotel,review,score,topic_0,topic_1,topic_2,topic_3,classified_topic,topic_sum
4866,Homewood Suites New York/Midtown Manhattan Tim...,"The rooms are not like the pictures, I booked ...",0.1415,0.417971,0.0,0.0,0.02565,0,0.443621
6775,Square Hotel in Times Square,Got there at 5:00 pm check in was 3:00 howeve...,0.1855,0.400918,0.0,0.043122,0.017676,0,0.461716
6810,Square Hotel in Times Square,"Room was heated by a radiator, which I didn't ...",0.765,0.389402,0.003771,0.005483,0.0,0,0.398656
6200,Citadines Connect Fifth Avenue New York,Room with two queens didn’t leave a lot of ext...,0.4854,0.372296,0.0,0.164673,0.0,0,0.536969
7576,Library Hotel by Library Hotel Collection,"Having ""themed"" rooms with books on shelves in...",0.624,0.364018,0.0,0.0,0.0,0,0.364018
6723,Square Hotel in Times Square,"The room was freezing, there was not anything ...",-0.5688,0.35862,0.0,0.0,0.0,0,0.35862
6712,Square Hotel in Times Square,Bathroom was so small that I had hard time mov...,0.7747,0.356713,0.154429,0.23407,0.009697,0,0.75491
444,The Bryant Park Hotel,Other then the conference room that was attach...,0.4404,0.354066,0.0,0.0,0.0,0,0.354066
869,Executive Hotel Le Soleil New York,I was treated poorly I’m not sure why I tipped...,-0.0683,0.296944,0.013612,0.085733,0.020841,0,0.41713
835,Executive Hotel Le Soleil New York,"The room is spacious,new and clean, The bed wa...",0.807,0.293573,0.002994,0.008764,0.032767,0,0.338098


In [93]:
hotel_sentences[(hotel_sentences['classified_topic'] == 0)].sort_values('topic_0', ascending=False).head(10).review[835]

'The room is spacious,new and clean, The bed was really nice I slept like a baby.There is a mini fridge in the room too.'

In [95]:
hotel_sentences[(hotel_sentences['classified_topic'] == 1)].sort_values('topic_1', ascending=False).head(10)

Unnamed: 0,hotel,review,score,topic_0,topic_1,topic_2,topic_3,classified_topic,topic_sum
4329,3 West Club,"Great location, great history, great place.",0.9231,0.0,0.408964,0.0,0.0,1,0.408964
478,The Bryant Park Hotel,"great hotel , great locations, great bar in th...",0.9387,0.119019,0.395713,0.204824,0.0,1,0.719556
962,Radio City Apartments,Stayed 1 week and although this is not a fancy...,0.6216,0.019888,0.371269,0.046919,0.00951,1,0.447585
4828,Homewood Suites New York/Midtown Manhattan Tim...,I will continue to stay at the Homewood Suites...,0.0,0.0,0.362244,0.042157,0.0,1,0.404401
107,Room Mate Grace Boutique Hotel,Our stay was awesome...it truly is all about l...,0.4404,0.0,0.360598,0.036738,0.0,1,0.397336
6776,Square Hotel in Times Square,LOCATION - Location - Location!,0.0,0.0,0.360385,0.0,0.0,1,0.360385
7771,Archer Hotel New York,"Location, location, location!!!",0.0,0.0,0.360385,0.0,0.0,1,0.360385
6380,"Crowne Plaza HY36 Midtown Manhattan, an IHG Hotel","Location, location & location.",0.0,0.0,0.360385,0.0,0.0,1,0.360385
8308,Hotel 46 Times Square,"Location, location and location.",0.0,0.0,0.360385,0.0,0.0,1,0.360385
3611,Embassy Suites by Hilton New York Manhattan Ti...,"Location, location, location!!",0.0,0.0,0.360385,0.0,0.0,1,0.360385


In [101]:
hotel_sentences[(hotel_sentences['classified_topic'] == 1)].sort_values('topic_1', ascending=False).head(10).hotel[4828]

'Homewood Suites New York/Midtown Manhattan Times Square'

In [102]:
hotel_sentences[(hotel_sentences['classified_topic'] == 2)].sort_values('topic_2', ascending=False).head(10)

Unnamed: 0,hotel,review,score,topic_0,topic_1,topic_2,topic_3,classified_topic,topic_sum
2564,RIU Plaza Manhattan Times Square,Gave hotel address to cab driver was dropped o...,-0.5684,0.0,0.0,0.692059,0.0,2,0.692059
8205,Hotel 46 Times Square,"The hotel was ok, but not quite as it appears ...",0.0,0.0,0.0,0.687699,0.138269,2,0.825968
5449,Doxie Hotel,"Terrible, dirty, not safe, smell weed Marihuan...",-0.9301,0.000803,0.0,0.537305,0.0,2,0.538107
2694,RIU Plaza Manhattan Times Square,We booked this hotel and we’re informed 12 hou...,0.0,0.0,0.0,0.518391,0.0,2,0.518391
184,Hotel St. James,The hotel was clean and staff very courteous b...,0.8292,0.003584,0.0,0.509291,0.169097,2,0.681972
5473,Doxie Hotel,The website for the hotel and on hotels.com ga...,0.0,0.0,0.0,0.505028,0.0,2,0.505028
1031,Radio City Apartments,The hotel is an older hotel with rooms smaller...,0.0,0.100946,0.0,0.493764,0.0,2,0.59471
2587,RIU Plaza Manhattan Times Square,I was told when they informed me that my 1 nig...,0.4404,0.0,0.0,0.416865,0.0,2,0.416865
7395,Row NYC,"Overall, it was a beautiful hotel in a perfect...",0.9196,0.0,0.113832,0.413822,0.0,2,0.527654
2108,City Club Hotel,"She refused to stay in this hotel, do had to m...",-0.296,0.0,0.0,0.405574,0.0,2,0.405574


In [107]:
hotel_sentences[(hotel_sentences['classified_topic'] == 2)].sort_values('topic_2', ascending=False).head(10).review[1031]

"The hotel is an older hotel with rooms smaller than today's hotels and motels, but because of its age, it has character."

In [110]:
hotel_sentences[(hotel_sentences['classified_topic'] == 3)].sort_values('topic_3', ascending=False).head(20)

Unnamed: 0,hotel,review,score,topic_0,topic_1,topic_2,topic_3,classified_topic,topic_sum
5876,The Time New York,We will come back because everyone on the staf...,0.7424,0.016113,0.0,0.001516,0.475644,3,0.493272
8068,The New Yorker A Wyndham Hotel,All the staff were friendly from the front des...,0.4939,0.013439,0.0,0.0,0.412912,3,0.42635
7119,"InterContinental - New York Times Square, an I...","Restaurant staff, housekeeping and hotel staff...",0.875,0.0,0.0,0.163115,0.392185,3,0.5553
5947,The Time New York,Guest staff at checkin were friendly however t...,0.0516,0.0,0.026718,0.022888,0.386065,3,0.435671
3425,Millennium Premier New York Times Square,Some of the staff was friendly but for the maj...,0.2732,0.0,0.0,0.0,0.378303,3,0.378303
6008,The Time New York,The staff was very nice only complaint besides...,0.8414,0.030654,0.220231,0.003979,0.36462,3,0.619484
4804,"Staypineapple, An Artful Hotel, Midtown","Clean, quiet, friendly staff, economical, and ...",0.8442,0.01448,0.0,0.0,0.334852,3,0.349332
866,Executive Hotel Le Soleil New York,"clean, excellent location, the staff is very f...",0.9162,0.013592,0.126852,0.0,0.318624,3,0.459069
6131,Sanctuary Hotel New York,Staff were exceptional especially our housekee...,0.0,0.0,0.0,0.0,0.313179,3,0.313179
710,LUMA Hotel Times Square,The staff was amazing- excellent service from ...,0.8176,0.0,0.009506,0.004427,0.312445,3,0.326378


In [111]:
hotel_sentences[(hotel_sentences['classified_topic'] == 3)].sort_values('topic_3', ascending=False).head(10).review[5947]

'Guest staff at checkin were friendly however the restaurant staff that I went one time to visit to get a morning coffee were rude.'

In [114]:
hotel_sentences[(hotel_sentences['classified_topic'] == 3)].sort_values('topic_3', ascending=False).head(10).hotel[7119]

'InterContinental - New York Times Square, an IHG Hotel'

In [115]:
hotel_sentences.sort_values('score', ascending=False).head(10)

Unnamed: 0,hotel,review,score,topic_0,topic_1,topic_2,topic_3,classified_topic,topic_sum
4199,"The Jewel, a Club Quarters Hotel, Opposite Roc...","Very clean, very friendly staff (let me check ...",0.9806,0.047018,0.222469,0.009852,0.27244,3,0.55178
5112,citizenM New York Times Square,Hey place to stay not a lot of frills but the ...,0.9783,0.015939,0.10203,0.061246,0.064242,1,0.243457
4665,"Staypineapple, An Artful Hotel, Midtown",The decor is indeed fun (nice change from seda...,0.967,0.097507,0.025262,0.027498,0.081916,0,0.232182
8168,The New Yorker A Wyndham Hotel,"Staff was great, very clean, felt safe with al...",0.9641,0.018689,0.228535,0.0,0.172642,1,0.419865
6790,Square Hotel in Times Square,Great location is what brought us to this Hote...,0.9637,0.11305,0.314479,0.166574,0.217426,1,0.81153
2521,RIU Plaza Manhattan Times Square,"It was clean, rooms good size for NYC, staff n...",0.9617,0.156307,0.229552,0.012089,0.211194,1,0.609143
525,Millennium Hotel Broadway Times Square,Pros: \n•location is AMAZING \n•staff members ...,0.9592,0.080447,0.134976,0.020444,0.257357,3,0.493225
7996,Wingate by Wyndham New York Midtown South/5th Ave,Great Location | Clean Facilities | Updated Ro...,0.9584,0.17116,0.221188,0.0,0.267992,3,0.66034
1882,Hotel Riu Plaza New York Times Square,"Extremely clean, friendly staff, gorgeous deco...",0.957,0.137854,0.0,0.0,0.253764,3,0.391618
2428,Park Terrace Hotel on Bryant Park,They have a beautiful room on the 6th floor th...,0.9565,0.134562,0.097641,0.000869,0.0,0,0.233072


In [120]:
hotel_sentences[hotel_sentences['score']<0.4].sort_values('score', ascending=False).head(10)

Unnamed: 0,hotel,review,score,topic_0,topic_1,topic_2,topic_3,classified_topic,topic_sum
3117,Hotel Shocard at Times Square,Very recommended!!,0.3973,0.0,0.0,0.007643,0.000236,2,0.007878
5684,Ameritania at Times Square,No smoking room but it smelled like cigarettes,0.3919,0.126551,0.0,0.0,0.0,0,0.126551
2844,The Iroquois New York,Rooms are NY small but well appointed.,0.3919,0.138479,0.0,0.0,0.0,0,0.138479
4657,"Staypineapple, An Artful Hotel, Midtown",Tiny room but everything that we needed was we...,0.3919,0.125593,0.0,0.0,0.0,0,0.125593
7851,Archer Hotel New York,Room was small but cozy and well appointed.,0.3919,0.138639,0.0,0.0,0.0,0,0.138639
5144,citizenM New York Times Square,Small thing to some but my last stay was well ...,0.3919,0.028128,0.0,0.131027,0.0,2,0.159155
1481,Arlo Midtown,Room is small but well designed with storage u...,0.3919,0.151665,0.0,0.0,0.0,0,0.151665
2810,The Iroquois New York,Do NOT get taken in with touristy eating spots...,0.3903,0.001568,0.006084,0.007549,0.000505,2,0.015706
3359,Millennium Premier New York Times Square,The Hotel was wonderful but we were traveling ...,0.3903,0.011831,0.0,0.171032,0.00085,2,0.183713
4121,Hotel Edison Times Square,Room was a lot smaller than expected but that ...,0.3903,0.130913,0.02646,0.018094,0.0,0,0.175467


In [123]:
hotel_sentences[hotel_sentences['score']<0.4].sort_values('score', ascending=False).head(10).review[4121]

"Room was a lot smaller than expected but that was okay considering we didn't spend much time in there!"

In [124]:
hotel_sentences.sort_values('score').head(10)

Unnamed: 0,hotel,review,score,topic_0,topic_1,topic_2,topic_3,classified_topic,topic_sum
1966,Hotel Riu Plaza New York Times Square,Was a horrible hotel to stay in just not worth...,-0.9646,0.12871,0.021183,0.247808,0.0,2,0.397701
5449,Doxie Hotel,"Terrible, dirty, not safe, smell weed Marihuan...",-0.9301,0.000803,0.0,0.537305,0.0,2,0.538107
2036,City Club Hotel,"This hotel was a fraud, nothing of VIP\n\nExp...",-0.9143,0.109675,0.0,0.335322,0.0,2,0.444997
1173,Artel Hotel Times Square,"NO hot water, AWFUL water pressure, and the st...",-0.9106,0.004462,0.0,0.174918,0.0,2,0.17938
6121,Sanctuary Hotel New York,The Haven restaurant website says they close a...,-0.9055,0.017315,0.021838,0.272972,0.004333,2,0.316458
5914,The Time New York,"The AC was dirty, there was no parking, the ph...",-0.8977,0.033402,0.009789,0.018176,0.001745,0,0.063112
1864,Hotel Mela Times Square,The staff was unfriendly and unhelpful from ch...,-0.8957,0.266678,0.0,0.0,0.150925,0,0.417603
500,Millennium Hotel Broadway Times Square,View is great.. but dirty not good maintenance...,-0.8836,0.01736,0.107724,0.003227,0.003193,1,0.131504
7725,Doubletree by Hilton New York Times Square South,"The blinds were broken, bathroom door broken, ...",-0.8834,0.017294,0.001812,0.002991,0.0,0,0.022096
3228,Hotel Shocard at Times Square,When I told the front desk he didn’t know what...,-0.8777,0.155007,0.0002,0.024522,0.030091,0,0.20982


In [125]:
hotel_sentences.sort_values('score').head(10).review[1966]

'Was a horrible hotel to stay in just not worth the time the room design is horrible and annoying the bathroom is terrible no privacy at all the breakfast is so bad also'

In [126]:
hotel_sentences[hotel_sentences['score']>-0.4].sort_values('score').head(10)

Unnamed: 0,hotel,review,score,topic_0,topic_1,topic_2,topic_3,classified_topic,topic_sum
6178,Citadines Connect Fifth Avenue New York,Really strange!!,-0.3973,0.002152,0.000866,0.004195,0.002487,2,0.0097
4220,"The Jewel, a Club Quarters Hotel, Opposite Roc...",The attendant couldn't figure out how to fix e...,-0.3971,0.008164,0.002128,0.004668,0.003684,0,0.018645
4987,Homewood Suites New York/Midtown Manhattan Tim...,Check in was at 4PM but the room was not ready...,-0.395,0.12464,0.0,0.0,0.0,0,0.12464
48,Room Mate Grace Boutique Hotel,So it was inconvenient to have to lug all your...,-0.3947,0.002865,0.008204,0.008224,0.000352,2,0.019646
8203,Hotel 46 Times Square,Less than a block from Times Square.,-0.3892,0.0,0.059519,0.029428,0.0,1,0.088947
3326,Millennium Premier New York Times Square,"As far as convenience of location, less than a...",-0.3892,0.0,0.189292,0.024097,0.0,1,0.213389
3889,The Gallivant Times Square,"The elevators are very small, not covid friendly.",-0.3875,0.019966,0.0,0.002198,0.07924,3,0.101405
2661,RIU Plaza Manhattan Times Square,Chick in staff not friendly at all.,-0.3875,0.0,0.0,0.0,0.228297,3,0.228297
4593,M Social Hotel Times Square New York,Not friendly front desk.,-0.3875,0.000298,0.0,0.0,0.090029,3,0.090327
5232,Hyatt Place New York City/Time Square,"Need to work on customer service skills, the w...",-0.3875,0.015326,0.003782,0.012138,0.093485,3,0.124731


In [130]:
hotel_sentences[hotel_sentences['score']>-0.4].sort_values('score').head(10).review[48]

'So it was inconvenient to have to lug all your luggage from the parking lot.'

In [139]:
hotel_review[hotel_review['hotel'] == 'CIVILIAN Hotel'].description[1053]

'I’m so glad that I chose this hotel! We loved the unique decor and the room was very comfortable. We went to the bar where they had an hour of complimentary drinks! Again, the ambiance was fantastic. It’s a hotel that you can hang out at and enjoy yourselves. We want to go again. '