In [14]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import gensim
import numpy as np
import spacy
import re

from gensim.models import CoherenceModel, LdaMulticore, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
from gensim.test.utils import datapath
import pyLDAvis.gensim

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Import all_reviews_clean_sentiment

In [3]:
# bring in all_reviews df with cleaned reviews
all_reviews_clean_sentiment = pd.read_pickle('./Data/all_reviews_clean_sentiment.pkl')
all_reviews_clean_sentiment.head()

Unnamed: 0,date_of_stay,hotel_response,month_of_stay,nationality,review,source,trip_type,year_of_stay,review_clean,TextBlob_Polarity,vader_score
0,2010-01,,January,Malaysia,Brunei” overall am satisfied as the hotel is ...,Agoda,Couple,2010,brunei overall be satisfy a the hotel be very ...,0.275694,0.9708
1,2010-01,,January,Indonesia,great week-end at Mandarin Meritus” It's been ...,Agoda,Group,2010,great week end at mandarin meritus it s be alm...,0.256061,0.903
2,2010-01,,January,Malaysia,Favourite Hotel !” Mandarin Meritus has always...,Agoda,Business traveler,2010,favourite hotel mandarin meritus have always b...,0.466077,0.9642
3,2010-01,,January,Malaysia,nice” very nice hotel. nice food and great roo...,Agoda,Couple,2010,nice very nice hotel nice food and great room ...,0.162812,0.4507
4,2010-01,,January,Indonesia,"meritus orchard” Very nice hotel,location real...",Agoda,Couple,2010,meritus orchard very nice hotel location reall...,0.408462,0.9777


## Use spacy to prep data for topic modeling

In [4]:
nlp = spacy.load("en")

In [5]:
my_stop_words = [u'hotel', u'orchard', u'road', u'shop', u'singapore', u'mandarin', u'meritus', u'location', u'locate',\
                 u'good', u'great', u'excellent', u'business', u'trip', u'day', u'night', u'star', u'view',\
                 u'trip', u'center', u'centre', u'station', u'city', u'rd', u'lot', u'say', u'\'s', u'Mr',\
                 u'be', u'said', u'says', u'saying']
for stopword in my_stop_words:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True

In [6]:
all_reviews_clean_sentiment['review'] = all_reviews_clean_sentiment['review'].str.replace(".", ' ')

In [7]:
def clean_up(text): 
    text_out = []
    doc= nlp(text)
    for token in doc:
        if token.is_stop == False and token.like_num == False and token.is_punct == False and token.pos_ == 'NOUN':
            text_out.append(token.lemma_)
    return text_out

In [8]:
all_reviews_clean_sentiment['review_spacyclean'] = all_reviews_clean_sentiment['review']\
.apply(lambda x: clean_up(x))

In [9]:
dictionary = Dictionary(all_reviews_clean_sentiment['review_spacyclean'])

In [10]:
corpus = [dictionary.doc2bow(text) for text in all_reviews_clean_sentiment['review_spacyclean']]

In [40]:
# hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)
# [print(topic,'\n') for topic in hdpmodel.show_topics()]

In [25]:
ldamodel = LdaMulticore(corpus=corpus, num_topics=4, id2word=dictionary, passes=500, workers=5)

In [26]:
[print(topic,'\n') for topic in ldamodel.show_topics()]

(0, '0.068*"room" + 0.045*"check" + 0.043*"time" + 0.037*"staff" + 0.036*"service" + 0.016*"guest" + 0.015*"hour" + 0.013*"luggage" + 0.012*"stay" + 0.011*"reception"') 

(1, '0.144*"room" + 0.035*"bed" + 0.019*"bathroom" + 0.019*"floor" + 0.015*"shower" + 0.013*"stay" + 0.013*"water" + 0.011*"service" + 0.011*"tower" + 0.010*"wing"') 

(2, '0.067*"breakfast" + 0.051*"staff" + 0.051*"service" + 0.046*"room" + 0.032*"food" + 0.031*"stay" + 0.018*"club" + 0.017*"time" + 0.016*"buffet" + 0.014*"restaurant"') 

(3, '0.111*"shopping" + 0.048*"room" + 0.037*"mall" + 0.029*"place" + 0.028*"staff" + 0.026*"food" + 0.026*"restaurant" + 0.026*"area" + 0.019*"heart" + 0.018*"service"') 



[None, None, None, None]

In [27]:
ldacoherence = CoherenceModel(model=ldamodel, corpus=corpus, dictionary=dictionary, coherence='u_mass')
print(ldacoherence.get_coherence())

-2.455665549625013


## VIZ

In [28]:
pyLDAvis.enable_notebook()
vis_data = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

In [29]:
vis_data

In [30]:
pyLDAvis.save_html(vis_data, './lda_topic_viz.html')

## save model

In [None]:
# # load
# lda = LdaModel.load(temp_file)

In [31]:
ldamodel.save('./Model/lda_4topics_500.model')

# assign topic to each review

In [32]:
corpus_lda = ldamodel[corpus]

lda_topic = []

for doc in corpus_lda:
    lda_topic.append(doc)

In [33]:
lda_topic_frame = pd.DataFrame(lda_topic, columns=['topic_0', 'topic_1', 'topic_2', 'topic_3'])
lda_topic_frame.head(3)

Unnamed: 0,topic_0,topic_1,topic_2,topic_3
0,"(0, 0.38113126)","(1, 0.025298346)","(2, 0.35928825)","(3, 0.23428214)"
1,"(0, 0.90150136)","(1, 0.032604907)","(2, 0.03301195)","(3, 0.032881808)"
2,"(0, 0.032682512)","(1, 0.036005903)","(2, 0.032777984)","(3, 0.89853364)"


In [34]:
topic0_prob = pd.DataFrame(lda_topic_frame['topic_0'].tolist(), index=lda_topic_frame.index).iloc[:,1].rename(columns = {"1": "topic0_probability"})
topic1_prob = pd.DataFrame(lda_topic_frame['topic_1'].tolist(), index=lda_topic_frame.index).iloc[:,1].rename(columns = {"1": "topic1_probability"})
topic2_prob = pd.DataFrame(lda_topic_frame['topic_2'].tolist(), index=lda_topic_frame.index).iloc[:,1].rename(columns = {"1": "topic2_probability"})
topic3_prob = pd.DataFrame(lda_topic_frame['topic_3'].tolist(), index=lda_topic_frame.index).iloc[:,1].rename(columns = {"1": "topic3_probability"})

In [35]:
all_reviews_final = pd.concat([all_reviews_clean_sentiment, topic0_prob, topic1_prob, topic2_prob, topic3_prob], axis=1)

In [36]:
all_reviews_final['CustSvc/Checkin'] = np.where(all_reviews_final[0] >= 0.25, 1, 0)
all_reviews_final['Room'] = np.where(all_reviews_final[1] >= 0.25, 1, 0)
all_reviews_final['F&B/Breakfast'] = np.where(all_reviews_final[2] >= 0.25, 1, 0)
all_reviews_final['Gallery/Shopping'] = np.where(all_reviews_final[3] >= 0.25, 1, 0)

In [37]:
# export to csv
all_reviews_final.to_csv('./Data/all_reviews_final.csv')

In [38]:
all_reviews_final.head()

Unnamed: 0,date_of_stay,hotel_response,month_of_stay,nationality,review,source,trip_type,year_of_stay,review_clean,TextBlob_Polarity,vader_score,review_spacyclean,0,1,2,3,CustSvc/Checkin,Room,F&B/Breakfast,Gallery/Shopping
0,2010-01,,January,Malaysia,Brunei” overall am satisfied as the hotel is ...,Agoda,Couple,2010,brunei overall be satisfy a the hotel be very ...,0.275694,0.9708,"[service, ppl, time, shopping, service, people...",0.381131,0.025298,0.359288,0.234282,1,0,1,0
1,2010-01,,January,Indonesia,great week-end at Mandarin Meritus” It's been ...,Agoda,Group,2010,great week end at mandarin meritus it s be alm...,0.256061,0.903,"[week, end, year, face, point, hotel, time]",0.901501,0.032605,0.033012,0.032882,1,0,0,0
2,2010-01,,January,Malaysia,Favourite Hotel !” Mandarin Meritus has always...,Agoda,Business traveler,2010,favourite hotel mandarin meritus have always b...,0.466077,0.9642,"[pick, thousand, shopping, access, room, decor...",0.032683,0.036006,0.032778,0.898534,0,0,0,1
3,2010-01,,January,Malaysia,nice” very nice hotel nice food and great roo...,Agoda,Couple,2010,nice very nice hotel nice food and great room ...,0.162812,0.4507,"[food, room, service, delicacy, distant, shopp...",0.017106,0.501469,0.017598,0.463827,0,1,0,1
4,2010-01,,January,Indonesia,"meritus orchard” Very nice hotel,location real...",Agoda,Couple,2010,meritus orchard very nice hotel location reall...,0.408462,0.9777,"[shopping, condition, bedroom, bathroom, swimm...",0.013625,0.48179,0.167047,0.337538,0,1,0,1
