https://www.kaggle.com/code/sukhadadharangaonkar/hotel-reviews-topic-modeling

In [47]:
# !pip install pyLDAvis==2.1.2


In [41]:
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import seaborn as sns

# Text Preprocessing and model building
from gensim.corpora import Dictionary
import nltk
from nltk.stem import WordNetLemmatizer
import re
# Iteratively read files
import glob
import os
import pyLDAvis.gensim

# For displaying images in ipython
from IPython.display import HTML, display
from nltk.corpus import stopwords
import nltk
import pprint
from gensim.models import LdaModel



In [25]:
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (14.0, 8.7)
#warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:,.2f}'.format


In [26]:
df = pd.read_csv("../../../DataSets/tripadvisor_hotel_reviews.csv")
print("dataset loaded...")

dataset loaded...


In [27]:
df.head()


Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [28]:
df.columns


Index(['Review', 'Rating'], dtype='object')

In [29]:
reviews = df.Review.tolist()


In [30]:
wordnet_lemmatizer = WordNetLemmatizer()


In [31]:
stopwords = stopwords.words('english')


In [32]:
def preprocessText(x):
    temp = x.lower()
    temp = re.sub(r'[^\w]', ' ', temp)
    temp = nltk.word_tokenize(temp)
    temp = [wordnet_lemmatizer.lemmatize(w) for w in temp]
    temp = [word for word in temp if word not in stopwords]
    return temp


In [33]:
reviews_final = [preprocessText(review) for review in reviews]


In [34]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(reviews_final)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)


In [35]:
corpus = [dictionary.doc2bow(doc) for doc in reviews_final]


In [36]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))


Number of unique tokens: 5854
Number of documents: 20491


In [37]:
# Train LDA model.

# Set training parameters.
num_topics = 5
chunksize = 2000
passes = 10
# iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token
# print(id2word)

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    #     iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)


In [38]:
pprint.pprint(model.print_topics(num_words=10))
doc_lda = model[corpus]


[(0,
  '0.019*"location" + 0.016*"good" + 0.015*"breakfast" + 0.012*"clean" + '
  '0.010*"night" + 0.010*"walk" + 0.010*"nice" + 0.009*"stayed" + '
  '0.009*"helpful" + 0.008*"excellent"'),
 (1,
  '0.014*"n" + 0.012*"bed" + 0.011*"night" + 0.010*"nice" + 0.010*"floor" + '
  '0.010*"bathroom" + 0.008*"good" + 0.007*"small" + 0.007*"like" + '
  '0.007*"area"'),
 (2,
  '0.021*"service" + 0.010*"stayed" + 0.010*"wonderful" + 0.009*"time" + '
  '0.009*"best" + 0.008*"restaurant" + 0.008*"beautiful" + 0.008*"pool" + '
  '0.007*"view" + 0.007*"experience"'),
 (3,
  '0.016*"n" + 0.013*"day" + 0.008*"time" + 0.008*"got" + 0.007*"told" + '
  '0.007*"check" + 0.007*"service" + 0.007*"night" + 0.007*"hour" + '
  '0.006*"desk"'),
 (4,
  '0.022*"beach" + 0.019*"resort" + 0.015*"pool" + 0.015*"food" + 0.013*"good" '
  '+ 0.011*"time" + 0.011*"n" + 0.010*"day" + 0.009*"restaurant" + '
  '0.009*"people"')]


In [42]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, corpus, dictionary=dictionary)
vis


  return pd.DataFrame({'Term': vocab[term_ix], \


In [43]:
lda_corpus = model[corpus]

In [44]:
topics = []

for doc in lda_corpus:
    temp_id = []
    temp_score = []
    for doc_tuple in doc:
        temp_id.append(doc_tuple[0])
        temp_score.append(doc_tuple[1])
    index = np.argmax(temp_score)
    topics.append(temp_id[index])


In [45]:
df["Topic_num"] = topics


In [46]:
df.head(n=10)


Unnamed: 0,Review,Rating,Topic_num
0,nice hotel expensive parking got good deal sta...,4,1
1,ok nothing special charge diamond member hilto...,2,3
2,nice rooms not 4* experience hotel monaco seat...,3,1
3,"unique, great stay, wonderful time hotel monac...",5,1
4,"great stay great stay, went seahawk game aweso...",5,3
5,love monaco staff husband stayed hotel crazy w...,5,1
6,"cozy stay rainy city, husband spent 7 nights m...",5,0
7,"excellent staff, housekeeping quality hotel ch...",4,0
8,"hotel stayed hotel monaco cruise, rooms genero...",5,1
9,excellent stayed hotel monaco past w/e delight...,5,0
