# Topic Modeling for hotel207reviews data
Filename: topic_modeling_hotel207reviews.ipynb
- Load data (207 records are stored in 223 rows as some reviews were entered in multiple rows)   
  Data source: https://<span></span>monkeylearn.com/blog/introduction-to-topic-modeling/
- Compute term-document matrix with lemmatization
- Latent semantic analysis (LSA)
- Non-negative matrix factorization (MNF)
- Latent Dirichlet allocation (LDA)
- Grid search for the best number of topics (and other parameters) for LDA

In [1]:
import pandas as pd
import numpy as np
from matplotlib import style
from matplotlib import pyplot as plt
#import graphviz as gr
%matplotlib inline
style.use("fivethirtyeight")
import matplotlib.pyplot as plt

#from google.colab import drive
#drive.mount('/content/drive')
import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", 60)
pd.set_option('display.max_rows', 50)
pd.set_option('display.width', 1000)

In [2]:
# Load data (207 reviews are stored in 223 rows as some reviews were entered in multiple rows)
import pandas as pd
import numpy as np
from time import time

textdf = pd.read_csv('hotel207reviews.csv')
textdf.info()
print('\nSentiment values: ', textdf.Sentiment.unique())
print('Topic values: ', textdf.Topic.unique())
textdf.head(18)  # row 15 "Corridors filthy\r\nRoom...." includes 4 rows of data in the CSV file

# import csv
# reviews = [row for row in csv.reader(open('C:/Courses/MIST.7060(63.706)/Datasets/hotel223reviews.csv'))]
# reviews

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207 entries, 0 to 206
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       207 non-null    object
 1   Sentiment  207 non-null    object
 2   Topic      207 non-null    object
dtypes: object(3)
memory usage: 5.0+ KB

Sentiment values:  ['negative' 'positive']
Topic values:  ['Comfort' 'Facilities' 'Cleanliness']


Unnamed: 0,Text,Sentiment,Topic
0,"The rooms are extremely small, practically onl...",negative,Comfort
1,Room safe did not work.,negative,Facilities
2,Mattress very comfortable.,positive,Comfort
3,"Very uncomfortable, thin mattress, with plasti...",negative,Comfort
4,No bathroom in room,negative,Facilities
5,The bed was soooo comfy.,positive,Comfort
6,someone must have been smoking in the room nex...,negative,Cleanliness
7,The bed is very comfortable.,positive,Comfort
8,"Very spacious rooms, quiet and very comfortable.",positive,Comfort
9,For 3 people in a bedroom the sofa bed is a bi...,negative,Comfort


In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fromx\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fromx\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fromx\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# Compute term-document matrix with lemmatization
import nltk
import string
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

def lemma_tokenizer(corpus):   # a method to lemmatize corpus
    corpus = ''.join([ch for ch in corpus if ch not in string.punctuation])  # remove punctuation
    tokens = nltk.word_tokenize(corpus)
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

tf_vec = CountVectorizer(tokenizer=lemma_tokenizer, stop_words='english')  # default lowercase
tf_sparse = tf_vec.fit_transform(textdf.Text)
tf_dictionary = tf_vec.get_feature_names()
print(tf_dictionary)
tf_sparse

['1', '12am', '15', '15th', '1990s', '2', '240', '3', '30', '302', '4', '40', '4th', '5', '58', '6', '650', '7', 'abundant', 'ac', 'access', 'actually', 'added', 'adjust', 'adult', 'advertised', 'advised', 'agreed', 'ahead', 'air', 'aircon', 'allow', 'amazing', 'ambience', 'amenity', 'amenityi', 'ample', 'anda', 'andor', 'anymore', 'apart', 'appeal', 'area', 'arent', 'art', 'available', 'avenue', 'awesome', 'bag', 'bar', 'barely', 'basic', 'bath', 'bathroom', 'bathtub', 'bed', 'bedding', 'bedroom', 'best', 'big', 'bigger', 'bit', 'blanket', 'blind', 'boil', 'book', 'bookingcom', 'bothering', 'bright', 'broken', 'building', 'bunting', 'cable', 'called', 'came', 'cap', 'card', 'carpet', 'center', 'challenge', 'change', 'changed', 'channel', 'character', 'charge', 'city', 'classy', 'clean', 'cleaned', 'cleaning', 'closed', 'closest', 'closet', 'cm', 'cockroach', 'coffee', 'cold', 'come', 'comfortable', 'comfy', 'coming', 'common', 'compared', 'condition', 'conditioning', 'connected', 'con

<207x510 sparse matrix of type '<class 'numpy.int64'>'
	with 1077 stored elements in Compressed Sparse Row format>

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer(tokenizer=lemma_tokenizer, stop_words='english')  # default lowercase
tfidf_sparse = tfidf_vec.fit_transform(textdf.Text)
tfidf_dictionary = tfidf_vec.get_feature_names()
tfidf_sparse

<207x510 sparse matrix of type '<class 'numpy.float64'>'
	with 1077 stored elements in Compressed Sparse Row format>

In [6]:
# Latent semantic analysis (LSA)
from sklearn.decomposition import TruncatedSVD
lsa = TruncatedSVD(n_components=3)
lsa

TruncatedSVD(n_components=3)

In [7]:
lsa_tf_topics = lsa.fit_transform(tf_sparse)
lsa_tf_topics.shape

(207, 3)

In [8]:
lsa.components_.shape

(3, 510)

In [9]:
# print top terms for each topic
def print_top_terms(model, vocabulary, n_top_terms):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([vocabulary[i]
                             for i in topic.argsort()[:-n_top_terms - 1:-1]])
        print(message)
    print()

print('LSA topics based on term-document matrix:')
print_top_terms(lsa, tf_dictionary, 10)

LSA topics based on term-document matrix:
Topic #0: wa room bed small shower comfortable clean cold bathroom hotel
Topic #1: room small far didnt bit safe filthy old screen book
Topic #2: bed comfortable queen 2 requested comfy terrible far size 1



In [10]:
lsa.fit_transform(tfidf_sparse)
print('LSA topics based on tfidf matrix:')
print_top_terms(lsa, tfidf_dictionary, 10)

LSA topics based on tfidf matrix:
Topic #0: wa room clean bed comfortable small spacious cold bathroom comfy
Topic #1: bed comfortable comfy terrible mattress super lovely minimalist pillow selection
Topic #2: small room extra extremely bathroom open suitcase spacious practically cold



In [11]:
# Non-negative matrix factorization (MNF)
from sklearn.decomposition import NMF
nmf = NMF(n_components=3, random_state=1, alpha=.1, l1_ratio=.5)  # alpha and l1 related to regularization
nmf

NMF(alpha=0.1, l1_ratio=0.5, n_components=3, random_state=1)

In [12]:
nmf.fit_transform(tf_sparse)
print('NMF topics based on term-document matrix:')
print_top_terms(nmf, tf_dictionary, 10)

NMF topics based on term-document matrix:
Topic #0: wa shower cold way bathroom clean water poor pressure night
Topic #1: room small didnt spacious far bit old screen filthy safe
Topic #2: bed comfortable queen 2 wa requested comfy terrible size sleeping



In [13]:
nmf.fit_transform(tfidf_sparse)
print('NMF topics based on tfidf matrix:')
print_top_terms(nmf, tfidf_dictionary, 10)

NMF topics based on tfidf matrix:
Topic #0: clean wa hotel floor facility bathtub room renovation hip tidy
Topic #1: bed comfortable wa comfy terrible lovely super minimalist spacious mattress
Topic #2: room small wa extremely extra cold spacious bathroom especially inviting



In [14]:
# Latent Dirichlet allocation (LDA)
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(
    n_components=3, random_state=1, learning_method='online', learning_offset=50.)
lda

LatentDirichletAllocation(learning_method='online', learning_offset=50.0,
                          n_components=3, random_state=1)

In [15]:
lda.fit_transform(tf_sparse)
print('LDA topics based on term-document matrix:')
print_top_terms(lda, tf_dictionary, 10)

LDA topics based on term-document matrix:
Topic #0: window clean ha towel just air 5 loud lift bed
Topic #1: wa room bed shower small comfortable clean cold noisy great
Topic #2: room didnt bathroom hotel door way sound use work mattress



In [16]:
lda.fit_transform(tfidf_sparse)
print('LDA topics based on tfidf matrix:')
print_top_terms(lda, tfidf_dictionary, 10)

LDA topics based on tfidf matrix:
Topic #0: clean wa comfortable bed room elevator window air towel shower
Topic #1: wa room great cold facility quiet bed spacious shower noisy
Topic #2: room small bathroom wa extra sound dirty poor extremely didnt



In [17]:
# Grid search for the best number of topics (and other parameters) for LDA
t0 = time()
from sklearn.model_selection import GridSearchCV
param_grid = {'n_components': [2,3,4,5,6,7,8,9,10]}
lda = LatentDirichletAllocation(random_state=1, learning_method='online', learning_offset=50.)
lda_grid = GridSearchCV(lda, param_grid)
lda_grid.fit(tf_sparse)
print("Computing time: %0.3f seconds." % (time() - t0))

Computing time: 10.317 seconds.


In [18]:
print("Best model's params: ", lda_grid.best_params_)
print("Best log likelihood score: ", lda_grid.best_score_)
print("Model perplexity: ", lda_grid.best_estimator_.perplexity(tf_sparse))

Best model's params:  {'n_components': 2}
Best log likelihood score:  -1702.5776377326442
Model perplexity:  469.31129973020967


In [19]:
lda = LatentDirichletAllocation(
    n_components=2, random_state=1, learning_method='online', learning_offset=50.)
lda

LatentDirichletAllocation(learning_method='online', learning_offset=50.0,
                          n_components=2, random_state=1)

In [20]:
lda.fit_transform(tf_sparse)
print('LDA topics based on term-document matrix:')
print_top_terms(lda, tf_dictionary, 10)

LDA topics based on term-document matrix:
Topic #0: bathroom hotel door didnt use window mattress glass clean work
Topic #1: wa room bed shower small clean comfortable noisy bathroom cold



In [21]:
lda.fit_transform(tfidf_sparse)
print('LDA topics based on tfidf matrix:')
print_top_terms(lda, tfidf_dictionary, 10)

LDA topics based on tfidf matrix:
Topic #0: comfortable bed wa clean hotel bathroom door elevator didnt window
Topic #1: room wa small clean bed shower noisy great cold facility

