## Key Topics Extraction and Contextual Sentiment of Hotel Reviews

### 1. Preprocess

In [21]:
import os
from os import listdir
import pandas as pd
import plotly.graph_objs as go
df = pd.read_csv('svm_pred.csv', header=0)
df.columns

Index(['text', 'pred_category', 'true_label'], dtype='object')

In [22]:
df.drop(columns=['pred_category', 'true_label'], inplace = True)

In [23]:
f= open("hotel_review.txt","w+")
for i in range(df.shape[0]):
    f.write(df.text[i])
    f.write("\n")
f.close()

In [24]:
with open("hotel_review.txt") as fp: 
    reviews = []
    Lines = fp.readlines() 
    for line in Lines[:5]: 
        reviews.append(line)

In [25]:
reviews

['pleasant enough Stayed at the\n',
 'Singel for 2 nights for a football trip, the place is easy to find, in a good location, near the station,on the Singel canal, next to a church.\n',
 'Room was comfy and very well heated if a little basic, breakfast was good a variety of cereals, breads, cheese, meats, etc, boiled eggs, juice, teacoffeeAll in all a pleasant enough place for a couple of days, mind you we didnt get to bed until 3:30 the first night and well after 4 on the second.\n',
 'The guy I roomed with works in pest control and was a stickler about vermin bed bugs etc before we went and he had no complaints unlike some of our other guys who stayed elsewhere in the city so read into that what you will\n',
 'Exceeded our expectations!\n']

In [26]:
from gensim import corpora
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(reviews):
    stop_free = " ".join([i for i in reviews.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

reviews_clean = [clean(reviews).split() for reviews in reviews]    

dictionary = corpora.Dictionary(reviews_clean)

In [27]:
reviews_clean

[['pleasant', 'enough', 'stayed'],
 ['singel',
  '2',
  'night',
  'football',
  'trip',
  'place',
  'easy',
  'find',
  'good',
  'location',
  'near',
  'stationon',
  'singel',
  'canal',
  'next',
  'church'],
 ['room',
  'comfy',
  'well',
  'heated',
  'little',
  'basic',
  'breakfast',
  'good',
  'variety',
  'cereal',
  'bread',
  'cheese',
  'meat',
  'etc',
  'boiled',
  'egg',
  'juice',
  'teacoffeeall',
  'pleasant',
  'enough',
  'place',
  'couple',
  'day',
  'mind',
  'didnt',
  'get',
  'bed',
  '330',
  'first',
  'night',
  'well',
  '4',
  'second'],
 ['guy',
  'roomed',
  'work',
  'pest',
  'control',
  'stickler',
  'vermin',
  'bed',
  'bug',
  'etc',
  'went',
  'complaint',
  'unlike',
  'guy',
  'stayed',
  'elsewhere',
  'city',
  'read'],
 ['exceeded', 'expectation']]

In [28]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x1a28684f28>

In [29]:
import gensim
from gensim import corpora

# Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)

reviews_term_matrix = [dictionary.doc2bow(reviews) for reviews in reviews_clean]

In [30]:
print(reviews_term_matrix)
for i in reviews_term_matrix:
    print(i)

[[(0, 1), (1, 1), (2, 1)], [(3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1)], [(0, 1), (1, 1), (9, 1), (13, 1), (14, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 2)], [(2, 1), (21, 1), (32, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 2), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1)], [(59, 1), (60, 1)]]
[(0, 1), (1, 1), (2, 1)]
[(3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1)]
[(0, 1), (1, 1), (9, 1), (13, 1), (14, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 

In [31]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(reviews_term_matrix, num_topics=10, id2word = dictionary, passes=7)

In [32]:
print(ldamodel.print_topics(num_topics=10, num_words=7))

[(0, '0.016*"exceeded" + 0.016*"expectation" + 0.016*"stayed" + 0.016*"enough" + 0.016*"pleasant" + 0.016*"singel" + 0.016*"guy"'), (1, '0.016*"exceeded" + 0.016*"expectation" + 0.016*"enough" + 0.016*"pleasant" + 0.016*"stayed" + 0.016*"place" + 0.016*"night"'), (2, '0.018*"well" + 0.017*"day" + 0.017*"4" + 0.017*"second" + 0.017*"get" + 0.017*"basic" + 0.017*"comfy"'), (3, '0.016*"exceeded" + 0.016*"enough" + 0.016*"expectation" + 0.016*"pleasant" + 0.016*"stayed" + 0.016*"singel" + 0.016*"place"'), (4, '0.095*"singel" + 0.050*"church" + 0.050*"location" + 0.050*"good" + 0.050*"stationon" + 0.050*"find" + 0.050*"next"'), (5, '0.016*"exceeded" + 0.016*"expectation" + 0.016*"enough" + 0.016*"stayed" + 0.016*"pleasant" + 0.016*"singel" + 0.016*"work"'), (6, '0.136*"expectation" + 0.136*"exceeded" + 0.012*"enough" + 0.012*"pleasant" + 0.012*"stayed" + 0.012*"singel" + 0.012*"place"'), (7, '0.037*"guy" + 0.037*"bed" + 0.037*"etc" + 0.037*"well" + 0.019*"roomed" + 0.019*"control" + 0.019*"