# SENTIMENT ANALYSIS OF HOTEL REVIEWS

In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from keras.preprocessing.sequence import pad_sequences
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:

from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)
colnames=['REVIEW','Label'] 
df= pd.read_csv('dataset.csv', names=colnames, header=None)
print(df.head())
X=df['REVIEW']
y=df['Label']
X.isnull()
df['Label'] = df['Label'].apply(np.int64)

                       REVIEW  Label
0                   Nice view      1
1           Excellent Service      1
2                  Beautiful       1
3  About food and hospitality      1
4                Good service      1


In [3]:
from sklearn.model_selection import train_test_split

In [4]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

# TF IDF VECTORIZER

In [5]:
vect = TfidfVectorizer()
#vect = CountVectorizer(stop_words='english', ngram_range = (1,1), max_df = .80, min_df = 4)
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1, test_size= 0.2)
#Using training data to transform text into counts of features for each message
vect.fit(X_train)
X_train_dtm = vect.transform(X_train) 
X_test_dtm = vect.transform(X_test)

# K NEIGHBORS CLASSIFIER

In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
krange=range(1,26)
scores={}
scorelist=[]
for k in krange:
    knn=KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_dtm,y_train)
    y_pred=knn.predict(X_test_dtm)
    scores[k]=metrics.accuracy_score(y_test,y_pred)
    scorelist.append(metrics.accuracy_score(y_test,y_pred))

In [7]:
print(scorelist)

[0.782608695652174, 0.6521739130434783, 0.6956521739130435, 0.6521739130434783, 0.782608695652174, 0.782608695652174, 0.782608695652174, 0.8260869565217391, 0.8260869565217391, 0.782608695652174, 0.7391304347826086, 0.7391304347826086, 0.6956521739130435, 0.5217391304347826, 0.6086956521739131, 0.6521739130434783, 0.6956521739130435, 0.6521739130434783, 0.6956521739130435, 0.6521739130434783, 0.6086956521739131, 0.6086956521739131, 0.4782608695652174, 0.5217391304347826, 0.43478260869565216]


# HIGHEST ACCURACY  OF KNN AT K=8

In [8]:
KNN = KNeighborsClassifier(n_neighbors = 8)
KNN.fit(X_train_dtm, y_train)
y_pred = KNN.predict(X_test_dtm)
print('\nK Nearest Neighbors (NN = 8)')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')


K Nearest Neighbors (NN = 8)
Accuracy Score: 82.6086956521739%
Confusion Matrix: 
[[10  4]
 [ 0  9]]


# MULTINOMIAL NB

In [9]:
NB = MultinomialNB()
NB.fit(X_train_dtm, y_train)
y_pred = NB.predict(X_test_dtm)
print('\nNaive Bayes')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')


Naive Bayes
Accuracy Score: 69.56521739130434%
Confusion Matrix: 
[[7 7]
 [0 9]]


# LOGISTIC REGRESSION

In [10]:
LR = LogisticRegression()
LR.fit(X_train_dtm, y_train)
y_pred = LR.predict(X_test_dtm)
print('\nLogistic Regression')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')


Logistic Regression
Accuracy Score: 60.86956521739131%
Confusion Matrix: 
[[5 9]
 [0 9]]


# SVM

In [11]:
SVM = LinearSVC()
SVM.fit(X_train_dtm, y_train)
y_pred = SVM.predict(X_test_dtm)
print('\nSupport Vector Machine')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')


Support Vector Machine
Accuracy Score: 82.6086956521739%
Confusion Matrix: 
[[10  4]
 [ 0  9]]


# Using Keras and Deep Learning

In [12]:
import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing.text import Tokenizer
import collections

In [13]:
X1 = np.array(X)
Y1 = np.array(y)

max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(df['REVIEW'].values)
X1 = tokenizer.texts_to_sequences(df['REVIEW'].values)
X1 = pad_sequences(X1)

In [14]:

max_words = 5000


embedding_vector_length = 32

# Define the layers in the model
model = Sequential()
model.add(Embedding(max_words, embedding_vector_length))
model.add(LSTM(200))

model.add(Dense(1, activation='sigmoid'))

print("Model created.")

Instructions for updating:
Colocations handled automatically by placer.
Model created.


In [15]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print("Model compiled, ready to be fit to the training data.")


Model compiled, ready to be fit to the training data.


In [16]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 32)          160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 200)               186400    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 201       
Total params: 346,601
Trainable params: 346,601
Non-trainable params: 0
_________________________________________________________________
None


In [17]:
X_train,X_test,y_train,y_test=train_test_split(X1,y,test_size=0.25,)

In [18]:
model.fit(X_train, y_train, epochs=50, batch_size=64)


Instructions for updating:
Use tf.cast instead.
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fd0410fa1d0>

In [19]:

model_scores = model.evaluate(X_test, y_test, verbose=0)

# Print out the accuracy of the model on the test set
print("Model accuracy on the test dataset: {0:.2f}%".format(model_scores[1]*100))

Model accuracy on the test dataset: 79.31%
