# Welcome to Restaurant Review Sentiment Analysis Project

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [2]:
df=pd.read_csv('restaurant_review.csv', encoding = "ISO-8859-1")

In [3]:
df.head()

Unnamed: 0,Restaurant,Reviewer,Review,Rating,Metadata,Time,Pictures
0,Beyond Flavours,Rusha Chakraborty,"The ambience was good, food was quite good . h...",5,"1 Review , 2 Followers",5/25/2019 15:54,0
1,Beyond Flavours,Anusha Tirumalaneedi,Ambience is too good for a pleasant evening. S...,5,"3 Reviews , 2 Followers",5/25/2019 14:20,0
2,Beyond Flavours,Ashok Shekhawat,A must try.. great food great ambience. Thnx f...,5,"2 Reviews , 3 Followers",5/24/2019 22:54,0
3,Beyond Flavours,Swapnil Sarkar,Soumen das and Arun was a great guy. Only beca...,5,"1 Review , 1 Follower",5/24/2019 22:11,0
4,Beyond Flavours,Dileep,Food is good.we ordered Kodi drumsticks and ba...,5,"3 Reviews , 2 Followers",5/24/2019 21:37,0


In [4]:
df.shape

(10000, 7)

### Cleaning & Preparing Data

In [5]:
df = df.drop(columns=["Restaurant","Reviewer","Metadata","Time","Pictures"])

In [6]:
df.head()

Unnamed: 0,Review,Rating
0,"The ambience was good, food was quite good . h...",5
1,Ambience is too good for a pleasant evening. S...,5
2,A must try.. great food great ambience. Thnx f...,5
3,Soumen das and Arun was a great guy. Only beca...,5
4,Food is good.we ordered Kodi drumsticks and ba...,5


In [7]:
y = df["Rating"]
X = df.drop(columns=["Rating"])
X.shape

(10000, 1)

In [8]:
y.isnull().sum()

38

In [9]:
y = y.replace({'Like':3})

In [10]:
y = pd.to_numeric(y)

In [11]:
y = y.fillna(y.median())

In [12]:
for i in range(0,len(y)):
    y.iloc[i] = round(y.iloc[i],0)

In [13]:
# for i in range(0,len(y)):
#     if (y[i]>=3):
#         y[i] = "Positive"
#     else:
#         y[i] = "Negative"

In [14]:
#Apply this for LSTM then move to Stemming
for i in range(0,len(y)):
    if (y[i]>=3):
        y[i] = 1
    else:
        y[i] = 0

In [15]:
y.unique()

array([1., 0.])

In [16]:
y.shape

(10000,)

### Applying NLP Processes

In [17]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(X)):
    review = re.sub('[^a-zA-Z]',' ', str(X['Review'][i]))
    review = review.lower() #Lowering the words is very important in avoiding classifying same words as different words.
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')] #Eleminating words that do not put much value in sentences.
    review = ' '.join(review) #Reconstructing sentences
    corpus.append(review)

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=9000) #After experimenting with 7500, 5000, 2500 ...9000 worked best.
X = cv.fit_transform(corpus).toarray()

### Train-Test-Split

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

### Appling all algorithms and to decide which is best

#### Appling MultinomialNB

In [20]:
from sklearn.naive_bayes import MultinomialNB
restaurant_review_model = MultinomialNB().fit(X_train, y_train)
restaurant_review_model.fit(X_train, y_train)
y_pred = restaurant_review_model.predict(X_test)

from sklearn.metrics import confusion_matrix
confusion_m = confusion_matrix(y_test, y_pred)

print(confusion_m)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

[[ 457  127]
 [ 102 1814]]
0.9084


#### Appling Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier
randomclassifier=RandomForestClassifier(n_estimators=200,criterion='entropy')
randomclassifier.fit(X_train,y_train)

y_pred = randomclassifier.predict(X_test)

from sklearn.metrics import confusion_matrix
confusion_m = confusion_matrix(y_test, y_pred)
print(confusion_m)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

[[ 387  197]
 [  66 1850]]
0.8948


#### Appling SVM

In [22]:
from sklearn import svm
clf = svm.SVC()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

from sklearn.metrics import confusion_matrix
confusion_m = confusion_matrix(y_test, y_pred)
print(confusion_m)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

[[ 365  219]
 [  52 1864]]
0.8916


#### Appling KNN

In [23]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 3, metric = 'minkowski', p = 2) 
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix
confusion_m = confusion_matrix(y_test, y_pred)
print(confusion_m)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

print(accuracy)

[[ 308  276]
 [ 189 1727]]
0.814


#### Among above four algorithms ,MultinomialNB has higher accuracy...

### Applying Bi-Directional LSTM with WordEmbedding

In [24]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [25]:
### Vocabulary size
voc_size=5000

In [26]:
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
onehot_repr

[[1772,
  3283,
  81,
  2145,
  3283,
  4718,
  3120,
  3294,
  255,
  3283,
  1100,
  4181,
  427,
  1227,
  1665,
  552,
  4929,
  4193,
  2512,
  2231,
  2253,
  2377,
  4147,
  1620],
 [1772,
  3283,
  2906,
  925,
  1196,
  3771,
  81,
  3283,
  3283,
  4460,
  2231,
  2253,
  2840,
  1196],
 [2744,
  1311,
  1641,
  81,
  1641,
  1772,
  4535,
  1196,
  2861,
  8,
  4829,
  4493,
  3696,
  4388,
  1402,
  1665,
  1757,
  4361,
  2111],
 [2231,
  2253,
  2566,
  1641,
  2811,
  1568,
  3600,
  3283,
  81,
  1022,
  4654,
  3959,
  32,
  1100],
 [81,
  3283,
  3472,
  2243,
  3217,
  123,
  1821,
  689,
  3283,
  2684,
  2861,
  4880,
  4393,
  1479,
  1772,
  1665,
  3283],
 [308,
  3283,
  1196,
  3283,
  81,
  1697,
  2789,
  1206,
  1196,
  81,
  3283,
  4589,
  3283,
  1375,
  3180,
  1365,
  3283,
  3383,
  3238],
 [2239,
  1100,
  1772,
  505,
  81,
  3472,
  596,
  1196,
  1665,
  2903,
  4621,
  32,
  1345,
  4393,
  2377,
  2744,
  32,
  1100],
 [4393,
  2735,
  4317,
  9

In [27]:
sent_length=40
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ... 2377 4147 1620]
 [   0    0    0 ... 2253 2840 1196]
 [   0    0    0 ... 1757 4361 2111]
 ...
 [ 435 4271 4916 ...  306 2778  279]
 [   0    0 4829 ...  105 1196 2803]
 [ 619 2426 3176 ... 3032  176 1772]]


In [28]:
embedded_docs[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0, 1772, 3283,   81, 2145, 3283, 4718,
       3120, 3294,  255, 3283, 1100, 4181,  427, 1227, 1665,  552, 4929,
       4193, 2512, 2231, 2253, 2377, 4147, 1620])

In [29]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [30]:
X_final.shape,y_final.shape

((10000, 40), (10000,))

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.25, random_state=0)

In [32]:
import tensorflow
from tensorflow.keras.layers import Bidirectional
import keras
from tensorflow.keras.layers import Dropout
## Creating model
embedding_vector_features=40
model=tensorflow.keras.Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(150)))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

Using TensorFlow backend.


In [33]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=5,batch_size=64)

Train on 7500 samples, validate on 2500 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1eae52bac18>

In [34]:
y_pred=model.predict_classes(X_test)

In [35]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[ 433,  151],
       [ 110, 1806]], dtype=int64)

In [36]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.8956

## Still MultinomialNB has higher Accuracy as compared to all other algorithms

In [38]:
import pickle
pickle.dump(restaurant_review_model,open('model.pkl','wb'))
pickle.dump(cv,open('cv-model.pkl','wb'))