In [1]:
import numpy as np
import pandas as pd

In [2]:
dataset = pd.read_csv("Restaurant_Reviews.tsv", delimiter='\t', quoting=3)

In [3]:
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
dataset.count()

Review    1000
Liked     1000
dtype: int64

In [5]:
import re
import nltk

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vaish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
from nltk.corpus import stopwords

In [8]:
from nltk.stem.porter import PorterStemmer

In [9]:
corpus = []
for i in range(0,1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

#### Creating Bag of Words

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,1].values

#### Splitting the dataset into the Training Set and Test Set

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
text_train, text_test, senti_train, senti_test = train_test_split(dataset.iloc[:,0].values, y, test_size = 0.2, random_state = 0)

#### Training the Naive Bayes model on Training set

In [14]:
from sklearn.naive_bayes import GaussianNB

In [15]:
classifier = GaussianNB()

In [16]:
classifier.fit(X_train, y_train)

GaussianNB()

In [17]:
y_pred = classifier.predict(X_test)

#### Confusion Matrix 

In [18]:
from sklearn import metrics

In [19]:
cm = metrics.confusion_matrix(y_test, y_pred)
ac = metrics.accuracy_score(y_test, y_pred)
pr = metrics.precision_score(y_test, y_pred)

In [20]:
print('Confusion Matrix')
print(cm)

Confusion Matrix
[[55 42]
 [11 92]]


In [21]:
print('Accuracy Score is ', ac)

Accuracy Score is  0.735


In [22]:
print('Precision Score is ', pr)

Precision Score is  0.6865671641791045


#### Incorrect Predictions

In [23]:
print("******[0-Negative, 1-Positive]******")
n = 0
for document, label, predicted_label in zip(text_test, y_test, y_pred):
    for j in range(0,2):
        if label == j and predicted_label != j:
            for i in range(0, len(document[0])):
                n += 1
                print("Incorrect Review :", n)
                print("Actual Label:", +label)
                print("Predicted Label:",+predicted_label)
                print("Review:", document)
                print('\n')

******[0-Negative, 1-Positive]******
Incorrect Review : 1
Actual Label: 0
Predicted Label: 1
Review: the presentation of the food was awful.


Incorrect Review : 2
Actual Label: 0
Predicted Label: 1
Review: Worst food/service I've had in a while.


Incorrect Review : 3
Actual Label: 0
Predicted Label: 1
Review: Never again will I be dining at this place!


Incorrect Review : 4
Actual Label: 0
Predicted Label: 1
Review: The ambiance isn't much better.


Incorrect Review : 5
Actual Label: 0
Predicted Label: 1
Review: If the food isn't bad enough for you, then enjoy dealing with the world's worst/annoying drunk people.


Incorrect Review : 6
Actual Label: 0
Predicted Label: 1
Review: Will never, ever go back.


Incorrect Review : 7
Actual Label: 0
Predicted Label: 1
Review: The chains, which I'm no fan of, beat this place easily.


Incorrect Review : 8
Actual Label: 1
Predicted Label: 0
Review: This place is a jewel in Las Vegas, and exactly what I've been hoping to find in nearly ten yea