In [2]:
#Importing Libraries
import numpy as np
import pandas as pd

In [3]:
#Loading Dataset
dataset = pd.read_csv("Restaurant_Reviews.tsv", delimiter="\t", quoting=3)
dataset.head()


Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
#Inspecting Data
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [5]:
dataset.dtypes

Review    object
Liked      int64
dtype: object

- Dataset have only : One is in textual format and other si binary categorical(0 or 1)

In [7]:
dataset.groupby('Liked').size()

Liked
0    500
1    500
dtype: int64

- Dataset is Balanced
- it doesn't have any null value

In [8]:
dataset.isnull().sum()

Review    0
Liked     0
dtype: int64

- Clean and process a single review then creat a for loop for cleaning all 1000 reviews.

In [9]:
#first review
dataset['Review'][0]

'Wow... Loved this place.'

In [27]:
#Removing numbers and punctuation with the help of regular expressions
import re
review = re.sub('[^a-zA-Z]',' ',dataset['Review'][0])
print(review)

Wow    Loved this place 


- convert each word into its lower case / upper case (prefer lower case):
- it irrelevant to have same words in different case(eg:'LOVE' and 'LoVe')    

In [28]:
# Convert the string to lower 

review = review.lower()
review

'wow    loved this place '

In [12]:
import nltk 
nltk.download('stopwords')#download stopwords
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vivek\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [21]:
# stopwords.words('english')
len(stopwords.words('english'))

179

- There are total 179 stopwords in english language

In [29]:
review = review.split()
review

['wow', 'loved', 'this', 'place']

In [30]:
# By list comprehension, we tried to remove the stop word 

review1 = [ word for word in review if not word in set(stopwords.words('english')) ]
review1

['wow', 'loved', 'place']

# Stemming

- convert word to its root word

Example:loved--love, stopped--stop


In [31]:
# Use Stemming to take word it to its Root form

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

review1 = [ ps.stem(word) for word in review1 ]
review1

['wow', 'love', 'place']

In [32]:
# Convert list to string 

review2 = ' '.join(review1)
review2

'wow love place'

# Count-Vectorizer()

- This will construct the vocabularyof the bag-of-words model and transform the sentences into spares feature vctors

In [33]:
corpus1 = []

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=3) # max-feature=3 means take only top 3 columns into consideration
print(review2)

corpus1.append(review2)
print(corpus1)

X = cv.fit_transform(corpus1)
print(X.toarray())

wow love place
['wow love place']
[[1 1 1]]


- Now the textual data is prepocessed and converted into nmerical format, which we can use for ml model

In [34]:
dataset.shape

(1000, 2)

In [39]:
dataset.tail()

Unnamed: 0,Review,Liked
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0
999,"Then, as if I hadn't wasted enough of my life ...",0


In [40]:
#preprocessing 1000 rows
corpus = []
for i in range(0,1000):
    review = re.sub('[^a-zA-Z]',' ',dataset['Review'][i])
    review = review.lower()
    review = review.split()
    review = [ ps.stem(word) for word in review if not word in set(stopwords.words('english')) ]
    review = ' '.join(review)
    # print(review)
    corpus.append(review)

In [41]:
print("Review Type: ",type(review))
print("Corpus Type: ",type(corpus))

Review Type:  <class 'str'>
Corpus Type:  <class 'list'>


In [42]:
#creating DataFrame for preprocessed Reviews
corpus_dataset = pd.DataFrame(corpus)
corpus_dataset.head()

Unnamed: 0,0
0,wow love place
1,crust good
2,tasti textur nasti
3,stop late may bank holiday rick steve recommen...
4,select menu great price


In [43]:
corpus_dataset['corpus'] = corpus_dataset
corpus_dataset = corpus_dataset.drop([0], axis=1)
corpus_dataset.head()

Unnamed: 0,corpus
0,wow love place
1,crust good
2,tasti textur nasti
3,stop late may bank holiday rick steve recommen...
4,select menu great price


In [45]:
corpus_dataset.to_csv("corpus.dataset.csv")#saving pre-processed data for future reference


# Bag of words Model for Whole data

In [46]:
# Create a Bag of Words Model

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)

In [47]:
X = cv.fit_transform(corpus).toarray()
X[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [49]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

- Spares matrix iscreated for top 1500 columns

In [50]:
# To see  all the top 1500 seleceted feature names: 
# cv.get_feature_names()
len(cv.get_feature_names())

1500

In [51]:
# As our input data is in numpy format so changing y(target variable) in numpy array
y = dataset.iloc[:,1].values

# Splitting data into 80-20 ratio

In [52]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

# Naive Bayes

In [53]:
from sklearn.naive_bayes import BernoulliNB
classifier = BernoulliNB()
classifier.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [54]:
y_pred = classifier.predict(X_test)

In [55]:
from sklearn.metrics import confusion_matrix,accuracy_score

In [56]:
confusion_matrix(y_test,y_pred)

array([[73, 24],
       [22, 81]], dtype=int64)

In [57]:
accuracy_score(y_test,y_pred)

0.77

- Model is neither to good nor to bad as it is 77% accurate in predicting review either positive or negative

# Check it on unseen Data

In [58]:
Review = "nice service"
input1 = [Review]

input_data = cv.transform(input1).toarray()

input_pred = classifier.predict(input_data)

if input_pred[0]==1:
    print("Review is Positive")
else:
    print("Review is Negative")

Review is Positive


In [59]:
Review = "long waiting time"
input1 = [Review]

input_data = cv.transform(input1).toarray()

input_pred = classifier.predict(input_data)

if input_pred[0]==1:
    print("Review is Positive")
else:
    print("Review is Negative")

Review is Negative
