# Import Necessary Library

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from nltk.stem import PorterStemmer
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Reading The Data

In [2]:
df = pd.read_csv('/kaggle/input/restaurant-reviewstsv/Restaurant_Reviews.tsv', sep='\t')

# Explore The Data

In [3]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
df.shape

(1000, 2)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [6]:
df.duplicated().sum()

4

In [7]:
df.isnull().sum()

Review    0
Liked     0
dtype: int64

In [8]:
df['Liked'].value_counts()

Liked
1    500
0    500
Name: count, dtype: int64

**then we can say this data are balanced data**

# Cleaning the Data

In [9]:
df = df.drop_duplicates()

In [10]:
df.duplicated().sum()

0

**Make this column to know how many char at each review**

In [11]:
df['reviewcharcount'] = df['Review'].apply(len)

In [12]:
df

Unnamed: 0,Review,Liked,reviewcharcount
0,Wow... Loved this place.,1,24
1,Crust is not good.,0,18
2,Not tasty and the texture was just nasty.,0,41
3,Stopped by during the late May bank holiday of...,1,87
4,The selection on the menu was great and so wer...,1,59
...,...,...,...
995,I think food should have flavor and texture an...,0,66
996,Appetite instantly gone.,0,24
997,Overall I was not impressed and would not go b...,0,50
998,"The whole experience was underwhelming, and I ...",0,91


In [13]:
df['reviewcharcount'].max()

149

In [14]:
df.iloc[df['reviewcharcount'].idxmax()]

Review             I would avoid this place if you are staying in...
Liked                                                              0
reviewcharcount                                                   58
Name: 989, dtype: object

# Preprocessing the Data using NLTK

**Remove Undeeded Words ans stopwords from all sentences**

In [15]:
corpus = []
ps = PorterStemmer()
cv = CountVectorizer()
df = df.reset_index(drop=True)
for i in range(len(df)):
    sent = re.sub('[^a-zA-Z]'," ", df['Review'][i])
    sent = sent.lower()
    sent = [word for word in sent if word not in stopwords.words('english')]
    sent = ''.join(sent)
    sent = ps.stem(sent)
    corpus.append(sent)

In [16]:
corpus

['ww    lve h plce ',
 'cru  n g ',
 'n  n he exure w ju n ',
 'ppe b urng he le  bnk hl ff rck eve recenn n lve  ',
 'he elecn n he enu w gre n  were he prce ',
 'nw   geng ngr n  wn  n ph ',
 'hnel  n  e h freh  ',
 'he pe were lke rubber n u cul ell he h been e up he f e beng kep uner  wrer ',
 'he fre were gre  ',
 ' gre uch ',
 'ervce w ver prp ',
 'wul n g bck ',
 'he cher h n cre wh  ever n wh  h    ll ene up beng w verprce ',
 ' re he cpe c rvl  chcken  wh crnberr    ',
 ' w gue becue  w pre ure h w hun hr ',
 ' w hcke becue n gn nce ch nl ',
 'hghl recene ',
 'wre w  lle lw n ervce ',
 'h plce  n wrh ur e  le lne veg ',
 ' n lke  ll ',
 'he burr blh ',
 'he f  n zng ',
 'ervce  l cue ',
 ' cul cre le    he nerr  ju beuful ',
 ' he perfre ',
 'h  rgh    he re velve cke     hhh h uff   g ',
 '  he never brugh  l we ke fr ',
 'h hle n he wll h gre excn ree c  n frenl ff ',
 'k n hur  ge ur f nl   ble n reurn  f w luke wr  ur ever w runnng run lke he w ll verwhele ',
 'he wr w he 

In [17]:
cv.fit_transform(corpus).toarray().shape

(996, 1183)

# Splitting the Date

In [18]:
X = cv.fit_transform(corpus).toarray()
y = df['Liked']

In [19]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [20]:
y

0      1
1      0
2      0
3      1
4      1
      ..
991    0
992    0
993    0
994    0
995    0
Name: Liked, Length: 996, dtype: int64

In [21]:
X_train, x_test, Y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=43)

In [22]:
X_train.shape

(796, 1183)

In [23]:
x_test.shape

(200, 1183)

In [24]:
Y_train.shape

(796,)

In [25]:
y_test.shape

(200,)

# Build ML Model

**Naive Bayes Model**

In [26]:
from sklearn.naive_bayes import MultinomialNB

In [27]:
model = MultinomialNB()

In [28]:
model.fit(X_train,Y_train)

# Model Prediction

In [29]:
y_pred = model.predict(x_test)

In [30]:
y_pred

array([0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0])

In [31]:
y_test.values

array([0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0])

# Model Evalution

In [32]:
print(confusion_matrix(y_test,y_pred))

[[78 24]
 [23 75]]


In [33]:
print(accuracy_score(y_test,y_pred))

0.765


In [34]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.77      0.76      0.77       102
           1       0.76      0.77      0.76        98

    accuracy                           0.77       200
   macro avg       0.76      0.77      0.76       200
weighted avg       0.77      0.77      0.77       200

