In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [9]:
# Load the Dataset
df = pd.read_csv("Restaurant_Reviews.tsv", sep='\t', quoting=3)

In [11]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [12]:
#Distribution OF Target
df['Liked'].value_counts()

0    500
1    500
Name: Liked, dtype: int64

### Cleaning the Data

In [13]:
import nltk
import re

In [14]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Manju\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [15]:
from nltk.corpus import stopwords

In [17]:
df['Review'][0]

'Wow... Loved this place.'

In [19]:
re.sub("[^a-zA-Z]", ' ', df['Review'][0])

'Wow    Loved this place '

In [23]:
# lower the character
review = re.sub("[^a-zA-Z]", ' ', df['Review'][0]).lower()
review

'wow    loved this place '

In [24]:
# Tokenize
review = review.split()
review

['wow', 'loved', 'this', 'place']

In [26]:
# Display Stopword
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [29]:
# Remove Stopwords using list comprehension
review_sw = [word for word in review if word not in stopwords.words('english')]
review_sw

['wow', 'loved', 'place']

In [30]:
# Stemming
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [32]:
review_stem = [ps.stem(word) for word in review_sw]
review_stem

['wow', 'love', 'place']

In [35]:
review = " ".join(review_stem)
review

'wow love place'

In [37]:
corpus = []
for i in range(df.shape[0]):
    review = re.sub("[^a-zA-Z]", ' ', df['Review'][i]).lower()
    review = review.split()
    review_sw = [word for word in review if word not in stopwords.words('english')]
    review_stem = [ps.stem(word) for word in review_sw]
    review = " ".join(review_stem)
    corpus.append(review)

In [39]:
print(corpus[:5])

['wow love place', 'crust good', 'tasti textur nasti', 'stop late may bank holiday rick steve recommend love', 'select menu great price']


### Bag Of Words

![title](bag_of_words.jpg)

Image_address = 'https://www.quora.com/What-is-the-bag-of-words-algorithm'
credits = Quora 

Above there are only two Documents/records But here We have 1000 documents in our Datasets...

In [40]:
from sklearn.feature_extraction.text import CountVectorizer

In [41]:
cv = CountVectorizer(max_features=1500)

In [42]:
x = cv.fit_transform(corpus).toarray()

In [44]:
x.shape

(1000, 1500)

In [45]:
y = df.iloc[:,1].values

In [51]:
y.shape

(1000,)

### Applying Naive Bayes Algo

In [54]:
# Splitiing The Dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [56]:
x_train.shape, x_test.shape

((800, 1500), (200, 1500))

In [57]:
y_train.shape, y_test.shape

((800,), (200,))

In [58]:
from sklearn.naive_bayes import GaussianNB

In [59]:
classifier = GaussianNB()

In [60]:
classifier.fit(x_train,y_train)

GaussianNB()

In [61]:
y_pred = classifier.predict(x_test)

In [62]:
from sklearn.metrics import accuracy_score

In [63]:
accuracy_score(y_test,y_pred)

0.73