In [219]:
# importing the necessary libraries

In [201]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [220]:
# importing the dataset

In [203]:
df = pd.read_csv('train.tsv', delimiter = '\t', quoting = 3)

In [221]:
# checking the size

In [204]:
df.shape

(900, 2)

In [205]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [222]:
# data preprocessing

In [206]:
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

all_stopwords = stopwords.words('english')
all_stopwords.remove('not')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tarun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [208]:
corpus=[]

for i in range(0, 900):
  review = re.sub('[^a-zA-Z]', ' ', df['Review'][i])
  review = review.lower()
  review = review.split()
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

In [209]:
corpus[0:5]

['wow love place',
 'crust not good',
 'not tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price']

In [198]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1420)

In [210]:
x = cv.fit_transform(corpus).toarray()
y = df.iloc[:, -1].values

In [211]:
from sklearn.model_selection import train_test_split

In [212]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,train_size = 0.80,random_state=0)

In [231]:
# using bernoulli naive bayes

In [213]:
from sklearn.naive_bayes import BernoulliNB

In [214]:
bn = BernoulliNB()
bn.fit(xtrain,ytrain)

In [215]:
ypred = bn.predict(xtest)

In [216]:
from sklearn.metrics import classification_report

In [217]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.79      0.63      0.70        78
           1       0.75      0.87      0.81       102

    accuracy                           0.77       180
   macro avg       0.77      0.75      0.75       180
weighted avg       0.77      0.77      0.76       180



In [218]:
from sklearn.metrics import confusion_matrix
confusion_matrix(ytest,ypred)

array([[49, 29],
       [13, 89]], dtype=int64)

In [230]:
# using gaussian naive bayes

In [225]:
from sklearn.naive_bayes import GaussianNB

In [226]:
gb = GaussianNB()

In [227]:
gb.fit(xtrain,ytrain)

In [228]:
ypred = gb.predict(xtest)

In [229]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.58      0.87      0.69        78
           1       0.84      0.51      0.63       102

    accuracy                           0.67       180
   macro avg       0.71      0.69      0.66       180
weighted avg       0.72      0.67      0.66       180



In [232]:
# in comparison with the models gaussian and bernoulli
# Bernoulli naive bayes gives a better accuracy, so proceeding with the Bernoulli

In [None]:
# importing the dataset that needs to be tested

In [223]:
df1 = pd.read_csv('test.tsv', delimiter = '\t', quoting = 3)

In [224]:
df1

Unnamed: 0,Review
0,Spend your money elsewhere.
1,Their regular toasted bread was equally satisf...
2,The Buffet at Bellagio was far from what I ant...
3,"And the drinks are WEAK, people!"
4,-My order was not correct.
...,...
95,I think food should have flavor and texture an...
96,Appetite instantly gone.
97,Overall I was not impressed and would not go b...
98,"The whole experience was underwhelming, and I ..."


In [234]:
corpus=[]

for i in range(len(df1)):
  review = re.sub('[^a-zA-Z]', ' ', df1['Review'][i])
  review = review.lower()
  review = review.split()
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

In [236]:
corpus[:5]

['spend money elsewher',
 'regular toast bread equal satisfi occasion pat butter mmmm',
 'buffet bellagio far anticip',
 'drink weak peopl',
 'order not correct']

In [237]:
x = cv.transform(corpus).toarray()

In [256]:
# predicting the required target variable

In [239]:
y = bn.predict(x)

In [248]:
y = pd.DataFrame(y)

In [254]:
df1['y'] = y

In [255]:
df1

Unnamed: 0,Review,y
0,Spend your money elsewhere.,0
1,Their regular toasted bread was equally satisf...,1
2,The Buffet at Bellagio was far from what I ant...,1
3,"And the drinks are WEAK, people!",1
4,-My order was not correct.,0
...,...,...
95,I think food should have flavor and texture an...,0
96,Appetite instantly gone.,1
97,Overall I was not impressed and would not go b...,0
98,"The whole experience was underwhelming, and I ...",0
