In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [53]:
data = pd.read_csv("Restaurant_Reviews (2) (1).tsv",sep='/t',quoting=3)
# quoting parameter will avoid any
data

Unnamed: 0,Review\tLiked
0,Wow... Loved this place.\t1
1,Crust is not good.\t0
2,Not tasty and the texture was just nasty.\t0
3,Stopped by during the late May bank holiday of...
4,The selection on the menu was great and so wer...
...,...
995,I think food should have flavor and texture an...
996,Appetite instantly gone.\t0
997,Overall I was not impressed and would not go b...
998,"The whole experience was underwhelming, and I ..."


In [4]:
data.head()

Unnamed: 0,Review\tLiked
0,Wow... Loved this place.\t1
1,Crust is not good.\t0
2,Not tasty and the texture was just nasty.\t0
3,Stopped by during the late May bank holiday of...
4,The selection on the menu was great and so wer...


In [5]:
data.tail()

Unnamed: 0,Review\tLiked
995,I think food should have flavor and texture an...
996,Appetite instantly gone.\t0
997,Overall I was not impressed and would not go b...
998,"The whole experience was underwhelming, and I ..."
999,"Then, as if I hadn't wasted enough of my life ..."


In [6]:
data.info

<bound method DataFrame.info of                                          Review\tLiked
0                          Wow... Loved this place.\t1
1                                Crust is not good.\t0
2         Not tasty and the texture was just nasty.\t0
3    Stopped by during the late May bank holiday of...
4    The selection on the menu was great and so wer...
..                                                 ...
995  I think food should have flavor and texture an...
996                        Appetite instantly gone.\t0
997  Overall I was not impressed and would not go b...
998  The whole experience was underwhelming, and I ...
999  Then, as if I hadn't wasted enough of my life ...

[1000 rows x 1 columns]>

In [7]:
data.shape

(1000, 1)

In [8]:
data.isnull().sum()

Review\tLiked    0
dtype: int64

In [9]:
data.dtypes

Review\tLiked    object
dtype: object

In [10]:
data.describe()

Unnamed: 0,Review\tLiked
count,1000
unique,996
top,I would not recommend this place.\t0
freq,2


# Text Cleaning

# Remove number and punchuations

In [13]:
import re # regular expression library
import nltk
nltk.download('stopword')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Error loading stopword: Package 'stopword' not found in
[nltk_data]     index


In [14]:
data['Review\tLiked'][0] # First review

'Wow... Loved this place.\t1'

In [15]:
Review = re.sub('[^a-zA-Z]',' ',data['Review\tLiked'][0])


In [16]:
print(Review)

Wow    Loved this place   


# Convert all the letters in lowercase

In [18]:
Review = re.sub('[^a-zA-Z]',' ',data['Review\tLiked'][0])
Review = Review.lower()

In [19]:
print(Review)

wow    loved this place   


# Removing the stopwords

In [21]:
# stopwords are the words
# that are non revelant in predicting wether a preview is positive or negative
# e.g the, and, in, a

In [22]:
# to remove the stopwords we will use the for loop

In [23]:
Review = re.sub('[^a-zA-Z]',' ',data['Review\tLiked'][0])
Review = Review.split()

In [24]:
Review

['Wow', 'Loved', 'this', 'place']

In [25]:
Review = re.sub('[^a-zA-Z]',' ',data['Review\tLiked'][0])
Review = Review.lower()
Review = Review.split()
Review = [word for word in Review if not word in set(stopwords.words('english'))]

In [26]:
print(Review)

['wow', 'loved', 'place']


# Stemming

In [28]:
Review = re.sub('[^a-zA-Z]',' ',data['Review\tLiked'][0])
Review = Review.lower()
Review = Review.split()
ps= PorterStemmer()
Review = [ps.stem(words) for words in Review if not words in set(stopwords.words('english'))]

In [29]:
print(Review)

['wow', 'love', 'place']


# Join the tokens

In [31]:
Review = re.sub('[^a-zA-Z]',' ',data['Review\tLiked'][0])
Review = Review.lower()
Review = Review.split()
ps= PorterStemmer()
Review = [ps.stem(words) for words in Review if not words in set(stopwords.words('english'))]
Review = ' '.join(Review)

In [32]:
print(Review)

wow love place


In [33]:
type(Review)

str

# Apply text cleaning to all the reviews in dataset

In [35]:
corpus = []
for i in range(0, 10):
    Review = re.sub('[^a-zA-Z]', ' ', data['Review\tLiked'][i])  # Remove non-letter characters
    Review = Review.lower()  # Convert to lowercase
    Review = Review.split()  # Split into words
    
    ps = PorterStemmer()  # Create stemmer instance
    Review = [ps.stem(word) for word in Review if word not in set(stopwords.words('english'))]  # Remove stopwords and stem
    Review = ' '.join(Review)  # Join back to a single string
    corpus.append(Review)

In [36]:
print(corpus)

['wow love place', 'crust good', 'tasti textur nasti', 'stop late may bank holiday rick steve recommend love', 'select menu great price', 'get angri want damn pho', 'honeslti tast fresh', 'potato like rubber could tell made ahead time kept warmer', 'fri great', 'great touch']


# Bag of words mode

In [38]:
# In Bag of words model we will take all the words in 1000 redviews
# and here we will select unique works, no repeatition

In [39]:
# then we will create one column to each unique word
# after that we will out all these columns in a table, where rows = 1000 reviews and columns = words

In [40]:
from sklearn.feature_extraction.text import CountVectorizer

In [41]:
cv = CountVectorizer(max_features=1500) #keep 500 hundred frequent word
x = cv.fit_transform(corpus).toarray()

In [42]:
data = pd.read_csv("Restaurant_Reviews (2) (1).tsv", sep="\t")
data = pd.read_csv("Restaurant_Reviews (2) (1).tsv", delimiter="\t")
data.columns = data.columns.str.strip()


In [43]:
print(data.shape)
print(data.columns.tolist())
print(data.head())

(1000, 2)
['Review', 'Liked']
                                              Review  Liked
0                           Wow... Loved this place.      1
1                                 Crust is not good.      0
2          Not tasty and the texture was just nasty.      0
3  Stopped by during the late May bank holiday of...      1
4  The selection on the menu was great and so wer...      1


In [44]:
y = data.iloc[:, 1].values

# Splitting the dataset into the Train and Test set

In [72]:

# Make sure x and y have the same number of samples
min_len = min(len(x), len(y))
x = x[:min_len]
y = y[:min_len]

# Now split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)


In [74]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(8, 40) (2, 40) (8,) (2,)


# Train the model(Navie Bayes)

In [76]:
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()
classifier.fit(x_train, y_train)

# Predict the Test set results

In [78]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [80]:
y_pred = classifier.predict(x_test)

In [88]:
from sklearn.metrics import confusion_matrix

# Assuming y_test and y_pred are already defined
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)



[[0 1]
 [0 1]]


In [90]:
# classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.50      1.00      0.67         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



In [92]:
accuracy_score(y_test, y_pred)

0.5