# Restaurant Reviews Text Classification (Project)

In [4]:
# Importing necessary Libraries

In [5]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

In [6]:
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
# Loading the dataset

In [8]:
data=pd.read_csv("D:\Data science\Datasets\\restaurant_data.csv")

In [9]:
data.head() 

Unnamed: 0,name,online_order,book_table,rate,votes,rest_type,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type)
0,Eat & Joy Restaurant,Yes,No,3.0/5,7,,"Fast Food, Finger Food",300,"[('Rated 1.0', 'RATED\n I had ordered for one...",[],Delivery
1,Corner Huts,Yes,No,3.8/5,30,Quick Bites,"North Indian, Chinese, Biryani, South Indian",300,"[('Rated 4.0', 'RATED\n I was at this place f...","['Tandoori Chicken', 'Paneer Tikka Masala', 'B...",Dine-out
2,Oregano Soul Food,Yes,No,4.1/5,142,Cafe,Cafe,700,"[('Rated 5.0', 'RATED\n Loved the place. Awes...",[],Dine-out
3,Momo Time,Yes,No,,0,Quick Bites,"Momos, Chinese",300,[],[],Dine-out
4,Pepperwood,Yes,No,3.7 /5,136,Quick Bites,"Biryani, Rolls",300,"[('Rated 3.0', 'RATED\n Got delivery from thi...",[],Dine-out


In [10]:
data.shape

(25151, 11)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25151 entries, 0 to 25150
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   name                         25151 non-null  object
 1   online_order                 25151 non-null  object
 2   book_table                   25151 non-null  object
 3   rate                         21428 non-null  object
 4   votes                        25151 non-null  int64 
 5   rest_type                    25051 non-null  object
 6   cuisines                     25129 non-null  object
 7   approx_cost(for two people)  24984 non-null  object
 8   reviews_list                 25151 non-null  object
 9   menu_item                    25151 non-null  object
 10  listed_in(type)              25151 non-null  object
dtypes: int64(1), object(10)
memory usage: 2.1+ MB


In [12]:
data.describe()

Unnamed: 0,votes
count,25151.0
mean,282.028031
std,778.0383
min,0.0
25%,7.0
50%,41.0
75%,198.0
max,16345.0


In [13]:
data.columns

Index(['name', 'online_order', 'book_table', 'rate', 'votes', 'rest_type',
       'cuisines', 'approx_cost(for two people)', 'reviews_list', 'menu_item',
       'listed_in(type)'],
      dtype='object')

### Data Cleaning 

In [14]:
# Checking for NA values

In [15]:
data.isna().sum()

name                              0
online_order                      0
book_table                        0
rate                           3723
votes                             0
rest_type                       100
cuisines                         22
approx_cost(for two people)     167
reviews_list                      0
menu_item                         0
listed_in(type)                   0
dtype: int64

#### There are many missing values in the 'ratings' column, which is a very important variable, hence dropping these NA values

In [16]:
data.dropna(axis = 0, inplace = True)

In [17]:
data.isna().sum()

name                           0
online_order                   0
book_table                     0
rate                           0
votes                          0
rest_type                      0
cuisines                       0
approx_cost(for two people)    0
reviews_list                   0
menu_item                      0
listed_in(type)                0
dtype: int64

In [18]:
data.shape

(21240, 11)

In [19]:
# Removing the duplicate values if there are any

In [20]:
data.duplicated().sum()

1753

In [21]:
data.drop_duplicates(inplace= True)

In [22]:
data.shape

(19487, 11)

In [23]:
data.columns

Index(['name', 'online_order', 'book_table', 'rate', 'votes', 'rest_type',
       'cuisines', 'approx_cost(for two people)', 'reviews_list', 'menu_item',
       'listed_in(type)'],
      dtype='object')

In [24]:
# Reading Rate of dataset

In [25]:
data['rate'].unique()

array(['3.8/5', '4.1/5', '3.7 /5', '3.1/5', '4.4 /5', 'NEW', '3.6/5',
       '3.5/5', '4.3 /5', '3.8 /5', '3.4 /5', '3.4/5', '4.2/5', '4.0/5',
       '3.1 /5', '3.9 /5', '4.7/5', '3.6 /5', '2.9/5', '4.3/5', '4.1 /5',
       '3.9/5', '3.5 /5', '3.3/5', '3.7/5', '3.2 /5', '3.2/5', '4.0 /5',
       '4.2 /5', '2.8 /5', '2.3 /5', '4.4/5', '4.5 /5', '3.3 /5', '4.5/5',
       '3.0/5', '3.0 /5', '2.5/5', '2.5 /5', '2.7 /5', '2.3/5', '2.8/5',
       '4.6 /5', '2.9 /5', '2.7/5', '2.6/5', '-', '2.6 /5', '4.6/5',
       '4.7 /5', '4.9/5', '2.4 /5', '2.4/5', '4.9 /5', '1.8/5', '2.2/5',
       '4.8 /5', '4.8/5', '2.1/5', '2.2 /5', '2.0 /5', '2.1 /5', '2.0/5'],
      dtype=object)

In [26]:
# Removing some redudandant ratings

In [27]:
data = data.loc[data.rate !='NEW']

In [28]:
data = data.loc[data.rate !='-'].reset_index(drop=True)

In [29]:
# Removing slash

In [30]:
remove_slash = lambda x: x.replace('/5', '') if type(x) == np.str else x

In [31]:
data.rate = data.rate.apply(remove_slash).str.strip().astype('float')

In [32]:
data['rate'].head()

0    3.8
1    4.1
2    3.7
3    3.1
4    4.4
Name: rate, dtype: float64

In [33]:
# Computing Mean Rating

In [34]:
restaurants = list(data['name'].unique())

In [35]:
data['Mean Rating'] = 0

In [36]:
for i in range(len(restaurants)):
    data['Mean Rating'][data['name'] == restaurants[i]] = data['rate'][data['name'] == restaurants[i]].mean()

In [37]:
data.head()

Unnamed: 0,name,online_order,book_table,rate,votes,rest_type,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),Mean Rating
0,Corner Huts,Yes,No,3.8,30,Quick Bites,"North Indian, Chinese, Biryani, South Indian",300,"[('Rated 4.0', 'RATED\n I was at this place f...","['Tandoori Chicken', 'Paneer Tikka Masala', 'B...",Dine-out,3.8
1,Oregano Soul Food,Yes,No,4.1,142,Cafe,Cafe,700,"[('Rated 5.0', 'RATED\n Loved the place. Awes...",[],Dine-out,4.1
2,Pepperwood,Yes,No,3.7,136,Quick Bites,"Biryani, Rolls",300,"[('Rated 3.0', 'RATED\n Got delivery from thi...",[],Dine-out,3.75
3,Dawat Midnight Express,Yes,No,3.1,29,"Takeaway, Delivery","North Indian, Chinese",450,"[('Rated 4.0', 'RATED\n Loved the chicken fri...","['Paneer Tikka', 'Paneer Chilli', 'Paneer Manc...",Delivery,3.1
4,RockSalt,Yes,Yes,4.4,498,"Bar, Casual Dining",Modern Indian,1900,"[('Rated 3.0', 'RATED\n Visited this place wi...",[],Pubs and bars,4.4


In [38]:
data['Mean Rating'].describe()

count    18563.000000
mean         3.718117
std          0.419172
min          1.800000
25%          3.444828
50%          3.700000
75%          4.000000
max          4.900000
Name: Mean Rating, dtype: float64

In [39]:
# Normalizing the rating values

In [40]:
from sklearn.preprocessing import MinMaxScaler

In [41]:
scaler = MinMaxScaler(feature_range = (1,5))

In [42]:
data[['Mean Rating']] = scaler.fit_transform(data[['Mean Rating']]).round(2)

In [43]:
data.sample(3)

Unnamed: 0,name,online_order,book_table,rate,votes,rest_type,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),Mean Rating
1142,Donne Biriyani Mane,Yes,No,3.4,63,Quick Bites,"Biryani, Kebab",400,"[('Rated 3.0', 'RATED\n Visited here after sh...","['Mutton Combo', 'Biryani Rice with Kabab Comb...",Dine-out,3.17
5762,Tea Brew,Yes,No,4.1,1156,"Cafe, Casual Dining","Cafe, Continental, Momos, Italian, North Indian",500,"[('Rated 4.0', 'RATED\n We went here today ev...",[],Delivery,3.97
2794,The Egg Factory,Yes,No,4.0,1310,Casual Dining,"Continental, American",750,"[('Rated 4.0', 'RATED\n If you love eggs u wi...","['Street Style Egg Curry with Paratha', 'Spicy...",Delivery,3.9


In [44]:
data['Mean Rating'].describe()

count    18563.000000
mean         3.475005
std          0.540867
min          1.000000
25%          3.120000
50%          3.450000
75%          3.840000
max          5.000000
Name: Mean Rating, dtype: float64

In [45]:
# Keeping only the useful columns and remving the unnecessary columns

In [46]:
data.columns

Index(['name', 'online_order', 'book_table', 'rate', 'votes', 'rest_type',
       'cuisines', 'approx_cost(for two people)', 'reviews_list', 'menu_item',
       'listed_in(type)', 'Mean Rating'],
      dtype='object')

In [47]:
data_new = data[['name','rate','reviews_list','Mean Rating']]

In [48]:
data_new.head()

Unnamed: 0,name,rate,reviews_list,Mean Rating
0,Corner Huts,3.8,"[('Rated 4.0', 'RATED\n I was at this place f...",3.58
1,Oregano Soul Food,4.1,"[('Rated 5.0', 'RATED\n Loved the place. Awes...",3.97
2,Pepperwood,3.7,"[('Rated 3.0', 'RATED\n Got delivery from thi...",3.52
3,Dawat Midnight Express,3.1,"[('Rated 4.0', 'RATED\n Loved the chicken fri...",2.68
4,RockSalt,4.4,"[('Rated 3.0', 'RATED\n Visited this place wi...",4.35


In [49]:
# Renaming the columns

In [50]:
data_new.rename(columns={'rate':'ratings', 'reviews_list' : 'reviews'}, inplace = True)

In [51]:
data_new.head()

Unnamed: 0,name,ratings,reviews,Mean Rating
0,Corner Huts,3.8,"[('Rated 4.0', 'RATED\n I was at this place f...",3.58
1,Oregano Soul Food,4.1,"[('Rated 5.0', 'RATED\n Loved the place. Awes...",3.97
2,Pepperwood,3.7,"[('Rated 3.0', 'RATED\n Got delivery from thi...",3.52
3,Dawat Midnight Express,3.1,"[('Rated 4.0', 'RATED\n Loved the chicken fri...",2.68
4,RockSalt,4.4,"[('Rated 3.0', 'RATED\n Visited this place wi...",4.35


### Text Preprocessing

Some of the common text preprocessing / cleaning steps are:

 - Lower casing
 - Removal of Punctuations
 - Removal of Stopwords
 - Removal of URLs
 - Spelling correction

In [52]:
# Transforming into lower case

In [53]:
data_new["reviews"] = data_new["reviews"].str.lower()

In [54]:
# Removal of Puctuations

In [55]:
import string

In [56]:
PUNCT_TO_REMOVE = string.punctuation

In [57]:
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

In [58]:
data_new["reviews"] = data_new["reviews"].apply(lambda text: remove_punctuation(text))

In [59]:
# Removal of Stopwords

In [60]:
from nltk.corpus import stopwords

In [61]:
STOPWORDS = set(stopwords.words('english'))

In [62]:
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

In [63]:
data_new["reviews"] = data_new["reviews"].apply(lambda text: remove_stopwords(text))

In [64]:
# Removal of URLS

In [65]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [66]:
data_new["reviews"] = data_new["reviews"].apply(lambda text: remove_urls(text))

In [67]:
# Cleaned Reviews

In [68]:
data_new['reviews'].head(10)

0    rated 40 ratedn place dinner husband sure expl...
1    rated 50 ratedn loved place awesome food great...
2    rated 30 ratedn got delivery place food good s...
3    rated 40 ratedn loved chicken fried rice nothi...
4    rated 30 ratedn visited place friends tried 3 ...
5    rated 50 ratedn order pancakes nd loved nice p...
6    rated 50 ratedn food delicious delivery time m...
7    rated 40 ratedn place located jaynagar talking...
8    rated 30 ratedn beautiful place contemporary p...
9    rated 50 ratedn staying citrine hotel first da...
Name: reviews, dtype: object

In [69]:
# Restaurant Names

In [70]:
restaurant_names = list(data_new['name'].unique())

In [71]:
restaurant_names

['Corner Huts',
 'Oregano Soul Food',
 'Pepperwood',
 'Dawat Midnight Express',
 'RockSalt',
 'Bite Me Cupcakes',
 'Guns N Roses',
 'Fishland Deluxe',
 'Aubree',
 'Cinnamon',
 'Wudfyr',
 'Arabian Bites',
 'Ghee Positive',
 'Thindi Mane',
 'Hotel Kadamba Veg',
 'Ascharya Hotel',
 'Bhairava Deluxe Hindu Military Hotel',
 'Salut',
 'Thai House',
 'New Surya Sweets & Snacks',
 'Ober Cafe',
 'Alfresco by Bene -Sheraton Grand Bangalore Hotel at...',
 'Skyye',
 'Shalimar',
 'C Kosila Kitchen',
 'Coffee@Arens',
 'Cheesiano Pizza',
 'The Bridge Walk Cafe',
 'Sri Raghavendra Davanagere Benne Dose Hotel',
 'Xpress Chai',
 "Ruh's Cafe",
 'Mountain Spice',
 'Biergarten',
 'Vyshali Food Corner',
 'Chaai Resto',
 'The Chariot',
 'Al Noor',
 'Sree Lakshmi Venkateshwara Andhra Mess',
 'Hoppipola',
 "Saara's Grill n Spice - SS Lumina Hotel",
 'Kaaram',
 'Krispy Kreme',
 'Nandhanam Restaurant',
 'Sreeraj Lassi Bar',
 'Juicemaker',
 'Fresh Pressery Cafe',
 'Art Cafe',
 'Odisha Kitchen',
 "Nanda's",
 'Ambu

In [72]:
len(restaurant_names)

5710

## Sentimental Analysis

In [73]:
from textblob import TextBlob

In [74]:
# Adding 'length', 'words_count' and 'polarity' columns to dataset

In [75]:
data_new['length'] = data_new["reviews"].apply(lambda x: len(x))

In [76]:
data_new['word_count'] = data_new['reviews'].apply(lambda x: len(x.split()))

In [77]:
data_new['polarity'] = data_new['reviews'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [78]:
data_new.head()

Unnamed: 0,name,ratings,reviews,Mean Rating,length,word_count,polarity
0,Corner Huts,3.8,rated 40 ratedn place dinner husband sure expl...,3.58,423,61,0.216667
1,Oregano Soul Food,4.1,rated 50 ratedn loved place awesome food great...,3.97,1609,229,0.308167
2,Pepperwood,3.7,rated 30 ratedn got delivery place food good s...,3.52,1975,198,0.144174
3,Dawat Midnight Express,3.1,rated 40 ratedn loved chicken fried rice nothi...,2.68,181,27,-0.1
4,RockSalt,4.4,rated 30 ratedn visited place friends tried 3 ...,4.35,19476,2070,0.311511


In [79]:
data_new.describe()

Unnamed: 0,ratings,Mean Rating,length,word_count,polarity
count,18563.0,18563.0,18563.0,18563.0,18563.0
mean,3.718117,3.475005,8572.784571,1100.188709,0.227832
std,0.441481,0.540867,21557.415376,2622.689372,0.17668
min,1.8,1.0,0.0,0.0,-1.0
25%,3.4,3.12,489.0,68.0,0.140417
50%,3.8,3.45,1316.0,175.0,0.247543
75%,4.0,3.84,4695.5,603.0,0.329261
max,4.9,5.0,421585.0,47221.0,1.0


In [80]:
data_new.loc[(data_new['ratings'] > 3) & (data_new['polarity'] > 0.1)]

Unnamed: 0,name,ratings,reviews,Mean Rating,length,word_count,polarity
0,Corner Huts,3.8,rated 40 ratedn place dinner husband sure expl...,3.58,423,61,0.216667
1,Oregano Soul Food,4.1,rated 50 ratedn loved place awesome food great...,3.97,1609,229,0.308167
2,Pepperwood,3.7,rated 30 ratedn got delivery place food good s...,3.52,1975,198,0.144174
4,RockSalt,4.4,rated 30 ratedn visited place friends tried 3 ...,4.35,19476,2070,0.311511
5,Bite Me Cupcakes,4.1,rated 50 ratedn order pancakes nd loved nice p...,4.01,1354,184,0.412500
...,...,...,...,...,...,...,...
18557,Skyline Family Restaurant,3.4,rated 50 ratedn awesome taste rated 20 ratedn ...,2.98,1910,304,0.127804
18558,The 3 Musketeers,3.2,rated 50 ratedn lot flavoursgood service quali...,2.81,674,99,0.355758
18559,Navarang Fast Food,3.6,rated 50 ratedn services good foods quality re...,3.32,141,22,0.516667
18561,Cafe Mozaic - Taj MG Road Bengaluru,4.1,rated 50 ratedn continental indian restaurantn...,3.97,1325,186,0.284099


In [81]:
data_new["Review_category"] = None

In [82]:
data_new.head()

Unnamed: 0,name,ratings,reviews,Mean Rating,length,word_count,polarity,Review_category
0,Corner Huts,3.8,rated 40 ratedn place dinner husband sure expl...,3.58,423,61,0.216667,
1,Oregano Soul Food,4.1,rated 50 ratedn loved place awesome food great...,3.97,1609,229,0.308167,
2,Pepperwood,3.7,rated 30 ratedn got delivery place food good s...,3.52,1975,198,0.144174,
3,Dawat Midnight Express,3.1,rated 40 ratedn loved chicken fried rice nothi...,2.68,181,27,-0.1,
4,RockSalt,4.4,rated 30 ratedn visited place friends tried 3 ...,4.35,19476,2070,0.311511,


In [83]:
data_new["Review_category"]  = np.where((data_new['ratings'] > 3) & (data_new['polarity'] >= 0.1) , 'Good', 'Bad')

In [84]:
data_new.head()

Unnamed: 0,name,ratings,reviews,Mean Rating,length,word_count,polarity,Review_category
0,Corner Huts,3.8,rated 40 ratedn place dinner husband sure expl...,3.58,423,61,0.216667,Good
1,Oregano Soul Food,4.1,rated 50 ratedn loved place awesome food great...,3.97,1609,229,0.308167,Good
2,Pepperwood,3.7,rated 30 ratedn got delivery place food good s...,3.52,1975,198,0.144174,Good
3,Dawat Midnight Express,3.1,rated 40 ratedn loved chicken fried rice nothi...,2.68,181,27,-0.1,Bad
4,RockSalt,4.4,rated 30 ratedn visited place friends tried 3 ...,4.35,19476,2070,0.311511,Good


In [85]:
data_new['Review_category'].unique()

array(['Good', 'Bad'], dtype=object)

In [86]:
data_new['Review_category'].value_counts()

Good    14265
Bad      4298
Name: Review_category, dtype: int64

## Building Bag of Words

In [87]:
from sklearn.feature_extraction.text import CountVectorizer

In [88]:
cv = CountVectorizer(max_features=5000)

In [89]:
cv.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': 5000,
 'min_df': 1,
 'ngram_range': (1, 1),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [90]:
# Indepedent variables

In [91]:
X = cv.fit_transform(data_new["reviews"]).toarray()

In [92]:
X.shape

(18563, 5000)

In [93]:
X[0:10]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [94]:
# Dependent variable

In [95]:
class_values = pd.get_dummies(data_new['Review_category'])
class_values = class_values.drop(columns="Bad")
class_values = class_values.rename(columns={"Bad":"Class"})

In [96]:
y = class_values.values.ravel()

In [97]:
y

array([1, 1, 1, ..., 0, 1, 1], dtype=uint8)

In [98]:
# Splitting the train test data

In [99]:
from sklearn.model_selection import train_test_split

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [101]:
print(X_train.shape,'\n',
     X_test.shape)

(14850, 5000) 
 (3713, 5000)


In [102]:
# Bag of words

In [103]:
count_df = pd.DataFrame(X_train, columns=cv.get_feature_names())

In [104]:
count_df.head()

Unnamed: 0,05,10,100,1000,100ft,1010,1015,1030,10mins,10pm,...,â²ã,â³ã,âµã,â¹,â¹ã,âºã,â¼,â¼ã,â½ã,â¾ã
0,0,8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Model Building

### Naive Bayes Model

In [105]:
from sklearn.naive_bayes import MultinomialNB

In [106]:
nb_classifier = MultinomialNB()

In [107]:
nb_classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [108]:
# Predicting the values

In [109]:
y_pred_train = nb_classifier.predict(X_train)
y_pred_test = nb_classifier.predict(X_test)

In [110]:
# Confusion matrix

In [111]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [112]:
confusion_matrix(y_train, y_pred_train)

array([[2436, 1021],
       [4170, 7223]], dtype=int64)

In [113]:
confusion_matrix(y_test, y_pred_test)

array([[ 577,  264],
       [1025, 1847]], dtype=int64)

In [114]:
# Checking accuracy, precision and recall

In [115]:
# For training

In [116]:
accuracy = accuracy_score(y_train, y_pred_train)
precision = precision_score(y_train, y_pred_train)
recall = recall_score(y_train, y_pred_train)

In [117]:
print("Accuracy for train: ", accuracy)
print("Precision for train: ", precision)
print("Recall for train: ", recall)

Accuracy for train:  0.6504377104377105
Precision for train:  0.876152353226589
Recall for train:  0.6339857807425612


In [118]:
# For testing

In [119]:
accuracy = accuracy_score(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test)
recall = recall_score(y_test, y_pred_test)

In [120]:
print("Accuracy for test: ", accuracy)
print("Precision for test: ", precision)
print("Recall for test: ", recall)

Accuracy for test:  0.6528413681659035
Precision for test:  0.8749407863571766
Recall for test:  0.6431058495821727


### Using Bernoullis NB

In [121]:
from sklearn.naive_bayes import BernoulliNB

In [122]:
brnb = BernoulliNB()

In [124]:
brnb.fit(X_train,y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [125]:
# Predicting the target values

In [126]:
pred_train_brnb = brnb.predict(X_train)
pred_brnb = brnb.predict(X_test)

In [127]:
# Probability values

In [128]:
brnb.predict_proba(X_test)

array([[1.00000000e+000, 3.72372629e-051],
       [1.00000000e+000, 8.15301936e-047],
       [1.00000000e+000, 1.60107179e-043],
       ...,
       [1.56199290e-131, 1.00000000e+000],
       [6.69967431e-181, 1.00000000e+000],
       [1.00000000e+000, 3.92760933e-034]])

In [129]:
# Confusion matrix

In [130]:
confusion_matrix(y_train,pred_train_brnb)

array([[2904,  553],
       [5905, 5488]], dtype=int64)

In [131]:
confusion_matrix(y_test,pred_brnb)

array([[ 697,  144],
       [1480, 1392]], dtype=int64)

In [132]:
# Accuracy

In [133]:
train_accuracy = accuracy_score(y_train,pred_train_brnb)
train_accuracy

0.5651178451178451

In [134]:
test_accuracy = accuracy_score(y_test,pred_brnb)
test_accuracy

0.562617829248586

### Hyperparameter tuning of Naive Bayes model

In [135]:
# Tuning the parameter 'alpha' to improve the accuracy

In [136]:
classifier=MultinomialNB(alpha=0.1)

In [137]:
previous_score=0
for alpha in np.arange(0.1,1.1,0.1):
    sub_classifier=MultinomialNB(alpha=alpha)
    sub_classifier.fit(X_train,y_train)
    y_pred=sub_classifier.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    if score>previous_score:
        classifier=sub_classifier
    print("Alpha: {}, Score : {}".format(round(alpha,1),score))

Alpha: 0.1, Score : 0.6566119041206572
Alpha: 0.2, Score : 0.656342580123889
Alpha: 0.3, Score : 0.6547266361432804
Alpha: 0.4, Score : 0.6544573121465123
Alpha: 0.5, Score : 0.6544573121465123
Alpha: 0.6, Score : 0.6536493401562079
Alpha: 0.7, Score : 0.6533800161594399
Alpha: 0.8, Score : 0.6533800161594399
Alpha: 0.9, Score : 0.6531106921626717
Alpha: 1.0, Score : 0.6528413681659035


#### Highest accuracy is obtanined for the value alpha =0.1

### Logistic Regression

In [138]:
from sklearn.linear_model import LogisticRegression

In [139]:
lr_classifier = LogisticRegression(max_iter=500, random_state=0)

In [140]:
lr_classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [141]:
# Predicting the values

In [142]:
y_pred_train = lr_classifier.predict(X_train)

In [143]:
y_pred_test = lr_classifier.predict(X_test)

In [144]:
# Confusion matrix

In [145]:
confusion_matrix(y_train, y_pred_train)

array([[ 3338,   119],
       [   85, 11308]], dtype=int64)

In [146]:
confusion_matrix(y_test, y_pred_test)

array([[ 761,   80],
       [  98, 2774]], dtype=int64)

In [147]:
# Accuracy score, precision and recall

In [148]:
# For training

In [149]:
Accuracy = accuracy_score(y_train, y_pred_train)
Precision = precision_score(y_train, y_pred_train)
Recall = recall_score(y_train, y_pred_train)

In [150]:
print("Accuracy for train: ", Accuracy)
print("Precision for train: ", Precision)
print("Recall for train: ", Recall)

Accuracy for train:  0.9862626262626263
Precision for train:  0.9895860680843616
Recall for train:  0.9925392785043448


In [151]:
# For testing

In [152]:
Accuracy = accuracy_score(y_test, y_pred_test)
Precision = precision_score(y_test, y_pred_test)
Recall = recall_score(y_test, y_pred_test)

In [153]:
print("Accuracy for test: ", Accuracy)
print("Precision for test: ", Precision)
print("Recall for test: ", Recall)

Accuracy for test:  0.952060328575276
Precision for test:  0.9719691660826909
Recall for test:  0.9658774373259053


### Hyperparameter tuning for logistic regression

In [154]:
classifier=LogisticRegression(C=1)

In [155]:
previous_score=0
for i in np.arange(0.1,1.1,0.1):
    sub_classifier=LogisticRegression(max_iter=500, C=i, random_state=0)
    sub_classifier.fit(X_train,y_train)
    y_pred=sub_classifier.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    if score>previous_score:
        classifier=sub_classifier
    print("C: {}, Score : {}".format(round(i,1),score))

C: 0.1, Score : 0.9482897926205225
C: 0.2, Score : 0.9490977646108268
C: 0.3, Score : 0.949636412604363
C: 0.4, Score : 0.9499057366011312
C: 0.5, Score : 0.9501750605978992
C: 0.6, Score : 0.9507137085914354
C: 0.7, Score : 0.9501750605978992
C: 0.8, Score : 0.9501750605978992
C: 0.9, Score : 0.9528683005655804
C: 1.0, Score : 0.952060328575276


#### Highest accuracy is obtained for the value of C=0.9

### TF-IDF Vectorizer

In [156]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [157]:
tfidf_v=TfidfVectorizer(max_features=5000,ngram_range=(1,1))

In [158]:
tfidf_v.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': 5000,
 'min_df': 1,
 'ngram_range': (1, 1),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

In [159]:
# Independent variables

In [160]:
X = tfidf_v.fit_transform(data_new["reviews"]).toarray()

In [161]:
X.shape

(18563, 5000)

In [162]:
tfidf_v.get_feature_names()[500:550]

['begin',
 'beginning',
 'behave',
 'behaved',
 'behavior',
 'behaviour',
 'behind',
 'bel',
 'belgian',
 'belgium',
 'believe',
 'bell',
 'bellandur',
 'belly',
 'benches',
 'benedict',
 'bengal',
 'bengali',
 'bengalis',
 'bengaluru',
 'benne',
 'berries',
 'berry',
 'beside',
 'besides',
 'best',
 'bet',
 'better',
 'beverage',
 'beverages',
 'beware',
 'beyond',
 'bhagini',
 'bhaja',
 'bhaji',
 'bhajji',
 'bhapa',
 'bhara',
 'bharta',
 'bhatti',
 'bhatura',
 'bhature',
 'bhel',
 'bhetki',
 'bhindi',
 'bhuna',
 'bhurji',
 'biere',
 'biergarten',
 'big']

In [163]:
# Dependent variable

In [164]:
class_values = pd.get_dummies(data_new['Review_category'])
class_values = class_values.drop(columns="Bad")
class_values = class_values.rename(columns={"Bad":"Class"})

In [165]:
y = class_values.values.ravel()

In [166]:
# Splitting the train test data

In [167]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [168]:
# TFIDF vecotorized words

In [169]:
count_df = pd.DataFrame(X_train, columns=tfidf_v.get_feature_names())
count_df.head()

Unnamed: 0,05,10,100,1000,100ft,1010,1015,1030,10mins,10pm,...,â²ã,â³ã,âµã,â¹,â¹ã,âºã,â¼,â¼ã,â½ã,â¾ã
0,0.0,0.037358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.031002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.03085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Naive Bayes Model

In [170]:
from sklearn.naive_bayes import MultinomialNB

In [171]:
nb_classifier = MultinomialNB()

In [172]:
nb_classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [173]:
# Predicting the values

In [174]:
y_pred_train = nb_classifier.predict(X_train)
y_pred_test = nb_classifier.predict(X_test)

In [175]:
# Confusion matrix

In [176]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [177]:
confusion_matrix(y_train, y_pred_train)

array([[ 1322,  2135],
       [  316, 11077]], dtype=int64)

In [178]:
confusion_matrix(y_test, y_pred_test)

array([[ 308,  533],
       [  95, 2777]], dtype=int64)

In [179]:
# Checking accuracy, precision and recall

In [180]:
# For training

In [181]:
accuracy = accuracy_score(y_train, y_pred_train)
precision = precision_score(y_train, y_pred_train)
recall = recall_score(y_train, y_pred_train)

In [182]:
print("Accuracy for train: ", accuracy)
print("Precision for train: ", precision)
print("Recall for train: ", recall)

Accuracy for train:  0.834949494949495
Precision for train:  0.838404480775053
Recall for train:  0.9722636706749759


In [183]:
# For testing

In [184]:
accuracy = accuracy_score(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test)
recall = recall_score(y_test, y_pred_test)

In [185]:
print("Accuracy for test: ", accuracy)
print("Precision for test: ", precision)
print("Recall for test: ", recall)

Accuracy for test:  0.8308645300296257
Precision for test:  0.8389728096676737
Recall for test:  0.9669220055710307


### Using Bernoullis NB

In [186]:
from sklearn.naive_bayes import BernoulliNB

In [187]:
brnb = BernoulliNB()

In [188]:
brnb.fit(X_train,y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [189]:
# Predicting the target values

In [190]:
pred_train_brnb = brnb.predict(X_train)
pred_brnb = brnb.predict(X_test)

In [191]:
# Probability values

In [193]:
brnb.predict_proba(X_test)

array([[1.00000000e+000, 3.72372629e-051],
       [1.00000000e+000, 8.15301936e-047],
       [1.00000000e+000, 1.60107179e-043],
       ...,
       [1.56199290e-131, 1.00000000e+000],
       [6.69967431e-181, 1.00000000e+000],
       [1.00000000e+000, 3.92760933e-034]])

In [194]:
# Confusion matrix

In [195]:
confusion_matrix(y_train,pred_train_brnb)

array([[2904,  553],
       [5905, 5488]], dtype=int64)

In [196]:
confusion_matrix(y_test,pred_brnb)

array([[ 697,  144],
       [1480, 1392]], dtype=int64)

In [197]:
# Accuracy

In [198]:
train_accuracy = accuracy_score(y_train,pred_train_brnb)
train_accuracy

0.5651178451178451

In [199]:
test_accuracy = accuracy_score(y_test,pred_brnb)
test_accuracy

0.562617829248586

### Hyperparameter tuning of Naive Bayes model

In [200]:
# Tuning the parameter 'alpha' to improve the accuracy

In [201]:
classifier=MultinomialNB(alpha=0.1)

In [202]:
previous_score=0
for alpha in np.arange(0.1,1.1,0.1):
    sub_classifier=MultinomialNB(alpha=alpha)
    sub_classifier.fit(X_train,y_train)
    y_pred=sub_classifier.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    if score>previous_score:
        classifier=sub_classifier
    print("Alpha: {}, Score : {}".format(round(alpha,1),score))

Alpha: 0.1, Score : 0.8308645300296257
Alpha: 0.2, Score : 0.8308645300296257
Alpha: 0.3, Score : 0.8308645300296257
Alpha: 0.4, Score : 0.8305952060328575
Alpha: 0.5, Score : 0.8284406140587126
Alpha: 0.6, Score : 0.8287099380554808
Alpha: 0.7, Score : 0.827093994074872
Alpha: 0.8, Score : 0.8279019660651764
Alpha: 0.9, Score : 0.8308645300296257
Alpha: 1.0, Score : 0.8308645300296257


#### Highest accuracy is obtanined for the value alpha = 1.0

### Logistic regression model

In [203]:
lr_classifier = LogisticRegression(max_iter=500, random_state=0)

In [204]:
lr_classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [205]:
# Predicting the values

In [206]:
y_pred_train = lr_classifier.predict(X_train)

In [207]:
y_pred_test = lr_classifier.predict(X_test)

In [208]:
# Confusion matrix

In [209]:
confusion_matrix(y_train, y_pred_train)

array([[ 2571,   886],
       [  168, 11225]], dtype=int64)

In [210]:
confusion_matrix(y_test, y_pred_test)

array([[ 608,  233],
       [  64, 2808]], dtype=int64)

In [211]:
# Accuracy score, precision and recall

In [212]:
# For training

In [213]:
Accuracy = accuracy_score(y_train, y_pred_train)
Precision = precision_score(y_train, y_pred_train)
Recall = recall_score(y_train, y_pred_train)

In [214]:
print("Accuracy for train: ", Accuracy)
print("Precision for train: ", Precision)
print("Recall for train: ", Recall)

Accuracy for train:  0.929023569023569
Precision for train:  0.9268433655354636
Recall for train:  0.9852541033968226


In [215]:
# For testing

In [216]:
Accuracy = accuracy_score(y_test, y_pred_test)
Precision = precision_score(y_test, y_pred_test)
Recall = recall_score(y_test, y_pred_test)

In [217]:
print("Accuracy for test: ", Accuracy)
print("Precision for test: ", Precision)
print("Recall for test: ", Recall)

Accuracy for test:  0.9200107729598708
Precision for test:  0.9233804669516607
Recall for test:  0.9777158774373259


### Hyperparameter tuning for logistic regression

In [218]:
classifier=LogisticRegression(C=1)

In [219]:
previous_score=0
for i in np.arange(0.1,1.1,0.1):
    sub_classifier=LogisticRegression(max_iter=500, C=i, random_state=0)
    sub_classifier.fit(X_train,y_train)
    y_pred=sub_classifier.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    if score>previous_score:
        classifier=sub_classifier
    print("C: {}, Score : {}".format(round(i,1),score))

C: 0.1, Score : 0.8747643415028279
C: 0.2, Score : 0.8884998653380016
C: 0.3, Score : 0.8973875572313493
C: 0.4, Score : 0.9068138971182332
C: 0.5, Score : 0.9103151090762187
C: 0.6, Score : 0.9143549690277404
C: 0.7, Score : 0.9162402370051171
C: 0.8, Score : 0.9175868569889577
C: 0.9, Score : 0.9181255049824939
C: 1.0, Score : 0.9200107729598708


#### Highest accuracy is obtained for the value of C=1