<a href="https://colab.research.google.com/github/vinaykumargummadi/DSML/blob/main/Deep%20Learning/notebooks/NLP%20/Restaurant_Review_Classification_with_NLTK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix

import warnings
warnings.filterwarnings("ignore")

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
path = "https://docs.google.com/spreadsheets/d/1Skesacp9tuyKZfgMy-thD95_GdWyumOMg0f_z91k84g/export?format=csv&gid=1202734447"
df=pd.read_csv(path)

In [3]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
df.isnull().sum()

Review    0
Liked     0
dtype: int64

In [5]:
df[df.Review == ''] # Every row is having review

Unnamed: 0,Review,Liked


In [6]:
df.Liked.value_counts() #Balanced data sets

1    500
0    500
Name: Liked, dtype: int64

### Text Preprocessing - cleaing data

In [7]:
import re
import nltk

In [8]:
df.Review=df.Review.apply(lambda x: x.lower())

In [9]:
df.head()

Unnamed: 0,Review,Liked
0,wow... loved this place.,1
1,crust is not good.,0
2,not tasty and the texture was just nasty.,0
3,stopped by during the late may bank holiday of...,1
4,the selection on the menu was great and so wer...,1


In [10]:
df.Review.sample(1).values

array(["furthermore, you can't even find hours of operation on the website!"],
      dtype=object)

**Remove the number & special characters other than letters**

In [11]:
#testing the regex for a sample string
text = 'wow..loved this place.'
res = re.sub(r'[^a-zA-Z]',' ',text)
res = re.sub(r'\d',' ',res)
print(res)

wow  loved this place 


In [12]:
def remove_number_sp_chars(text):
  cleaned_text = re.sub(r'[^a-zA-Z]',' ',text)
  res = re.sub(r'\d',' ',cleaned_text)
  return res
df.Review=df.Review.apply(remove_number_sp_chars)

In [13]:
df.tail()

Unnamed: 0,Review,Liked
995,i think food should have flavor and texture an...,0
996,appetite instantly gone,0
997,overall i was not impressed and would not go b...,0
998,the whole experience was underwhelming and i ...,0
999,then as if i hadn t wasted enough of my life ...,0


**Removing STOP WORDS**

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
from nltk.corpus import stopwords

In [16]:
review=df.Review[0]

In [17]:
review

'wow    loved this place '

In [18]:
def remove_stop_words(review):
  l=[]
  for word in review.split():
    if word not in stopwords.words('english'):
      l.append(word)
  return ' '.join(l)

In [19]:
remove_stop_words("i think food should have flavor and texture an")

'think food flavor texture'

In [20]:
df.Review=df.Review.apply(remove_stop_words)

In [21]:
df.head()

Unnamed: 0,Review,Liked
0,wow loved place,1
1,crust good,0
2,tasty texture nasty,0
3,stopped late may bank holiday rick steve recom...,1
4,selection menu great prices,1


**Stemming:** Finding the root form of the words

In [22]:
from nltk.stem import PorterStemmer
stemmer=PorterStemmer()

In [23]:
def stemming_poter(review):
  return ' '.join([stemmer.stem(word) for word in review.split()])

In [24]:
stemming_poter("wow loved")

'wow love'

In [25]:
df.Review=df.Review.apply(stemming_poter)

**Bag of words:**

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
cv = CountVectorizer(max_features=1500)

In [33]:
X=cv.fit_transform(df.Review).toarray()

In [34]:
X.shape

(1000, 1500)

In [36]:
y=df.Liked

**Navie Bayes Model**

In [47]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,classification_report

In [39]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [40]:
X_train.shape,X_test.shape

((800, 1500), (200, 1500))

In [41]:
y_train.shape,y_test.shape

((800,), (200,))

In [42]:
classifier=GaussianNB()

In [43]:
classifier.fit(X_train,y_train)

In [45]:
y_pred=classifier.predict(X_test)

In [46]:
accuracy_score(y_test,y_pred)

0.73

In [48]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.82      0.57      0.67        97
           1       0.68      0.88      0.77       103

    accuracy                           0.73       200
   macro avg       0.75      0.73      0.72       200
weighted avg       0.75      0.73      0.72       200



**Identifying Wrongly Predicted labels**

In [61]:
wrong_predicted_indices=(y_pred != y_test.to_numpy()).nonzero()[0]

In [66]:
df.iloc[wrong_predicted_indices,:]

Unnamed: 0,Review,Liked
0,wow love place,1
1,crust good,0
2,tasti textur nasti,0
5,get angri want damn pho,0
7,potato like rubber could tell made ahead time ...,0
8,fri great,1
12,cashier care ever say still end wayyy overpr,0
21,food amaz,1
23,could care less interior beauti,1
24,perform,1
