<a href="https://colab.research.google.com/github/sunilgandipadala/NLP-Projects/blob/main/NLP_Project2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Restaurant Reviews Classification with NLTK

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df=pd.read_csv("Restaurant_Reviews.tsv",sep="\t",quoting = 3) # quoting is used to ignore the quotes which are in between the text

In [None]:
df.head(1)

NameError: ignored

In [None]:
df.tail()

Unnamed: 0,Review,Liked
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0
999,"Then, as if I hadn't wasted enough of my life ...",0


In [None]:
df["Liked"].value_counts()

1    500
0    500
Name: Liked, dtype: int64

# Cleaning Text Data

In [None]:
# we use NLTK
import nltk
import re

In [None]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#to remove stops words from the data we use stopwords modules
from nltk.corpus import stopwords

In [None]:
df["Review"].head()

0                             Wow... Loved this place.
1                                   Crust is not good.
2            Not tasty and the texture was just nasty.
3    Stopped by during the late May bank holiday of...
4    The selection on the menu was great and so wer...
Name: Review, dtype: object

In [None]:
# remove all the punctuations from the data.
#re.sub(pattern,repl,string) here pattern is that we want to find and replace, repl is what we want to place in the find pattern, and string is the data
review = re.sub('[^a-zA-Z]'," ", df['Review'][0])
print(review)
review=re.sub('[" "]+'," ",review)


Wow    Loved this place 


In [None]:
review.lower()

'wow loved this place '

In [None]:
review=review.split()

In [None]:
review

['Wow', 'Loved', 'this', 'place']

In [None]:
#Now we have tokens, so now we need to remove the stopwords from out stopwords database
review= [word for word in review if word not in stopwords.words('english')]
review

['Wow', 'Loved', 'place']

In [None]:
#Apply stemming to the filtered tokens
from nltk.stem.porter import PorterStemmer
#create a porterstemmer by using PorterStemmer class
ps= PorterStemmer()
review=[ps.stem(word) for word in review]

In [None]:
review

['wow', 'love', 'place']

In [None]:
review = " ".join(review)

In [None]:
review

'wow love place'

In [None]:
#lets do for complete corpus data
corpus = []
for i in range(len(df)):
  review = re.sub('[^a-zA-Z]'," ", df['Review'][i])
  review.lower()
  review = review.split()
  review=[ps.stem(word) for word in review]
  review = " ".join(review)
  corpus.append(review)
print(corpus)

['wow love place', 'crust good', 'not tasti textur nasti', 'stop late may bank holiday rick steve recommend love', 'the select menu great price', 'now i get angri i want damn pho', 'honeslti tast that fresh', 'the potato like rubber could tell made ahead time kept warmer', 'the fri great', 'a great touch', 'servic prompt', 'would go back', 'the cashier care ever i say still end wayyy overpr', 'i tri cape cod ravoli chicken cranberri mmmm', 'i disgust i pretti sure human hair', 'i shock sign indic cash', 'highli recommend', 'waitress littl slow servic', 'thi place worth time let alon vega', 'like', 'the burritto blah', 'the food amaz', 'servic also cute', 'i could care less the interior beauti', 'so perform', 'that right red velvet cake ohhh stuff good', 'they never brought salad ask', 'thi hole wall great mexican street taco friendli staff', 'took hour get food tabl restaur food luke warm our sever run around like total overwhelm', 'the worst salmon sashimi', 'also combo like burger fr

# Bag of Words (to convert the text to machine understandable)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)

In [None]:
x = cv.fit_transform(corpus).toarray()
y = df.iloc[:,1].values

#Applying Classification 

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.2,random_state= 0)

In [None]:
x_train.shape,y_train.shape

y_test

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 1])

In [None]:
#import NAIVE BAYES CLASSIFIER
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()


In [None]:
# now to train the model we have fit() method
classifier.fit(x_train,y_train)

GaussianNB()

In [None]:
#the model is trained 
#check the prediction by using the test data
y_pred = classifier.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.735

ookay.... if we want to test the data from by taking review from user... we can do it by classifier.predit(input) but the input need to be preproccessed and we need to do all the steps done before training

In [None]:
import spacy
nlp = spacy.load(name ="en_core_web_sm")