## Aim: Lets Find the Positive and Negative Reviews of a Restaurant 

## 1.Import Libraries

In [None]:
# import Necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

ps=PorterStemmer()
lm=WordNetLemmatizer()

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,classification_report

cv=CountVectorizer()
tf=TfidfVectorizer(ngram_range = (1,1))

## 2.Read the data from tsv file

In [None]:
data=pd.read_csv('Restaurant_Reviews.tsv',sep='\t')
data.head(5)

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
data['Liked'].astype('object')

In [None]:
data['Liked'].value_counts()

In [None]:
data['Liked'].value_counts().plot(kind='bar')
plt.show()

## Approach 1: Normal procedure with only Lexical processing

### Lemmatisation

In [None]:
# target column is mapped as y
y=data['Liked']
y.head()

In [None]:
# Review column is stores in x variable
x=data['Review']
x.head()

In [None]:
corpus=[] # empty corpus

# Clean the data as shown below
for sent in x:
    
    # 1. Lower the sentences
    sent=sent.lower()
    
    #2.regex statement
    sent=re.sub('[^a-zA-Z0-9]',' ',sent)
    
    #3. remove extra spaces in the text
    sent=sent.split()
    sent=' '.join(sent)
    
    #4. Split the data
    sent=sent.split()
    
    #5.use regex to remove unnecessary charatcers in the data
    sent=[lm.lemmatize(word) for word in sent if word not in set(stopwords.words('english'))]
    
    #6. Append the data into corpus and now the sentence is traeted with stopwords and lemmatization
    corpus.append(' '.join(sent))

In [None]:
# We have cleaned,lemmatised and removed stopwords as well and formed corpus
corpus


In [None]:
# Add the Reviews column  into datframe again after clenaing the data
data['Review_cleaned']=corpus
data.head()

In [None]:
# consider the input data as Review_cleaned column
x=data['Review_cleaned']
x.head()

## Bag of words model creation

In [None]:
# TF-IDF: term frequency inverse document frequency
x=tf.fit_transform(x).toarray()
x

## Train-Test-Split

In [None]:
from sklearn.model_selection import train_test_split


In [None]:
# consider 20 percent as testing data
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

### Naive-bayes Model

In [None]:
# Apppling Multinomial navive Bayes
from sklearn.naive_bayes import MultinomialNB

In [None]:
nbc=MultinomialNB()
nbc.fit(x_train,y_train)

In [None]:
y_pred=nbc.predict(x_test)
y_pred

In [None]:
print(accuracy_score(y_test, y_pred))

In [None]:
print(confusion_matrix(y_test,y_pred))

In [None]:
print(classification_report(y_test, y_pred))

## Insights:
- We have obtained 79 percent accuracy usingNaive Bayes.

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc=RandomForestClassifier()

In [None]:
rfc.fit(x_train,y_train)

In [None]:
y_pred=nbc.predict(x_test)
y_pred

In [None]:
print(accuracy_score(y_test, y_pred))

In [None]:
print(confusion_matrix(y_test,y_pred))

In [None]:
print(classification_report(y_test, y_pred))

## Insights:
- We have obtained 79 percent accuracy using both Random Forest calssifier and Naive Bayes.Lets test the data on user entred input

### validate the user Input

In [None]:
def predict_review(message):
    sent=message
    # 1. Lower the sentences
    sent=sent.lower()
    
    #2.regex statement
    sent=re.sub('[^a-zA-Z0-9]',' ',sent)
    
    #3. remove extra spaces in the text
    sent=sent.split()
    sent=' '.join(sent)
    
    #4. Split the data
    sent=sent.split()
    
    #5.use regex to remove unnecessary charatcers in the data
    sent=[lm.lemmatize(word) for word in sent if word not in set(stopwords.words('english'))]
    
    #6. Form sentences from lsit
    sent=' '.join(sent)
    
    
    #7.Sentence vecctorisation
    sent=tf.transform([sent]).toarray()
   
    #8. predict the value
    predict=nbc.predict(sent)
    
    if(predict[0]==1):
        return 'Positive Review'
    else:
        return "Negative Review"

In [None]:
# user input 1

predict_review("Wow... Loved this place")

In [None]:
# user input 2

predict_review("Crust is not good.")

In [None]:
# user input 3
#sarcasam

predict_review("Food was good and i will never vsiit here again.")

In [None]:
# user input 4

predict_review("Honeslty it didn't taste THAT fresh.)")

### We have tried 1-gram,bigram and trigram in TF-IDF function `(tf=TfidfVectorizer(ngram_range = (1,1)))` and we have good results in onegram

## Insights:
- We have obtained similar accuracy using Naive Bayes Technique and Random Forest Algorithm.
- Our model is able to identify Positive and Negative Reviews correctly as shown above.
