# Importing the Datasets

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

In [2]:
# this is the food review dataset
data = pd.read_csv("C:\\Users\\sures\\OneDrive\\Desktop\\yelp_labelled.txt",sep="\t")

In [3]:
data.columns = ['Reviews','Sentiment']
data.head()

Unnamed: 0,Reviews,Sentiment
0,Crust is not good.,0
1,Not tasty and the texture was just nasty.,0
2,Stopped by during the late May bank holiday of...,1
3,The selection on the menu was great and so wer...,1
4,Now I am getting angry and I want my damn pho.,0


In [4]:
data.shape

(999, 2)

In [5]:
# value_counts function will tell us that how many time one and how many time 0 occurring in the dataset
data['Sentiment'].value_counts()

0    500
1    499
Name: Sentiment, dtype: int64

In [6]:
data.isnull().sum()

Reviews      0
Sentiment    0
dtype: int64

In [7]:
x_review = data['Reviews']
y = data['Sentiment']

In [8]:
x_review

0                                     Crust is not good.
1              Not tasty and the texture was just nasty.
2      Stopped by during the late May bank holiday of...
3      The selection on the menu was great and so wer...
4         Now I am getting angry and I want my damn pho.
                             ...                        
994    I think food should have flavor and texture an...
995                             Appetite instantly gone.
996    Overall I was not impressed and would not go b...
997    The whole experience was underwhelming, and I ...
998    Then, as if I hadn't wasted enough of my life ...
Name: Reviews, Length: 999, dtype: object

# Data Cleaning

In [9]:
from nltk.corpus import stopwords
import nltk
from nltk.stem  import WordNetLemmatizer

In [10]:
# stopwords=stopwords.words('english')
lemmatizer=WordNetLemmatizer()

In [11]:
x_review[4].split()

['Now',
 'I',
 'am',
 'getting',
 'angry',
 'and',
 'I',
 'want',
 'my',
 'damn',
 'pho.']

In [12]:
def clean_lammatize(x):
    corpus = []
    for i in range(0,len(x)):
        data = re.sub('[^a-zA-Z]',' ',x[i])
        data = data.lower()
        #split funcion is just converting string into list
        data = data.split()
        #applying the stemming on the message
        data = [lemmatizer.lemmatize(word) for word in data if word not in set(stopwords.words('english'))]
        #join function add all the list  item into one string separeted by space.
        data= ' '.join(data)
        corpus.append(data)
    return corpus

In [13]:
cleaned_data=clean_lammatize(x_review)

In [14]:
cleaned_data

['crust good',
 'tasty texture nasty',
 'stopped late may bank holiday rick steve recommendation loved',
 'selection menu great price',
 'getting angry want damn pho',
 'honeslty taste fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fry great',
 'great touch',
 'service prompt',
 'would go back',
 'cashier care ever say still ended wayyy overpriced',
 'tried cape cod ravoli chicken cranberry mmmm',
 'disgusted pretty sure human hair',
 'shocked sign indicate cash',
 'highly recommended',
 'waitress little slow service',
 'place worth time let alone vega',
 'like',
 'burrittos blah',
 'food amazing',
 'service also cute',
 'could care le interior beautiful',
 'performed',
 'right red velvet cake ohhh stuff good',
 'never brought salad asked',
 'hole wall great mexican street taco friendly staff',
 'took hour get food table restaurant food luke warm sever running around like totally overwhelmed',
 'worst salmon sashimi',
 'also combo like burger fry beer decent de

# Vectorization Feature Engineering (TF-IDF)

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv=TfidfVectorizer()

In [16]:
# import pickle
# file=open(r'C:\Users\sures\OneDrive\Documents\ML_projects\tfidf.pkl','wb')
# pickle.dump(rfc,file)

In [17]:
x = cv.fit_transform(cleaned_data)
x = cv.transform(cleaned_data)

In [18]:
x.shape

(999, 1766)

In [19]:
def tfidf(s):
    x = cv.transform(s)
    return x



In [20]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

In [21]:
x_train

<699x1766 sparse matrix of type '<class 'numpy.float64'>'
	with 3823 stored elements in Compressed Sparse Row format>

# Model building

In [22]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
model.fit(x_train,y_train)

In [23]:
y_pred = model.predict(x_test)

In [24]:
y_pred[0:20]

array([0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1],
      dtype=int64)

In [25]:
np.array(y_test[0:20])

array([0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1],
      dtype=int64)

In [26]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.7366666666666667

In [27]:
import pickle
file=open(r'C:\Users\sures\OneDrive\Documents\ML_projects\sentiment.pkl','wb')
pickle.dump(model,file)

# Example

In [36]:
short=clean_lammatize([" service is bed and food is  also worst"])
print(short)

['service bed food also worst']


In [37]:
z=tfidf(short)
print(z)

  (0, 1743)	0.6193943436830255
  (0, 1375)	0.4151492730317161
  (0, 601)	0.37338207035096543
  (0, 26)	0.5518945167829977


In [38]:
z.shape

(1, 1766)

In [39]:
model.predict(z)

array([0], dtype=int64)