# CLASSIFYING AMAZON FOOD REVIEWS USING NAIVE BAYES

In [1]:
# Importing the libraries required for the dataset

import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer


In [2]:
# At this stage we will be importing the required sqlite dataset

con = sqlite3.connect('database.sqlite')

'''
Our objective is to check whether the review is positive or negative.

The dataset originally consists of reviews from 1 to 5. We will consider reviews which are rated as 4 and 5 to be positive, reviews 
which are rated 1 and 2 to be negative. As, we cannot draw any conclusions from review which is rated 3 star we will eliminate
all the reviews rated 3 star.

'''

# Filtering out the data w/o the 3 star reviews.

filtered_data = pd.read_sql_query("""SELECT * FROM Reviews WHERE Score != 3""", con) 

In [3]:
# Checking whether the data has been filtered properly

filtered_data['Score'].unique()

# We can see that there is no 3 star review in the data

array([5, 1, 4, 2], dtype=int64)

In [4]:
# As we have eliminated the data with 3 start reviews, we will label the remaining data(4 and 5 scores) as positive and negative(1 and 2 scores).

# Creating a function to label the data
def partition(x):
    if x < 3:
        return '0'
    return '1'


# Applying the labels to the data

actualScore = filtered_data['Score']
positiveNegative = actualScore.map(partition) 
filtered_data['Score'] = positiveNegative

In [5]:
# Checking the first few rows of the data

filtered_data.head(10)

# We can see that the score has been changed to positive and negative instead of 5,4,1,2

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,1,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,0,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,1,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,0,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,1,1350777600,Great taffy,Great taffy at a great price. There was a wid...
5,6,B006K2ZZ7K,ADT0SRK1MGOEU,Twoapennything,0,0,1,1342051200,Nice Taffy,I got a wild hair for taffy and ordered this f...
6,7,B006K2ZZ7K,A1SP2KVKFXXRU1,David C. Sullivan,0,0,1,1340150400,Great! Just as good as the expensive brands!,This saltwater taffy had great flavors and was...
7,8,B006K2ZZ7K,A3JRGQVEQN31IQ,Pamela G. Williams,0,0,1,1336003200,"Wonderful, tasty taffy",This taffy is so good. It is very soft and ch...
8,9,B000E7L2R4,A1MZYO9TZK0BBI,R. James,1,1,1,1322006400,Yay Barley,Right now I'm mostly just sprouting this so my...
9,10,B00171APVA,A21BT40VZCCYT4,Carol A. Reed,0,0,1,1351209600,Healthy Dog Food,This is a very healthy dog food. Good for thei...


In [6]:
# Let us look at the shape of the data

filtered_data.shape

(525814, 10)

In [7]:
# Dropping any duplicates if they are present in the data

duplicates_dropped=filtered_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
duplicates_dropped.shape

(364173, 10)

In [8]:
# Eliminating the rows where helpfulness numerator is greator than the helfulness denominator

final=duplicates_dropped[duplicates_dropped.HelpfulnessNumerator<=duplicates_dropped.HelpfulnessDenominator]

In [9]:
# Taking a look at the shape of the data

final.shape

(364171, 10)

In [10]:
# We will select only the required columns of which we will filter 100000 reviews and assign them their labels and sort them wrt time

final_data = final[['ProductId','Time','Text','Score']]


In [11]:
# Checking the dimension of the data
final_data.shape


(364171, 4)

In [12]:
# Randomly selecting only 15000 reviews from the final_data

# First we will extract the values from the given dataframe

X=final_data.iloc[:,:].values

# randomly extracting 15000 reviews from the dataset

import random

n = 364171
m = 15000
p = m/n

sampled_data = [];

for i in range(0,n):
    if random.random() <= p:
        sampled_data.append(X[i,:])



In [13]:
# Assigning the extracted data to a dataframe

names = ['ProductId','Time','Text','Score']

sample = pd.DataFrame(sampled_data,columns= names)

In [14]:
# Checking the dimensions of the sampled data

sample.shape

(14754, 4)

In [15]:
# Checking the first few rows of the data

sample.head()

Unnamed: 0,ProductId,Time,Text,Score
0,B001GVISJM,1304467200,I love this candy. After weight watchers I ha...,1
1,B001EO5TPM,1215302400,Arrived in 6 days and were so stale i could no...,0
2,B0019CW0HE,1333929600,This food is great - all ages dogs. I have a ...,1
3,B003SE19UK,1324166400,I started buying this after I noticed my 1 yea...,1
4,B003SE19UK,1330992000,This cat food was recommended by my vet becaus...,1


In [16]:
# Now we will sort the data according to timestamp

sorted_data=sample.sort_values('Time', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')

In [17]:
# Checking if the data has been sorted or not

sorted_data.head(15)

Unnamed: 0,ProductId,Time,Text,Score
4694,B00004S1C6,965779200,"These are easy to use, they do not make a mess...",1
6779,B00004RYGX,966297600,"This is such a great film, I don't even know h...",1
4689,B00006L2ZT,1036627200,"Well, maybe not &quot;the&quot; greatest, but ...",1
7175,B001O8NLV2,1038009600,"For me, when the days get colder nothing is as...",1
6480,B00005IX98,1046044800,After years of using Starbuck's decaf and regu...,1
11603,B0000CA4TK,1068422400,I live in exile in Denver now - I couldn't sur...,1
8988,B0000DIYUK,1069286400,"I like this flavor, but it kind of reminds me ...",1
11693,B0000DBN2F,1070582400,Blend a little apple cider mix into your Tazo ...,1
8775,B0000DIVUR,1070755200,My brother-in-law was thrilled with this gift ...,1
10245,B0000DBN1Q,1073433600,"""There is a garden overlooking the Yangtze Riv...",1


In [18]:
# We can see that the data has been sorted wrt time, now we will only consider the 'Text' and 'Score' columns henceforth

sorted_final = sorted_data[['Text','Score']]


In [19]:
# Checking the first few rows of the data to ensure we have the right data

sorted_final.head()

Unnamed: 0,Text,Score
4694,"These are easy to use, they do not make a mess...",1
6779,"This is such a great film, I don't even know h...",1
4689,"Well, maybe not &quot;the&quot; greatest, but ...",1
7175,"For me, when the days get colder nothing is as...",1
6480,After years of using Starbuck's decaf and regu...,1


In [20]:
# The next task is to clean the text data so that it can be fed to the model

# Checking if there are unknown elements in the data

# find sentences containing HTML tags

import re

i=0;
for sent in sorted_final['Text'].values:
    if (len(re.findall('<.*?>', sent))):
        print(i)
        print(sent)
        break;
    i += 1;   
    
# We can see that the data contains html tags, we will need to remove those

3
For me, when the days get colder nothing is as rewarding as a simple cup of hot tea. And for it's claimed immunity benefits, a basic green tea is a common pick for maintaining a healthy natural balance during the flu season. From previous experiences in tasting the Tazo brand, both of the bottled and boxed products, they have proven to be unsurpassed for quality and flavor. Once I've tried their teas they immediately became my drink of choice. <p>The Zen Green Tea Blend is a wonderful one that has only a few ingredients with no artificial anything. And thankfully, doesn't boast the addition of fortified vitamins in some senseless amount. It truly is an enlightening blend of green tea, spearmint, lemongrass and lemon verbena. Thus making it versatile refreshment for anytime of the day, whether it's right after meals or between meals, or just before bedtime. Generally light and mild tasting, but that will depend upon how long you steep it and if you add a sweetener of some form.<p>Inte

In [21]:
'''
We will perform the data cleaning steps on the text data.
For that we will import some packages for stopwords removal, word stemmatization and cleaning html and punctuation marks.
'''
import re
from nltk.corpus import stopwords

nltk.download('stopwords')

stop = set(stopwords.words('english')) #set of stopwords
sno = nltk.stem.SnowballStemmer('english') #initialising the snowball stemmer

def cleanhtml(sentence): #function to clean the word of any html-tags
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext
def cleanpunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kulkarni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
#Code for implementing step-by-step the checks mentioned in the pre-processing phase

i=0
str1=' '
final_string=[]
all_positive_words=[] # store words from +ve reviews here
all_negative_words=[] # store words from -ve reviews here.
s=''
for sent in sorted_final['Text'].values:
    filtered_sentence=[]
    #print(sent);
    sent=cleanhtml(sent) # remove HTMl tags
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):    
                if(cleaned_words.lower() not in stop):
                    s=(sno.stem(cleaned_words.lower())).encode('utf8')
                    filtered_sentence.append(s)
                    if (final['Score'].values)[i] == 'positive': 
                        all_positive_words.append(s) #list of all words used to describe positive reviews
                    if(final['Score'].values)[i] == 'negative':
                        all_negative_words.append(s) #list of all words used to describe negative reviews reviews
                else:
                    continue
            else:
                continue 
    #print(filtered_sentence)
    str1 = b" ".join(filtered_sentence) #final string of cleaned words
    #print("***********************************************************************")
    
    final_string.append(str1)
    i+=1

In [23]:
sorted_final['Cleaned_Text'] = final_string

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


# We will do the classification based on various representations of the text data i.e BoW, Tf-Idf


# Using the Bag of Words text representation to create our first Naive Bayes models

In [28]:
# Creating the BoW representation of the text

count_vect = CountVectorizer() #in scikit-learn
final_counts = count_vect.fit_transform(sorted_final['Cleaned_Text'].values).todense()

In [29]:
# Assigning the final_tf_idf data to 'X' variable

X = final_counts

In [30]:
# Assinging the score to 'y' variable

y = np.array(sorted_final['Score'])

# We will first split the data into train and test sets, then split the train data in to train and cross-validation sets

In [31]:
# Splitting the data into train test sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # Splitting train test with 70:30 ratio

# Splitting the train data into cross validation and test datasets

X_tr, X_CV, y_tr, y_CV = train_test_split(X_train, y_train, test_size=0.3) # Splitting train cross-val with 70:30 ratio


# We will be using the multinomialNB algorithm to classify the data as positive and negative. Here, we will use  'alpha' as the hyperparameter which will take various values to determine the best classifier depending upon the hyperparameter.

# We will detemine the best value of the hyperparameter based on the cross validation accuracy

In [33]:
# Importing the multinomial naive bayes algorithm
# Importing the library required to calcukate the accuracy


from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


# We have taken the following values for 'alpha'
alpha_values = [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000]

for i in alpha_values:
    clf = MultinomialNB(alpha= i)
    clf.fit(X_tr, y_tr)
    pred = clf.predict(X_CV)
    
    # evaluate CV accuracy
    acc = accuracy_score(y_CV, pred, normalize=True) * float(100)
    print('\nCV accuracy for alpha = %.5f is %d%%' % (i, acc))





CV accuracy for alpha = 0.00001 is 86%

CV accuracy for alpha = 0.00010 is 86%

CV accuracy for alpha = 0.00100 is 86%

CV accuracy for alpha = 0.10000 is 87%

CV accuracy for alpha = 1.00000 is 87%

CV accuracy for alpha = 10.00000 is 84%

CV accuracy for alpha = 100.00000 is 84%

CV accuracy for alpha = 1000.00000 is 84%


# We can see that for the alpha value of 0.1 we get the maximum cross validation accuracy.

# Creating a naive bayes model with BoW text representation with an alpha value of 0.1

In [34]:
nb = MultinomialNB(alpha= 0.1)
nb.fit(X_tr,y_tr)
pred_acc = nb.predict(X_test)
acc = accuracy_score(y_test, pred_acc, normalize=True) * float(100)
print('\nTest accuracy for alpha = 1.0 is %d%%' % (acc))


Test accuracy for alpha = 1.0 is 87%


# We can see that we get a test accuracy of 87% for BoW representation of text data with alpha value of 0.1 for the naive bayes classifier.

# In the next stage we will use model evaluation techniques such as confusion matrix, precision, recall and f1-score to further evaluate the model

In [35]:
# Plotting the confusion matrix for our model

print(metrics.confusion_matrix(y_test, pred_acc))

[[ 425  281]
 [ 267 3454]]


# From the above confusion matrix we can conclude that:
# 425 values are True Negative
# 3454 values are True Positive
# 281 values are False Positive
# 267 values are False Negative

In [69]:
# Importing the libraries required to obtain the values of precision, recall and f1 score based on class
from sklearn.metrics import classification_report

print(classification_report(y_test,pred_acc))

             precision    recall  f1-score   support

          0       0.62      0.53      0.57       478
          1       0.91      0.94      0.93      2498

avg / total       0.87      0.87      0.87      2976



# From the above precision, recall and f1-score table we can draw the following conclusions:

# For Class 0(negative):

From the precision score we can say that of all the values that were predicted to belong to negative class, 62% actually belong to the negative class.

From the recall score we can say that of all the points that belong to negative class, 53% were classified belonging to negative class.

# For Class 1(positive):

From the precision score we can say that of all the values that were predicted to belong to positive class, 91% actually belong to positive class.

From the recall score we can say that of all the points that belong to positive class, 94% were classified belongin to positive class.

# The following conclusions can be drawn from the precision, recall and f1-score
# 
#
#

# In this section we will get the most important features depending upon the class which help us predict the class whether it be positive or negative

In [48]:
neg_class_prob_sorted = nb.feature_log_prob_[0, :].argsort()
pos_class_prob_sorted = nb.feature_log_prob_[1, :].argsort()

print('Important features for negative class prediction:\n')
print(np.take(count_vect.get_feature_names(), neg_class_prob_sorted[:150])) # This will show the top 150 features for negative class prediction

print('\nImportant features for positive class prediction:\n')
print(np.take(count_vect.get_feature_names(), pos_class_prob_sorted[:150])) # This will show the top 150 features for positive class prediction

Important features for negative class prediction:

['aahh' 'kee' 'keebler' 'keefer' 'keemun' 'keen' 'keenan' 'sooon' 'keeper'
 'keester' 'kefir' 'keg' 'keifir' 'kellog' 'sooohhh' 'kelp' 'kemper' 'ken'
 'kebab' 'kennel' 'keaton' 'kcal' 'karissa' 'karla' 'karlsson' 'soooooooo'
 'karo' 'soooooo' 'kashmir' 'kasia' 'kasugai' 'kat' 'katchup' 'kathryn'
 'katrina' 'katz' 'kauai' 'kavli' 'kazakhstan' 'sooooo' 'karen' 'kentucki'
 'kenyan' 'somet' 'kicker' 'someday' 'kickn' 'som' 'kiddi' 'kiddo' 'solv'
 'kielbasa' 'kikkoman' 'kili' 'kilimanjaro' 'solubl' 'solomon' 'kiln'
 'kilo' 'kim' 'somethign' 'kenya' 'someway' 'keyword' 'sooner' 'kerala'
 'kerig' 'kern' 'kernal' 'soo' 'kerrygold' 'sonni' 'kestekidi' 'soni'
 'song' 'sone' 'kevin' 'somth' 'keyboard' 'keychain' 'sommeli' 'kgharri'
 'kimbo' 'kaplowi' 'kanteen' 'juan' 'sorrento' 'judah' 'sorguhm'
 'judgement' 'judgment' 'judi' 'judici' 'jug' 'juggl' 'juguet' 'sorghum'
 'juicer' 'jule' 'julep' 'sorbitol' 'julia' 'jtg' 'sorbitiol' 'jsz'
 'sorta' 'so

# Using the TF-IDF text representation to create our Naive Bayes models

In [37]:
# In this stage we have initialized the tf-idf vectorizer and applied it to the text data which has been stored in the final_tf_idf vriable

tf_idf_vect = TfidfVectorizer()
final_tf_idf = tf_idf_vect.fit_transform(sorted_final['Cleaned_Text'].values).todense()

In [38]:
# Assigning the final_tf_idf data to 'X1' variable

X1 = final_tf_idf

In [39]:
# Assinging the score to 'y1' variable

y1 = np.array(sorted_final['Score'])

# We will first split the data into train and test sets, then split the train data in to train and cross-validation sets

In [40]:
# Splitting the data into train test sets

from sklearn.model_selection import train_test_split

X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.3) # Splitting train test with 70:30 ratio

# Splitting the train data into train and cross validation sets

X_tr1, X_CV1, y_tr1, y_CV1 = train_test_split(X_train1, y_train1, test_size=0.3) # Splitting train cross-val with 70:30 ratio

In [41]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

alpha = [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000]

for i in alpha:
    clf1 = MultinomialNB(alpha= i)
    clf1.fit(X_tr, y_tr)
    pred1 = clf1.predict(X_CV)
    
    # evaluate CV accuracy
    acc1 = accuracy_score(y_CV1, pred1, normalize=True) * float(100)
    print('\nCV accuracy for i = %.5f is %d%%' % (i, acc1))




CV accuracy for i = 0.00001 is 76%

CV accuracy for i = 0.00010 is 75%

CV accuracy for i = 0.00100 is 75%

CV accuracy for i = 0.10000 is 73%

CV accuracy for i = 1.00000 is 78%

CV accuracy for i = 10.00000 is 83%

CV accuracy for i = 100.00000 is 83%

CV accuracy for i = 1000.00000 is 83%


# We can see that for the alpha value of 10 we get the maximum cross validation accuracy.

In [42]:
nb1 = MultinomialNB(alpha= 10)
nb1.fit(X_tr1,y_tr1)
pred2 = nb.predict(X_test1)
acc2 = accuracy_score(y_test1, pred2, normalize=True) * float(100)
print('\nTest accuracy for alpha = 10 is %d%%' % (acc2))


Test accuracy for alpha = 0.1 is 89%


# We can see that we get a test accuracy of 89% for TF-IDF representation of text data with alpha value of 10 for the naive bayes classifier.

# In the next stage we will use model evaluation techniques such as confusion matrix, precision, recall and f1-score to further evaluate the model

In [43]:
from sklearn.metrics import classification_report

print(classification_report(y_test1,pred2))

             precision    recall  f1-score   support

          0       0.89      0.42      0.57       705
          1       0.90      0.99      0.94      3722

avg / total       0.90      0.90      0.88      4427



# From the above precision, recall and f1-score table we can draw the following conclusions:

# For Class 0(negative):

From the precision score we can say that of all the values that were predicted to belong to negative class, 89% actually belong to negative class.

From the recall score we can say that of all the points that belong to negative class, 42% were classified belonging to negative class.

# For Class 1(positive):

From the precision score we can say that of all the values that were predicted to belong to positive class, 90% actually belong to positive class.

From the recall score we can say that of all the points that belong to positive class, 99% were classified belonging to positive class.


In [44]:
# Confusion matrix

print(metrics.confusion_matrix(y_test1, pred2))

[[ 296  409]
 [  37 3685]]


# From the above confusion matrix we can conclude that:
# 296 values are True Negative
# 3685 values are True Positive
# 409 values are False Positive
# 37 values are False Negative

# In this section we will get the most important features depending upon the class which help us predict the class whether it be positive or negative

In [50]:
neg_class_prob_sorted1 = nb1.feature_log_prob_[0, :].argsort()
pos_class_prob_sorted1 = nb1.feature_log_prob_[1, :].argsort()

print('Important features for negative class prediction:\n')
print(np.take(tf_idf_vect.get_feature_names(), neg_class_prob_sorted1[:150])) # This will show the top 150 features for negative class prediction

print('\nImportant features for positive class prediction:\n')
print(np.take(tf_idf_vect.get_feature_names(), pos_class_prob_sorted1[:150])) # This will show the top 150 features for positive class prediction

Important features for negative class prediction:

['aahh' 'odin' 'odorless' 'odour' 'odyssey' 'oem' 'oetker' 'offal'
 'offbeat' 'offens' 'odifer' 'officinal' 'ofter' 'ogan' 'oh' 'ohara' 'ohh'
 'ohio' 'oiko' 'oild' 'ointment' 'oftentim' 'oder' 'odel' 'odea' 'oblong'
 'obnoxi' 'obscen' 'obsolet' 'obsolut' 'obssess' 'ocass' 'ocassion'
 'occais' 'occasiona' 'occassion' 'occate' 'occuoi' 'occupi' 'occuppi'
 'oceanfish' 'ocha' 'octan' 'octav' 'octavia' 'octob' 'oist' 'okc' 'okla'
 'oklahoma' 'omnipres' 'ona' 'oncologist' 'onecup' 'oneil' 'ongo' 'onhand'
 'oniion' 'onlyrec' 'onscreen' 'onth' 'ontim' 'oohh' 'oomph' 'oonc' 'ooo'
 'oooh' 'ooooh' 'oooooo' 'oopen' 'opertun' 'omni' 'oblig' 'omlett'
 'omgood' 'okonomiyaki' 'olak' 'olbus' 'olday' 'oldi' 'ole' 'olean'
 'oleoresin' 'olio' 'oliven' 'oliveri' 'olivio' 'olli' 'oma' 'omaha'
 'omami' 'omega' 'omelet' 'omelett' 'omerga' 'omgi' 'omit' 'opiat'
 'objection' 'obey' 'noy' 'nozzl' 'nsa' 'nuanc' 'nubbi' 'nuclear' 'nudg'
 'nueva' 'nuf' 'noxious' 'n

# For this problem we were only used 2 types representation of the text data i.e Bow and TF-IDF. 

# We can see that for TF_IDF  representation of the data for alpha value of 10 we get maximum accuracy of 89%.