In [1]:
#Importing all the neccessary libraries
%matplotlib inline
import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_score, recall_score, f1_score
from nltk.stem.porter import PorterStemmer
import string
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Word2Vec, KeyedVectors
import pickle
import warnings
warnings.filterwarnings("ignore")
from sklearn import datasets, neighbors
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from collections import Counter
from matplotlib.colors import ListedColormap
#import scikitplot.metrics as sciplot
from sklearn.metrics import accuracy_score
import math
import nltk
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

##### Use below link to download the dataset
##### https://www.kaggle.com/datasets/snap/amazon-fine-food-reviews

#### The immediate code block below does the following things :

1. Load the Amazon dataset.
2. Classify the reviews initially based on their score rating and give them a 'Positve' or a 'Negative' tag.
3. Remove duplicate/redundant datas.
4. Get an idea of how much percentage data were actually duplicates.
5. Plot a histogram which will display the distribution of the number of positive and negative reviews after de-duplication.

###### NOTE : If we dont' clean the data and feed them to an ML system, it basically means we are throwing in a lot of garbage data to the ML system. If we give it garbage, it will give us garbage back. So it's utmost important to clean the data before proceeding.

In [4]:
filtered_data=pd.read_csv("D:\\NLP\\SENTIMENT ANALYSIS\\Reviews.csv")

In [5]:
filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [6]:
#Give reviews with Score > 3 a 'Positive' tag, and reviews with a score < 3 a 'Negative' tag.
filtered_data['SentimentPolarity'] = filtered_data['Score'].apply(lambda x : 'Positive' if x > 3 else 'Negative')
filtered_data['Class_Labels'] = filtered_data['SentimentPolarity'].apply(lambda x : 1 if x == 'Positive' else 0)

In [7]:
filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,SentimentPolarity,Class_Labels
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,Positive,1
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Negative,0
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,Positive,1
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,Negative,0
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,Positive,1


In [8]:
filtered_data.shape

(568454, 12)

In [9]:
print("The number of positive and negative reviews before the removal of duplicate data.")
print(filtered_data["SentimentPolarity"].value_counts())

The number of positive and negative reviews before the removal of duplicate data.
SentimentPolarity
Positive    443777
Negative    124677
Name: count, dtype: int64


In [10]:
#Removing duplicate entries based on past knowledge.
filtered_duplicates=filtered_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=True)


In [11]:
print("The number of positive and negative reviews after the removal of duplicate data.")
print(filtered_data["SentimentPolarity"].value_counts())

The number of positive and negative reviews after the removal of duplicate data.
SentimentPolarity
Positive    307056
Negative     86877
Name: count, dtype: int64


In [12]:
#Removing the entries where HelpfulnessNumerator > HelpfulnessDenominator.
final_data=filtered_data[filtered_data.HelpfulnessNumerator <= filtered_data.HelpfulnessDenominator]

In [13]:
final_data["SentimentPolarity"].value_counts()

SentimentPolarity
Positive    307054
Negative     86877
Name: count, dtype: int64

#### In this code block :

1. I am creating a copy of the final_data dataset called 'sampled_dataset' by dropping the unwanted columns that we don't need for this problem.
2. Sorting the data according to time, such that the oldest reviews are displayed at the top and the latest reviews are displayed at the bottom.
3. Displaying information about the number of postive and negative reviews in the sampled dataset, using a Histogram.

In [15]:
#Dropping unwanted columns for now.
sampled_dataset=final_data.drop(labels=['Id','ProductId', 'UserId', 'Score', 'ProfileName','HelpfulnessNumerator', 'HelpfulnessDenominator','Summary'], axis=1)
print("The shape of the sampled dataset after dropping unwanted columns : ", sampled_dataset.shape)
sampled_dataset.head()

The shape of the sampled dataset after dropping unwanted columns :  (393931, 4)


Unnamed: 0,Time,Text,SentimentPolarity,Class_Labels
0,1303862400,I have bought several of the Vitality canned d...,Positive,1
1,1346976000,Product arrived labeled as Jumbo Salted Peanut...,Negative,0
2,1219017600,This is a confection that has been around a fe...,Positive,1
3,1307923200,If you are looking for the secret ingredient i...,Negative,0
4,1350777600,Great taffy at a great price. There was a wid...,Positive,1


In [16]:
#Sorting data according to Time in ascending order => Time Based Splitting Step 1.
sampled_dataset=sampled_dataset.sort_values('Time', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last')
sampled_dataset = sampled_dataset.reset_index()
sampled_dataset=sampled_dataset.drop(labels=['index'], axis=1)

In [17]:
#Display distribution of Postive and Negative reviews in a bar graph
sampled_dataset["SentimentPolarity"].value_counts().plot(kind='bar',color=['green','red'],title='Distribution Of Positive and Negative reviews after De-Duplication.',figsize=(5,5))

<Axes: title={'center': 'Distribution Of Positive and Negative reviews after De-Duplication.'}, xlabel='SentimentPolarity'>

#### In this code block :

1. We define two functions which will remove the HTML tags and punctuations from each review.
2. At the end of this code block, each review will contain texts which will only contain alphabetical strings. 
3. We will apply techniques such as stemming and stopwords removal.
3. We will create two columns in the sampled dataset - 'CleanedText' and 'RemovedHTML'.
4. 'CleanedText' column will basically contain the data corpus after stemming the each reviews and removing stopwords from each review. We will use this for our Bag of Word model.
5. 'RemovedHTML' column will contain the data corpus from which only the HTML tags and punctuations are removed. We will use this column for our TF-IDF model, Average Word2Vec model and TF-IDF weighted average Word2Vec model.
6. Store the final table in a dataset called 'sampled_dataset' for future use.

In [19]:
'''Data Cleaning Stage. Clean each review from the sampled Amazon Dataset.'''
#Data Cleaning Stage. Clean each review from the sampled Amazon Dataset

#Function to clean html tags from a sentence
def removeHtml(sentence): 
    pattern = re.compile('<.*?>')
    cleaned_text = re.sub(pattern,' ',sentence)
    return cleaned_text


In [20]:
#Function to keep only words containing letters A-Z and a-z. This will remove all punctuations, special characters etc.
def removePunctuations(sentence):
    cleaned_text  = re.sub('[^a-zA-Z]',' ',sentence)
    return cleaned_text


In [21]:
#Stemming and stopwords removal
import re
from nltk.stem.snowball import SnowballStemmer
sno = SnowballStemmer(language='english')

In [22]:
#Removing the word 'not' from stopwords
default_stopwords = set(stopwords.words('english'))
remove_not = set(['not'])
custom_stopwords = default_stopwords - remove_not

In [23]:
#custom_stopwords

In [24]:
#Building a data corpus by removing all stopwords except 'not'. Because 'not' can be an important estimator to differentiate between positive and negative reviews.    
count=0                   #Iterator to iterate through the list of reviews and check if a given review belongs to the positive or negative class
string=' '    
data_corpus=[]
all_positive_words=[] #Store all the relevant words from Positive reviews
all_negative_words=[] #Store all the relevant words from Negative reviews
stemed_word=''
for review in sampled_dataset['Text'].values:
    filtered_sentence=[]
    sentence=removeHtml(review) #Remove HTMl tags
    for word in sentence.split():
        for cleaned_words in removePunctuations(word).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words)>2)): #Checking if a word consists of only alphabets + word length is greater than 2.    
                if(cleaned_words.lower() not in custom_stopwords):
                    stemed_word=(sno.stem(cleaned_words.lower()))
                    filtered_sentence.append(stemed_word)
                    if (sampled_dataset['SentimentPolarity'].values)[count] == 'Positive': 
                        all_positive_words.append(stemed_word) #List of all the relevant words from Positive reviews
                    if(sampled_dataset['SentimentPolarity'].values)[count] == 'Negative':
                        all_negative_words.append(stemed_word) #List of all the relevant words from Negative reviews
                else:
                    continue
            else:
                continue 
    string = " ".join(filtered_sentence) #Final string of cleaned words    
    data_corpus.append(string) #Data corpus contaning cleaned reviews from the whole dataset
    count+=1
    
    
print("The length of the data corpus is : {}".format(len(data_corpus)))

The length of the data corpus is : 393931


In [25]:
#data_corpus

In [26]:
#Adding a column of CleanedText to the table final which stores the data_corpus after pre-processing the reviews 
sampled_dataset['CleanedText']=data_corpus 

In [27]:
sampled_dataset.head()

Unnamed: 0,Time,Text,SentimentPolarity,Class_Labels,CleanedText
0,1351209600,Nespresso makes GREAT coffee and GREAT machine...,Negative,0,nespresso make great coffe great machin nespre...
1,1351209600,"I love these ginger candy, tastes like ginger ...",Positive,1,love ginger candi tast like ginger noth ad nat...
2,1351209600,This product is a great alternative to peanut ...,Positive,1,product great altern peanut butter butter like...
3,1351209600,This is the best coffee ever! Wish I could ord...,Positive,1,best coffe ever wish could order box time thru...
4,1351209600,You can taste the butter. The peanuts are fre...,Positive,1,tast butter peanut fresh piec see peanut britt...


In [28]:
# Finding most frequently occuring Positive and Negative words 
freq_positive=nltk.FreqDist(all_positive_words)
freq_negative=nltk.FreqDist(all_negative_words)
print("Most Common Positive Words : ",freq_positive.most_common(20))
print("Most Common Negative Words : ",freq_negative.most_common(20))

Most Common Positive Words :  [('not', 147653), ('like', 141045), ('tast', 131300), ('good', 113838), ('flavor', 111637), ('love', 107726), ('great', 104572), ('use', 104341), ('one', 97734), ('product', 92278), ('tri', 87155), ('tea', 84870), ('coffe', 79785), ('make', 75417), ('get', 72271), ('food', 65614), ('time', 56218), ('would', 55796), ('buy', 54320), ('realli', 52813)]
Most Common Negative Words :  [('not', 84839), ('tast', 55662), ('like', 53877), ('product', 41117), ('flavor', 36292), ('one', 32078), ('would', 29054), ('good', 28613), ('tri', 27443), ('use', 25931), ('coffe', 25255), ('get', 22638), ('buy', 20036), ('tea', 20026), ('food', 19221), ('order', 17998), ('much', 16861), ('make', 16592), ('realli', 16441), ('box', 16081)]


In [29]:
sampled_dataset = sampled_dataset[['Time','CleanedText','Class_Labels']]

In [30]:
print(sampled_dataset.shape)
sampled_dataset.head()

(393931, 3)


Unnamed: 0,Time,CleanedText,Class_Labels
0,1351209600,nespresso make great coffe great machin nespre...,0
1,1351209600,love ginger candi tast like ginger noth ad nat...,1
2,1351209600,product great altern peanut butter butter like...,1
3,1351209600,best coffe ever wish could order box time thru...,1
4,1351209600,tast butter peanut fresh piec see peanut britt...,1


In [31]:
sampled_dataset['Class_Labels'].value_counts()

Class_Labels
1    307054
0     86877
Name: count, dtype: int64

In [32]:
#Split the data set into train and test sets. The test set should be unseen. Time Based Splitting Step 2.
#The top old 80% data will get into the train set. The latest 20% data will get into the test set.
def splitting_data(data):
    

    X = sampled_dataset['CleanedText']
    y = sampled_dataset['Class_Labels']
    
    return X

In [33]:
#X

In [34]:
data1 = splitting_data(data1)

NameError: name 'data1' is not defined

In [None]:
#y

In [None]:
split = math.floor(0.8*len(X))
X_train = X[0:split,] ; y_train = y[0:split,]

X_test = X[split:,] ; y_test = y[split:,]

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
#Initializing the BOW constructor
cv_object = CountVectorizer().fit(X_train)

In [None]:
#Creating the BOW matrix from cleaned data corpus. Only 'not' is preserved from stopwords. This is done for both train and test Vectors.
print("\nCreating the BOW vectors using the cleaned corpus")
X_train_vectors = cv_object.transform(X_train)
X_test_vectors = cv_object.transform(X_test)

In [None]:
X_train_vectors.shape

In [None]:
X_train

In [None]:
X_test_vectors.shape

In [None]:
X_train_vectors

In [None]:
model = MultinomialNB()

In [None]:
# Train the model using the training sets
model.fit(X_train_vectors,y_train)

In [None]:
#Predict the response for test dataset
y_pred = model.predict(X_test_vectors)

In [None]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

### next steps to try tfidf and build the model
#### keras tokenizer and build the Lstm model