In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet', quiet = True)
import re
from bs4 import BeautifulSoup
import contractions
import warnings
warnings.filterwarnings('ignore')
nltk.download('averaged_perceptron_tagger', quiet = True)

True

In [None]:
!pip install 

In [60]:
from platform import python_version

print(python_version())

3.8.8


## Read Data

In [2]:
# Read the data while skipping bad lines.

dataframe = pd.read_table("amazon_reviews_us_Beauty_v1_00.tsv", error_bad_lines = False, warn_bad_lines=False)

In [3]:
dataframe

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,1797882,R3I2DHQBR577SS,B001ANOOOE,2102612,The Naked Bee Vitmin C Moisturizing Sunscreen ...,Beauty,5,0.0,0.0,N,Y,Five Stars,"Love this, excellent sun block!!",2015-08-31
1,US,18381298,R1QNE9NQFJC2Y4,B0016J22EQ,106393691,"Alba Botanica Sunless Tanning Lotion, 4 Ounce",Beauty,5,0.0,0.0,N,Y,Thank you Alba Bontanica!,The great thing about this cream is that it do...,2015-08-31
2,US,19242472,R3LIDG2Q4LJBAO,B00HU6UQAG,375449471,"Elysee Infusion Skin Therapy Elixir, 2oz.",Beauty,5,0.0,0.0,N,Y,Five Stars,"Great Product, I'm 65 years old and this is al...",2015-08-31
3,US,19551372,R3KSZHPAEVPEAL,B002HWS7RM,255651889,"Diane D722 Color, Perm And Conditioner Process...",Beauty,5,0.0,0.0,N,Y,GOOD DEAL!,I use them as shower caps & conditioning caps....,2015-08-31
4,US,14802407,RAI2OIG50KZ43,B00SM99KWU,116158747,Biore UV Aqua Rich Watery Essence SPF50+/PA+++...,Beauty,5,0.0,0.0,N,Y,this soaks in quick and provides a nice base f...,This is my go-to daily sunblock. It leaves no ...,2015-08-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5094302,US,50113639,RZ7RZ02MTP4SL,B000050B70,185454094,Conair NE150NSCS Cordless Nose and Ear Hair Tr...,Beauty,5,10.0,10.0,N,N,Great Little Grooming Tool,After watching my Dad struggle with his scisso...,2000-11-12
5094303,US,52940456,R2IRC0IZ8YCE5T,B000050FF2,678848064,Homedics Envirascape Sound Spa Alarm Clock Radio,Beauty,3,23.0,23.0,N,N,Not bad for the price,"Like most sound machines, the sounds choices a...",2000-11-07
5094304,US,47587881,R1U4ZSXOD228CZ,B000050B6U,862195513,Conair Instant Heat Curling Iron,Beauty,5,89.0,97.0,N,N,Best Curling Iron Ever,I bought this product because it indicated 30 ...,2000-11-02
5094305,US,53047750,R3SFJLZE09URWM,B000050FDE,195242894,Oral-B Professional Care 1000 Power Toothbrush,Beauty,5,10.0,10.0,N,N,"The best electric toothbrush ever, REALLY!",We have used Oral-B products for 15 years; thi...,2000-11-01


## Keep Reviews and Ratings

In [4]:
reviews_and_ratings = dataframe[["review_body", "star_rating"]]

In [5]:
# Convert string to numerical values ("1.0" -> 1.0) and mark as None if the value is invalid.

reviews_and_ratings.loc[: , "star_rating"] = pd.to_numeric(reviews_and_ratings["star_rating"], errors = 'coerce', downcast = 'integer')

In [6]:
# Drop null values.

reviews_and_ratings = reviews_and_ratings.dropna(how = "any")

In [7]:
reviews_and_ratings

Unnamed: 0,review_body,star_rating
0,"Love this, excellent sun block!!",5.0
1,The great thing about this cream is that it do...,5.0
2,"Great Product, I'm 65 years old and this is al...",5.0
3,I use them as shower caps & conditioning caps....,5.0
4,This is my go-to daily sunblock. It leaves no ...,5.0
...,...,...
5094302,After watching my Dad struggle with his scisso...,5.0
5094303,"Like most sound machines, the sounds choices a...",3.0
5094304,I bought this product because it indicated 30 ...,5.0
5094305,We have used Oral-B products for 15 years; thi...,5.0


 ## We form three classes and select 20000 reviews randomly from each class.



In [8]:
ratings_dict = {1:1, 2:1, 3:2, 4:3, 5:3}

In [9]:
# Map the ratings to appropriate classes.

reviews_and_ratings.loc[:,"class"] = reviews_and_ratings["star_rating"].map(ratings_dict)

In [10]:
# Randomly sample 20000 rows from each class.

class_1 = reviews_and_ratings[reviews_and_ratings["class"] == 1].sample(20000)
class_2 = reviews_and_ratings[reviews_and_ratings["class"] == 2].sample(20000)
class_3 = reviews_and_ratings[reviews_and_ratings["class"] == 3].sample(20000)

In [11]:
balanced_dataset = pd.concat([class_1, class_2, class_3], axis = 0)

In [12]:
balanced_dataset

Unnamed: 0,review_body,star_rating,class
4965866,You'll need heavy duty shears to open packages...,1.0,1
2225032,It didn't take the brassiness out of my hair a...,1.0,1
3826255,I have a suspicion that this brush may be the ...,2.0,1
4812345,I was so thrilled to find a cologne that has b...,1.0,1
2084719,While the toothbrush does an excellent job cle...,1.0,1
...,...,...,...
3386104,The picture is deceptive in that you only get ...,4.0,3
4127620,Most of these colors are very natural. A lot o...,4.0,3
2404233,it does work.,4.0,3
279257,Exactly what I expected,5.0,3


In [13]:
balanced_dataset["review_length"] = [len(str(x)) for x in balanced_dataset["review_body"]]

In [14]:
# Calculate the average length of the reviews before cleaning.

length_before_cleaning = balanced_dataset["review_length"].mean()

# Data Cleaning



In [15]:
# Convert text to lowercase
balanced_dataset["review_body"] = balanced_dataset["review_body"].str.lower()

In [16]:
# Remove HTML tags
balanced_dataset["review_body"] = [re.sub('<[^<]+?>', '', str(x)) for x in balanced_dataset["review_body"]]

In [17]:
# Remove URLs
balanced_dataset["review_body"] =  [re.sub(r"http\S+","", str(x)) for x in balanced_dataset["review_body"]]

In [18]:
# Expand contractions
balanced_dataset["review_body"] = [contractions.fix(str(x)) for x in balanced_dataset["review_body"]]

In [19]:
# Remove non-alphabetical characters
balanced_dataset["review_body"] = [re.sub(r"[^a-zA-Z ]", "", str(x)) for x in balanced_dataset["review_body"]]

In [20]:
# Remove excess spaces
balanced_dataset["review_body"] = balanced_dataset["review_body"].replace("\s+", " ", regex = True).str.strip()

In [21]:
balanced_dataset["review_length"] = [len(str(x)) for x in balanced_dataset["review_body"]]

In [22]:
# Calculate the average length of the reviews after cleaning.

length_after_cleaning = balanced_dataset["review_length"].mean()

In [23]:
print(str(length_before_cleaning) + "," + str(length_after_cleaning))

269.7167333333333,258.84695


# Pre-processing

## remove the stop words 

In [24]:
nltk.download('stopwords', quiet = True)
from nltk.corpus import stopwords
stop = stopwords.words("english")

In [25]:
balanced_dataset["review_body_no_stopwords"] = balanced_dataset["review_body"].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))

## perform lemmatization  

In [26]:
from nltk.stem import WordNetLemmatizer

In [27]:
# Function to convert nltk pos tags into tags that WordNetLemmatizer can understand.

def nltk_pos_converter(tag):
    if tag.startswith("J"):
        return "a"
    elif tag.startswith("V"):
        return "v"
    elif tag.startswith("R"):
        return "r"
    else:
        return "n"

In [28]:
# Create another column for text without stopwords.

lemmatizer = WordNetLemmatizer()
balanced_dataset["review_body_no_stopwords"] = balanced_dataset["review_body_no_stopwords"].apply(lambda x: ' '.join([lemmatizer.lemmatize(word, pos = nltk_pos_converter(tag)) for word, tag in nltk.pos_tag(x.split())]))

In [29]:
# Lemmatize the text with stopwords.

lemmatizer = WordNetLemmatizer()
balanced_dataset["review_body"] = balanced_dataset["review_body"].apply(lambda x: ' '.join([lemmatizer.lemmatize(word, pos = nltk_pos_converter(tag)) for word, tag in nltk.pos_tag(x.split())]))

In [30]:
balanced_dataset

Unnamed: 0,review_body,star_rating,class,review_length,review_body_no_stopwords
4965866,you will need heavy duty shear to open package...,1.0,1,516,need heavy duty shear open package like packag...
2225032,it do not take the brassiness out of my hair a...,1.0,1,445,take brassiness hair infact add brassiness can...
3826255,i have a suspicion that this brush may be the ...,2.0,1,436,suspicion brush may recent breakage experience...
4812345,i be so thrilled to find a cologne that have b...,1.0,1,472,thrill find cologne discountinued time even am...
2084719,while the toothbrush do an excellent job clean...,1.0,1,253,toothbrush excellent job clean teeth manufactu...
...,...,...,...,...,...
3386104,the picture be deceptive in that you only get ...,4.0,3,566,picture deceptive get little guy arrive adorab...
4127620,most of these color be very natural a lot of b...,4.0,3,183,color natural lot brown grey black white vibra...
2404233,it do work,4.0,3,12,work
279257,exactly what i expect,5.0,3,23,exactly expect


In [31]:
balanced_dataset["review_length"] = [len(str(x)) for x in balanced_dataset["review_body_no_stopwords"]]

In [32]:
# Calculate the average length of reviews after preprocessing (Stopwords removed).

length_after_preprocessing = balanced_dataset["review_length"].mean()

In [33]:
print(str(length_after_cleaning) + "," + str(length_after_preprocessing))

258.84695,151.47631666666666


In [34]:
balanced_dataset["review_length"] = [len(str(x)) for x in balanced_dataset["review_body"]]

In [35]:
# Calculate the average length of reviews after preprocessing (Stopwords retained).

length_after_preprocessing = balanced_dataset["review_length"].mean()

In [36]:
print(str(length_after_cleaning) + "," + str(length_after_preprocessing))

258.84695,248.88255


# TF-IDF Feature Extraction

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [38]:
tfidf_vectorizer = TfidfVectorizer(min_df = 0.0001, max_df = 0.5, ngram_range = (1,3))
tfidf_features = tfidf_vectorizer.fit_transform(balanced_dataset["review_body"])

In [39]:
from sklearn.model_selection import train_test_split

In [40]:
X = tfidf_features
y = balanced_dataset["class"]

In [41]:
# Datasets with stopwords.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [42]:
tfidf_vectorizer = TfidfVectorizer(min_df = 0.0001, max_df = 0.5, ngram_range = (1,3))
tfidf_features_without_stopwords = tfidf_vectorizer.fit_transform(balanced_dataset["review_body_no_stopwords"])

In [43]:
X = tfidf_features_without_stopwords

In [44]:
# Datasets without stopwords.

X_train_wos, X_test_wos, y_train_wos, y_test_wos = train_test_split(X, y, test_size = 0.2)

# Perceptron

In [45]:
# Function to calculate the metrics and print them.

def metrics_calculator(y_test, y_pred):
    class_1_cm = dict()
    class_2_cm = dict()
    class_3_cm = dict()
    
    confusion_matrix = [[0]*3 for i in range(3)]
    
    for i in range(len(y_test)):
        if y_test[i] == 1:
            if y_pred[i] == 1:
                confusion_matrix[0][0] += 1
            elif y_pred[i] == 2:
                confusion_matrix[0][1] += 1
            else:
                confusion_matrix[0][2] += 1
        elif y_test[i] == 2:
            if y_pred[i] == 1:
                confusion_matrix[1][0] += 1
            elif y_pred[i] == 2:
                confusion_matrix[1][1] += 1
            else:
                confusion_matrix[1][2] += 1
        else:
            if y_pred[i] == 1:
                confusion_matrix[2][0] += 1
            elif y_pred[i] == 2:
                confusion_matrix[2][1] += 1
            else:
                confusion_matrix[2][2] += 1
        
    class_1_tp = confusion_matrix[0][0]
    class_1_tn = confusion_matrix[1][1] + confusion_matrix[1][2] + confusion_matrix[2][1] + confusion_matrix[2][2]
    class_1_fp = confusion_matrix[1][0] + confusion_matrix[2][0]
    class_1_fn = confusion_matrix[0][1] + confusion_matrix[0][2]
    
    class_2_tp = confusion_matrix[1][1]
    class_2_tn = confusion_matrix[0][0] + confusion_matrix[0][2] + confusion_matrix[2][0] + confusion_matrix[2][2]
    class_2_fp = confusion_matrix[0][1] + confusion_matrix[2][1]
    class_2_fn = confusion_matrix[1][0] + confusion_matrix[1][2]
    
    class_3_tp = confusion_matrix[2][2]
    class_3_tn = confusion_matrix[0][0] + confusion_matrix[0][1] + confusion_matrix[1][0] + confusion_matrix[1][1]
    class_3_fp = confusion_matrix[0][2] + confusion_matrix[1][2]
    class_3_fn = confusion_matrix[2][0] + confusion_matrix[2][1]
    
    class_1_precision = (class_1_tp) / (class_1_tp + class_1_fp)
    class_1_recall = (class_1_tp) / (class_1_tp + class_1_fn)
    class_1_f1_score = 2 * class_1_precision * class_1_recall / (class_1_precision + class_1_recall)
    
    class_2_precision = (class_2_tp) / (class_2_tp + class_2_fp)
    class_2_recall = (class_2_tp) / (class_2_tp + class_2_fn)
    class_2_f1_score = 2 * class_2_precision * class_2_recall / (class_2_precision + class_2_recall)
    
    class_3_precision = (class_3_tp) / (class_3_tp + class_3_fp)
    class_3_recall = (class_3_tp) / (class_3_tp + class_3_fn)
    class_3_f1_score = 2 * class_3_precision * class_3_recall / (class_3_precision + class_3_recall)
    
    print(str(class_1_precision) + "," + str(class_1_recall) + "," + str(class_1_f1_score))  
    print(str(class_2_precision) + "," + str(class_2_recall) + "," + str(class_2_f1_score))  
    print(str(class_3_precision) + "," + str(class_3_recall) + "," + str(class_3_f1_score))  
    
    average_precision = (class_1_precision + class_2_precision + class_3_precision)/3
    average_recall = (class_1_recall + class_2_recall + class_3_recall)/3
    average_f1_score = (class_1_f1_score + class_2_f1_score + class_3_f1_score)/3
    
    print(str(average_precision) + "," + str(average_recall) + "," + str(average_f1_score))

In [46]:
from sklearn.linear_model import Perceptron

In [47]:
# Perceptron without stopwords.

perceptron = Perceptron(alpha = 0.0001, tol = 1e-3)
perceptron.fit(X_train_wos, y_train_wos)
y_pred_wos = perceptron.predict(X_test_wos)
metrics_calculator(y_test_wos.values, y_pred_wos)

# 0.5979827089337176,0.6264150943396226,0.6118687799483966
# 0.5076035658101731,0.49137055837563454,0.4993551715243745
# 0.6954251616111388,0.6847001223990208,0.6900209695325028
# 0.6003371454516765,0.6008285917047593,0.6004149736684247

0.5979827089337176,0.6264150943396226,0.6118687799483966
0.5076035658101731,0.49137055837563454,0.4993551715243745
0.6954251616111388,0.6847001223990208,0.6900209695325028
0.6003371454516765,0.6008285917047593,0.6004149736684247


In [48]:
# Perceptron with stopwords.

perceptron = Perceptron(alpha = 0.0001, tol = 1e-3)
perceptron.fit(X_train, y_train)
y_pred = perceptron.predict(X_test)
metrics_calculator(y_test.values, y_pred)

0.6828846628797234,0.688667496886675,0.685763888888889
0.5745800952619704,0.5727136431784108,0.5736453510198974
0.773346794548208,0.7692693949284459,0.7713027061044683
0.6769371842299673,0.6768835116645106,0.6769039820044181


# SVM

In [49]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [50]:
# Linear SVC without scaling and without stopwords.

lsvc = LinearSVC(C = 1.0, tol = 1e-3)
lsvc.fit(X_train_wos, y_train_wos)
y_pred_wos = lsvc.predict(X_test_wos)
metrics_calculator(y_test_wos.values, y_pred_wos)

# 0.662357036300348,0.670188679245283,0.6662498436913843
# 0.577438370846731,0.5469543147208121,0.5617831074035454
# 0.7416391898257183,0.7708690330477356,0.7559716720681791
# 0.6604781989909325,0.6626706756712769,0.6613348743877029

0.662357036300348,0.670188679245283,0.6662498436913843
0.577438370846731,0.5469543147208121,0.5617831074035454
0.7416391898257183,0.7708690330477356,0.7559716720681791
0.6604781989909325,0.6626706756712769,0.6613348743877029


In [51]:
# Linear SVC with scaling and without stopwords.

pipeline = make_pipeline(StandardScaler(with_mean = False), LinearSVC(C = 1.0, tol = 1e-3))
pipeline.fit(X_train_wos, y_train_wos)
y_pred = pipeline.predict(X_test_wos)
metrics_calculator(y_test_wos.values, y_pred_wos)

# 0.662357036300348,0.670188679245283,0.6662498436913843
# 0.577438370846731,0.5469543147208121,0.5617831074035454
# 0.7416391898257183,0.7708690330477356,0.7559716720681791
# 0.6604781989909325,0.6626706756712769,0.6613348743877029

0.662357036300348,0.670188679245283,0.6662498436913843
0.577438370846731,0.5469543147208121,0.5617831074035454
0.7416391898257183,0.7708690330477356,0.7559716720681791
0.6604781989909325,0.6626706756712769,0.6613348743877029


In [52]:
# Linear SVC without scaling and with stopwords.

lsvc = LinearSVC(C = 1.0, tol = 1e-5)
lsvc.fit(X_train, y_train)
y_pred = lsvc.predict(X_test)
metrics_calculator(y_test.values, y_pred)

0.7243842364532019,0.7325031133250312,0.7284210526315789
0.6320467242254951,0.6219390304847576,0.6269521410579346
0.811344327836082,0.8152146623148381,0.8132748904195367
0.7225917628382597,0.7232189353748756,0.7228826947030167


In [53]:
# Linear SVC with scaling and with stopwords.

pipeline = make_pipeline(StandardScaler(with_mean = False), LinearSVC(C = 1.0, tol = 1e-3))
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
metrics_calculator(y_test.values, y_pred)

# 0.6495160468670402,0.635118306351183,0.6422364941443143
# 0.5422086202499362,0.5312343828085957,0.5366654045184905
# 0.713700939080183,0.7441626914386141,0.7286135693215339
# 0.6351418687323865,0.6368384601994643,0.6358384893281129

0.6495160468670402,0.635118306351183,0.6422364941443143
0.5422086202499362,0.5312343828085957,0.5366654045184905
0.713700939080183,0.7441626914386141,0.7286135693215339
0.6351418687323865,0.6368384601994643,0.6358384893281129


# Logistic Regression

In [54]:
from sklearn.linear_model import LogisticRegression

In [55]:
# Logistic Regression without stopwords.

logreg = LogisticRegression(C = 1.0, tol = 1e-3, max_iter = 1000)
logreg.fit(X_train_wos, y_train_wos)
y_pred_wos = logreg.predict(X_test_wos)
metrics_calculator(y_test_wos.values, y_pred_wos)

# 0.6908588648920141,0.6920754716981132,0.6914666331531985
# 0.5923927011051143,0.5850253807106599,0.5886859915719576
# 0.7685970438575236,0.7764993880048959,0.7725280077934729
# 0.6839495366182172,0.6845334134712231,0.6842268775062097

0.6908588648920141,0.6920754716981132,0.6914666331531985
0.5923927011051143,0.5850253807106599,0.5886859915719576
0.7685970438575236,0.7764993880048959,0.7725280077934729
0.6839495366182172,0.6845334134712231,0.6842268775062097


In [56]:
# Logistic Regression with stopwords.

logreg = LogisticRegression(C = 1.0, tol = 1e-3, max_iter = 1000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
metrics_calculator(y_test.values, y_pred)

0.7475,0.7447073474470735,0.746101060511541
0.6444007858546169,0.655672163918041,0.6499876145652712
0.8294297352342159,0.8179763996987196,0.8236632536973835
0.740443507029611,0.7394519703546113,0.7399173095913986


# Naive Bayes

In [57]:
from sklearn.naive_bayes import MultinomialNB

In [58]:
# Naive Bayes without stopwords.

mnb = MultinomialNB(alpha = 1.0)
mnb.fit(X_train_wos, y_train_wos)
y_pred_wos = mnb.predict(X_test_wos)
metrics_calculator(y_test_wos.values, y_pred_wos)

# 0.6936582809224319,0.6659119496855346,0.6795019894750353
# 0.5726536445926632,0.6101522842639594,0.5908085524698944
# 0.7601605619668841,0.7417380660954712,0.7508363275926156
# 0.6754908291606597,0.672600766681655,0.6737156231791818

0.6936582809224319,0.6659119496855346,0.6795019894750353
0.5726536445926632,0.6101522842639594,0.5908085524698944
0.7601605619668841,0.7417380660954712,0.7508363275926156
0.6754908291606597,0.672600766681655,0.6737156231791818


In [59]:
# Naive Bayes with stopwords.

mnb = MultinomialNB(alpha = 1.0)
mnb.fit(X_train, y_train)
y_pred = mnb.predict(X_test)
metrics_calculator(y_test.values, y_pred)

0.7457237681899412,0.7275217932752179,0.7365103378719111
0.627000695894224,0.6754122938530734,0.6503067484662577
0.8374867444326617,0.7931207632437861,0.8147001934235976
0.736737069505609,0.7320182834573591,0.7338390932539222
