# Sentiment Analysis and Quality Metrics in Amazon Product Reviews

## Importing Necessary Libraries:

In [13]:
# Data manipulation
import pandas as pd
import numpy as np


In [16]:
# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [18]:
# Text processing 
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

## Loading the Dataset

In [19]:
df = pd.read_csv("Reviews.csv")

In [20]:
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [27]:
df.shape

(568454, 10)

In [28]:
df.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568428 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB


In [23]:
df.describe()

Unnamed: 0,Id,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time
count,568454.0,568454.0,568454.0,568454.0,568454.0
mean,284227.5,1.743817,2.22881,4.183199,1296257000.0
std,164098.679298,7.636513,8.28974,1.310436,48043310.0
min,1.0,0.0,0.0,1.0,939340800.0
25%,142114.25,0.0,0.0,4.0,1271290000.0
50%,284227.5,0.0,1.0,5.0,1311120000.0
75%,426340.75,2.0,2.0,5.0,1332720000.0
max,568454.0,866.0,923.0,5.0,1351210000.0


## Data Cleaning

### Checking for missing values

In [32]:
print(df.isnull().sum())

Id                         0
ProductId                  0
UserId                     0
ProfileName               26
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64


Here we found a very few missing values in the profilename and summary columns, which we can drop without any issue.

In [35]:

# Dropping rows with missing values in 'ProfileName' and 'Summary'
df_cleaned = df.dropna(subset=['ProfileName', 'Summary'])


### Checking for Duplicates

In [36]:
duplicates = df_cleaned[df_cleaned.duplicated(subset=['UserId', 'ProductId', 'Text'], keep=False)]
print(f"Number of duplicate rows: {duplicates.shape[0]}")


Number of duplicate rows: 2122


In [37]:
# Dropping duplicates
df_cleaned = df_cleaned.drop_duplicates(subset=['UserId', 'ProductId', 'Text'])


In [38]:
# Verify cleaning
print("Shape after cleaning:", df_cleaned.shape)
print("Missing values after cleaning:\n", df_cleaned.isnull().sum())

Shape after cleaning: (567092, 10)
Missing values after cleaning:
 Id                        0
ProductId                 0
UserId                    0
ProfileName               0
HelpfulnessNumerator      0
HelpfulnessDenominator    0
Score                     0
Time                      0
Summary                   0
Text                      0
dtype: int64


## Data Exploration

In [40]:
# Shape of the cleaned dataframe
print("Shape of the cleaned dataframe:", df_cleaned.shape)

Shape of the cleaned dataframe: (567092, 10)


In [42]:
# Displaying the first few rows
df_cleaned.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [43]:
# Summary of the dataset
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 567092 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      567092 non-null  int64 
 1   ProductId               567092 non-null  object
 2   UserId                  567092 non-null  object
 3   ProfileName             567092 non-null  object
 4   HelpfulnessNumerator    567092 non-null  int64 
 5   HelpfulnessDenominator  567092 non-null  int64 
 6   Score                   567092 non-null  int64 
 7   Time                    567092 non-null  int64 
 8   Summary                 567092 non-null  object
 9   Text                    567092 non-null  object
dtypes: int64(5), object(5)
memory usage: 47.6+ MB


In [64]:
# Exploring unique values in the Score column
unique_scores = df_cleaned['Score'].unique()
print("Unique Scores:", unique_scores)

Unique Scores: [5 1 4 2 3]


### There are 5 unique scores/ ratings as 1,2,3,4,5.

In [46]:
# Checking the distribution of scores
score_distribution = df_cleaned['Score'].value_counts()
print("Score Distribution:\n", score_distribution)

Score Distribution:
 Score
5    362299
4     80549
1     51961
3     42556
2     29727
Name: count, dtype: int64


### The distribution of the scores/ratings across the dataset vary widely as 5 has most of the distributions whereas 4,3,2, and 1 has minimal ratings in a descending order starting from 4.

In [47]:
# Display some unique values from the Summary and Text columns
print("Unique Summaries:\n", df_cleaned['Summary'].unique()[:10])  # Display first 10 unique summaries
print("Unique Texts:\n", df_cleaned['Text'].unique()[:10])        # Display first 10 unique texts


Unique Summaries:
 ['Good Quality Dog Food' 'Not as Advertised' '"Delight" says it all'
 'Cough Medicine' 'Great taffy' 'Nice Taffy'
 'Great!  Just as good as the expensive brands!' 'Wonderful, tasty taffy'
 'Yay Barley' 'Healthy Dog Food']
Unique Texts:
 ['I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.'
 'Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".'
 'This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavor

### Importing NLTK libraries

In [48]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yeshw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yeshw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yeshw\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Data Preprocessing

In [49]:
# Initializing lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


In [50]:
# Defining function to clean the text
def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(f"[{re.escape(string.punctuation)}0-9]", " ", text)
    # Tokenize
    words = nltk.word_tokenize(text)
    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    # Join cleaned words back to string
    return ' '.join(words)


In [51]:
# Apply the cleaning function to the 'Text' column
df_cleaned['Cleaned_Text'] = df_cleaned['Text'].apply(clean_text)

In [52]:
# Optionally, create a feature for review length
df_cleaned['Review_Length'] = df_cleaned['Cleaned_Text'].apply(lambda x: len(x.split()))

In [53]:
# Display the cleaned text and review length
print(df_cleaned[['Text', 'Cleaned_Text', 'Review_Length']].head())

                                                Text  \
0  I have bought several of the Vitality canned d...   
1  Product arrived labeled as Jumbo Salted Peanut...   
2  This is a confection that has been around a fe...   
3  If you are looking for the secret ingredient i...   
4  Great taffy at a great price.  There was a wid...   

                                        Cleaned_Text  Review_Length  
0  bought several vitality canned dog food produc...             23  
1  product arrived labeled jumbo salted peanut pe...             18  
2  confection around century light pillowy citrus...             40  
3  looking secret ingredient robitussin believe f...             18  
4  great taffy great price wide assortment yummy ...             13  
