# Review Sentiment Analysis for Product Analysis

In [1]:
import re
import string

import nltk
import numpy as np
import pandas as pd
import tensorflow as tf

from keras import layers
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Dataset

The dataset used is 'Flipkart Product reviews with sentiment Dataset'. It is publicly available to download at URL : https://www.kaggle.com/datasets/niraliivaghani/flipkart-product-customer-reviews-dataset


In our case, the database is already present at the project's github repository inside 'data' folder.

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/waleashish/online-product-review-classification/main/data/Dataset-SA.csv",
                 low_memory=False)

The online dataset contains many columns such as 'Product Name', 'Product Price', etc. We only need 'Sentiment' and 'Summary' for our modelling.

In [4]:
dataset = df[['Summary', 'Sentiment']]

In [5]:
# This block contains all the helper functions

"""
    This function performs clearing process on the input text.
    Arguments:
      text : sentence to be processed
    Returns:
      The clean sentence
"""
def process_sentence(text):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove unwanted text patterns (hyperlinks, hashtags, etc.)
    text = re.sub(r'\$\w*', '', str(text))
    # remove old style retweet text "RT"
    text = re.sub(r'^RT[\s]+', '', str(text))
    # remove hyperlinks
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', str(text))
    # remove hashtags
    # only removing the hash # sign from the word
    text = re.sub(r'#', '', str(text))
    # tokenize tweets
    text_tokens = word_tokenize(text)
    # make all word tokens lower case
    text_tokens = [token.lower() for token in text_tokens]
    # remove punctuation
    text_clean = ""
    for word in text_tokens:
        if word not in stopwords_english and word not in string.punctuation:
            text_clean += stemmer.stem(word) + " "

    return text_clean[:-1]

> Our 'Sentiment' column has sentiments as strings. We need to encode them into integers. We'll use sklearn's `LabelEncoder` for this purpose.

> Our 'Summary' column, which contains the reviews, has original reviews. We need to preprocess them to remove punctuations, stopwords, etc.

In [6]:
le = LabelEncoder()
# Encode the sentiments as integers
dataset['label'] = le.fit_transform(df['Sentiment'])
# Pre process the reviews
dataset['Summary'] = dataset['Summary'].apply(lambda x : process_sentence(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['label'] = le.fit_transform(df['Sentiment'])


Create the training, testing and validation datasets.

In [50]:
# Create training and testing data
train_sentences, temp_test_sentences, train_labels, temp_test_labels = train_test_split(dataset['Summary'],
                                                                                        dataset['label'],
                                                                                        test_size=0.2)

# Create validation and testing data
valid_sentences, test_sentences, valid_labels, test_labels = train_test_split(temp_test_sentences,
                                                                              temp_test_labels,
                                                                              test_size=0.5)

## Model

We are creating following model architecture:


1.   Text Vectorization Layer
2.   Embedding Layer
3.   LSTM Layers
4.   Dense Layers
5    Output Dense Layer


Our compilation parameters:

1.   Optimizer: Adam
2.   Loss: Sparse Categorical Cross Entropy
3.   Metric: Accuracy


Our Fitting parameters:

1.   Epochs: 10



In [40]:
# Declare variables
MAX_VOCAB_SIZE = 10000
MAXLEN = 25

# Create text vectorization layers
text_vectorizer = layers.TextVectorization(max_tokens=MAX_VOCAB_SIZE,
                                           output_mode='int',
                                           output_sequence_length=MAXLEN)
text_vectorizer.adapt(train_sentences)
# Create embedding layer
embedding = layers.Embedding(input_dim=MAX_VOCAB_SIZE,
                             output_dim=128,
                             input_length=MAXLEN)

In [56]:
# Create model
inputs = layers.Input((1, ), dtype='string')
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.LSTM(64)(x)
x =
outputs = layers.Dense(3, activation='softmax')(x)
model = tf.keras.Model(inputs, outputs, name="review_sentiment_analysis")

# Compile the model
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

# Fit the model
history = model.fit(x=train_sentences,
                    y=train_labels,
                    epochs=10,
                    validation_data=(valid_sentences, valid_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Testing evaluation

In [57]:
model.evaluate(test_sentences, test_labels)



[0.36045315861701965, 0.9110016822814941]

## Prediction on new data

In [77]:
def predict_sentiment(model, sentences):
    # Get prediction probabilities
    pred_probs = model.predict(sentences)

    # Get the index of maximum probability, that is our label
    label = np.argmax(pred_probs, axis=-1)

    # Convert the label back to sentiment and return it
    return list(le.inverse_transform(label))

In [78]:
sent = [
    "Wow!!! Its the very first word that i said as i put on this headset. Amazing Product, and Delivery time was also great.",
    "An amazing experience. Never seen never dreamt experiences. A worthy entertainer. Cost could have been more affordable.",
    "It was a pretty decent set up & fun to play games on as well.",
    "Item is defective. Contacted the customer service number and email provided by the Amazon customer service.",
    "Please take back product because of not working properly."
    ]

predict_sentiment(model, sent)



['positive', 'neutral', 'positive', 'negative', 'negative']