# Sentiment Analysis of Amazon Review using LSTM 

In [1]:
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
from nltk.corpus import stopwords   # to get collection of stopwords
from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
import re

## Loading the data

In [2]:
df1 = pd.read_csv('C://Users//Ujjawal//Downloads//1429_1.csv')
df2 = pd.read_csv('C://Users//Ujjawal//Downloads//Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv')
df3 = pd.read_csv('C://Users//Ujjawal//Downloads//Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
df1 = df1[['brand','reviews.rating','reviews.text','reviews.doRecommend']]
df2 = df2[['brand','reviews.rating','reviews.text','reviews.doRecommend']]
df3 = df3[['brand','reviews.rating','reviews.text','reviews.doRecommend']]

### Data pre-preprocessing

In [4]:
df1.head(3)

Unnamed: 0,brand,reviews.rating,reviews.text,reviews.doRecommend
0,Amazon,5.0,This product so far has not disappointed. My c...,True
1,Amazon,5.0,great for beginner or experienced person. Boug...,True
2,Amazon,5.0,Inexpensive tablet for him to use and learn on...,True


In [5]:
df2.head(3)

Unnamed: 0,brand,reviews.rating,reviews.text,reviews.doRecommend
0,Amazonbasics,3,I order 3 of them and one of the item is bad q...,
1,Amazonbasics,4,Bulk is always the less expensive way to go fo...,
2,Amazonbasics,5,Well they are not Duracell but for the price i...,


In [6]:
df3.head(3)

Unnamed: 0,brand,reviews.rating,reviews.text,reviews.doRecommend
0,Amazon,3,I thought it would be as big as small paper bu...,False
1,Amazon,5,This kindle is light and easy to use especiall...,True
2,Amazon,4,Didnt know how much i'd use a kindle so went f...,True


In [7]:
df2.shape #df1.shape = (34660,4) df2.shape = (28332,4) df3.shape = (5000,4)

(28332, 4)

In [8]:
df2.isnull().sum()

brand                      0
reviews.rating             0
reviews.text               0
reviews.doRecommend    12246
dtype: int64

In [9]:
df2 = df2.dropna()

In [10]:
df = pd.concat([df1,df2,df3])

In [11]:
df.shape

(55746, 4)

In [12]:
df.isnull().sum()

brand                    0
reviews.rating          33
reviews.text             1
reviews.doRecommend    594
dtype: int64

In [13]:
dataset = df.copy()
dataset=dataset[['reviews.text','reviews.rating']]
dataset.shape

(55746, 2)

In [14]:
dataset.head(3)

Unnamed: 0,reviews.text,reviews.rating
0,This product so far has not disappointed. My c...,5.0
1,great for beginner or experienced person. Boug...,5.0
2,Inexpensive tablet for him to use and learn on...,5.0


In [15]:
dataset['sentiment'] = dataset["reviews.rating"].apply(lambda score: "positive" if score > 3 else \
                                              ("negative" if score < 3 else "intermediate"))

In [16]:
dataset.head(3)

Unnamed: 0,reviews.text,reviews.rating,sentiment
0,This product so far has not disappointed. My c...,5.0,positive
1,great for beginner or experienced person. Boug...,5.0,positive
2,Inexpensive tablet for him to use and learn on...,5.0,positive


## Cleaning dataset

In [17]:
import nltk
nltk.download('stopwords')
english_stops = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ujjawal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
def load_dataset():
    data = dataset.copy()
    x_data = data['reviews.text']       # Reviews/Input
    y_data = data['sentiment']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in str(review).split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case
    
    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)
    y_data = y_data.replace('intermediate', 2)

    return x_data, y_data

x_data, y_data = load_dataset()

print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)

Reviews
0       [this, product, far, disappointed, my, childre...
1       [great, beginner, experienced, person, bought,...
2       [inexpensive, tablet, use, learn, step, nabi, ...
3       [i, fire, hd, two, weeks, i, love, this, table...
4       [i, bought, grand, daughter, comes, visit, i, ...
                              ...                        
4995      [this, great, tablet, price, amazon, good, job]
4996    [this, tablet, perfect, size, easy, use, read,...
4997    [purchased, son, has, room, upgrade, memory, a...
4998    [i, thoughts, getting, year, old, get, screen,...
4999                [steal, gb, model, well, this, punch]
Name: reviews.text, Length: 55746, dtype: object 

Sentiment
0       1
1       1
2       1
3       1
4       1
       ..
4995    1
4996    1
4997    1
4998    1
4999    1
Name: sentiment, Length: 55746, dtype: int64


## train/test split

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

In [20]:
x_train.shape

(44596,)

In [21]:
x_test.shape

(11150,)

In [22]:
x_train.head(3)

12200    [i, purchased, wall, charger, i, issues, samsu...
22482    [i, wish, little, smarter, right, box, most, q...
21545    [i, replaced, older, tablet, new, version, e, ...
Name: reviews.text, dtype: object

### Function for getting maximum length of review words for word embedding

In [23]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

## Tokenizer  to encode reviews.text into integer

In [24]:
token = Tokenizer(lower=False)    
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

In [25]:
#x_train --> to check the integer number for specific words in x_train

In [26]:
#x_test   --> to check the integer number for specific words in x_train 

## word Embedding / Padding

In [27]:
max_length = get_max_length()
x_train = pad_sequences(x_train, padding='post', truncating='post',maxlen=max_length)
x_test = pad_sequences(x_test, padding='post', truncating='post', maxlen=max_length)

In [28]:
max_length

18

In [29]:
total_words = len(token.word_index) + 1

## LSTM

In [48]:
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'sgd', loss = 'CategoricalCrossentropy', metrics = ['accuracy'])

print(model.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 18, 32)            416512    
_________________________________________________________________
lstm_4 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
Total params: 441,409
Trainable params: 441,409
Non-trainable params: 0
_________________________________________________________________
None


In [49]:
checkpoint = ModelCheckpoint(
    'models/LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [50]:
model.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint])

Epoch 1/5

Epoch 00001: accuracy improved from -inf to 0.81747, saving model to models\LSTM.h5
Epoch 2/5

Epoch 00002: accuracy improved from 0.81747 to 0.93349, saving model to models\LSTM.h5
Epoch 3/5

Epoch 00003: accuracy did not improve from 0.93349
Epoch 4/5

Epoch 00004: accuracy did not improve from 0.93349
Epoch 5/5

Epoch 00005: accuracy did not improve from 0.93349


<tensorflow.python.keras.callbacks.History at 0x182d08e3df0>

In [51]:
y_pred = model.predict_classes(x_test, batch_size = 128)

true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))


Correct Prediction: 10381
Wrong Prediction: 769
Accuracy: 93.10313901345292


In [56]:
EMBED_DIM = 32
LSTM_OUT = 64

model1 = Sequential()
model1.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model1.add(LSTM(LSTM_OUT))
model1.add(Dense(1, activation='sigmoid'))
model1.compile(optimizer = 'adam', loss = 'CategoricalCrossentropy', metrics = ['accuracy'])

print(model1.summary())

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 18, 32)            416512    
_________________________________________________________________
lstm_6 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 65        
Total params: 441,409
Trainable params: 441,409
Non-trainable params: 0
_________________________________________________________________
None


In [64]:
model1.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint])

Epoch 1/5

Epoch 00001: accuracy improved from 0.92127 to 0.92428, saving model to models\LSTM.h5
Epoch 2/5

Epoch 00002: accuracy improved from 0.92428 to 0.93351, saving model to models\LSTM.h5
Epoch 3/5

Epoch 00003: accuracy did not improve from 0.93351
Epoch 4/5

Epoch 00004: accuracy did not improve from 0.93351
Epoch 5/5

Epoch 00005: accuracy did not improve from 0.93351


<tensorflow.python.keras.callbacks.History at 0x182d7384cd0>

In [66]:
EMBED_DIM = 32
LSTM_OUT = 64

model2 = Sequential()
model2.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model2.add(LSTM(LSTM_OUT))
model2.add(Dense(1, activation='sigmoid'))
model2.compile(optimizer = 'adadelta', loss = 'CategoricalCrossentropy', metrics = ['accuracy'])

print(model2.summary())

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 18, 32)            416512    
_________________________________________________________________
lstm_8 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 65        
Total params: 441,409
Trainable params: 441,409
Non-trainable params: 0
_________________________________________________________________
None


In [68]:
model2.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint])

Epoch 1/5

Epoch 00001: accuracy improved from -inf to 0.79191, saving model to models\LSTM.h5
Epoch 2/5

Epoch 00002: accuracy improved from 0.79191 to 0.85725, saving model to models\LSTM.h5
Epoch 3/5

Epoch 00003: accuracy improved from 0.85725 to 0.88472, saving model to models\LSTM.h5
Epoch 4/5

Epoch 00004: accuracy improved from 0.88472 to 0.90605, saving model to models\LSTM.h5
Epoch 5/5

Epoch 00005: accuracy improved from 0.90605 to 0.91892, saving model to models\LSTM.h5


<tensorflow.python.keras.callbacks.History at 0x182dd4209d0>