# Prediction of Opinion Spam using LSTM

In [61]:
import warnings
warnings.filterwarnings("ignore") 

In [62]:
import tqdm
import tensorflow as tf
import numpy as np
import keras
import tensorflow.keras.metrics # for recall and precision metrics
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from sklearn.model_selection import train_test_split
import time
import numpy as np
import pickle

In [63]:
SEQUENCE_LENGTH = 1042 # the length of all sequences (number of words per sample)
EMBEDDING_SIZE = 100  # Using 100-Dimensional GloVe embedding vectors
TEST_SIZE = 0.25 # ratio of testing set

BATCH_SIZE = 64
EPOCHS =10 # number of epochs

# to convert labels to integers and vice-versa
label2int = {"Buyable":0,'Not Buyable':1}
int2label = {0:"Buyable",1:"Not Buyable"}

In [64]:
import pandas as pd

In [65]:
data=pd.read_csv("review final.csv")

In [66]:
data

Unnamed: 0,Rating,Review,category
0,2,Cheap product with high cost,toys
1,5,Very gripping and well thought out a taut plot...,books
2,1,not cool,mobile
3,3,This toy is for 1+ year old babies... Overall ...,toys
4,1,I hate this book,books
...,...,...,...
2338,5,I love it,fashion
2339,4,Good,toys
2340,5,I am satistied,fashion
2341,2,Not worth to price,toys


In [67]:
data=data.dropna()

In [68]:
data['Review'].unique()

array(['Cheap product with high cost',
       'Very gripping and well thought out a taut plot-line ! Hats off to the writer . Pls write more such books !',
       'not cool ', ..., 'Timer function doesn t work',
       'Strudy product. Good quality and colors.',
       'it is very difficult to make tea or boil milk. the temperature at the lowest setting ie at 60 or 100 watts is even so high that the fluid over flows even if the utensil is not covered. even in the keep worm  option the heat is so high that the content in the utensil boils and over flows. i have experienced the same problem in the PIC 15.0 model and returned the same to the supplier purchased through Amazon. I had faith on Prestige brand as i had been using its model PIC 1.0 for the last 7 years with out any problem. this PIC 20 model sleeps on the kitchen surface (on tile or granite) because it sits on plastic legs. The manufacturer have designed the back plate to rest on plastic points instead rubber.thus very disappoi

In [69]:
x=data['Review']

In [70]:
x

0                            Cheap product with high cost
1       Very gripping and well thought out a taut plot...
2                                               not cool 
3       This toy is for 1+ year old babies... Overall ...
4                                        I hate this book
                              ...                        
2338                                            I love it
2339                                                 Good
2340                                       I am satistied
2341                                   Not worth to price
2342                                                  bad
Name: Review, Length: 2326, dtype: object

In [71]:
x=list(x)

In [72]:
x

['Cheap product with high cost',
 'Very gripping and well thought out a taut plot-line ! Hats off to the writer . Pls write more such books !',
 'not cool ',
 'This toy is for 1+ year old babies... Overall the product is good ..',
 'I hate this book',
 'careless staff',
 'The toy is not good its too bad again the price is high.the toy has so much sharp edges n the smily cap is so loose.dont waste money on it',
 'Highly disappointed ????',
 "Really good device with great looks and screen. camera is quality is really bad, do not trust online reviews. Can't game on this.Buy for light use and older generation people.",
 'For enhancing knowledge',
 'I just wrote how good the product...but then.. I wanted to use it again  first for 2 minutes it worked and then suddenly it stopped. First it looked like power cut. But there was no power cut..the product was dead.....2 times it worked... Then finished... after one hour I tried again  as I tried all the outlets... Nothing I could do could change

In [73]:
y=data['Rating']

In [74]:
y.head()

0    2
1    5
2    1
3    3
4    1
Name: Rating, dtype: int64

In [75]:
y = y.map({5.0:"Buyable", 4.0:"Buyable", 3.0:"Buyable", 2.0:"Not Buyable", 1.0:"Not Buyable",})

In [76]:
y.head()

0    Not Buyable
1        Buyable
2    Not Buyable
3        Buyable
4    Not Buyable
Name: Rating, dtype: object

In [77]:
y=list(y)

In [78]:
# Text tokenization
# vectorizing text, turning each text into sequence of integers
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x)
# convert to sequence of integers
x = tokenizer.texts_to_sequences(x)

In [79]:
print(x[0])

[126, 7, 17, 89, 420]


In [80]:
# convert to numpy arrays
x = np.array(x)
y = np.array(y)
# pad sequences at the beginning of each sequence with 0's
# for example if SEQUENCE_LENGTH=4:
# [[5, 3, 2], [5, 1, 2, 3], [3, 4]]
# will be transformed to:
# [[0, 5, 3, 2], [5, 1, 2, 3], [0, 0, 3, 4]]
x = pad_sequences(x, maxlen=SEQUENCE_LENGTH)

In [81]:
# One Hot encoding labels
# [spam, ham, spam, ham, ham] will be converted to:
# [1, 0, 1, 0, 1] and then to:
# [[0, 1], [1, 0], [0, 1], [1, 0], [0, 1]]

y = [label2int[label] for label in y]
y = to_categorical(y)

In [82]:
 print(y[0])

[0. 1.]


In [83]:
# split and shuffle
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=TEST_SIZE, random_state=7)

In [84]:
def get_embedding_vectors(tokenizer, dim=100):
    embedding_index = {}
    with open(f"glove.6B.{dim}d.txt", encoding='utf8') as f:
        for line in tqdm.tqdm(f, "Reading GloVe"):
            values = line.split()
            word = values[0]
            vectors = np.asarray(values[1:], dtype='float32')
            embedding_index[word] = vectors

    word_index = tokenizer.word_index
    embedding_matrix = np.zeros((len(word_index)+1, dim))
    for word, i in word_index.items():
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            # words not found will be 0s
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In [85]:
def get_model(tokenizer, lstm_units):
    """
    Constructs the model,
    Embedding vectors => LSTM => 2 output Fully-Connected neurons with softmax activation
    """
    # get the GloVe embedding vectors
    embedding_matrix = get_embedding_vectors(tokenizer)
    model = Sequential()
    model.add(Embedding(len(tokenizer.word_index)+1,
              EMBEDDING_SIZE,
              input_length=SEQUENCE_LENGTH))

    model.add(LSTM(lstm_units, recurrent_dropout=0.2))
    model.add(Dropout(0.3))
    model.add(Dense(2, activation="softmax"))
    # compile as rmsprop optimizer
    # aswell as with recall metric
    model.compile(optimizer="rmsprop", loss="categorical_crossentropy",
                  metrics=["accuracy", tensorflow.keras.metrics.Precision(), tensorflow.keras.metrics.Recall()])
    model.summary()
    return model

In [86]:
# constructs the model with 128 LSTM units
model = get_model(tokenizer=tokenizer, lstm_units=128)

Reading GloVe: 400000it [00:19, 20598.10it/s]


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1042, 100)         345200    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
Total params: 462,706
Trainable params: 462,706
Non-trainable params: 0
_________________________________________________________________


In [87]:
X_train

array([[  0,   0,   0, ...,   0, 205,  30],
       [  0,   0,   0, ...,  13,   1,   7],
       [  0,   0,   0, ..., 185, 154, 207],
       ...,
       [  0,   0,   0, ...,   4,   8, 123],
       [  0,   0,   0, ..., 143,   6, 140],
       [  0,   0,   0, ...,   0,   0, 205]])

In [88]:
X_test

array([[   0,    0,    0, ..., 2485, 1189,  143],
       [   0,    0,    0, ...,    0,    0,    9],
       [   0,    0,    0, ...,    0,  152,    7],
       ...,
       [   0,    0,    0, ...,    0,    0,   35],
       [   0,    0,    0, ...,   12,    7, 2072],
       [   0,    0,    0, ...,  714,   59,   54]])

In [89]:
y_train

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [1., 0.],
       [0., 1.]], dtype=float32)

In [90]:
y_test

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [0., 1.]], dtype=float32)

In [91]:
# print our data shapes
print("X_train.shape:", X_train.shape)
print("X_test.shape:", X_test.shape)
print("y_train.shape:", y_train.shape)
print("y_test.shape:", y_test.shape)

X_train.shape: (1744, 1042)
X_test.shape: (582, 1042)
y_train.shape: (1744, 2)
y_test.shape: (582, 2)


In [92]:
# train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test),
          batch_size=BATCH_SIZE, epochs=EPOCHS,verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1ea2af5db50>

In [93]:
import h5py

In [94]:
model.save("LSTM1.h5")

In [95]:
from tensorflow.keras.models import load_model

In [96]:
lstm = load_model("LSTM1.h5")

In [97]:
# get the loss and metrics
result = model.evaluate(X_test, y_test)
# extract those
loss = result[0]
accuracy = result[1]
precision = result[2]
recall = result[3]



In [98]:
print(f"[+] Accuracy: {accuracy*100:.2f}%")
print(f"[+] Precision: {precision*100:.2f}%")
print(f"[+] Recall:   {recall*100:.2f}%")

[+] Accuracy: 91.41%
[+] Precision: 91.41%
[+] Recall:   91.41%


In [105]:
def get_predictions(text):
    sequence = tokenizer.texts_to_sequences([text])
    # pad the sequence
    sequence = pad_sequences(sequence, maxlen=SEQUENCE_LENGTH)
    # get the prediction
    prediction = lstm.predict(sequence)[0]
    # one-hot encoded vector, revert using np.argmax
    return int2label[np.argmax(prediction)]

In [100]:
data["Review"][74]

'Poor Quality'

In [106]:
text = str(input("Enter the review : "))

Enter the review : very bad product


In [107]:
#text = "We stayed for a one night getaway with family on a thursday. Triple AAA "
print(get_predictions(text))

[0.02358653 0.9764134 ]
Not Buyable
