In [22]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords

# Load Dataset

In [2]:
df = pd.read_csv("Reviews.csv",index_col=0)

In [3]:
df.head()

Unnamed: 0_level_0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [4]:
df["review"] = df["Score"].apply(lambda x: 0 if x<4 else 1)
df = df[["Text","review"]]

In [29]:
stop_words = stopwords.words('english')

def filter_stop_words(text):
    text = text.split()
    text = " ".join([word for word in text if word.lower().strip() not in stop_words])
    return text

In [33]:
df["Text"] = df["Text"].apply(lambda x: filter_stop_words(x))
df

Unnamed: 0_level_0,Text,review
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,bought several Vitality canned dog food produc...,1
2,Product arrived labeled Jumbo Salted Peanuts.....,0
3,"confection around centuries. light, pillowy ci...",1
4,looking secret ingredient Robitussin believe f...,0
5,Great taffy great price. wide assortment yummy...,1
...,...,...
568450,Great sesame chicken..this good better restura...,1
568451,I'm disappointed flavor. chocolate notes espec...,0
568452,"stars small, give 10-15 one training session. ...",1
568453,BEST treats training rewarding dog good groomi...,1


In [6]:
df["review"].value_counts() # 1 for positive and 0 for negative

1    443777
0    124677
Name: review, dtype: int64

The dataset has almost 4 times the positive reviews compared to negative reviews. To counter this we can sample only a part of the positive reviews

In [7]:
positive_reviews = df[df.review == 1]
negative_reviews = df[df.review == 0]

positive_reviews = positive_reviews.sample(n=len(negative_reviews))

reviews = positive_reviews.append(negative_reviews).reset_index(drop=True)

In [8]:
reviews
# The data can be shuffled later when calling model.fit()

Unnamed: 0,Text,review
0,This is a good strong coffee - I do use 2 K-Cu...,1
1,These barley teething biscuits are very simila...,1
2,I chewed this & chicklets when I was a kid. I ...,1
3,"This is THE best tea that I have ever had, and...",1
4,Tellicherry Pepper at this price is remarkable...,1
...,...,...
249349,I just bought this soup today at my local groc...,0
249350,This soup is mostly broth. Although it has a k...,0
249351,"It is mostly broth, with the advertised 3/4 cu...",0
249352,I had ordered some of these a few months back ...,0


In [9]:
sentences = reviews["Text"].values
labels = reviews["review"].values

In [10]:
# Define training and testing sets
train_sentences,test_sentences,train_labels,test_labels = train_test_split(sentences,labels,test_size=0.3,shuffle=True)

# Tokenization

In [11]:
embed_dim = 64 # dimension of the embedding layer
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [12]:
tokenizer = Tokenizer(oov_token=oov_tok) #Define tokenizer
tokenizer.fit_on_texts(train_sentences) # Assign tokens based on words on training set

vocab_size = len(tokenizer.word_index) + 1 # add 1 to account for the '0' used to pad sequences

train_sequences = tokenizer.texts_to_sequences(train_sentences) # Create sequences based on tokens for the training set

max_length = np.max(list(map(lambda x: len(x), train_sequences))) # Extract the length of the longest sentence in the train set

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) # pad/truncate zeros at the end for a length of 'max_length' 


test_sequences = tokenizer.texts_to_sequences(test_sentences) # similar preprocessing for test set
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [14]:
print(vocab_size)
print(max_length)

79484
3507


# Define Model

In [16]:
tf.keras.backend.clear_session()
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embed_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
optimizer = tf.keras.optimizers.Adam()
model.compile(loss='binary_crossentropy',optimizer=optimizer,metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 3507, 64)          5059584   
_________________________________________________________________
bidirectional (Bidirectional (None, 3507, 128)         66048     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense (Dense)                (None, 16)                1040      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 5,167,905
Trainable params: 5,167,905
Non-trainable params: 0
_________________________________________________________________


In [17]:
history = model.fit(train_padded, train_labels, epochs=10, validation_data=(test_padded, test_labels),batch_size=64)

Epoch 1/10
   3/2728 [..............................] - ETA: 13:48 - loss: 0.6952 - accuracy: 0.4271

CancelledError:  [_Derived_]RecvAsync is cancelled.
	 [[{{node Adam/Adam/update/AssignSubVariableOp/_41}}]]
	 [[gradient_tape/sequential/embedding/embedding_lookup/Reshape/_38]] [Op:__inference_train_function_10782]

Function call stack:
train_function
