# Import libraries

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import regularizers

from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup 


# Load Dataset

In [None]:
df = pd.read_csv("Reviews.csv",index_col=0)

In [None]:
df.head()

Unnamed: 0_level_0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [None]:
# Retain only the columns to be used for training
df["review"] = df["Score"].apply(lambda x: 0 if x<4 else 1)
df = df[["Text","review"]]

# Preprocessing

We will be filtering numbers using regex and filter any html tag with BeautifulSoup

In [None]:
def process(text):
    text = re.sub(r'\d+', ' ', text)
    #text = BeautifulSoup(text).text
    return text

In [None]:
df["Text"] = df["Text"].apply(lambda x: process(x))
df

Unnamed: 0_level_0,Text,review
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,I have bought several of the Vitality canned d...,1
2,Product arrived labeled as Jumbo Salted Peanut...,0
3,This is a confection that has been around a fe...,1
4,If you are looking for the secret ingredient i...,0
5,Great taffy at a great price. There was a wid...,1
...,...,...
568450,Great for sesame chicken..this is a good if no...,1
568451,I'm disappointed with the flavor. The chocolat...,0
568452,"These stars are small, so you can give - of ...",1
568453,These are the BEST treats for training and rew...,1


### Balancing Training data

The dataset has almost 4 times the positive reviews compared to negative reviews. To counter this we can sample only a part of the positive reviews

In [None]:
df["review"].value_counts() # 1 for positive and 0 for negative

1    443777
0    124677
Name: review, dtype: int64

In [None]:
positive_reviews = df[df.review == 1]
negative_reviews = df[df.review == 0]

positive_reviews = positive_reviews.sample(n=len(negative_reviews)) # sample positive examples whose number is equal to the negative examples

df = positive_reviews.append(negative_reviews).reset_index(drop=True)

In [None]:
df
# The data is not shuffled right now, but it can be shuffled once we call the train test split function

Unnamed: 0,Text,review
0,My wife is picky about her coffee. I myself ca...,1
1,Cannot tell it from the syrup you get at Crack...,1
2,Quaker Oatmeal Squares with a Hint of Brown Su...,1
3,Coffee was killing my stomach so I went in sea...,1
4,Boy does this seasoning mix bring back memorie...,1
...,...,...
249349,I just bought this soup today at my local groc...,0
249350,This soup is mostly broth. Although it has a k...,0
249351,"It is mostly broth, with the advertised / cu...",0
249352,I had ordered some of these a few months back ...,0


In [None]:
sentences = df["Text"].values
labels = df["review"].values

In [None]:
# Define training and testing sets
train_sentences,test_sentences,train_labels,test_labels = train_test_split(sentences,labels,test_size=0.2,shuffle=True)

In [None]:
del positive_reviews,negative_reviews,sentences,labels

# Tokenization

In [None]:
embed_dim = 64 # dimension of the embedding layer
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
vocab_size = 10000
max_length = 400

In [None]:
tokenizer = Tokenizer(num_words=vocab_size,oov_token=oov_tok) #Define tokenizer
tokenizer.fit_on_texts(train_sentences) # Assign tokens based on words on training set

train_sequences = tokenizer.texts_to_sequences(train_sentences) # Create sequences based on tokens for the training set

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) # pad/truncate zeros at the end for a length of 'max_length' 


test_sequences = tokenizer.texts_to_sequences(test_sentences) # similar preprocessing for test set
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Define Model

In [None]:
tf.keras.backend.clear_session()
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embed_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(8,activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
optimizer = tf.keras.optimizers.Adam()
model.compile(loss='binary_crossentropy',optimizer=optimizer,metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 400, 64)           640000    
_________________________________________________________________
bidirectional (Bidirectional (None, 64)                24832     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                1040      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 136       
_________________________________________________________________
dropout_2 (Dropout)          (None, 8)                 0

In [None]:
history = model.fit(train_padded, train_labels, epochs=10, validation_data=(test_padded, test_labels),batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Get embedding vectors

In [None]:
word_index = tokenizer.word_index
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [None]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape)

(10000, 64)


In [None]:
import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
    word = reverse_word_index[word_num]
    embeddings = weights[word_num]
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

# Predict using custom string

In [None]:
model.load_weights("Weights/LSTM.h5")

In [None]:
test_string = "this was the best food I had"

In [None]:
def predict_sentiment(model,custom_text):
    custom_sequence = tokenizer.texts_to_sequences(np.array([custom_text]))
    custom_padded = pad_sequences(custom_sequence, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    if float(model.predict(custom_padded))>0.5:
        print("The review has a positive sentiment :)")
    else:
        print("The review has a negative sentiment :(")

In [None]:
predict_sentiment(model,test_string)

The review has a positive sentiment :)
