In [5]:
import pandas as pd
import gzip
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [6]:
# List of datasets to use
data = ['reviews_Digital_Music_5.json.gz', 'reviews_Cell_Phones_and_Accessories_5.json.gz']

# functions to read Amazon data into a pandas data frame
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

# function to concatenate multiple Amazon datasets
def concatDF(data):
    df = pd.DataFrame()
    for dataset in data:
        dftemp = getDF(dataset)
        df = pd.concat([df, dftemp], axis=0)
    # drop unneeded columns
    df.drop(columns = ['reviewerID', 'asin', 'reviewerName', 'helpful', \
                       'summary', 'unixReviewTime', 'reviewTime'], inplace= True)
    return df

In [7]:
df = concatDF(data)
df['sentiments'] = df.overall.apply(lambda x: 0 if x in [1, 2] else 1)

In [8]:
df

Unnamed: 0,reviewText,overall,sentiments
0,"It's hard to believe ""Memory of Trees"" came ou...",5.0,1
1,"A clasically-styled and introverted album, Mem...",5.0,1
2,I never thought Enya would reach the sublime h...,5.0,1
3,This is the third review of an irish album I w...,5.0,1
4,"Enya, despite being a successful recording art...",4.0,1
...,...,...,...
194434,Works great just like my original one. I reall...,5.0,1
194435,Great product. Great packaging. High quality a...,5.0,1
194436,"This is a great cable, just as good as the mor...",5.0,1
194437,I really like it becasue it works well with my...,5.0,1


In [9]:
tokenizer = Tokenizer(oov_token="<OOV>")

In [10]:
x = df['reviewText']
y = df['sentiments']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
x_train, x_val, Y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

In [12]:
Y_train

190042    1
18159     0
587       0
9010      1
154790    1
         ..
25262     1
25156     1
165829    1
22267     1
84169     1
Name: sentiments, Length: 165852, dtype: int64

In [13]:
vocab_size = 40000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
oov_tok = '<OOV>'
padding_type = 'post'

In [14]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(x_train)
word_index = tokenizer.word_index

In [15]:
sequences = tokenizer.texts_to_sequences(x_train)
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)
testing_sentences = tokenizer.texts_to_sequences(X_test)
testing_padded = pad_sequences(testing_sentences, maxlen=max_length)

In [23]:
Y_train.shape

(165852,)

In [16]:
model = Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(Dense(6, activation='relu'))
model.add(Dense(1, activation='sigmoid'))


In [17]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           640000    
_________________________________________________________________
dense (Dense)                (None, 120, 6)            102       
_________________________________________________________________
dense_1 (Dense)              (None, 120, 1)            7         
Total params: 640,109
Trainable params: 640,109
Non-trainable params: 0
_________________________________________________________________


In [19]:
training_labels_final = np.array(Y_train)
testing_labels_final = np.array(y_test)

In [20]:
num_epochs = 1
history = model.fit(padded, training_labels_final, epochs=num_epochs, validation_data=(testing_padded, testing_labels_final))



In [21]:
model

<tensorflow.python.keras.engine.sequential.Sequential at 0x28160cd79d0>