In [1]:
import pandas as pd
import gzip
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# List of datasets to use
data = ['Resources/data/reviews_Digital_Music_5.json.gz', 'Resources/data/reviews_Digital_Music_5.json.gz']

# functions to read Amazon data into a pandas data frame
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

# function to concatenate multiple Amazon datasets
def concatDF(data):
    df = pd.DataFrame()
    for dataset in data:
        dftemp = getDF(dataset)
        df = pd.concat([df, dftemp], axis=0)
    # drop unneeded columns
    df.drop(columns = ['reviewerID', 'asin', 'reviewerName', 'helpful', \
                       'summary', 'unixReviewTime', 'reviewTime'], inplace= True)
    return df

In [3]:
df = concatDF(data)

In [4]:
df.overall = df.overall.astype(int)

In [5]:
df.head()

Unnamed: 0,reviewText,overall
0,"It's hard to believe ""Memory of Trees"" came ou...",5
1,"A clasically-styled and introverted album, Mem...",5
2,I never thought Enya would reach the sublime h...,5
3,This is the third review of an irish album I w...,5
4,"Enya, despite being a successful recording art...",4


In [6]:
tokenizer = Tokenizer(oov_token="<OOV>")

In [7]:
x = df['reviewText'].values
y = df['overall'].values

In [8]:
y

array([5, 5, 5, ..., 5, 3, 1])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)


In [10]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)

encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [11]:
y_train_categorical[:5]

array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.]], dtype=float32)

In [12]:
y_train[:5]

array([5, 4, 5, 5, 4])

In [13]:
vocab_size = 40000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
oov_tok = '<OOV>'
padding_type = 'post'

In [14]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [15]:
word_index

{'<OOV>': 1,
 'the': 2,
 'and': 3,
 'a': 4,
 'of': 5,
 'to': 6,
 'is': 7,
 'this': 8,
 'i': 9,
 'it': 10,
 'in': 11,
 'that': 12,
 'album': 13,
 'on': 14,
 'with': 15,
 'quot': 16,
 'but': 17,
 'you': 18,
 'as': 19,
 'for': 20,
 'are': 21,
 'was': 22,
 'song': 23,
 'one': 24,
 'like': 25,
 'songs': 26,
 'not': 27,
 'all': 28,
 'his': 29,
 'from': 30,
 'be': 31,
 'have': 32,
 'my': 33,
 "it's": 34,
 'has': 35,
 'music': 36,
 'good': 37,
 'he': 38,
 'just': 39,
 'out': 40,
 'more': 41,
 'me': 42,
 'an': 43,
 'great': 44,
 'so': 45,
 'by': 46,
 'they': 47,
 'her': 48,
 'at': 49,
 'some': 50,
 'if': 51,
 'cd': 52,
 'their': 53,
 'up': 54,
 'best': 55,
 'or': 56,
 'love': 57,
 'about': 58,
 'track': 59,
 'which': 60,
 'what': 61,
 'there': 62,
 'time': 63,
 'first': 64,
 'very': 65,
 'really': 66,
 'when': 67,
 'most': 68,
 'can': 69,
 'get': 70,
 'here': 71,
 'no': 72,
 'well': 73,
 'tracks': 74,
 'sound': 75,
 'than': 76,
 'still': 77,
 'she': 78,
 'would': 79,
 'only': 80,
 'rock': 81,
 

In [45]:
sequences = tokenizer.texts_to_sequences(X_train)
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)
testing_sentences = tokenizer.texts_to_sequences(X_test)
testing_padded = pad_sequences(testing_sentences, maxlen=max_length, truncating=trunc_type)

In [17]:
padded

array([[    8,    13,    22, ...,   328,  2782,  1060],
       [    0,     0,     0, ...,   363,   226, 18361],
       [    0,     0,     0, ...,    19,     9,  2433],
       ...,
       [  101,    11,     2, ...,   283,     5,     4],
       [    0,     0,     0, ...,   891,    21,   104],
       [ 6207,  6809,     7, ...,    25,  6207,  4742]])

In [50]:
model = Sequential()

model.add(Dense(units=16, activation='relu', input_dim=max_length))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=5, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 16)                1936      
_________________________________________________________________
dense_4 (Dense)              (None, 100)               1700      
_________________________________________________________________
dense_5 (Dense)              (None, 5)                 505       
Total params: 4,141
Trainable params: 4,141
Non-trainable params: 0
_________________________________________________________________


In [52]:
model.fit(
    padded,
    y_train_categorical,
    epochs=2,
    shuffle=True,
    verbose=2
)

Epoch 1/2
3236/3236 - 2s - loss: 33.8229 - categorical_accuracy: 0.4602
Epoch 2/2
3236/3236 - 2s - loss: 1.3231 - categorical_accuracy: 0.5434


<tensorflow.python.keras.callbacks.History at 0x21b6b3fc070>

In [79]:
predictions = []
counter = 0;
for testme in testing_padded:
    testme = testme.reshape(1, -1)
    if counter == 10:
        break
    counter += 1
    predictions.append(model.predict_classes(testme))
    



In [80]:
predictions

[array([4], dtype=int64),
 array([4], dtype=int64),
 array([4], dtype=int64),
 array([4], dtype=int64),
 array([4], dtype=int64),
 array([4], dtype=int64),
 array([4], dtype=int64),
 array([4], dtype=int64),
 array([4], dtype=int64),
 array([4], dtype=int64)]

In [36]:
y_test[1]

4