In [1]:
import pandas as pd
import gzip

In [2]:
# List of datasets to use
data = ['Resources/data/reviews_Digital_Music_5.json.gz', 'Resources/data/reviews_Digital_Music_5.json.gz']

# functions to read Amazon data into a pandas data frame
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

# function to concatenate multiple Amazon datasets
def concatDF(data):
    df = pd.DataFrame()
    for dataset in data:
        dftemp = getDF(dataset)
        df = pd.concat([df, dftemp], axis=0)
    # drop unneeded columns
    df.drop(columns = ['reviewerID', 'asin', 'reviewerName', 'helpful', \
                       'summary', 'unixReviewTime', 'reviewTime'], inplace= True)
    return df

In [3]:
df = concatDF(data)

In [9]:
df.head()

Unnamed: 0,reviewText,overall
0,"It's hard to believe ""Memory of Trees"" came ou...",5.0
1,"A clasically-styled and introverted album, Mem...",5.0
2,I never thought Enya would reach the sublime h...,5.0
3,This is the third review of an irish album I w...,5.0
4,"Enya, despite being a successful recording art...",4.0


In [14]:
df['reviewText'] = df['reviewText'].str.lower()

In [18]:
df.head()

Unnamed: 0,reviewText,overall
0,"it's hard to believe ""memory of trees"" came ou...",5.0
1,"a clasically-styled and introverted album, mem...",5.0
2,i never thought enya would reach the sublime h...,5.0
3,this is the third review of an irish album i w...,5.0
4,"enya, despite being a successful recording art...",4.0


In [45]:
import nltk
def provideTokens(row):
    tokens = nltk.word_tokenize(row)
    #removes all tokens that do not contain letters
    token_words = [word for word in tokens if word.isalpha()]
    return token_words

In [46]:
df['tokened_words'] = df['reviewText'].apply(provideTokens) 

In [47]:
df.head()

Unnamed: 0,reviewText,overall,tokened_words
0,"it's hard to believe ""memory of trees"" came ou...",5.0,"[it, hard, to, believe, memory, of, trees, cam..."
1,"a clasically-styled and introverted album, mem...",5.0,"[a, and, introverted, album, memory, of, trees..."
2,i never thought enya would reach the sublime h...,5.0,"[i, never, thought, enya, would, reach, the, s..."
3,this is the third review of an irish album i w...,5.0,"[this, is, the, third, review, of, an, irish, ..."
4,"enya, despite being a successful recording art...",4.0,"[enya, despite, being, a, successful, recordin..."


In [50]:
from nltk.corpus import stopwords
stops = set(stopwords.words("english")) 

def remove_stops(row):
    meaningful_words = [word for word in row if not word in stops]
    return (meaningful_words)

In [51]:
df['train_me'] = df['tokened_words'].apply(remove_stops)

In [53]:
df.head()

Unnamed: 0,reviewText,overall,tokened_words,train_me
0,"it's hard to believe ""memory of trees"" came ou...",5.0,"[it, hard, to, believe, memory, of, trees, cam...","[hard, believe, memory, trees, came, years, ag..."
1,"a clasically-styled and introverted album, mem...",5.0,"[a, and, introverted, album, memory, of, trees...","[introverted, album, memory, trees, masterpiec..."
2,i never thought enya would reach the sublime h...,5.0,"[i, never, thought, enya, would, reach, the, s...","[never, thought, enya, would, reach, sublime, ..."
3,this is the third review of an irish album i w...,5.0,"[this, is, the, third, review, of, an, irish, ...","[third, review, irish, album, write, today, ot..."
4,"enya, despite being a successful recording art...",4.0,"[enya, despite, being, a, successful, recordin...","[enya, despite, successful, recording, artist,..."


In [56]:
x = df['train_me']
y = df['overall'].astype(int)

In [95]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 6000
max_len = 120
tokenizer = Tokenizer(num_words=max_words)
def padSequences(x):
    tokenizer.fit_on_texts(x)
    sequences = tokenizer.texts_to_sequences(x)
    reviews = pad_sequences(sequences, maxlen=max_len)
    return reviews

In [96]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [97]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
label_encoder = LabelEncoder()
label_encoder.fit(y_train)

encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [98]:
X_train_padded = padSequences(X_train)
X_test_padded = padSequences(X_test)

In [99]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM

model = Sequential()

model.add(Embedding(max_words, 20))
model.add(LSTM(15, dropout=.5))
model.add(Dense(units=5, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 20)          120000    
_________________________________________________________________
lstm_5 (LSTM)                (None, 15)                2160      
_________________________________________________________________
dense_8 (Dense)              (None, 5)                 80        
Total params: 122,240
Trainable params: 122,240
Non-trainable params: 0
_________________________________________________________________


In [100]:
model.fit(
    X_train_padded[:1000],
    y_train_categorical[:1000],
    epochs=50,
    verbose=2
)

Epoch 1/50
32/32 - 3s - loss: 1.4537 - accuracy: 0.5460
Epoch 2/50
32/32 - 1s - loss: 1.2034 - accuracy: 0.5700
Epoch 3/50
32/32 - 1s - loss: 1.1542 - accuracy: 0.5700
Epoch 4/50
32/32 - 1s - loss: 1.0824 - accuracy: 0.5720
Epoch 5/50
32/32 - 1s - loss: 1.0023 - accuracy: 0.5820
Epoch 6/50
32/32 - 1s - loss: 0.9196 - accuracy: 0.6070
Epoch 7/50
32/32 - 1s - loss: 0.8442 - accuracy: 0.6420
Epoch 8/50
32/32 - 1s - loss: 0.7923 - accuracy: 0.6770
Epoch 9/50
32/32 - 1s - loss: 0.7411 - accuracy: 0.6970
Epoch 10/50
32/32 - 1s - loss: 0.7028 - accuracy: 0.7290
Epoch 11/50
32/32 - 1s - loss: 0.6470 - accuracy: 0.7580
Epoch 12/50
32/32 - 1s - loss: 0.6120 - accuracy: 0.7860
Epoch 13/50
32/32 - 1s - loss: 0.5614 - accuracy: 0.8030
Epoch 14/50
32/32 - 1s - loss: 0.5343 - accuracy: 0.8210
Epoch 15/50
32/32 - 1s - loss: 0.5063 - accuracy: 0.8340
Epoch 16/50
32/32 - 1s - loss: 0.4649 - accuracy: 0.8430
Epoch 17/50
32/32 - 1s - loss: 0.4488 - accuracy: 0.8500
Epoch 18/50
32/32 - 1s - loss: 0.4312 - 

<tensorflow.python.keras.callbacks.History at 0x1e604e53100>

In [207]:
model.fit(
    X_train_padded[4000:5000],
    y_train_categorical[4000:5000],
    epochs=50,
    verbose=2
)

Epoch 1/50
32/32 - 1s - loss: 1.9720 - accuracy: 0.5030
Epoch 2/50
32/32 - 1s - loss: 1.4437 - accuracy: 0.5140
Epoch 3/50
32/32 - 1s - loss: 1.2287 - accuracy: 0.5410
Epoch 4/50
32/32 - 1s - loss: 1.0929 - accuracy: 0.5730
Epoch 5/50
32/32 - 1s - loss: 1.0556 - accuracy: 0.5880
Epoch 6/50
32/32 - 1s - loss: 0.9790 - accuracy: 0.6070
Epoch 7/50
32/32 - 1s - loss: 0.9108 - accuracy: 0.6350
Epoch 8/50
32/32 - 1s - loss: 0.8681 - accuracy: 0.6610
Epoch 9/50
32/32 - 1s - loss: 0.7997 - accuracy: 0.6960
Epoch 10/50
32/32 - 1s - loss: 0.7555 - accuracy: 0.7050
Epoch 11/50
32/32 - 1s - loss: 0.7187 - accuracy: 0.7340
Epoch 12/50
32/32 - 1s - loss: 0.6610 - accuracy: 0.7460
Epoch 13/50
32/32 - 1s - loss: 0.6349 - accuracy: 0.7720
Epoch 14/50
32/32 - 1s - loss: 0.5877 - accuracy: 0.7900
Epoch 15/50
32/32 - 1s - loss: 0.5351 - accuracy: 0.8110
Epoch 16/50
32/32 - 1s - loss: 0.4854 - accuracy: 0.8240
Epoch 17/50
32/32 - 1s - loss: 0.4655 - accuracy: 0.8370
Epoch 18/50
32/32 - 1s - loss: 0.4323 - 

<tensorflow.python.keras.callbacks.History at 0x1e61df84af0>

In [200]:
counter = 0 
for row in y_train_categorical[:1000]:
    test = [0., 0., 0., 0., 1.]
    identical = True
    index = 0
    for element in row:
        if element != test[index]:
            identical = False
        index += 1
    index = 0
    if identical:
        counter+= 1
    identical = True
print(counter)

570


In [223]:
sentence = ["this is a terrible bad toy"]
sequence = tokenizer.texts_to_sequences(sentence)
test = pad_sequences(sequence, max_len)


In [224]:
prediction  = model.predict(test).tolist()[0]
print(prediction)
score = prediction.index((max(prediction)))
print(score + 1)

[0.012861790135502815, 0.2838737666606903, 0.04923542961478233, 0.1138872504234314, 0.540141761302948]
5
