In [1]:
import json
import numpy as np
import tensorflow as tf

In [2]:
from statistics import median
from nltk.tokenize import word_tokenize
from sklearn.metrics import f1_score, confusion_matrix

In [3]:
def convert_json_to_review_and_rating(json_text):
    review_dict = json.loads(json_text)    
    return review_dict['reviewText'], review_dict['overall']

In [4]:
def get_reviews_and_ratings(reviews_filepath):
    review_texts = list()
    ratings = list()
    with open(reviews_filepath) as reviews_file:
        for line in reviews_file:
            review_text, rating = convert_json_to_review_and_rating(line)
            review_texts.append(review_text)
            ratings.append(int(rating))
            
    return review_texts, ratings

In [5]:
def texts_to_indexed_word_sequences(review_texts):
    word_indices = dict()
    indexed_sequences = list()
    word_index = 1
    
    for review_text in review_texts:
        tokens = word_tokenize(review_text)
        indexed_sequence = list()
        for token in tokens:
            if token not in word_indices:
                word_indices[token] = word_index
                indexed_sequence.append(word_index)
                word_index += 1
            else:
                indexed_sequence.append(word_indices[token])
        indexed_sequences.append(np.asarray(indexed_sequence))
        
    return word_indices, indexed_sequences

In [6]:
reviews_filepath = "/home/v2john/datasets/amazon/reviews_electronics.json"

In [7]:
review_texts, ratings = get_reviews_and_ratings(reviews_filepath)
print(len(review_texts), len(ratings))

10000 10000


In [8]:
word_indices, indexed_sequences = texts_to_indexed_word_sequences(review_texts)

In [9]:
VOCAB_SIZE = len(word_indices)
print("VOCAB_SIZE: ", VOCAB_SIZE)

EMBEDDING_SIZE = 300
print("EMBEDDING_SIZE: ", EMBEDDING_SIZE)

MAX_SEQUENCE_LENGTH = int(median([len(sequence) for sequence in indexed_sequences]))
print("MAX_SEQUENCE_LENGTH: ", MAX_SEQUENCE_LENGTH)

NUM_CLASSES = len(set(ratings))
print("NUM_CLASSES: ", NUM_CLASSES)

VOCAB_SIZE:  46265
EMBEDDING_SIZE:  300
MAX_SEQUENCE_LENGTH:  77
NUM_CLASSES:  5


In [10]:
def pad_indexed_sequences(indexed_sequences, max_sequence_length):
    new_indexed_sequences = list()
    for sequence in indexed_sequences:
        if len(sequence) >= max_sequence_length:
            new_indexed_sequences.append(sequence[:max_sequence_length])
        else:
            shortfall = max_sequence_length - len(sequence)
            new_indexed_sequences.append(
                np.pad(sequence, (0, shortfall), 'constant', 
                       constant_values=(0, 0)))
    return np.asarray(new_indexed_sequences)

def convert_labels_to_logits(ratings, num_classes):
    one_hot_ratings = list()
    for rating in ratings:
        one_hot_rating = np.zeros(num_classes)
        one_hot_rating[rating - 1] = 1
        one_hot_ratings.append(one_hot_rating)
        
    return np.asarray(one_hot_ratings)

def tensorize_sequences_and_labels(indexed_sequences, ratings, max_sequence_length, num_classes):
    return pad_indexed_sequences(indexed_sequences, max_sequence_length), \
        convert_labels_to_logits(ratings, num_classes)

In [11]:
indexed_sequences, labels = tensorize_sequences_and_labels(
    indexed_sequences, ratings, MAX_SEQUENCE_LENGTH, NUM_CLASSES)

In [12]:
indexed_sequences.shape, labels.shape

((10000, 77), (10000, 5))

In [15]:
graph_1 = tf.Graph()
with graph_1.as_default():
    
    input_x = tf.placeholder(
        tf.int32, [None, MAX_SEQUENCE_LENGTH], name="input_x")
    input_y = tf.placeholder(
        tf.int32, [None, NUM_CLASSES], name="input_y")

    word_embeddings = tf.get_variable(
        shape=[VOCAB_SIZE, EMBEDDING_SIZE], name="word_embeddings", 
        dtype=tf.float32)
    print("word_embeddings: ", word_embeddings)
    
    embedded_sequence = tf.nn.embedding_lookup(
        word_embeddings, input_x, name="embedded_sequence")
    print("embedded_sequence: ", embedded_sequence)
    
    conv_1 = tf.layers.conv1d(
        inputs=embedded_sequence, filters=64, kernel_size=3, name="conv_1")
    print("conv_1: ", conv_1)
    max_pool_1 = tf.layers.max_pooling1d(
        inputs=conv_1, pool_size=2, strides=2, name="max_pool_1")
    print("max_pool_1: ", max_pool_1)
    
    conv_2 = tf.layers.conv1d(
        inputs=max_pool_1, filters=128, kernel_size=3, name="conv_2")
    print("conv_2: ", conv_2)
    max_pool_2 = tf.layers.max_pooling1d(
        inputs=conv_2, pool_size=2, strides=2, name="max_pool_2")
    print("max_pool_2: ", max_pool_2)
    
    lstm_cell_fw = tf.contrib.rnn.BasicLSTMCell(32)
    lstm_cell_bw = tf.contrib.rnn.BasicLSTMCell(32)
    
    outputs, output_states = tf.nn.bidirectional_dynamic_rnn(
        cell_fw=lstm_cell_fw, cell_bw=lstm_cell_bw, inputs=max_pool_2, 
        dtype=tf.float32)
    print("fw_lstm_output: ", output_states[0].h)
    print("bw_lstm_output: ", output_states[1].h)
    
    lstm_output = tf.concat([output_states[0].h, output_states[1].h], axis=1)
    print("lstm_output: ", lstm_output)
    
    dense_1 = tf.layers.dense(
        inputs=lstm_output, units=NUM_CLASSES, name="dense_1")
    print("dense_1.shape: ", dense_1.shape)
    
    softmax_output = tf.nn.softmax(dense_1, name="softmax")
    print("softmax_output.shape: ", softmax_output.shape)
    
    one_hot_label = tf.one_hot(
        indices=input_y-1, depth=1, on_value=1, off_value=0,
        name="one_hot_label")
    one_hot_label = tf.reshape(one_hot_label, tf.shape(softmax_output))
    print("one_hot_label.shape: ", one_hot_label.shape)
    
    loss = tf.losses.softmax_cross_entropy(
        one_hot_label, softmax_output)
    
    optimizer = tf.train.AdamOptimizer()
    train_op = optimizer.minimize(loss)

word_embeddings:  <tf.Variable 'word_embeddings:0' shape=(46265, 300) dtype=float32_ref>
embedded_sequence:  Tensor("embedded_sequence:0", shape=(?, 77, 300), dtype=float32)
conv_1:  Tensor("conv_1/BiasAdd:0", shape=(?, 75, 64), dtype=float32)
max_pool_1:  Tensor("max_pool_1/Squeeze:0", shape=(?, 37, 64), dtype=float32)
conv_2:  Tensor("conv_2/BiasAdd:0", shape=(?, 35, 128), dtype=float32)
max_pool_2:  Tensor("max_pool_2/Squeeze:0", shape=(?, 17, 128), dtype=float32)
fw_lstm_output:  Tensor("bidirectional_rnn/fw/fw/while/Exit_3:0", shape=(?, 32), dtype=float32)
bw_lstm_output:  Tensor("bidirectional_rnn/bw/bw/while/Exit_3:0", shape=(?, 32), dtype=float32)
lstm_output:  Tensor("concat:0", shape=(?, 64), dtype=float32)
dense_1.shape:  (?, 5)
softmax_output.shape:  (?, 5)
one_hot_label.shape:  (?, 5)


In [22]:
with tf.Session(graph=graph_1) as sess:
    sess.run(tf.global_variables_initializer())
    
    epoch_reporting_interval = 10
    batch_size = 100
    
    for current_epoch in range(1, 1001):
        for batch_number in range(0, 100):
            _, loss_var = sess.run(
                [train_op, loss], 
                feed_dict={
                    input_x: indexed_sequences[batch_number * batch_size : 
                                               (batch_number + 1) * batch_size],
                    input_y: labels[batch_number * batch_size : 
                                    (batch_number + 1) * batch_size]
                })

        if (current_epoch % epoch_reporting_interval == 0):
            print("Training epoch: ", current_epoch, ", Loss: ", loss_var)
            
    final_predictions = sess.run(
        softmax_output, 
        feed_dict={
            input_x: indexed_sequences, 
            input_y: labels
        })

Training epoch:  10 , Loss:  1.25751
Training epoch:  20 , Loss:  1.19797
Training epoch:  30 , Loss:  1.10118
Training epoch:  40 , Loss:  1.04328
Training epoch:  50 , Loss:  0.976438
Training epoch:  60 , Loss:  0.966803
Training epoch:  70 , Loss:  0.965322
Training epoch:  80 , Loss:  0.975027
Training epoch:  90 , Loss:  0.962458
Training epoch:  100 , Loss:  0.953298
Training epoch:  110 , Loss:  0.952695
Training epoch:  120 , Loss:  0.942155
Training epoch:  130 , Loss:  0.942023
Training epoch:  140 , Loss:  0.983687
Training epoch:  150 , Loss:  0.942833
Training epoch:  160 , Loss:  0.942756
Training epoch:  170 , Loss:  0.946515
Training epoch:  180 , Loss:  0.943674
Training epoch:  190 , Loss:  0.952881
Training epoch:  200 , Loss:  0.934023
Training epoch:  210 , Loss:  0.933716
Training epoch:  220 , Loss:  0.933452
Training epoch:  230 , Loss:  0.934012
Training epoch:  240 , Loss:  0.949581
Training epoch:  250 , Loss:  0.934911
Training epoch:  260 , Loss:  0.934851

In [23]:
y_pred = list()
for prediction in final_predictions:
    y_label = np.argmax(prediction) + 1
    y_pred.append(y_label)
print(len(y_pred))

10000


In [24]:
f1_score(y_true=ratings, y_pred=y_pred, average='micro')

0.98489999999999989

In [25]:
confusion_matrix(ratings, y_pred)

array([[ 556,    2,    5,    3,    6],
       [   4,  439,    6,    0,    1],
       [  10,    5,  772,   18,   17],
       [   3,    2,   14, 2049,   27],
       [   4,    2,    7,   15, 6033]])