In [1]:
import json
import time
import numpy as np
import tensorflow as tf

In [2]:
from statistics import median
from nltk.tokenize import word_tokenize
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.utils import shuffle
from datetime import datetime as dt

In [3]:
def convert_json_to_review_and_rating(json_text):
    review_dict = json.loads(json_text)    
    return review_dict['reviewText'], review_dict['overall']

In [4]:
def get_reviews_and_ratings(reviews_filepath):
    review_texts = list()
    ratings = list()
    with open(reviews_filepath) as reviews_file:
        for line in reviews_file:
            review_text, rating = convert_json_to_review_and_rating(line)
            review_texts.append(review_text)
            ratings.append(int(rating))
            
    return review_texts, ratings

In [5]:
def clean_word(word):
    return word

def clean_sentence(sentence):
    sentence = sentence.lower()
    return sentence

In [6]:
def texts_to_indexed_word_sequences(review_texts):
    vocab = list()
    word_indices = dict()
    indexed_sequences = list()
    word_index = 1
    
    for review_text in review_texts:
        review_text = clean_sentence(review_text)
        tokens = word_tokenize(review_text)
        indexed_sequence = list()
        for token in tokens:
            token = clean_word(token)
            if token not in word_indices:
                vocab.append(token)
                word_indices[token] = word_index
                indexed_sequence.append(word_index)
                word_index += 1
            else:
                indexed_sequence.append(word_indices[token])
        indexed_sequences.append(np.asarray(indexed_sequence))
        
    return vocab, word_indices, indexed_sequences

In [7]:
reviews_filepath = "/home/v2john/datasets/amazon/reviews_electronics_tiny.json"

In [8]:
review_texts, ratings = get_reviews_and_ratings(reviews_filepath)
review_texts, ratings = shuffle(review_texts, ratings)
print(len(review_texts), len(ratings))

500 500


In [9]:
vocab, word_indices, indexed_sequences = texts_to_indexed_word_sequences(review_texts)

In [10]:
VOCAB_SIZE = len(vocab)
print("VOCAB_SIZE: ", VOCAB_SIZE)

EMBEDDING_SIZE = 300
print("EMBEDDING_SIZE: ", EMBEDDING_SIZE)

MAX_SEQUENCE_LENGTH = int(median([len(sequence) for sequence in indexed_sequences]))
print("MAX_SEQUENCE_LENGTH: ", MAX_SEQUENCE_LENGTH)

NUM_CLASSES = len(set(ratings))
print("NUM_CLASSES: ", NUM_CLASSES)

VOCAB_SIZE:  7276
EMBEDDING_SIZE:  300
MAX_SEQUENCE_LENGTH:  94
NUM_CLASSES:  5


In [11]:
def pad_indexed_sequences(indexed_sequences, max_sequence_length):
    new_indexed_sequences = list()
    for sequence in indexed_sequences:
        if len(sequence) >= max_sequence_length:
            new_indexed_sequences.append(sequence[:max_sequence_length])
        else:
            shortfall = max_sequence_length - len(sequence)
            new_indexed_sequences.append(
                np.pad(sequence, (0, shortfall), 'constant', 
                       constant_values=(0, 0)))
    return np.asarray(new_indexed_sequences)

def convert_labels_to_logits(ratings, num_classes):
    one_hot_ratings = list()
    for rating in ratings:
        one_hot_rating = np.zeros(num_classes)
        one_hot_rating[rating - 1] = 1
        one_hot_ratings.append(one_hot_rating)
        
    return np.asarray(one_hot_ratings)

def tensorize_sequences_and_labels(indexed_sequences, ratings, max_sequence_length, num_classes):
    return pad_indexed_sequences(indexed_sequences, max_sequence_length), \
        convert_labels_to_logits(ratings, num_classes)

In [12]:
indexed_sequences, labels = tensorize_sequences_and_labels(
    indexed_sequences, ratings, MAX_SEQUENCE_LENGTH, NUM_CLASSES)

In [13]:
indexed_sequences.shape, labels.shape

((500, 94), (500, 5))

In [14]:
graph_1 = tf.Graph()
current_epoch = int(time.time())

with graph_1.as_default():

    input_sequence = tf.placeholder(
        tf.int32, [None, MAX_SEQUENCE_LENGTH], name="input_sequence")
    print("input_sequence: ", input_sequence)
    
    input_label = tf.placeholder(
        tf.int32, [None, NUM_CLASSES], name="input_label")

    word_embeddings = tf.get_variable(
        shape=[VOCAB_SIZE + 1, EMBEDDING_SIZE], name="word_embeddings", 
        dtype=tf.float32)
    print("word_embeddings: ", word_embeddings)

    embedded_sequence = tf.nn.embedding_lookup(
        word_embeddings, input_sequence, name="embedded_sequence")
    print("embedded_sequence: ", embedded_sequence)

        
    # Discriminator Network
    conv_1 = tf.layers.conv1d(
        inputs=embedded_sequence, filters=64, kernel_size=3, 
        activation=tf.nn.relu, name="conv_1")
    print("conv_1: ", conv_1)

    bnorm_1 = tf.layers.batch_normalization(conv_1, name="batch_norm_1")
    print("batch_norm_1: ", bnorm_1)

    max_pool_1 = tf.layers.max_pooling1d(
        inputs=conv_1, pool_size=2, strides=2, name="max_pool_1")
    print("max_pool_1: ", max_pool_1)

    conv_2 = tf.layers.conv1d(
        inputs=max_pool_1, filters=128, kernel_size=3, 
        activation=tf.nn.relu, name="conv_2")
    print("conv_2: ", conv_2)

    bnorm_2 = tf.layers.batch_normalization(conv_2, name="batch_norm_2")
    print("batch_norm_2: ", bnorm_2)

    max_pool_2 = tf.layers.max_pooling1d(
        inputs=conv_2, pool_size=2, strides=2, name="max_pool_2")
    print("max_pool_2: ", max_pool_2)

    with tf.variable_scope('discriminator_lstm'):
        lstm_cell_fw = tf.contrib.rnn.BasicLSTMCell(
            num_units=32)
        lstm_cell_bw = tf.contrib.rnn.BasicLSTMCell(
            num_units=32)

        _, output_states = tf.nn.bidirectional_dynamic_rnn(
            cell_fw=lstm_cell_fw, cell_bw=lstm_cell_bw, inputs=max_pool_2, 
            dtype=tf.float32)
        print("fw_lstm_output: ", output_states[0].h)
        print("bw_lstm_output: ", output_states[1].h)

    lstm_output = tf.concat([output_states[0].h, output_states[1].h], 
                            axis=1)
    print("lstm_output: ", lstm_output)

    bnorm_3 = tf.layers.batch_normalization(lstm_output, name="batch_norm_3")
    print("batch_norm_3: ", bnorm_3)

    dense_1 = tf.layers.dense(
        inputs=lstm_output, units=NUM_CLASSES, 
        activation=tf.nn.relu, name="dense_1")
    print("dense_1.shape: ", dense_1.shape)

    softmax_output = tf.nn.softmax(dense_1, name="softmax")
    print("softmax_output.shape: ", softmax_output.shape)

    one_hot_label = tf.one_hot(
        indices=input_label - 1, depth=1, on_value=1, off_value=0,
        name="one_hot_label")
    one_hot_label = tf.reshape(one_hot_label, tf.shape(softmax_output))
    print("one_hot_label.shape: ", one_hot_label.shape)

    discriminator_loss = tf.losses.softmax_cross_entropy(
        one_hot_label, softmax_output)
    print("discriminator_loss: ", discriminator_loss)

    discriminator_loss_summary = tf.summary.scalar(
        "cross-entropy-loss-" + str(current_epoch), tf.convert_to_tensor(discriminator_loss))

    discriminator_optimizer = tf.train.AdamOptimizer()
    discriminator_train_operation = discriminator_optimizer.minimize(discriminator_loss)
    
    
    # Generator Network
    with tf.variable_scope('generator_lstm'):
        vanilla_lstm_cell_fw = tf.contrib.rnn.BasicLSTMCell(
            num_units=64)
        vanilla_lstm_cell_bw = tf.contrib.rnn.BasicLSTMCell(
            num_units=64)
    
        vanilla_rnn_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
            cell_fw=vanilla_lstm_cell_fw, cell_bw=vanilla_lstm_cell_bw, 
            inputs=embedded_sequence, dtype=tf.float32, time_major=True)
        print("vanilla_lstm_outputs_fw: ", vanilla_rnn_outputs[0])
        print("vanilla_lstm_outputs_bw: ", vanilla_rnn_outputs[1])
        
    def perform_vocab_softmax(word_tensor):
        dense_word_1 = tf.layers.dense(
            inputs=word_tensor, units=VOCAB_SIZE, 
            activation=tf.nn.relu, name="dense_word_1")
        return dense_word_1
        
    mapped_lstm_output = tf.map_fn(
        perform_vocab_softmax,
        vanilla_rnn_outputs[0],
        name='mapped_lstm'
    )
    print("mapped_lstm_output: ", mapped_lstm_output)
    
    generator_loss = tf.contrib.seq2seq.sequence_loss(
        logits=mapped_lstm_output,
        targets=input_sequence, 
        weights=tf.ones_like(
            input_sequence, dtype=tf.float32, name=None, optimize=True
        ),
        name='generator_loss'
    )
    print("generator_loss: ", generator_loss)
    
    generator_loss_summary = tf.summary.scalar(
        "generated-sequence-loss-" + str(current_epoch), tf.convert_to_tensor(generator_loss))
    
    generator_optimizer = tf.train.AdamOptimizer()
    generator_train_operation = generator_optimizer.minimize(generator_loss)

input_sequence:  Tensor("input_sequence:0", shape=(?, 94), dtype=int32)
word_embeddings:  <tf.Variable 'word_embeddings:0' shape=(7277, 300) dtype=float32_ref>
embedded_sequence:  Tensor("embedded_sequence:0", shape=(?, 94, 300), dtype=float32)
conv_1:  Tensor("conv_1/Relu:0", shape=(?, 92, 64), dtype=float32)
batch_norm_1:  Tensor("batch_norm_1/batchnorm/add_1:0", shape=(?, 92, 64), dtype=float32)
max_pool_1:  Tensor("max_pool_1/Squeeze:0", shape=(?, 46, 64), dtype=float32)
conv_2:  Tensor("conv_2/Relu:0", shape=(?, 44, 128), dtype=float32)
batch_norm_2:  Tensor("batch_norm_2/batchnorm/add_1:0", shape=(?, 44, 128), dtype=float32)
max_pool_2:  Tensor("max_pool_2/Squeeze:0", shape=(?, 22, 128), dtype=float32)
fw_lstm_output:  Tensor("discriminator_lstm/bidirectional_rnn/fw/fw/while/Exit_3:0", shape=(?, 32), dtype=float32)
bw_lstm_output:  Tensor("discriminator_lstm/bidirectional_rnn/bw/bw/while/Exit_3:0", shape=(?, 32), dtype=float32)
lstm_output:  Tensor("concat:0", shape=(?, 64), dtyp

In [15]:
epoch_reporting_interval = 1
training_examples_fraction = 0.9
training_examples_size = int(training_examples_fraction * len(labels))
batch_size = 100
training_epochs = 10
num_batches = int(training_examples_size/batch_size)

In [16]:
with tf.Session(graph=graph_1) as sess:
    sess.run(tf.global_variables_initializer())
    
    writer = tf.summary.FileWriter(
        logdir="/home/v2john/tensorlogs/" + dt.now().strftime("%Y%m%d-%H%M%S") + "/", 
        graph=graph_1)
    
    mini_epoch = 1
    for current_epoch in range(1, training_epochs + 1):
        for batch_number in range(num_batches):
            _, _, discriminator_loss_summary_out, generator_loss_summary_out, \
            discriminator_loss_out, generator_loss_out = \
            sess.run(
                [discriminator_train_operation, generator_train_operation, 
                 discriminator_loss_summary, generator_loss_summary,
                 discriminator_loss, generator_loss], 
                feed_dict={
                    input_sequence: indexed_sequences[batch_number * batch_size : 
                                               (batch_number + 1) * batch_size],
                    input_label: labels[batch_number * batch_size : 
                                    (batch_number + 1) * batch_size]})
            
            writer.add_summary(discriminator_loss_summary_out, mini_epoch)
            writer.add_summary(generator_loss_summary_out, mini_epoch)
            writer.flush()
            mini_epoch += 1

        if (current_epoch % epoch_reporting_interval == 0):
            print("Training epoch:", current_epoch, ", Discriminator Loss:", discriminator_loss_out, 
                  ", Generator Loss:", generator_loss_out)
    
    writer.flush()
    writer.close()

Training epoch:  1 , Discriminator Loss:  1.60944 , Generator Loss:  8.88687
Training epoch:  2 , Discriminator Loss:  1.60943 , Generator Loss:  8.87353
Training epoch:  3 , Discriminator Loss:  1.60889 , Generator Loss:  8.84042
Training epoch:  4 , Discriminator Loss:  1.60823 , Generator Loss:  8.71903
Training epoch:  5 , Discriminator Loss:  1.60679 , Generator Loss:  8.46658
Training epoch:  6 , Discriminator Loss:  1.60366 , Generator Loss:  8.22404
Training epoch:  7 , Discriminator Loss:  1.59329 , Generator Loss:  7.97825
Training epoch:  8 , Discriminator Loss:  1.57214 , Generator Loss:  7.71319
Training epoch:  9 , Discriminator Loss:  1.57755 , Generator Loss:  7.43953
Training epoch:  10 , Discriminator Loss:  1.55817 , Generator Loss:  7.17598


In [17]:
with tf.Session(graph=graph_1) as sess:
    sess.run(tf.global_variables_initializer())
    training_label_predictions, training_sequence_predictions = sess.run(
        [softmax_output, mapped_lstm_output], 
        feed_dict={
            input_sequence: indexed_sequences[:training_examples_size], 
            input_label: labels[:training_examples_size]
        })
    
    test_label_predictions, test_sequence_predictions = sess.run(
        [softmax_output, mapped_lstm_output], 
        feed_dict={
            input_sequence: indexed_sequences[training_examples_size:], 
            input_label: labels[training_examples_size:]
        })

In [18]:
test_sequence_predictions.shape

(50, 94, 7276)

In [19]:
with tf.Session() as sess:
    sequence_hardmax = tf.contrib.seq2seq.hardmax(
        tf.nn.softmax(test_sequence_predictions), name="hardmax_word")
    
    sequence_indices = tf.map_fn(
        lambda x: tf.where(tf.equal(x, 1)),
        sequence_hardmax,
        name='mapped_lstm',
        dtype=tf.int64
    )

    predicted_logit_sequences = sess.run(sequence_indices)
    
    for predicted_logit_sequence in predicted_logit_sequences:
        word_sequence = list(map(lambda x: vocab[int(x[1])], predicted_logit_sequence))
        print(" ".join(word_sequence))

low.setting halls bunch refund.wish desirable it.then sick low.setting installed halls them.when photographer inaccurate.-the refund.wish desirable files session 53 highways it.then low.setting strips refund.wish john inaccurate.-the refund.wish kitchen audio-books estate glorified recipes head mystery low.setting selections sorry could resembles tab head budget imagination- impressive.battery low.setting hannspree incrementing it.then played/watched aac +.same inaccurate.-the refund.wish desirable have drill could walked radio pulse resistance ways impressive.battery low.setting iphone/ipad lightly processor photographer sleek refund.wish desirable installed myriad refund.wish router lightly naturally reduced -barnes obscure. registers budget textbook it.then files session 53 highways unreliable contrast contrast contrast contrast contrast contrast
low.setting halls permits protect viewsonic drove me.a beautiful-great deal competitions.easy end rotational electronically hols hte files