In [None]:
import json
import tensorflow as tf

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
def convert_json_to_review_and_rating(json_text):
    review_dict = json.loads(json_text)    
    return review_dict['reviewText'], review_dict['overall']

In [None]:
def get_reviews_and_ratings(reviews_filepath):
    review_texts = list()
    ratings = list()
    with open(reviews_filepath) as reviews_file:
        for line in reviews_file:
            review_text, rating = convert_json_to_review_and_rating(line)
            review_texts.append(review_text)
            ratings.append(int(rating))
            
    return review_texts, ratings

In [None]:
def texts_to_indexed_word_sequences(review_texts):
    word_indices = dict()
    indexed_sequences = list()
    word_index = 1
    
    for review_text in review_texts:
        tokens = word_tokenize(review_text)
        indexed_sequence = list()
        for token in tokens:
            if token not in word_indices:
                word_indices[token] = word_index
                indexed_sequence.append(word_index)
                word_index += 1
            else:
                indexed_sequence.append(word_indices[token])
        indexed_sequences.append(indexed_sequence)
        
    return word_indices, indexed_sequences

In [None]:
reviews_filepath = "/home/v2john/datasets/amazon/reviews_electronics.json"

In [None]:
review_texts, ratings = get_reviews_and_ratings(reviews_filepath)
print(len(review_texts), len(ratings))

In [None]:
word_indices, indexed_sequences = texts_to_indexed_word_sequences(review_texts)

In [None]:
VOCAB_SIZE = len(word_indices)
EMBEDDING_SIZE = 300
MAX_SEQUENCE_LENGTH = max([len(sequence) for sequence in indexed_sequences])
NUM_CLASSES = len(set(ratings))

In [None]:
graph_1 = tf.Graph()
with graph_1.as_default():
    
    input_x = tf.placeholder(
        tf.int32, [None, MAX_SEQUENCE_LENGTH], name="input_x")
    input_y = tf.placeholder(
        tf.float32, [None, NUM_CLASSES], name="input_y")
    init_state = tf.zeros(
        shape=[100], dtype=tf.float32, name='init_state')
    
    weights = tf.Variable(
        dtype=tf.float32, name='weights',
        initial_value=tf.truncated_normal(
            shape=[128, 1], stddev=0.001, dtype=tf.float32))
    biases = tf.Variable(
        dtype=tf.float32, name='weights', 
        initial_value=tf.truncated_normal(
            shape=[1, 1], stddev=0.001, dtype=tf.float32))

    word_embeddings = tf.get_variable(
        shape=[VOCAB_SIZE, EMBEDDING_SIZE], name="word_embeddings", dtype=tf.float32)
    print("word_embeddings.shape: ", word_embeddings.shape)
    
    embedded_sequence = tf.nn.embedding_lookup(
        word_embeddings, input_x, name="embedded_sequence")
    print("embedded_sequence.shape: ", embedded_sequence.shape)
    
    conv_1 = tf.layers.conv1d(
        inputs=embedded_sequence, filters=300, kernel_size=3, name="conv_1")
    print("conv_1.shape: ", conv_1.shape)
    max_pool_1 = tf.layers.max_pooling1d(
        inputs=conv_1, pool_size=2, strides=2, name="max_pool_1")
    print("max_pool_1.shape: ", max_pool_1.shape)
    
    conv_2 = tf.layers.conv1d(
        inputs=max_pool_1, filters=150, kernel_size=3, name="conv_2")
    print("conv_2.shape: ", conv_2.shape)
    max_pool_2 = tf.layers.max_pooling1d(
        inputs=conv_2, pool_size=2, strides=2, name="max_pool_2")
    print("max_pool_2.shape: ", max_pool_2.shape)
    
    lstm_cell_1 = tf.contrib.rnn.BasicLSTMCell(128)
    states_series, current_state = tf.nn.dynamic_rnn(
        cell=lstm_cell_1, inputs=max_pool_2, dtype=tf.float32)
    print("current_state.h.shape: ", current_state.h.shape)
    
    dense_1 = tf.layers.dense(
        inputs=current_state.h, units=NUM_CLASSES, name="dense_1")
    print("dense_1.shape: ", dense_1.shape)
    
    softmax_output = tf.nn.softmax(
        dense_1, name="softmax")
    print("final_predictions.shape: ", final_predictions.shape)