In [None]:
import json
import time
import numpy as np
import tensorflow as tf

In [None]:
from statistics import median
from nltk.tokenize import word_tokenize
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.utils import shuffle

In [None]:
def convert_json_to_review_and_rating(json_text):
    review_dict = json.loads(json_text)    
    return review_dict['reviewText'], review_dict['overall']

In [None]:
def get_reviews_and_ratings(reviews_filepath):
    review_texts = list()
    ratings = list()
    with open(reviews_filepath) as reviews_file:
        for line in reviews_file:
            review_text, rating = convert_json_to_review_and_rating(line)
            review_texts.append(review_text)
            ratings.append(int(rating))
            
    return review_texts, ratings

In [None]:
def texts_to_indexed_word_sequences(review_texts):
    word_indices = dict()
    indexed_sequences = list()
    word_index = 1
    
    for review_text in review_texts:
        tokens = word_tokenize(review_text)
        indexed_sequence = list()
        for token in tokens:
            if token not in word_indices:
                word_indices[token] = word_index
                indexed_sequence.append(word_index)
                word_index += 1
            else:
                indexed_sequence.append(word_indices[token])
        indexed_sequences.append(np.asarray(indexed_sequence))
        
    return word_indices, indexed_sequences

In [None]:
reviews_filepath = "/home/v2john/datasets/amazon/reviews_electronics.json"

In [None]:
review_texts, ratings = get_reviews_and_ratings(reviews_filepath)
review_texts, ratings = shuffle(review_texts, ratings)
print(len(review_texts), len(ratings))

In [None]:
word_indices, indexed_sequences = texts_to_indexed_word_sequences(review_texts)

In [None]:
VOCAB_SIZE = len(word_indices)
print("VOCAB_SIZE: ", VOCAB_SIZE)

EMBEDDING_SIZE = 300
print("EMBEDDING_SIZE: ", EMBEDDING_SIZE)

MAX_SEQUENCE_LENGTH = int(median([len(sequence) for sequence in indexed_sequences]))
print("MAX_SEQUENCE_LENGTH: ", MAX_SEQUENCE_LENGTH)

NUM_CLASSES = len(set(ratings))
print("NUM_CLASSES: ", NUM_CLASSES)

In [None]:
def pad_indexed_sequences(indexed_sequences, max_sequence_length):
    new_indexed_sequences = list()
    for sequence in indexed_sequences:
        if len(sequence) >= max_sequence_length:
            new_indexed_sequences.append(sequence[:max_sequence_length])
        else:
            shortfall = max_sequence_length - len(sequence)
            new_indexed_sequences.append(
                np.pad(sequence, (0, shortfall), 'constant', 
                       constant_values=(0, 0)))
    return np.asarray(new_indexed_sequences)

def convert_labels_to_logits(ratings, num_classes):
    one_hot_ratings = list()
    for rating in ratings:
        one_hot_rating = np.zeros(num_classes)
        one_hot_rating[rating - 1] = 1
        one_hot_ratings.append(one_hot_rating)
        
    return np.asarray(one_hot_ratings)

def tensorize_sequences_and_labels(indexed_sequences, ratings, max_sequence_length, num_classes):
    return pad_indexed_sequences(indexed_sequences, max_sequence_length), \
        convert_labels_to_logits(ratings, num_classes)

In [None]:
indexed_sequences, labels = tensorize_sequences_and_labels(
    indexed_sequences, ratings, MAX_SEQUENCE_LENGTH, NUM_CLASSES)

In [None]:
indexed_sequences.shape, labels.shape