# Neural Sentiment Transfer

This motivation of this project is to test whether sentiment can be changed as a tunable parameter to auto-generate review text with the same content but different sentiment

In [1]:
import json
import time
import spacy
import numpy as np
import tensorflow as tf

In [2]:
from statistics import median
from sklearn.utils import shuffle
from sklearn.metrics import f1_score, confusion_matrix

In [3]:
spacy_nlp = spacy.load('en')

In [4]:
def convert_json_to_review_and_rating(json_text):
    review_dict = json.loads(json_text)    
    return review_dict['reviewText'], review_dict['overall']

In [5]:
def get_reviews_and_ratings(reviews_filepath):
    review_texts = list()
    ratings = list()
    with open(reviews_filepath) as reviews_file:
        for line in reviews_file:
            review_text, rating = convert_json_to_review_and_rating(line)
            review_texts.append(review_text)
            ratings.append(int(rating))
            
    return review_texts, ratings

In [6]:
def clean_word(word):
    return word

def clean_sentence(sentence):
    sentence = sentence.lower()
    return sentence

In [7]:
def texts_to_indexed_word_sequences(review_texts):
    vocab = list()
    word_indices = dict()
    indexed_sequences = list()
    word_index = 1
    
    for review_text in review_texts:
        review_text = clean_sentence(review_text)
        tokens = spacy_nlp.tokenizer(review_text)
        indexed_sequence = list()
        for token in tokens:
            token = clean_word(token)
            if token not in word_indices:
                vocab.append(token)
                word_indices[token] = word_index
                indexed_sequence.append(word_index)
                word_index += 1
            else:
                indexed_sequence.append(word_indices[token])
        indexed_sequences.append(np.asarray(indexed_sequence))
        
    return vocab, word_indices, indexed_sequences

In [8]:
reviews_filepath = "data/reviews_electronics_tiny.json"

In [9]:
review_texts, ratings = get_reviews_and_ratings(reviews_filepath)
review_texts, ratings = shuffle(review_texts, ratings)
print(len(review_texts), len(ratings))

500 500


In [10]:
vocab, word_indices, indexed_sequences = texts_to_indexed_word_sequences(review_texts)

In [11]:
VOCAB_SIZE = len(vocab)
print("VOCAB_SIZE: ", VOCAB_SIZE)

EMBEDDING_SIZE = 300
print("EMBEDDING_SIZE: ", EMBEDDING_SIZE)

MAX_SEQUENCE_LENGTH = int(median([len(sequence) for sequence in indexed_sequences]))
print("MAX_SEQUENCE_LENGTH: ", MAX_SEQUENCE_LENGTH)

NUM_CLASSES = len(set(ratings))
print("NUM_CLASSES: ", NUM_CLASSES)

VOCAB_SIZE:  101001
EMBEDDING_SIZE:  300
MAX_SEQUENCE_LENGTH:  94
NUM_CLASSES:  5


In [12]:
def pad_indexed_sequences(indexed_sequences, max_sequence_length):
    new_indexed_sequences = list()
    for sequence in indexed_sequences:
        if len(sequence) >= max_sequence_length:
            new_indexed_sequences.append(sequence[:max_sequence_length])
        else:
            shortfall = max_sequence_length - len(sequence)
            new_indexed_sequences.append(
                np.pad(sequence, (0, shortfall), 'constant', 
                       constant_values=(0, 0)))
    return np.asarray(new_indexed_sequences)

# def convert_labels_to_logits(ratings, num_classes):
#     one_hot_ratings = list()
#     for rating in ratings:
#         one_hot_rating = np.zeros(num_classes)
#         one_hot_rating[rating - 1] = 1
#         one_hot_ratings.append(one_hot_rating)
        
#     return np.asarray(one_hot_ratings)

def tensorize_sequences_and_labels(indexed_sequences, ratings, max_sequence_length, num_classes):
    return pad_indexed_sequences(indexed_sequences, max_sequence_length), np.asarray([ratings]).reshape(-1, 1)

In [13]:
indexed_sequences, labels = tensorize_sequences_and_labels(
    indexed_sequences, ratings, MAX_SEQUENCE_LENGTH, NUM_CLASSES)

In [14]:
indexed_sequences.shape, labels.shape

((500, 94), (500, 1))

## Train Generative Network

In [47]:
graph_1 = tf.Graph()
current_epoch = int(time.time())

with graph_1.as_default():

    # One-hot encoded representation of the text
    input_sequence = tf.placeholder(
        tf.int32, [None, MAX_SEQUENCE_LENGTH], name="input_sequence")
    print("input_sequence: ", input_sequence)
    
    # Actual rating
    input_rating = tf.placeholder(
        tf.float32, [None, 1], name="input_rating")
    print("input_rating: ", input_rating)

    # Learned embeddings matrix - can be initialized with pre-trained embeddings
    word_embeddings = tf.get_variable(
        shape=[VOCAB_SIZE + 1, EMBEDDING_SIZE], name="word_embeddings", 
        dtype=tf.float32)
    print("word_embeddings: ", word_embeddings)
    
    # Dense embedded sequence
    embedded_sequence = tf.nn.embedding_lookup(
        word_embeddings, input_sequence, name="embedded_sequence")
    print("embedded_sequence: ", embedded_sequence)
    
    # Convert sequence into fixed size representation for each body of text
    # using a bidirectional LSTM
    with tf.variable_scope('encoder_lstm'):
        vanilla_lstm_cell_fw = tf.contrib.rnn.BasicLSTMCell(
            num_units=64)
        vanilla_lstm_cell_bw = tf.contrib.rnn.BasicLSTMCell(
            num_units=64)
    
        vanilla_rnn_outputs, rnn_output_states = tf.nn.bidirectional_dynamic_rnn(
            cell_fw=vanilla_lstm_cell_fw, cell_bw=vanilla_lstm_cell_bw, 
            inputs=embedded_sequence, dtype=tf.float32, time_major=False)
        print("rnn_output_states_fw: ", rnn_output_states[0].c)
        print("rnn_output_states_bw: ", rnn_output_states[1].c)
        
    final_lstm_output = tf.concat([rnn_output_states[0].c, rnn_output_states[1].c], axis=1)
    print("final_lstm_output: ", final_lstm_output)
    
    # Concatenate the rating and the representation
    conditioned_lstm_output = tf.concat([input_rating, final_lstm_output], 1)
    print("conditioned_lstm_output: ", conditioned_lstm_output)
    
    bootstrapping_word = tf.zeros_like(conditioned_lstm_output)
#     print("bootstrapping_word: ", bootstrapping_word)

    sequence_indices = list()
    for i in range(MAX_SEQUENCE_LENGTH):
        joint_embedding = tf.concat([conditioned_lstm_output, bootstrapping_word], 1)
        
        bootstrapping_word = tf.layers.dense(
            inputs=joint_embedding, units=bootstrapping_word.shape[1], 
            activation=tf.nn.relu)
#         print("bootstrapping_word: ", bootstrapping_word)
        
        softmax_prediction = tf.layers.dense(
            inputs=bootstrapping_word, units=VOCAB_SIZE, 
            activation=tf.nn.sigmoid)
#         print("softmax_prediction: ", softmax_prediction)
        
        word_index_prediction = tf.argmax(softmax_prediction, axis=1)
        sequence_indices.append(word_index_prediction)
        
    sequence_indices = tf.concat(sequence_indices, axis=0)
    print("sequence_indices: ", sequence_indices)


#     def perform_vocab_softmax(word_tensor):
#         dense_word_1 = tf.layers.dense(
#             inputs=word_tensor, units=VOCAB_SIZE, 
#             activation=tf.nn.relu, name="dense_word_1")
#         return dense_word_1
        
#     mapped_lstm_output = tf.map_fn(
#         perform_vocab_softmax,
#         vanilla_rnn_outputs[0],
#         name='mapped_lstm'
#     )
#     print("mapped_lstm_output: ", mapped_lstm_output)
    
    generator_loss = tf.contrib.seq2seq.sequence_loss(
        logits=input_sequence,
        targets=input_sequence, 
        weights=tf.ones_like(
            input_sequence, dtype=tf.float32, name=None, optimize=True
        ),
        name='generator_loss'
    )
    print("generator_loss: ", generator_loss)
    
#     generator_loss_summary = tf.summary.scalar(
#         "generated-sequence-loss-" + str(current_epoch), tf.convert_to_tensor(generator_loss))
    
#     generator_optimizer = tf.train.AdamOptimizer()
#     generator_train_operation = generator_optimizer.minimize(generator_loss)

input_sequence:  Tensor("input_sequence:0", shape=(?, 94), dtype=int32)
input_rating:  Tensor("input_rating:0", shape=(?, 1), dtype=float32)
word_embeddings:  <tf.Variable 'word_embeddings:0' shape=(101002, 300) dtype=float32_ref>
embedded_sequence:  Tensor("embedded_sequence:0", shape=(?, 94, 300), dtype=float32)
rnn_output_states_fw:  Tensor("encoder_lstm/bidirectional_rnn/fw/fw/while/Exit_2:0", shape=(?, 64), dtype=float32)
rnn_output_states_bw:  Tensor("encoder_lstm/bidirectional_rnn/bw/bw/while/Exit_2:0", shape=(?, 64), dtype=float32)
final_lstm_output:  Tensor("concat:0", shape=(?, 128), dtype=float32)
conditioned_lstm_output:  Tensor("concat_1:0", shape=(?, 129), dtype=float32)
sequence_indices:  Tensor("concat_96:0", shape=(?,), dtype=int64)


In [17]:
with tf.Session(graph=graph_1) as sess:
    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(
        logdir="/home/v2john/tensorlogs/", graph=graph_1)

    epoch_reporting_interval = 10
    training_examples_fraction = 0.9
    training_examples_size = int(training_examples_fraction * len(labels))
    batch_size = 50
    training_epochs = 100
    num_batches = int(training_examples_size/batch_size)

    mini_epoch = 1
    loss_var = None
    loss_summary_var = None
    for current_epoch in range(1, training_epochs + 1):
        for batch_number in range(num_batches):
            _, loss_var, loss_summary_var = sess.run(
                [train_op, loss, loss_summary], 
                feed_dict={
                    input_x: indexed_sequences[batch_number * batch_size : 
                                               (batch_number + 1) * batch_size],
                    input_y: labels[batch_number * batch_size : 
                                    (batch_number + 1) * batch_size]})
            writer.add_summary(loss_summary_var, mini_epoch)
            writer.flush()
            mini_epoch += 1

        if (current_epoch % epoch_reporting_interval == 0):
            print("Training epoch:", current_epoch, ", Loss:", loss_var)

    training_predictions = sess.run(
        prediction, 
        feed_dict={
            input_x: indexed_sequences[:training_examples_size], 
            input_y: labels[:training_examples_size]
        })

    test_predictions = sess.run(
        prediction, 
        feed_dict={
            input_x: indexed_sequences[training_examples_size:], 
            input_y: labels[training_examples_size:]
        })

    writer.flush()
    writer.close()