# Authorship Style Transfer

In [None]:
import numpy as np
import tensorflow as tf

from datetime import datetime as dt

---

## Data Preprocessing

In [None]:
text_file_path = "data/c50-articles.txt"
label_file_path = "data/c50-labels.txt"

### Conversion of texts into integer sequences

In [None]:
VOCAB_SIZE = 1000
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_SIZE = 300

In [None]:
text_tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words=VOCAB_SIZE, filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')

with open(text_file_path) as text_file:
    text_tokenizer.fit_on_texts(text_file)
    
with open(text_file_path) as text_file:
    integer_text_sequences = text_tokenizer.texts_to_sequences(text_file)

padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(
     integer_text_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

print(padded_sequences.shape)

In [None]:
SOS_INDEX = text_tokenizer.word_index['<sos>']
EOS_INDEX = text_tokenizer.word_index['<eos>']
print(SOS_INDEX, EOS_INDEX)

### Conversion of labels to one-hot represenations

In [None]:
label_tokenizer =  tf.keras.preprocessing.text.Tokenizer(lower=False)

with open(label_file_path) as label_file:
    label_tokenizer.fit_on_texts(label_file)

with open(label_file_path) as label_file:
    label_sequences = label_tokenizer.texts_to_sequences(label_file)

NUM_LABELS = len(label_tokenizer.word_index)
one_hot_labels = np.asarray(list(
    map(lambda x: np.eye(NUM_LABELS, k=x[0])[0], label_sequences)))

print(one_hot_labels.shape)

---

## Deep Learning Model

### Setup Instructions

In [None]:
class GenerativeAdversarialNetwork():

    def __init__(self):
        self.style_embedding_size = 128
        self.content_embedding_size = 128
        self.build_model()
    
    def get_sentence_representation(self, index_sequence, word_embeddings):
        
        embedded_sequence = tf.nn.embedding_lookup(
            word_embeddings, index_sequence, name="embedded_sequence")

        lstm_cell_fw = tf.contrib.rnn.BasicLSTMCell(
            num_units=128, name="lstm_cell_fw_content")
        lstm_cell_bw = tf.contrib.rnn.BasicLSTMCell(
            num_units=128, name="lstm_cell_bw_content")

        rnn_outputs, rnn_states = tf.nn.bidirectional_dynamic_rnn(
            cell_fw=lstm_cell_fw, cell_bw=lstm_cell_bw, 
            inputs=embedded_sequence, 
            dtype=tf.float32, time_major=False)
        rnn_state = tf.concat(
            values=[rnn_states[0].h, rnn_states[1].h], axis=1, 
            name="sentence_representation")

        return rnn_state

    def get_content_representation(self, sentence_representation):
        
        dense_content = tf.layers.dense(
            inputs=sentence_representation, units=self.content_embedding_size, 
            activation=tf.nn.relu, name="content_representation")

        return dense_content

    def get_style_representation(self, sentence_representation):
        
        dense_style = tf.layers.dense(
            inputs=sentence_representation, units=self.style_embedding_size, 
            activation=tf.nn.relu, name="style_representation")
        return dense_style

    def get_label_prediction(self, content_representation):

        dense_1 = tf.layers.dense(
            inputs=content_representation, units=NUM_LABELS, 
            activation=tf.nn.relu, name="dense_1")
        
        softmax_output = tf.nn.softmax(dense_1, name="label_prediction")

        return softmax_output
    
    def generate_output_sequence(self, word_embeddings, style_representation, 
                                 content_representation):
        
        last_predicted_word_index = SOS_INDEX
        indices = tf.ones(shape=[self.batch_size], dtype=tf.int32) * SOS_INDEX
        print("indices: {}".format(indices))
        
        actual_initial_state = tf.one_hot(
            indices=indices, depth=VOCAB_SIZE)
        print("actual_initial_state: {}".format(actual_initial_state))
        
        generative_embedding = tf.concat(
            values=[style_representation, content_representation], axis=1)
        print("generative_embedding: {}".format(generative_embedding))
        
        repeated_vector = tf.tile(
            input=generative_embedding, 
            multiples=tf.constant([1, MAX_SEQUENCE_LENGTH]), 
            name='repeated_vector')
        print("repeated_vector: {}".format(repeated_vector))
        
        reshaped_sequence = tf.reshape(
            tensor=repeated_vector, 
            shape=[self.batch_size, MAX_SEQUENCE_LENGTH, 
                   self.style_embedding_size + self.content_embedding_size],
            name='reshaped_sequence'
        )
        print("reshaped_sequence: {}".format(reshaped_sequence))
        
        decoder_rnn_cell = tf.nn.rnn_cell.BasicRNNCell(VOCAB_SIZE)
        decoder_outputs, _ = tf.nn.dynamic_rnn(
            cell=decoder_rnn_cell, inputs=reshaped_sequence, 
            initial_state=actual_initial_state)
        print("decoder_outputs: {}".format(decoder_outputs))
        
        softmax_prediction = tf.nn.softmax(decoder_outputs)
        print("softmax_prediction: {}".format(softmax_prediction))

        return softmax_prediction


    def build_model(self):
        
        self.input_sequence = tf.placeholder(
            dtype=tf.int32, shape=[None, MAX_SEQUENCE_LENGTH], 
            name="input_sequence")
        print("input_sequence: {}".format(self.input_sequence))

        self.input_label = tf.placeholder(
            dtype=tf.float32, shape=[None, NUM_LABELS], 
            name="input_label")
        print("input_label: {}".format(self.input_label))
        
        self.batch_size = tf.shape(self.input_sequence)[0]

        # learn embeddings matrix
        # can be initialized with pre-trained embeddings
        word_embeddings = tf.get_variable(
            shape=[VOCAB_SIZE + 1, EMBEDDING_SIZE], name="word_embeddings", 
            dtype=tf.float32)
        print("word_embeddings: {}".format(word_embeddings))

        # get sentence representation
        sentence_representation = self.get_sentence_representation(
            self.input_sequence, word_embeddings)
        print("sentence_representation: {}".format(sentence_representation))

        # get content representation
        content_representation = self.get_content_representation(
            sentence_representation)
        print("content_representation: {}".format(content_representation))

        # use content representation to predict a label
        self.label_prediction = self.get_label_prediction(
            content_representation)
        print("label_prediction: {}".format(self.label_prediction))

        self.adversarial_loss = tf.losses.softmax_cross_entropy(
            onehot_labels=self.input_label, logits=self.label_prediction)
        print("adversarial_loss: {}".format(self.adversarial_loss))

        self.adversarial_loss_summary = tf.summary.scalar(
            tensor=self.adversarial_loss, name="adversarial_loss")

        # get style representation
        style_representation = self.get_style_representation(
            sentence_representation)
        print("style_representation: {}".format(style_representation))
        
        # generate new sentence
        self.generated_logits = self.generate_output_sequence(
            word_embeddings, style_representation, content_representation)
        print("generated_logits: {}".format(self.generated_logits))
        
        self.reconstruction_loss = tf.contrib.seq2seq.sequence_loss(
            logits=self.generated_logits, targets=self.input_sequence, 
            weights=tf.ones(tf.shape(self.input_sequence)))
        print("reconstruction_loss: {}".format(self.reconstruction_loss))

        self.reconstruction_loss_summary = tf.summary.scalar(
            tensor=self.reconstruction_loss, name="reconstruction_loss")


    def train(self, sess):

        writer = tf.summary.FileWriter(
            logdir="/tmp/tensorflow_logs/" + dt.now().strftime("%Y%m%d-%H%M%S") + "/", 
            graph=sess.graph)
        
        adversarial_training_optimizer = tf.train.AdamOptimizer()
        adversarial_training_operation = adversarial_training_optimizer.minimize(
            self.adversarial_loss)
        
        reconstruction_training_optimizer = tf.train.AdamOptimizer()
        reconstruction_training_operation = reconstruction_training_optimizer.minimize(
            self.reconstruction_loss - self.adversarial_loss)
        
        sess.run(tf.global_variables_initializer())

        batch_size = 100
        epoch_reporting_interval = 1
        training_examples_fraction = 0.9
        self.training_examples_size = int(training_examples_fraction * len(one_hot_labels))
        training_epochs = 100
        num_batches = self.training_examples_size // batch_size
        print("Training - texts shape: {}; labels shape {}"
              .format(padded_sequences[:self.training_examples_size].shape, 
                      one_hot_labels[:self.training_examples_size].shape))

        training_step = 1
        for current_epoch in range(1, training_epochs + 1):
            for batch_number in range(num_batches):
                _, adv_loss, adv_loss_sum, _, rec_loss, rec_loss_sum = sess.run(
                    [adversarial_training_operation, self.adversarial_loss, 
                     self.adversarial_loss_summary, 
                     reconstruction_training_operation, self.reconstruction_loss, 
                     self.reconstruction_loss_summary], 
                    feed_dict={
                        self.input_sequence: padded_sequences[
                            batch_number * batch_size : (batch_number + 1) * batch_size],
                        self.input_label: one_hot_labels[
                            batch_number * batch_size : (batch_number + 1) * batch_size]})
                writer.add_summary(adv_loss_sum, training_step)
                writer.add_summary(rec_loss_sum, training_step)
                writer.flush()
                training_step += 1

            if (current_epoch % epoch_reporting_interval == 0):
                print("Training epoch: {}; Adversarial Loss: {}; Reconstruction Loss: {}"
                      .format(current_epoch, adv_loss, rec_loss))
        
        writer.close()

    def infer(self, sess):
        
        test_samples_size = len(one_hot_labels[self.training_examples_size:])
        
        training_predictions, generated_training_sequences = sess.run(
            fetches=[self.label_prediction, self.generated_logits], 
            feed_dict={
                self.input_sequence: padded_sequences[:test_samples_size], 
                self.input_label: one_hot_labels[:test_samples_size]
            })

        test_predictions, generated_test_sequences = sess.run(
            fetches=[self.label_prediction, self.generated_logits],
            feed_dict={
                self.input_sequence: padded_sequences[-1 * test_samples_size:], 
                self.input_label: one_hot_labels[-1 * test_samples_size:]
            })

        return generated_training_sequences, generated_test_sequences

### Train Network

In [None]:
tf.reset_default_graph()
gan = GenerativeAdversarialNetwork()

In [None]:
sess = tf.Session()
gan.train(sess)

In [None]:
generated_training_sequences, generated_test_sequences = gan.infer(sess)

In [None]:
print(generated_training_sequences.shape)
print(generated_test_sequences.shape)

In [None]:
index_word_inverse_map = {v: k for k, v in text_tokenizer.word_index.items()}

def generate_word(word_embedding):
    return np.argmax(word_embedding)

def generate_sentence(floating_index_sequence):
    words_indices = map(generate_word, floating_index_sequence)
    words = list(map(lambda x: index_word_inverse_map[x], words_indices))
    
    sentence = " ".join(words)
    
    return sentence

In [None]:
test_generated_sentences = list(map(generate_sentence, generated_test_sequences))
print(test_generated_sentences)