# Authorship Style Transfer

In [1]:
import numpy as np
import tensorflow as tf
import sklearn.preprocessing as skp

---

## Data Preprocessing

In [2]:
text_file_path = "data/c50-articles.txt"
label_file_path = "data/c50-labels.txt"

### Conversion of texts into integer sequences

In [7]:
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_SIZE = 300

In [4]:
text_tokenizer =  tf.keras.preprocessing.text.Tokenizer(num_words=1000)

with open(text_file_path) as text_file:
    text_tokenizer.fit_on_texts(text_file)
    
with open(text_file_path) as text_file:
    integer_text_sequences = text_tokenizer.texts_to_sequences(text_file)

len(integer_text_sequences)

padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(
     integer_text_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

padded_sequences.shape

(2500, 100)

### Conversion of labels to one-hot represenations

In [5]:
label_tokenizer =  tf.keras.preprocessing.text.Tokenizer(lower=False)

with open(label_file_path) as label_file:
    label_tokenizer.fit_on_texts(label_file)

with open(label_file_path) as label_file:
    label_sequences = label_tokenizer.texts_to_sequences(label_file)

one_hot_labels = list(map(lambda x: np.eye(len(label_tokenizer.word_index), k=x[0])[0], label_sequences))lll

---

## Deep Learning Model

In [42]:
with tf.device("/gpu:0"):
    
    # needed to clear the existing graph when the cell is re-run
    tf.reset_default_graph()
    
    def get_sentence_representation(index_sequence, word_embeddings):

        # dense embedded sequence
        embedded_sequence = tf.nn.embedding_lookup(
            word_embeddings, input_sequence, name="embedded_sequence")

        lstm_cell_fw = tf.contrib.rnn.BasicLSTMCell(num_units=128, name="lstm_cell_fw_content")
        lstm_cell_bw = tf.contrib.rnn.BasicLSTMCell(num_units=128, name="lstm_cell_bw_content")

        rnn_outputs, rnn_states = tf.nn.bidirectional_dynamic_rnn(
            cell_fw=lstm_cell_fw, cell_bw=lstm_cell_bw, inputs=embedded_sequence, 
            dtype=tf.float32, time_major=False)
        rnn_state = tf.concat([rnn_states[0].c, rnn_states[1].c], axis=1)

        return rnn_state

    def get_content_representation(sentence_representation):
        dense_content = tf.layers.dense(
            inputs=sentence_representation, units=128, 
            activation=tf.nn.relu, name="dense_content")
        
        return dense_content

    def get_style_representation(sentence_representation):
        dense_style = tf.layers.dense(
            inputs=sentence_representation, units=128, 
            activation=tf.nn.relu, name="dense_style")
        
        return dense_style

    def get_label_prediction(content_representation):

        dense_1 = tf.layers.dense(
            inputs=content_representation, units=len(label_tokenizer.word_index), 
            activation=tf.nn.relu, name="dense_1")

        softmax_output = tf.nn.softmax(dense_1, name="softmax")

        return softmax_output


    # input variable - text sequence converted to an index sequence
    input_sequence = tf.placeholder(
        tf.int32, [None, MAX_SEQUENCE_LENGTH], name="input_sequence")
    print("input_sequence: ", input_sequence)

    input_label = tf.placeholder(
        tf.float32, [None, len(label_tokenizer.word_index)], name="input_label")
    print("input_label: ", input_label)

    # learn embeddings matrix - can be initialized with pre-trained embeddings
    word_embeddings = tf.get_variable(
        shape=[len(label_tokenizer.word_index) + 1, EMBEDDING_SIZE], name="word_embeddings", 
        dtype=tf.float32)
    print("word_embeddings: ", word_embeddings)
    
    # get sentence representation
    sentence_representation = get_sentence_representation(input_sequence, word_embeddings)
    print("sentence_representation:", sentence_representation)

    # get content representation
    content_representation = get_content_representation(sentence_representation)
    print("content_representation:", content_representation)

    # get style representation
    style_representation = get_style_representation(sentence_representation)
    print("style_representation:", style_representation)

    # use content representation to predict a label
    label_prediction = get_label_prediction(content_representation)
    print("label_prediction:", label_prediction)
    
    loss = tf.losses.softmax_cross_entropy(onehot_labels=input_label, logits=label_prediction)
    
    adversarial_optimizer = tf.train.AdamOptimizer()
    adversarial_loss = adversarial_optimizer.minimize(loss)

input_sequence:  Tensor("input_sequence:0", shape=(?, 100), dtype=int32)
input_label:  Tensor("input_label:0", shape=(?, 50), dtype=float32)
word_embeddings:  <tf.Variable 'word_embeddings:0' shape=(51, 300) dtype=float32_ref>
sentence_representation: Tensor("concat:0", shape=(?, 256), dtype=float32)
content_representation: Tensor("dense_content/Relu:0", shape=(?, 128), dtype=float32)
style_representation: Tensor("dense_style/Relu:0", shape=(?, 128), dtype=float32)
label_prediction: Tensor("softmax:0", shape=(?, 50), dtype=float32)
Tensor("softmax_cross_entropy_loss/value:0", shape=(), dtype=float32)
