# Neural Affect Style Transfer

In [1]:
from numpy import zeros, concatenate, asarray, ones, amax, argmax, squeeze
from IPython.display import display, HTML

In [2]:
def browser_alert(message):
    display(HTML('<script type="text/javascript">alert("' + message + '");</script>'))
    
def browser_notify(message):
    display(HTML('<script type="text/javascript">var notification=new Notification("' + \
                 'Jupyter Notification",{icon:"http://blog.jupyter.org/content/' + \
                 'images/2015/02/jupyter-sq-text.png",body:"' + message + \
                 '"});</script>'))

In [3]:
browser_notify("test")

## Read Data

In [4]:
dataset_path = "/home/v2john/attr-reviews-dataset/dev.txt"

In [5]:
all_texts = list()
with open(dataset_path) as dataset_file:
    for (line, text) in enumerate(dataset_file):
        text = text.split('\t')[3]
        all_texts.append(text)
        
        if line == 9999:
            break

In [7]:
len(all_texts)

10000

### Tokenize and build embeddings

In [127]:
from keras.preprocessing.text import Tokenizer

In [128]:
keras_tokenizer = Tokenizer(num_words=500)

In [129]:
keras_tokenizer.fit_on_texts(all_texts)

In [130]:
VOCAB_SIZE = len(keras_tokenizer.word_index)

In [131]:
VOCAB_SIZE

9155

In [132]:
text_sequences = keras_tokenizer.texts_to_sequences(all_texts)

In [133]:
text_sequences = asarray(text_sequences)

In [134]:
text_sequences.shape

(10000,)

## Build pre-trained embeddings

In [16]:
from gensim.models.wrappers import fasttext
from gensim.models.keyedvectors import KeyedVectors

In [17]:
embeddings_path = "/home/v2john/pretrained-embeddings/wiki.en.vec"

In [18]:
w2v_model = None
with open(embeddings_path) as embeddings_file:
    w2v_model = KeyedVectors.load_word2vec_format(embeddings_file, binary=False)

In [19]:
browser_notify("Embeddings loaded")

In [20]:
EMBEDDING_DIM = squeeze(w2v_model.word_vec("the").shape)

In [30]:
EMBEDDING_DIM

array(300)

In [22]:
embeddings_matrix = zeros(shape=(len(keras_tokenizer.word_index) + 1, EMBEDDING_DIM))

In [24]:
for word in keras_tokenizer.word_index:
    embeddings_matrix[keras_tokenizer.word_index[word]] = w2v_model.word_vec("the")

In [25]:
embeddings_matrix.shape

(9156, 300)

## Keras Model

In [141]:
from keras import backend as K
from keras.layers import Input, Dense, RepeatVector, LSTM, Conv1D, Masking, Embedding, Dropout
from keras.layers.wrappers import TimeDistributed, Bidirectional
from keras.models import Model
from keras.losses import binary_crossentropy
from keras.preprocessing.sequence import pad_sequences

In [142]:
MAX_SEQUENCE_LENGTH = 20

In [143]:
padded_text_sequences = \
    pad_sequences(text_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', 
                  truncating='post', value=0)

In [144]:
padded_text_sequences.shape

(10000, 20)

In [145]:
word_dict = dict()
for word in keras_tokenizer.word_index:
    rank = keras_tokenizer.word_index[word]
    if rank <= VOCAB_SIZE:
        word_dict[rank] = word

In [146]:
x_train = zeros(shape=(len(padded_text_sequences), MAX_SEQUENCE_LENGTH, VOCAB_SIZE))

In [147]:
for i in range(len(padded_text_sequences)):
    for pos in range(len(padded_text_sequences[i])):
        x_train[i][pos][padded_text_sequences[i][pos]] = 1

In [148]:
x_train.shape

(10000, 20, 9155)

In [149]:
main_input = Input(shape=padded_text_sequences.shape[1:], dtype='float32', name='main_input')
print(main_input)

embed_1 = Embedding(VOCAB_SIZE + 1, EMBEDDING_DIM, weights=[embeddings_matrix],
                    input_length=MAX_SEQUENCE_LENGTH, trainable=True, mask_zero=True) (main_input)
print(embed_1)

lstm_1 = Bidirectional(LSTM(200, return_sequences=True, dropout=0.25, 
                            recurrent_dropout=0.25, name='lstm_1')) (embed_1)
print(lstm_1)

lstm_2 = Bidirectional(LSTM(100, dropout=0.25, recurrent_dropout=0.25, 
                            name='lstm_2'))(lstm_1)
print(lstm_2)

repeat_1 = RepeatVector(MAX_SEQUENCE_LENGTH, name='repeat_1')(lstm_2)
print(repeat_1)

lstm_3 = Bidirectional(LSTM(100, return_sequences=True, dropout=0.25, 
                            recurrent_dropout=0.25, name='lstm_3'))(repeat_1)
print(lstm_3)

lstm_4 = Bidirectional(LSTM(200, return_sequences=True, dropout=0.25, 
                            recurrent_dropout=0.25, name='lstm_4'))(lstm_3)
print(lstm_4)

dense_1 = Dense(VOCAB_SIZE, activation="softmax") (lstm_4)
print(dense_1)

output = TimeDistributed(Dropout(0.5)) (dense_1)
print(output)

Tensor("main_input_12:0", shape=(?, 20), dtype=float32)
Tensor("embedding_13/Gather:0", shape=(?, 20, 300), dtype=float32)
Tensor("bidirectional_35/concat_2:0", shape=(?, ?, 400), dtype=float32)
Tensor("bidirectional_36/concat_2:0", shape=(?, 200), dtype=float32)
Tensor("repeat_1_12/Tile:0", shape=(?, 20, 200), dtype=float32)
Tensor("bidirectional_37/concat_2:0", shape=(?, ?, 200), dtype=float32)
Tensor("bidirectional_38/concat_2:0", shape=(?, ?, 400), dtype=float32)
Tensor("dense_13/truediv:0", shape=(?, 20, 9155), dtype=float32)
Tensor("time_distributed_13/Reshape_1:0", shape=(?, 20, 9155), dtype=float32)


In [150]:
model = Model(main_input, output)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
model.fit(padded_text_sequences, x_train, batch_size=128, epochs=100, verbose=1)

Epoch 1/100
 1024/10000 [==>...........................] - ETA: 809s - loss: 12.0040 - acc: 0.0688

In [None]:
predictions = model.predict(padded_text_sequences)

In [None]:
predictions.shape

In [None]:
browser_notify("Model Trained")

In [None]:
MAX_PROB = 0.2

In [None]:
word_dict = dict()
for word in keras_tokenizer.word_index:
    rank = keras_tokenizer.word_index[word]
    if rank <= VOCAB_SIZE:
        word_dict[rank] = word

In [None]:
def sequence_to_str(sequence):
    word_list = list()
    for element in sequence:
#         if amax(element) < MAX_PROB:
#             continue        
        index = argmax(element) + 1
        word = word_dict[index]
        word_list.append(word)
        
    return word_list

In [None]:
for i in range(len(predictions)):
    predicted_word_list = sequence_to_str(predictions[i])
    actual_len = len(all_texts[i].split())
    print("Actual: " + all_texts[i])
    print("Generated: " + " ".join(predicted_word_list[:actual_len]) + "\n")

In [None]:
browser_notify("Sentences generated")