# Neural Affect Style Transfer

In [1]:
from numpy import zeros, concatenate, asarray, ones, amax, argmax, squeeze
from IPython.display import display, HTML

In [2]:
def browser_alert(message):
    display(HTML('<script type="text/javascript">alert("' + message + '");</script>'))
    
def browser_notify(message):
    display(HTML('<script type="text/javascript">var notification=new Notification("' + \
                 'Jupyter Notification",{icon:"http://blog.jupyter.org/content/' + \
                 'images/2015/02/jupyter-sq-text.png",body:"' + message + \
                 '"});</script>'))

In [3]:
browser_notify("test")

## Read Data

In [4]:
dataset_path = "/home/v2john/attr-reviews-dataset/dev.txt"

In [5]:
all_texts = list()
with open(dataset_path) as dataset_file:
    for line in dataset_file:
        text = line.split('\t')[3]
        all_texts.append(text)

In [6]:
len(all_texts)

93703

### Tokenize and build embeddings

In [7]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [8]:
keras_tokenizer = Tokenizer(num_words=500)

In [9]:
keras_tokenizer.fit_on_texts(all_texts)

In [10]:
len(keras_tokenizer.word_index)

36663

In [11]:
text_sequences = keras_tokenizer.texts_to_sequences(all_texts)

In [12]:
text_sequences = asarray(text_sequences)

In [13]:
text_sequences.shape

(93703,)

## Build pre-trained embeddings

In [14]:
from gensim.models.wrappers import fasttext
from gensim.models.keyedvectors import KeyedVectors

In [15]:
embeddings_path = "/home/v2john/pretrained-embeddings/wiki.en.vec"

In [16]:
w2v_model = None
with open(embeddings_path) as embeddings_file:
    w2v_model = KeyedVectors.load_word2vec_format(embeddings_file, binary=False)

In [17]:
browser_notify("Embeddings loaded")

In [49]:
embedding_dim = squeeze(w2v_model.word_vec("the").shape)

In [50]:
print(embedding_dim)

300


In [21]:
embeddings_matrix = zeros(shape=(len(keras_tokenizer.word_index) + 1, embedding_dim))

In [42]:
keras_tokenizer.word_index['of']

9

In [43]:
for word in keras_tokenizer.word_index:
    embeddings_matrix[keras_tokenizer.word_index[word]] = w2v_model.word_vec("the")

In [44]:
embeddings_matrix.shape

(36664, 300)

## Keras Model

In [65]:
from keras import backend as K
from keras.layers import Input, Dense, RepeatVector, LSTM, Conv1D, Masking, Embedding
from keras.layers.wrappers import TimeDistributed, Bidirectional
from keras.models import Model
from keras.losses import binary_crossentropy
from keras.preprocessing.sequence import pad_sequences

In [28]:
MAX_SEQUENCE_LENGTH = 20

In [29]:
text_sequences = pad_sequences(text_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', 
                               truncating='post', value=0)

In [30]:
text_sequences.shape

(93703, 20)

In [31]:
text_sequences[0]

array([ 2, 39,  6,  8,  7, 10,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0], dtype=int32)

In [32]:
x_train = zeros(shape=(len(text_sequences), MAX_SEQUENCE_LENGTH, embedding_dim))

In [33]:
for i in range(len(text_sequences)):
    vector_list = list()
    for index in text_sequences[i]:
        vector = embeddings_matrix[index]
        vector_list.append(vector)
    x_train[i] = asarray(vector_list)

In [34]:
x_train = x_train[:10000]

In [35]:
x_train.shape

(10000, 20, 300)

In [36]:
x_train[0]

array([[-0.065334, -0.093031, -0.017571, ...,  0.16642 , -0.13079 ,
         0.035397],
       [-0.065334, -0.093031, -0.017571, ...,  0.16642 , -0.13079 ,
         0.035397],
       [-0.065334, -0.093031, -0.017571, ...,  0.16642 , -0.13079 ,
         0.035397],
       ..., 
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ],
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ],
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ]])

In [105]:
main_input = Input(shape=x_train.shape[1:], dtype='float32', name='main_input')
print(main_input)
# embed_1 = Embedding(len(keras_tokenizer.word_index) + 1, embedding_dim, weights=[embeddings_matrix],
#                     input_length=MAX_SEQUENCE_LENGTH, trainable=False, mask_zero=True) (main_input)
# print(embed_1)

# lstm_1 = Bidirectional(LSTM(100, return_sequences=True, name='lstm_1')) (main_input)
# print(lstm_1)
lstm_2 = Bidirectional(LSTM(100, name='lstm_2'))(main_input)
print(lstm_2)
repeat_1 = RepeatVector(MAX_SEQUENCE_LENGTH, name='repeat_1')(lstm_2)
print(repeat_1)
lstm_3 = LSTM(100, return_sequences=True, name='lstm_3')(repeat_1)
print(lstm_3)
output = TimeDistributed(Dense(300, activation="tanh")) (lstm_3)
print(output)
# lstm_4 = LSTM(300, return_sequences=True, name='lstm_4')(lstm_3)
# print(lstm_4)

Tensor("main_input_10:0", shape=(?, 20, 300), dtype=float32)
Tensor("bidirectional_15/concat_2:0", shape=(?, 200), dtype=float32)
Tensor("repeat_1_10/Tile:0", shape=(?, 20, 200), dtype=float32)
Tensor("lstm_3_9/transpose_1:0", shape=(?, ?, 100), dtype=float32)
Tensor("time_distributed_2/Reshape_1:0", shape=(?, 20, 300), dtype=float32)


In [106]:
model = Model(main_input, output)
model.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=['accuracy'])

In [107]:
model.fit(x_train, x_train, batch_size=128, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f8a516e5a20>

In [108]:
predictions = model.predict(x_train)

In [109]:
predictions[0].shape

(20, 300)

In [114]:
def sequence_to_str(sequence):
    word_list = list()
    for element in sequence:
        word = w2v_model.similar_by_vector(element, topn=1)[0][0]
        word_list.append(word)
    return word_list

In [116]:
for i in range(1900, 2000):
    predicted_word_list = sequence_to_str(predictions[i])
    actual_len = len(all_texts[i].split())
    print(" ".join(predicted_word_list[:actual_len]) + " - " + all_texts[i])

the the the the the the the the the the the the the the the the 霧 你不要悲哀 wsip asss - i nchose this title because i love this authoe and im very keen to get more of her books when available

the the the the the the the the the the the the the the the the 霧 你不要悲哀 wsip asss - a good romance if you like period pieces . keeps yo ureading and it was free so its worth a shot

the the the the the the the the the the the the the the the the the the the 霧 - i love the hero . i live for the few pages of action in a long boring story other wise .

the the the the the the the the the the the the the the the the the the 霧 jxq - as usual with scott pratt the book keeps you wanting to read more . the twists and turns throughout surprises you



KeyboardInterrupt: 

In [None]:
browser_notify("Sentences generated")