# Neural Affect Style Transfer

In [None]:
from numpy import zeros, concatenate, asarray, ones
from IPython.display import display, HTML

In [None]:
def browser_alert(message):
    display(HTML('<script type="text/javascript">alert("' + message + '");</script>'))
    
def browser_notify(message):
    display(HTML('<script type="text/javascript">var notification=new Notification("' + \
                 'Jupyter Notification",{icon:"http://blog.jupyter.org/content/' + \
                 'images/2015/02/jupyter-sq-text.png",body:"' + message + \
                 '"});</script>'))

In [None]:
browser_notify("test")

## Read Data

In [None]:
dataset_path = "/home/v2john/attr-reviews-dataset/dev.txt"

In [None]:
all_texts = list()
with open(dataset_path) as dataset_file:
    for line in dataset_file:
        text = line.split('\t')[3]
        all_texts.append(text)

In [None]:
len(all_texts)

### Tokenize and build embeddings

In [None]:
import html
import re

from nltk.tokenize import TweetTokenizer
from scipy.sparse import csr_matrix

In [None]:
tknzr = TweetTokenizer()

In [None]:
def clean_str(string):  
    string = html.unescape(string)
    string = re.sub(r"@[A-Za-z0-9_(),!?\'\`]+", " ", string) # removing any twitter handle mentions
    string = re.sub(r"\d+", " ", string) # removing any words with numbers
    return string.strip().lower()

In [None]:
tokenized_tweets = list()

for tweet in tweets:
    tweet = clean_str(tweet)
    tokens = tknzr.tokenize(tweet)
    tokenized_tweets.append(tokens)

## Creating Gensim Vocab model

In [None]:
from gensim.models import Word2Vec

In [None]:
vocab_model = Word2Vec(tokenized_tweets, min_count=1)

In [None]:
word_vectors = vocab_model.wv

In [None]:
vocab_model['how']

In [None]:
vectorized_tweet_sequences = list()

for tokenized_tweet in tokenized_tweets:
    vectors = list()
    for token in tokenized_tweet:
        vectors.append(vocab_model[token])
    vectorized_tweet_sequences.append(asarray(vectors))
    
vectorized_tweet_sequences = asarray(vectorized_tweet_sequences)

In [None]:
vectorized_tweet_sequences.shape, vectorized_tweet_sequences[0].shape

## Keras Model

In [None]:
from keras.layers import Input, Dense, RepeatVector, LSTM, Conv1D, Masking
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences

In [None]:
EMBEDDING_DIM = 100
MAX_SEQUENCE_LENGTH = 20

In [None]:
x_train = pad_sequences(vectorized_tweet_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', 
                        truncating='post', value=0.0)

In [None]:
x_train.shape

In [None]:
main_input = Input(shape=x_train.shape[1:], dtype='float32', name='main_input')
lstm_1 = LSTM(EMBEDDING_DIM, return_sequences=True, name='lstm_1')(main_input)
print(lstm_1)
mask = Masking(mask_value=0.0, name='mask')(lstm_1)
print(mask)
lstm_2 = LSTM(EMBEDDING_DIM, name='lstm_2')(mask)
print(lstm_2)
repeat_1 = RepeatVector(MAX_SEQUENCE_LENGTH, name='repeat_1')(lstm_2)
print(repeat_1)
lstm_3 = LSTM(EMBEDDING_DIM, return_sequences=True, name='lstm_3')(repeat_1)
print(lstm_3)
lstm_4 = LSTM(EMBEDDING_DIM, return_sequences=True, name='lstm_4')(lstm_3)
print(lstm_4)

In [None]:
model = Model(main_input, lstm_4)
model.compile(optimizer='adam',
              loss='kullback_leibler_divergence',
              metrics=['accuracy'])

In [None]:
model.fit(x_train, x_train, batch_size=32, epochs=50, verbose=1)

In [None]:
predictions = model.predict(x_train)

In [None]:
sentences = list()
for prediction in predictions:
    sentence_tokens = list()
    for word_vector in prediction:
        sentence_tokens.append(word_vectors.most_similar(positive=[word_vector],topn=1)[0][0])
    sentences.append(" ".join(sentence_tokens))

In [None]:
print(len(sentences))

In [None]:
for i in range(len(tweets)):
    print(sentences[i])