# Semantic Neural Network

In [6]:
from modules.datasets import STSDataset, SICKDataset, QuoraQuestionsDataset
from modules.clean_text import tokenize_and_clean
import logging

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.optimizers import Adadelta
from keras.models import Model
from keras.layers import Dense, Input, Embedding, Dropout, Activation, Merge
from keras.layers.recurrent import GRU, LSTM
from keras import backend as K

import numpy as np
from sklearn.model_selection import train_test_split
from gensim.models.keyedvectors import KeyedVectors

LOG = logging.getLogger()
handler = logging.StreamHandler()
formatter = logging.Formatter(
        '%(asctime)s : %(message)s')
handler.setFormatter(formatter)
LOG.addHandler(handler)
LOG.setLevel(logging.INFO)

QUORA_FILE = 'C:\\dev_env\\ml\\datasets\\quora_questions_pair\\train.csv'
STS_FILE = 'C:\\dev_env\\ml\\datasets\\sts\\sts_all.txt'
SICK_FILE = 'C:\\dev_env\ml\\datasets\\sick_2014\\SICK_complete.txt'
WORD2VEC = 'C:\dev_env\ml\datasets\GoogleNews-vectors-negative300.bin\\GoogleNews-vectors-negative300.bin'
GLOVE = 'C:\dev_env\ml\datasets\glove.6B\\glove.6B.300d.gensim.txt'


EMBEDDING_FILE = WORD2VEC

#### Prepare input data

In [9]:
train_df =  STSDataset(STS_FILE).data_frame()

sentences_1 = []
sentences_2 = []
labels = []
for index, row in train_df.iterrows():
    sentences_1.append(tokenize_and_clean(row['s1']))
    sentences_2.append(tokenize_and_clean(row['s2']))
    labels.append(float(row['label']))

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences_1)
tokenizer.fit_on_texts(sentences_2)

word_index = tokenizer.word_index
vocabulary_size = len(word_index)
LOG.info("Vocabulary created. Size: %s", vocabulary_size)

# Prepare the neural network inputs
input_sentences_1 = tokenizer.texts_to_sequences(sentences_1)
input_sentences_2 = tokenizer.texts_to_sequences(sentences_2)

max_sentence_length = 0
# The size of the input sequence is the size of the largest sequence of the input dataset
for sentence_vec in [sentences_1, sentences_2]:
    for sentence in sentence_vec:
        sentence_length = len(sentence.split())
        if (sentence_length > max_sentence_length):
            max_sentence_length = sentence_length

x1 = pad_sequences(input_sentences_1, max_sequence_length)
x2 = pad_sequences(input_sentences_2, max_sequence_length)
# WARNING: STS LABEL RESCALING
y = np.array(labels) / 5

x1_train, x1_test, x2_train, x2_test, y_train, y_test = train_test_split(x1, x2, y, test_size=0.2, random_state=42)

# Make sure everything is ok
assert x1_train.shape == x2_train.shape

2017-09-02 12:07:35,155 : Vocabulary created. Size: 15230


2017-09-02 12:07:35,155 : Vocabulary created. Size: 15230


2017-09-02 12:07:35,155 : Vocabulary created. Size: 15230


#### Prepare embedding matrix for word representations

In [11]:
EMBEDDING_DIM = 300

LOG.info('Loading embedding model from %s', EMBEDDING_FILE)
embedding_model = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))

LOG.info('Creating the embedding matrix')
for word, idx in word_index.items():
    if idx >= vocabulary_size:
        continue
    if word in embedding_model.vocab:
        embedding_vector = embedding_model.word_vec(word)
        if embedding_vector is not None:
            embedding_matrix[idx] = embedding_vector

#np.savetxt("embedding_matrix.txt", embedding_matrix, delimiter=',')
LOG.info('Embedding matrix as been created, removing embedding model from memory')
del embedding_model

2017-09-02 12:11:26,689 : Loading embedding model from C:\dev_env\ml\datasets\GoogleNews-vectors-negative300.bin\GoogleNews-vectors-negative300.bin


2017-09-02 12:11:26,689 : Loading embedding model from C:\dev_env\ml\datasets\GoogleNews-vectors-negative300.bin\GoogleNews-vectors-negative300.bin


2017-09-02 12:11:26,689 : Loading embedding model from C:\dev_env\ml\datasets\GoogleNews-vectors-negative300.bin\GoogleNews-vectors-negative300.bin


2017-09-02 12:11:26,697 : loading projection weights from C:\dev_env\ml\datasets\GoogleNews-vectors-negative300.bin\GoogleNews-vectors-negative300.bin


2017-09-02 12:11:26,697 : loading projection weights from C:\dev_env\ml\datasets\GoogleNews-vectors-negative300.bin\GoogleNews-vectors-negative300.bin


2017-09-02 12:11:26,697 : loading projection weights from C:\dev_env\ml\datasets\GoogleNews-vectors-negative300.bin\GoogleNews-vectors-negative300.bin


2017-09-02 12:15:17,613 : loaded (3000000, 300) matrix from C:\dev_env\ml\datasets\GoogleNews-vectors-negative300.bin\GoogleNews-vectors-negative300.bin


2017-09-02 12:15:17,613 : loaded (3000000, 300) matrix from C:\dev_env\ml\datasets\GoogleNews-vectors-negative300.bin\GoogleNews-vectors-negative300.bin


2017-09-02 12:15:17,613 : loaded (3000000, 300) matrix from C:\dev_env\ml\datasets\GoogleNews-vectors-negative300.bin\GoogleNews-vectors-negative300.bin


2017-09-02 12:15:20,476 : Creating the embedding matrix


2017-09-02 12:15:20,476 : Creating the embedding matrix


2017-09-02 12:15:20,476 : Creating the embedding matrix


2017-09-02 12:15:33,201 : Embedding matrix as been created, removing embedding model from memory


2017-09-02 12:15:33,201 : Embedding matrix as been created, removing embedding model from memory


2017-09-02 12:15:33,201 : Embedding matrix as been created, removing embedding model from memory


#### Neural Network Model	

In [13]:
def exponent_neg_manhattan_distance(left, right):
    ''' Helper function for the similarity estimate of the LSTMs outputs'''
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

LOG.info("Creating model...")

# ============= MODEL =====================
# A entrada recebe os índices das palavras no vocabulário, para fazer o lookup na tabela de embeddings
left_input = Input(shape=(max_sequence_length,), dtype='int32')
right_input = Input(shape=(max_sequence_length,), dtype='int32')

#Camada de embedding
embedding_layer = Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=max_sequence_length,
                            trainable=False)

left_encoder = embedding_layer(left_input)
right_encoder = embedding_layer(right_input)

# LSTM
LSTM_HIDDEN_LAYERS = 50
base_lstm = LSTM(LSTM_HIDDEN_LAYERS)

left_output = base_lstm(left_encoder)
right_output = base_lstm(right_encoder)

# Calculates the distance as defined by the MaLSTM model
malstm_distance = Merge(mode=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),
                        output_shape=lambda x: (x[0][0], 1))\
    ([left_output, right_output])

malstm = Model([left_input, right_input], [malstm_distance])
gradient_clipping_norm = 1.25
# Adadelta optimizer, with gradient clipping by norm
optimizer = Adadelta(clipnorm=gradient_clipping_norm)

malstm.compile(loss = 'mean_squared_error',
               optimizer=optimizer,
               metrics=['accuracy'])

training_time = time()
EPOCHS = 200
malstm.fit([x1_train, x2_train], y_train,
           epochs= EPOCHS,
           batch_size=BATCH_SIZE,
           validation_data=([x1_test, x2_test], y_test))

print("\nTraining time finished.\n{} epochs in {}".format(EPOCHS, datetime.timedelta(seconds=time()-training_time)))

score, acc = malstm.evaluate([x1_test, x2_test], y_test, batch_size=BATCH_SIZE)
print("\nTest score: %.3f, accuracy: %.3f" % (score, acc))

2017-09-02 12:16:24,104 : Creating model...


2017-09-02 12:16:24,104 : Creating model...


2017-09-02 12:16:24,104 : Creating model...


