In [1]:
import os
import re
import csv
import sys
import codecs
import numpy as np
import pandas as pd
import keras.layers as KL
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation
from gensim.models import KeyedVectors
from keras import backend as KB
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.core import Reshape, Permute, Lambda, RepeatVector
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
BASE_DIR = 'data/'
EMBEDDING_FILE = BASE_DIR + 'GoogleNews-vectors-negative300.bin'
TRAIN_DATA_FILE = BASE_DIR + 'train.csv'
TEST_DATA_FILE = BASE_DIR + 'test.csv'
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 100000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.3
LOAD_DATA = False

In [3]:
num_lstm = np.random.randint(175, 275)
num_dense = np.random.randint(100, 150)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25
act = 'relu'
re_weight = True # whether to re-weight classes to fit the 17.5% share in test set
STAMP = 'lstm'

In [4]:
print('Indexing word vectors')
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)
print('Found %s word vectors of word2vec' % len(word2vec.vocab))

Indexing word vectors
Found 3000000 word vectors of word2vec


In [5]:
print('Processing text dataset')

# The function "text_to_wordlist" is from
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

texts_1 = [] 
texts_2 = []
labels = []
with codecs.open(TRAIN_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        texts_1.append(text_to_wordlist(values[3]))
        texts_2.append(text_to_wordlist(values[4]))
        labels.append(int(values[5]))
print('Found %s texts in train.csv' % len(texts_1))

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_1 + texts_2)

sequences_1 = tokenizer.texts_to_sequences(texts_1)
sequences_2 = tokenizer.texts_to_sequences(texts_2)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(labels)
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)

Processing text dataset
Found 404290 texts in train.csv
Found 85518 unique tokens
Shape of data tensor: (404290, 30)
Shape of label tensor: (404290,)


In [6]:
print('Preparing embedding matrix')

nb_words = min(MAX_NB_WORDS, len(word_index))+1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix
Null word embeddings: 37391


In [13]:
perm = np.random.permutation(len(data_1))
idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]

data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
labels_train = np.concatenate((labels[idx_train], labels[idx_train]))

data_1_val = np.vstack((data_1[idx_val], data_2[idx_val]))
data_2_val = np.vstack((data_2[idx_val], data_1[idx_val]))
labels_val = np.concatenate((labels[idx_val], labels[idx_val]))

weight_val = np.ones(len(labels_val))
if re_weight:
    weight_val *= 0.472001959
    weight_val[labels_val==0] = 1.309028344

In [100]:
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

merged = concatenate([x1, y1])
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

merged = Dense(num_dense, activation=act)(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

preds = Dense(1, activation='sigmoid')(merged)

In [15]:
if re_weight:
    class_weight = {0: 1.309028344, 1: 0.472001959}
else:
    class_weight = None

In [70]:
model = Model(inputs=[sequence_1_input, sequence_2_input], \
        outputs=preds)
model.compile(loss='binary_crossentropy',
        optimizer='nadam',
        metrics=['acc'])
model.summary()
print(STAMP)
if LOAD_DATA:
    model.load_weights(bst_model_path)
early_stopping =EarlyStopping(monitor='val_loss', patience=5)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([data_1_train, data_2_train], labels_train, \
        validation_data=([data_1_val, data_2_val], labels_val, weight_val), \
        epochs=50, batch_size=2048, shuffle=True, \
        class_weight=class_weight, callbacks=[model_checkpoint])

model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_33 (InputLayer)           (None, 35)           0                                            
__________________________________________________________________________________________________
input_34 (InputLayer)           (None, 35)           0                                            
__________________________________________________________________________________________________
embedding_21 (Embedding)        (None, 35, 300)      25655700    input_33[0][0]                   
                                                                 input_34[0][0]                   
__________________________________________________________________________________________________
lstm_21 (LSTM)                  (None, 222)          464424      embedding_21[0][0]               
          

In [16]:
def add_attention_after_lstm(inputs, SINGLE_ATTENTION_VECTOR=False):
    a = Lambda(lambda x: KB.expand_dims(x, axis=1))(inputs)
    input_dim = int(a.shape[2])
    a = Permute((2, 1))(a)
    a = Reshape((input_dim, 1))(a) # this line is not useful. It's just to know which dimension is what.
    a = Dense(1, activation='softmax')(a)
    if SINGLE_ATTENTION_VECTOR:
        a = Lambda(lambda x: KB.mean(x, axis=1))(a)
        a = RepeatVector(input_dim)(a)
    a_probs = Permute((2, 1))(a)
    a_probs = Lambda(lambda x: KB.squeeze(x, axis=1))(a_probs)
    outputs = KL.multiply([inputs, a_probs])
    return outputs

In [17]:
def add_attention_before_lstm(inputs, T, SINGLE_ATTENTION_VECTOR=True):
    a = inputs
    input_dim = int(a.shape[2])
    a = Permute((2, 1))(a)
    a = Reshape((input_dim, T))(a) # this line is not useful. It's just to know which dimension is what.
    a = Dense(T, activation='softmax')(a)
    if SINGLE_ATTENTION_VECTOR:
        a = Lambda(lambda x: KB.mean(x, axis=1))(a)
        a = RepeatVector(input_dim)(a)
    a_probs = Permute((2, 1))(a)
    outputs = KL.multiply([inputs, a_probs])
    return outputs

In [18]:
embedding_layer_att_before = Embedding(nb_words, EMBEDDING_DIM, weights=[embedding_matrix], 
                            input_length=MAX_SEQUENCE_LENGTH, trainable=False)
lstm_layer_att_before = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)
sequence_1_input_att_before = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1_att_before = embedding_layer_att_before(sequence_1_input_att_before)
x1_att_before = add_attention_before_lstm(embedded_sequences_1_att_before, MAX_SEQUENCE_LENGTH, SINGLE_ATTENTION_VECTOR=True)
x1_att_before = lstm_layer_att_before(x1_att_before)
sequence_2_input_att_before = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2_att_before = embedding_layer_att_before(sequence_2_input_att_before)
y1_att_before = add_attention_before_lstm(embedded_sequences_2_att_before, MAX_SEQUENCE_LENGTH, SINGLE_ATTENTION_VECTOR=True)
y1_att_before = lstm_layer_att_before(embedded_sequences_2_att_before)
merged_att_before = concatenate([x1_att_before, y1_att_before])
merged_att_before = Dropout(rate_drop_dense)(merged_att_before)
merged_att_before = BatchNormalization()(merged_att_before)
merged_att_before = Dense(num_dense, activation=act)(merged_att_before)
merged_att_before = Dropout(rate_drop_dense)(merged_att_before)
merged_att_before = BatchNormalization()(merged_att_before)
preds_att_before = Dense(1, activation='sigmoid')(merged_att_before)

In [19]:
model_att_before = Model(inputs=[sequence_1_input_att_before, sequence_2_input_att_before], outputs=preds_att_before)
model_att_before.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc'])
model_att_before.summary()
bst_model_path_att_before = 'att_before_' + STAMP + '.h5'
model_checkpoint_att_before = ModelCheckpoint(bst_model_path_att_before, save_best_only=True, save_weights_only=True)
if LOAD_DATA:
    model_att_before.load_weights(bst_model_path_att_before)
hist_att_before = model_att_before.fit([data_1_train, data_2_train], labels_train, \
        validation_data=([data_1_val, data_2_val], labels_val, weight_val), \
        epochs=50, batch_size=2048, shuffle=True, \
        class_weight=class_weight, callbacks=[model_checkpoint_att_before])
model_att_before.load_weights(bst_model_path_att_before)
bst_val_score_att_before = min(hist_att_before.history['val_loss'])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, 30)           0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 30, 300)      25655700    input_6[0][0]                    
                                                                 input_7[0][0]                    
__________________________________________________________________________________________________
permute_7 (Permute)             (None, 300, 30)      0           embedding_4[0][0]                
__________________________________________________________________________________________________
reshape_4 (Reshape)             (None, 300, 30)      0           permute_7[0][0]                  
__________

KeyboardInterrupt: 

In [None]:
embedding_layer_att_after = Embedding(nb_words, EMBEDDING_DIM, weights=[embedding_matrix], 
                            input_length=MAX_SEQUENCE_LENGTH, trainable=False)
lstm_layer_att_after = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)
sequence_1_input_att_after = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1_att_after = embedding_layer_att_after(sequence_1_input_att_after)
x1_att_after = lstm_layer_att_after(embedded_sequences_1_att_after)
x1_att_after = add_attention_after_lstm(x1_att_after, SINGLE_ATTENTION_VECTOR=False)
sequence_2_input_att_after = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2_att_after = embedding_layer_att_after(sequence_2_input_att_after)
y1_att_after = lstm_layer_att_after(embedded_sequences_2_att_after)
y1_att_after = add_attention_after_lstm(y1_att_after, SINGLE_ATTENTION_VECTOR=False)
merged_att_after = concatenate([x1_att_after, y1_att_after])
merged_att_after = Dropout(rate_drop_dense)(merged_att_after)
merged_att_after = BatchNormalization()(merged_att_after)
merged_att_after = Dense(num_dense, activation=act)(merged_att_after)
merged_att_after = Dropout(rate_drop_dense)(merged_att_after)
merged_att_after = BatchNormalization()(merged_att_after)
preds_att_after = Dense(1, activation='sigmoid')(merged_att_after)

In [None]:
model_att_after = Model(inputs=[sequence_1_input_att_after, sequence_2_input_att_after], outputs=preds_att_after)
model_att_after.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc'])
model_att_after.summary()
bst_model_path_att_after = 'att_after_' + STAMP + '.h5'
model_checkpoint_att_after = ModelCheckpoint(bst_model_path_att_after, save_best_only=True, save_weights_only=True)
if LOAD_DATA:
    model_att_after.load_weights(bst_model_path_att_after)
hist_att_after = model_att_after.fit([data_1_train, data_2_train], labels_train, \
        validation_data=([data_1_val, data_2_val], labels_val, weight_val), \
        epochs=50, batch_size=2048, shuffle=True, \
        class_weight=class_weight, callbacks=[model_checkpoint_att_after])
model_att_after.load_weights(bst_model_path_att_after)
bst_val_score_att_after = min(hist_att_after.history['val_loss'])