# Vanilla LSTMS, 05052017

val loss 0.49, 

In [1]:
from datetime import datetime
from IPython.display import SVG

import pandas as pd
import numpy as np

from keras.models import Model
from keras.layers import Dense, Dropout, Input, LSTM, Embedding
from keras.layers.merge import concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint, ProgbarLogger, TensorBoard
from keras_tqdm import TQDMNotebookCallback

from utils import load_embeddings, extract_questions_from_dataframe, save_submission

%load_ext autoreload
%autoreload 2
pd.set_option('max_colwidth', 250)

Using TensorFlow backend.


In [2]:
class Config(object):
    VOCABULARY_SIZE = 1193514
    EMBEDDING_DIMENSION = 200
    OFFSET = 3
    OOV_TOKEN = 0  # out of vocabulary
    EOS_TOKEN = 1  # end of sentence
    PAD_TOKEN = 2  # padding to max sentence length
    MAX_SENTENCE_LENGTH = 60
    
    def stamp(self, comment):
        return '{date:%Y%m%d_%H%M}_{comment}'.format(
            date=datetime.now(), comment=comment)

In [3]:
%%time
train_dataframe = pd.read_csv('train.csv')
current_config = Config()

embedding_weights, word2idx = load_embeddings(
    'glove.twitter.27B.200d.txt',
    config=current_config
)

questions_A, questions_B, labels = extract_questions_from_dataframe(
    train_dataframe, 
    config=current_config,
    word2idx=word2idx,
    prediction_mode=False
)

96463 preprocessed questions loaded from disk
CPU times: user 1min 53s, sys: 4.62 s, total: 1min 57s
Wall time: 1min 59s


In [4]:
questions_A.shape

(96463, 60)

In [5]:
shared_lstm_layer = LSTM(
    units=100, 
    return_sequences=False, 
    go_backwards=True, 
    dropout=0.15 + np.random.rand() * 0.25
)
shared_embedding_layer = Embedding(
    input_dim=current_config.VOCABULARY_SIZE + current_config.OFFSET, 
    output_dim=current_config.EMBEDDING_DIMENSION, 
    input_length=current_config.MAX_SENTENCE_LENGTH,
    weights=[embedding_weights],
    trainable=False
)

input_A = Input(shape=(current_config.MAX_SENTENCE_LENGTH,))
embeddings_A = shared_embedding_layer(input_A)
sentence_representation_A = shared_lstm_layer(embeddings_A)
dropout_A = Dropout(0.5)(sentence_representation_A)

input_B = Input(shape=(current_config.MAX_SENTENCE_LENGTH,))
embeddings_B = shared_embedding_layer(input_B)
sentence_representation_B = shared_lstm_layer(embeddings_B)
dropout_B = Dropout(0.5)(sentence_representation_B)

merged_model = concatenate([dropout_A, dropout_B])
dense_1 = Dense(100)(merged_model)
predictions = Dense(1, activation='sigmoid')(dense_1)

model = Model(inputs=[input_A, input_B], outputs=predictions)
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 60)            0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, 60)            0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 60, 200)       238703400   input_1[0][0]                    
                                                                   input_2[0][0]                    
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 100)           120400      embedding_1[0][0]       

In [6]:
# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
model_checkpoint = ModelCheckpoint(
    monitor='val_loss',
    filepath=current_config.stamp(comment='1') + '.h5', 
    save_best_only=True, 
    save_weights_only=True
)
progress_bar = ProgbarLogger(count_mode='samples')
tensorboard = TensorBoard(
    log_dir='./logs', 
    histogram_freq=1,
)
tqdm_bar = TQDMNotebookCallback()

In [7]:
%%time

training_logs = model.fit(
    x=[questions_A, questions_B], 
    y=labels, 
    epochs=20, 
    batch_size=1024,
    validation_split=0.1, 
    verbose=1,
    callbacks=[early_stopping, model_checkpoint, progress_bar, tensorboard, tqdm_bar]
)

Train on 86816 samples, validate on 9647 samples
INFO:tensorflow:Summary name embedding_1/embeddings:0 is illegal; using embedding_1/embeddings_0 instead.
INFO:tensorflow:Summary name lstm_1/kernel:0 is illegal; using lstm_1/kernel_0 instead.
INFO:tensorflow:Summary name lstm_1/recurrent_kernel:0 is illegal; using lstm_1/recurrent_kernel_0 instead.
INFO:tensorflow:Summary name lstm_1/bias:0 is illegal; using lstm_1/bias_0 instead.
INFO:tensorflow:Summary name dense_1/kernel:0 is illegal; using dense_1/kernel_0 instead.
INFO:tensorflow:Summary name dense_1/bias:0 is illegal; using dense_1/bias_0 instead.
INFO:tensorflow:Summary name dense_2/kernel:0 is illegal; using dense_2/kernel_0 instead.
INFO:tensorflow:Summary name dense_2/bias:0 is illegal; using dense_2/bias_0 instead.


Epoch 1/20


Epoch 1/20
Epoch 2/20


Epoch 2/20
Epoch 3/20


Epoch 3/20
Epoch 4/20


Epoch 4/20
Epoch 5/20


Epoch 5/20
Epoch 6/20


Epoch 6/20
Epoch 7/20


Epoch 7/20
Epoch 8/20


Epoch 8/20
Epoch 9/20


Epoch 9/20
Epoch 10/20


Epoch 10/20
Epoch 11/20


Epoch 11/20
Epoch 12/20


Epoch 12/20
Epoch 13/20


Epoch 13/20
Epoch 14/20


Epoch 14/20
Epoch 15/20


Epoch 15/20
Epoch 16/20


Epoch 16/20
Epoch 17/20


Epoch 17/20
Epoch 18/20


Epoch 18/20
Epoch 19/20


Epoch 19/20
Epoch 20/20


Epoch 20/20

CPU times: user 8h 9min 36s, sys: 1h 48min 9s, total: 9h 57min 46s
Wall time: 2h 46min 19s


In [9]:
model.load_weights('20170505_0304_1.h5')
min(training_logs.history['val_loss'])

0.39166795922287068

In [10]:
test_dataframe = pd.read_csv('test.csv')
test_questions_A, test_questions_B, _ = extract_questions_from_dataframe(
    test_dataframe,
    config=current_config,
    word2idx=word2idx,
    prediction_mode=True
)

2345796 preprocessed questions loaded from disk


In [12]:
%%time
predictions = model.predict(
    x=[test_questions_A, test_questions_B], 
    batch_size=8192, 
    verbose=1
)

CPU times: user 4h 22min 49s, sys: 1h 3min 39s, total: 5h 26min 28s
Wall time: 1h 32min 46s


In [13]:
save_submission(predictions, current_config)

Unnamed: 0,is_duplicate
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,1
8,0
9,0


In [None]:
!wc -l 20170505_0156_1.csv

In [17]:
predictions[:100]

array([[  4.58662599e-01],
       [  1.74463227e-01],
       [  2.84314752e-01],
       [  2.84989804e-01],
       [  3.90512347e-01],
       [  3.39633793e-01],
       [  4.90730852e-01],
       [  9.65885460e-01],
       [  4.08088237e-01],
       [  2.52836287e-01],
       [  2.96838850e-01],
       [  3.29723209e-01],
       [  1.31350324e-01],
       [  2.86526322e-01],
       [  1.95955455e-01],
       [  1.54928207e-01],
       [  2.68868536e-01],
       [  1.37091994e-01],
       [  2.03864142e-01],
       [  3.80258322e-01],
       [  4.21979725e-02],
       [  3.32688063e-01],
       [  3.78279686e-01],
       [  3.53720516e-01],
       [  2.88094938e-01],
       [  8.61447632e-01],
       [  1.54259309e-01],
       [  3.63460220e-02],
       [  1.62562758e-01],
       [  1.06691457e-01],
       [  4.95753556e-01],
       [  1.72469184e-01],
       [  2.27784842e-01],
       [  2.10778549e-01],
       [  2.56637186e-01],
       [  5.64140320e-01],
       [  1.39910445e-01],
 

In [18]:
test_dataframe

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?
5,5,How are the two wheeler insurance from Bharti ...,I admire I am considering of buying insurance ...
6,6,How can I reduce my belly fat through a diet?,How can I reduce my lower belly fat in one month?
7,7,"By scrapping the 500 and 1000 rupee notes, how...",How will the recent move to declare 500 and 10...
8,8,What are the how best books of all time?,What are some of the military history books of...
9,9,After 12th years old boy and I had sex with a ...,Can a 14 old guy date a 12 year old girl?
