# Vanilla LSTMS, 05052017

val loss 0.49, 

In [1]:
from datetime import datetime
from IPython.display import SVG

import pandas as pd
import numpy as np

from keras.models import Model
from keras.layers import Dense, Dropout, Input, LSTM, Embedding
from keras.layers.merge import concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint, ProgbarLogger, TensorBoard
from keras.layers.normalization import BatchNormalization
from keras_tqdm import TQDMNotebookCallback

from utils import load_embeddings, extract_questions_from_dataframe, save_submission

%load_ext autoreload
%autoreload 2
pd.set_option('max_colwidth', 250)

Using TensorFlow backend.


In [2]:
class Config(object):
    VOCABULARY_SIZE = 1193514
    EMBEDDING_DIMENSION = 200
    OFFSET = 3
    OOV_TOKEN = 0  # out of vocabulary
    EOS_TOKEN = 1  # end of sentence
    PAD_TOKEN = 2  # padding to max sentence length
    MAX_SENTENCE_LENGTH = 60
    DENSE_LAYER_SIZE = 150
    DROPOUT = 0.4
    
    def stamp(self, comment):
        return '{date:%Y%m%d_%H%M}_{comment}'.format(
            date=datetime.now(), comment=comment)

In [3]:
%%time
train_dataframe = pd.read_csv('train.csv')
current_config = Config()

embedding_weights, word2idx = load_embeddings(
    'glove.twitter.27B.200d.txt',
    config=current_config
)

questions_A, questions_B, labels = extract_questions_from_dataframe(
    train_dataframe, 
    config=current_config,
    word2idx=word2idx,
    prediction_mode=False
)

404290 preprocessed questions loaded from disk
CPU times: user 1min 49s, sys: 4.38 s, total: 1min 53s
Wall time: 1min 54s


In [4]:
questions_A.shape

(404290, 60)

In [10]:
shared_lstm_layer = LSTM(
    units=100, 
    return_sequences=False, 
    go_backwards=True, 
    dropout=0.3
)
shared_embedding_layer = Embedding(
    input_dim=current_config.VOCABULARY_SIZE + current_config.OFFSET, 
    output_dim=current_config.EMBEDDING_DIMENSION, 
    input_length=current_config.MAX_SENTENCE_LENGTH,
    weights=[embedding_weights],
    trainable=False
)

input_A = Input(shape=(current_config.MAX_SENTENCE_LENGTH,))
embeddings_A = shared_embedding_layer(input_A)
sentence_representation_A = shared_lstm_layer(embeddings_A)

input_B = Input(shape=(current_config.MAX_SENTENCE_LENGTH,))
embeddings_B = shared_embedding_layer(input_B)
sentence_representation_B = shared_lstm_layer(embeddings_B)

merged_model = concatenate([sentence_representation_A, sentence_representation_B])
dropout_1 = Dropout(current_config.DROPOUT)(merged_model)
dense_1 = Dense(current_config.DENSE_LAYER_SIZE)(dropout_1)
dropout_2 = Dropout(current_config.DROPOUT)(dense_1)
merged = BatchNormalization()(dropout_2)

predictions = Dense(1, activation='sigmoid')(merged)

model = Model(inputs=[input_A, input_B], outputs=predictions)
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_7 (InputLayer)             (None, 60)            0                                            
____________________________________________________________________________________________________
input_8 (InputLayer)             (None, 60)            0                                            
____________________________________________________________________________________________________
embedding_4 (Embedding)          (None, 60, 200)       238703400   input_7[0][0]                    
                                                                   input_8[0][0]                    
____________________________________________________________________________________________________
lstm_4 (LSTM)                    (None, 100)           120400      embedding_4[0][0]       

In [34]:
# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
model_checkpoint = ModelCheckpoint(
    monitor='val_loss',
    filepath=current_config.stamp(comment='1') + '.h5', 
    save_best_only=True, 
    save_weights_only=True
)
tensorboard = TensorBoard()
tqdm_bar = TQDMNotebookCallback()

In [37]:
%%time

training_logs = model.fit(
    x=[questions_A, questions_B], 
    y=labels, 
    epochs=50, 
    batch_size=1024,
    validation_split=0.2, 
    callbacks=[early_stopping, model_checkpoint, tensorboard, tqdm_bar]
)

Train on 323432 samples, validate on 80858 samples


Epoch 1/50


Epoch 2/50


Epoch 3/50


Epoch 4/50


Epoch 5/50


Epoch 6/50


Epoch 7/50


Epoch 8/50


Epoch 9/50


Epoch 10/50


Epoch 11/50


Epoch 12/50


Epoch 13/50


Epoch 14/50


Epoch 15/50


Epoch 16/50


Epoch 17/50


Epoch 18/50


Epoch 19/50


Epoch 20/50


Epoch 21/50


Epoch 22/50


Epoch 23/50


Epoch 24/50


Epoch 25/50
CPU times: user 1d 13h 38min 37s, sys: 8h 16min 22s, total: 1d 21h 54min 59s
Wall time: 12h 55min 53s


In [38]:
!ls

20170430_2248_1.h5            extracted_questions_train.npz
20170501_0126_1.csv           [31mglove.twitter.27B.100d.txt[m[m
20170501_0326_1.h5            [31mglove.twitter.27B.200d.txt[m[m
20170501_1742_1.csv           [31mglove.twitter.27B.25d.txt[m[m
20170504_2334_1.h5            [31mglove.twitter.27B.50d.txt[m[m
20170505_0156_1.csv           [1m[36mlogs[m[m
20170505_0211_1.h5            [1m[36mlogs_2[m[m
20170505_0304_1.h5            [1m[36mlogs_3[m[m
20170505_1840_1.csv           sample_submission.csv
20170507_2315_1.csv           script.py
20170509_0055_1.h5            [1m[36mtensorflow[m[m
20170509_2249_1.h5            [31mtest.csv[m[m
20170509_2252_1.h5            [31mtrain.csv[m[m
20170509_2253_1.h5            utils.py
29042017_1.csv                vanilla_lstms_1.ipynb
README.md                     vanilla_lstms_2.ipynb
[1m[36m__pycache__[m[m                   wiki.en.vec


In [20]:
questions_B.shape

(404290, 60)

In [39]:
model.load_weights('20170509_2253_1.h5')
min(training_logs.history['val_loss'])

0.49221511489207603

In [40]:
test_dataframe = pd.read_csv('test.csv')
test_questions_A, test_questions_B, _ = extract_questions_from_dataframe(
    test_dataframe,
    config=current_config,
    word2idx=word2idx,
    prediction_mode=True
)

No saved file, preprocessing from scratch
2345796 questions preprocessed


In [41]:
%%time
predictions = model.predict(
    x=[test_questions_A, test_questions_B], 
    batch_size=8192, 
    verbose=1
)

CPU times: user 4h 24min 57s, sys: 1h 7min 12s, total: 5h 32min 9s
Wall time: 1h 41min 58s


In [42]:
save_submission(predictions, current_config)

Unnamed: 0,is_duplicate
0,0.236641
1,0.299976
2,0.194677
3,0.176530
4,0.413835
5,0.319847
6,0.656540
7,0.741822
8,0.487091
9,0.129493


In [None]:
!wc -l 20170505_0156_1.csv

In [None]:
predictions[:100]

In [None]:
test_dataframe