In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import sklearn.model_selection
from keras import backend as K
from keras.models import Sequential
from keras.layers import Embedding, Dense, Dropout, Reshape, Merge, BatchNormalization, TimeDistributed, Lambda, LSTM
from keras.regularizers import l2
from keras.callbacks import Callback, ModelCheckpoint
import sklearn.metrics
from keras_tqdm import TQDMNotebookCallback

_EPSILON = K.epsilon()

df = pd.read_csv("../d/train.csv")
df_dev = pd.read_csv("../d/dev.csv")
df.head()

Using TensorFlow backend.


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,265140,382056,120181,What are the benefits of using honey on skin?,Is it true that honey is capable of changing y...,0
1,393919,526758,526759,What cars were sold in America with a 2JZ-GTE?,Is it a good choice to join M.tech engineering...,0
2,44808,80383,80384,Is there a place between Bangalore and Pune fo...,Is there a place between Bangalore and Pune fo...,1
3,90064,151197,151198,How can I stop myself from thinking too much a...,How do I stop myself from watching porn and th...,1
4,6653,13025,13026,How can I stop being boastful?,"In you opinion, what is the best Explosions in...",0


In [28]:
def fit_tokenizer(df, num_words):
    questions = np.concatenate([df.question1.values, df.question1.values])
    tk = Tokenizer(num_words=num_words, lower=True, split=" ")
    tk.fit_on_texts(questions)
    return tk

def transform_tokenizer(df, tk):
    #maxlen = 256
    maxlen = 32
    df.question1.fillna("", inplace=True)
    df.question2.fillna("", inplace=True)
    sq1 = tk.texts_to_sequences(df.question1.values)
    sq2 = tk.texts_to_sequences(df.question2.values)
    sq1p = pad_sequences(sq1, maxlen=maxlen, dtype='int32', padding='post', truncating='post', value=0.)
    sq2p = pad_sequences(sq2, maxlen=maxlen, dtype='int32', padding='post', truncating='post', value=0.)    
    return np.array(sq1p), np.array(sq2p)

tk = fit_tokenizer(df, 5000)
q1_train, q2_train = transform_tokenizer(df, tk)
y_train = df.is_duplicate.values
q1_dev, q2_dev = transform_tokenizer(df_dev, tk)
y_dev = df_dev.is_duplicate.values

In [32]:
EMBEDDING_DIM=64
nb_words = 5000

def add_layers(model):

    model.add(Embedding(nb_words, EMBEDDING_DIM))
    model.add(LSTM(EMBEDDING_DIM, dropout=0.3, recurrent_dropout=0.3))

def get_model():
    EMBEDDING_DIM=512
    nb_words = 5000
    
    model1 = Sequential()
    add_layers(model1)
    
    model2 = Sequential()
    add_layers(model2)
    
    model = Sequential()
    model.add(Merge([model1, model2], mode='concat'))
    #model.add(BatchNormalization())
    model.add(Dense(64, activation='relu'))
    #model.add(BatchNormalization())
    model.add(Dense(32, activation='relu'))
    #model.add(BatchNormalization())
    #model.add(Dense(200, activation='relu'))
    #model.add(BatchNormalization())
    """
    model1 = Sequential()
    model1.add(Embedding(nb_words + 1, EMBEDDING_DIM, input_length=256, trainable=False))
    model1.add(TimeDistributed(Dense(EMBEDDING_DIM, activation='relu')))
    model1.add(Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, )))
    
    model2 = Sequential()
    model2.add(Embedding(nb_words + 1, EMBEDDING_DIM, input_length=256, trainable=False))
    model2.add(TimeDistributed(Dense(EMBEDDING_DIM, activation='relu')))
    model2.add(Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, )))
    
    model = Sequential()
    model.add(Merge([model1, model2], mode='concat'))
    #model.add(BatchNormalization())
    model.add(Dense(200, activation='relu'))
    #model.add(BatchNormalization())
    #model.add(Dense(200, activation='relu'))
    #model.add(BatchNormalization())
    #model.add(Dense(200, activation='relu'))
    #model.add(BatchNormalization())
    #model.add(Dense(200, activation='relu'))
    #model.add(BatchNormalization())
    """
    model.add(Dense(1, activation='sigmoid'))
    return model

def _logloss(y_true, y_pred):
    y_pred = K.clip(y_pred, _EPSILON, 1.0-_EPSILON)
    out = -(y_true * K.log(y_pred) + (1.0 - y_true) * K.log(1.0 - y_pred))
    return K.mean(out, axis=-1)

# https://github.com/bradleypallen/keras-quora-question-pairs
model = get_model()
model.compile(loss=_logloss, optimizer='adam', metrics=['accuracy', _logloss])
model.summary()




_________________________________________________________________
Layer (type)                 Output Shape              Param #   
merge_12 (Merge)             (None, 128)               0         
_________________________________________________________________
dense_46 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_47 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_48 (Dense)             (None, 1)                 33        
Total params: 716,417.0
Trainable params: 716,417
Non-trainable params: 0.0
_________________________________________________________________


In [33]:
# print('Train...')
N=20000 * 5
model.fit([q1_train[:N,:], q2_train[:N,:]], y_train[:N], epochs=10, batch_size=128, validation_data=([q1_dev, q2_dev], y_dev), verbose=2)
# score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)


Train on 100000 samples, validate on 20000 samples
Epoch 1/10
130s - loss: 0.5877 - acc: 0.6949 - _logloss: 0.5877 - val_loss: 0.5374 - val_acc: 0.7358 - val__logloss: 0.5374
Epoch 2/10
126s - loss: 0.5263 - acc: 0.7436 - _logloss: 0.5263 - val_loss: 0.5365 - val_acc: 0.7379 - val__logloss: 0.5365
Epoch 3/10
127s - loss: 0.5046 - acc: 0.7556 - _logloss: 0.5046 - val_loss: 0.5251 - val_acc: 0.7440 - val__logloss: 0.5251
Epoch 4/10
126s - loss: 0.4891 - acc: 0.7636 - _logloss: 0.4891 - val_loss: 0.5245 - val_acc: 0.7398 - val__logloss: 0.5245
Epoch 5/10
127s - loss: 0.4768 - acc: 0.7703 - _logloss: 0.4768 - val_loss: 0.5332 - val_acc: 0.7413 - val__logloss: 0.5332
Epoch 6/10
126s - loss: 0.4641 - acc: 0.7761 - _logloss: 0.4641 - val_loss: 0.5300 - val_acc: 0.7402 - val__logloss: 0.5300
Epoch 7/10
126s - loss: 0.4513 - acc: 0.7817 - _logloss: 0.4513 - val_loss: 0.5373 - val_acc: 0.7444 - val__logloss: 0.5373
Epoch 8/10
126s - loss: 0.4407 - acc: 0.7883 - _logloss: 0.4407 - val_loss: 0.534

<keras.callbacks.History at 0x21d121df940>