In [139]:
import numpy as np
import pandas as pd

from nltk import word_tokenize
#from nltk.corpus import stopwords

from sklearn.model_selection import StratifiedKFold
from gensim.models.wrappers.fasttext import FastTextKeyedVectors

from keras.models import Model, Sequential
from keras.layers import *
from keras import backend as K
from keras.callbacks import EarlyStopping, ModelCheckpoint
import pickle as pc

In [50]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 300
MAX_EPOCHS = 200

### Data read

In [9]:
df = pd.read_csv('data/quora-train.csv')
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,wmd
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,0.64958
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,2.160659
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,2.322309
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,5.895938
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,3.598486


### CV

In [16]:
x = df[['question1', 'question2']]
y = df['is_duplicate']
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
train_indexes, test_indexes = next(sss.split(x, y))
train = df.iloc[train_indexes]
test = df.iloc[test_indexes]

### Sequences

In [37]:
model = FastTextKeyedVectors.load_word2vec_format('model/fasttext/quora.vec')
all_words = set(model.vocab.keys())
int_vocab = {word:i for i,word in enumerate(all_words)}

In [111]:
pad_word = "与己方便" # strange word from vocab
def extract_sequence(question):
    tokens = list(filter(lambda word: word in all_words, word_tokenize(question.lower())))
    if(len(tokens) < MAX_SEQUENCE_LENGTH):
        tokens = tokens + [pad_word for i in range(MAX_SEQUENCE_LENGTH-len(tokens))]
    return [int_vocab[word] for word in tokens]

def extract_sequence_column(questions):
    result = []
    for q in questions:
        result.append(extract_sequence(q))
    return result

In [135]:
train_q1_sequences = np.array(extract_sequence_column(train['question1']))

In [137]:
train_q2_sequences = np.array(extract_sequence_column(train['question2']))

In [143]:
test_q1_sequences = np.array(extract_sequence_column(test['question1']))
test_q2_sequences = np.array(extract_sequence_column(test['question2']))

In [144]:
pc.dump(train_q1_sequences, open('data/train_q1_sequences.pickle', 'wb'))
pc.dump(train_q2_sequences, open('data/train_q2_sequences.pickle', 'wb'))
pc.dump(test_q1_sequences, open('data/test_q1_sequences.pickle', 'wb'))
pc.dump(test_q2_sequences, open('data/test_q2_sequences.pickle', 'wb'))

### Embedding

In [46]:
embedding_layer = model.get_embedding_layer()
lstm_layer = Bidirectional(LSTM(300,dropout=0.332,recurrent_dropout=0.2))

In [51]:
sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
x2 = lstm_layer(embedded_sequences_2)

wmd_input = Input(shape=(1, ))

merged = concatenate([x1, x2, wmd_input])
merged = Dropout(0.4)(merged)
merged = BatchNormalization()(merged)

merged = Dense(130, activation='relu')(merged)
merged = Dropout(0.075)(merged)
merged = BatchNormalization()(merged)

output = Dense(1, activation='sigmoid')(merged)

model = Model(inputs=[sequence_1_input, sequence_2_input, wmd_input],outputs=output)
model.compile(loss='binary_crossentropy',optimizer='nadam',metrics=['accuracy'])

In [None]:
model_checkpoint_path = 'model/fold-checkpoint.h5'

X_train_q1 = np.vstack([train_q1_sequences, train_q2_sequences])
X_train_q2 = np.vstack([train_q2_sequences, train_q1_sequences])
X_train_wmd = np.concatenate([np.array(train['wmd']), np.array(train['wmd'])])

X_val_q1 = np.vstack([test_q1_sequences, test_q2_sequences])
X_val_q2 = np.vstack([test_q2_sequences, test_q1_sequences])
X_test_wmd = np.concatenate([np.array(test['wmd']), np.array(test['wmd'])])

y_train = np.concatenate([train['is_duplicate'], train['is_duplicate']])
y_val = np.concatenate([test['is_duplicate'], test['is_duplicate']])

# Train.
model.fit([X_train_q1, X_train_q2, X_train_wmd], y_train,
          validation_data=([X_val_q1, X_val_q2, X_test_wmd], y_val),
        batch_size=128,
        epochs=MAX_EPOCHS,
        verbose=1,

        callbacks=[
        # Stop training when the validation loss stops improving.
        EarlyStopping(
            monitor='val_loss',
            min_delta=0.001,
            patience=3,
            verbose=1,
            mode='auto',
        ),
        # Save the weights of the best epoch.
        ModelCheckpoint(
            model_checkpoint_path,
            monitor='val_loss',
            save_best_only=True,
            verbose=2,
        ),
        ],
)

# Restore the best epoch.
model.load_weights(model_checkpoint_path)