In [67]:
import pandas as pd
import numpy as np
from collections import defaultdict
from prepare_data import preparing, get_word_list, get_embedding, extract_features, final_prepare
from keras.preprocessing.text import Tokenizer
from NLP import extract_nlp_features
import Non_NLP

from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout,BatchNormalization
from tensorflow.keras.layers import concatenate, add, Lambda,multiply, GaussianNoise
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import accuracy_score

In [24]:
import importlib
importlib.reload(Non_NLP)

<module 'Non_NLP' from '/Users/sizhenhan/Documents/quora-question-pairs/Non_NLP.py'>

In [4]:
data = pd.read_csv("train.csv")

In [5]:
X_train, X_test, y_train, y_test = preparing(data)

In [6]:
words = get_word_list(X_train)
embeddings = get_embedding(words)
words = embeddings.keys()

In [7]:
train1,train2, features_train = extract_features(X_train,words)
test1,test2, features_test = extract_features(X_test,words)

In [8]:
tokenizer = Tokenizer(filters="")
tokenizer.fit_on_texts(np.append(train1, train2))

In [9]:
data_train1, data_train2 = final_prepare(train1,train2,tokenizer)
data_test1, data_test2 = final_prepare(test1,test2,tokenizer)

In [10]:
y_train = np.array(y_train)
embedding_matrix = np.zeros((len(words)+1, 300))

for i, word in enumerate(words):
    embedding_matrix[i] = embeddings.get(word)

In [31]:
features_train

array([[0., 1., 0., 0.],
       [0., 2., 0., 0.],
       [0., 2., 0., 0.],
       ...,
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 5., 0., 0.]])

## NLP Features 

In [11]:
X_train2 = X_train.copy()
X_test2 = X_test.copy()

In [12]:
X_nlp_features_train = extract_nlp_features(X_train2)
X_nlp_features_test = extract_nlp_features(X_test2)

In [29]:
X_nlp_features_train.head()

Unnamed: 0,common_word_min,common_word_max,common_stop_min,common_stop_max,common_token_min,common_token_max,last_word_equal,first_word_equal,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
70052,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,11.0,43,43,31,52,0.184211
321015,0.499988,0.28571,0.599988,0.374995,0.555549,0.333331,0.0,1.0,6.0,12.0,60,45,46,49,0.18
268021,0.428565,0.374995,0.0,0.0,0.230767,0.199999,0.0,0.0,2.0,14.0,60,51,28,29,0.219178
312808,0.999967,0.999967,0.666644,0.499988,0.833319,0.714276,1.0,1.0,1.0,6.5,95,85,88,81,0.515152
265818,0.9999,0.9999,0.99995,0.666644,0.999967,0.749981,1.0,1.0,1.0,3.5,100,92,92,83,0.615385


## Non NLP features

In [13]:
X_train3 = X_train.copy()
X_test3 = X_test.copy()

In [26]:
X_nonnlp_features_train, X_nonnlp_features_test = Non_NLP.extract_nonnlp_features(X_train3,X_test3)

In [28]:
X_nonnlp_features_train.head()

Unnamed: 0,min_kcore,max_kcore,common_neighbor_ratio,common_neighbor_count,min_freq,max_freq
70052,0,0,0.0,0.0,1,5
321015,0,2,0.0,0.0,1,4
268021,0,0,0.0,0.0,1,1
312808,0,2,0.5,1.0,2,2
265818,0,3,0.0,0.0,1,2


## Merge Features

In [32]:
features_train = np.hstack((features_train, X_nlp_features_train, X_nonnlp_features_train))
features_test = np.hstack((features_test, X_nlp_features_test, X_nonnlp_features_test))

## Training 

In [60]:
input1 = Input(shape=(data_train1.shape[1],))
embedding1 = Embedding(len(words)+1,300,weights=[embedding_matrix],
                       input_length=data_train1.shape[1],trainable=False)(input1)
x1 = LSTM(75, recurrent_dropout=0.2)(embedding1)

input2 = Input(shape=(data_train1.shape[1],))
embedding2 = Embedding(len(words)+1,300,weights=[embedding_matrix],
                       input_length=data_train1.shape[1],trainable=False)(input2)
x2 = LSTM(75, recurrent_dropout=0.2)(embedding2)

input3 = Input(shape=(features_train.shape[1],))
dense_feature = BatchNormalization()(input3)
dense_feature = Dense(200, activation="relu")(dense_feature)
dense_feature = Dropout(0.2)(dense_feature)

addition = add([x1, x2])
x2_negative = Lambda(lambda x: -x)(x2)
subtraction = add([x1, x2_negative])
subtraction = multiply([subtraction, subtraction])
merged = concatenate([subtraction, addition])
merged = Dropout(0.4)(merged)

merged = concatenate([merged, dense_feature])
merged = BatchNormalization()(merged)
merged = GaussianNoise(0.1)(merged)

merged = Dense(150, activation="relu")(merged)
merged = Dropout(0.2)(merged)
merged = BatchNormalization()(merged)

out = Dense(1, activation="sigmoid")(merged)

model = Model(inputs=[input1, input2, input3], outputs=out)
model.compile(loss="binary_crossentropy",
                  optimizer="nadam")
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_14 (InputLayer)           [(None, 30)]         0                                            
__________________________________________________________________________________________________
input_13 (InputLayer)           [(None, 30)]         0                                            
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 30, 300)      1029600     input_14[0][0]                   
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 30, 300)      1029600     input_13[0][0]                   
______________________________________________________________________________________________

In [63]:
model.fit([data_train1, data_train2, features_train], y_train,
                     epochs=15, batch_size= 512, verbose=1)

W1111 14:46:27.581120 4562367936 deprecation.py:323] From /Users/sizhenhan/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 323432 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x1a67ad89b0>

In [64]:
pred = model.predict([data_test1, data_test2, features_test])

In [66]:
preds = []
for p in pred:
    if p > 0.5:
        preds.append(1)
    else:
        preds.append(0)

In [68]:
accuracy_score(preds,y_test)

0.8884464122288457