In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
import itertools
import re
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

import keras.layers as lyr
from keras.models import Model, load_model
import pickle
import xgboost as xgb

In [2]:
BASE_DIR = './dataset/'
train = pd.read_csv(f'{BASE_DIR}train.csv')
test = pd.read_csv(f'{BASE_DIR}test.csv')
# train = pd.read_csv(f'{BASE_DIR}train_preprocessed.csv')

In [3]:
train['id'] = train['id'].apply(str)
test['test_id'] = test['test_id'].apply(str)

In [4]:
df = pd.concat((train, test))
# df = train
df['question1'].fillna('', inplace=True)
df['question2'].fillna('', inplace=True)

### Vocab

In [7]:
# counts_vectorizer = CountVectorizer(max_features=10000-1).fit(
#     itertools.chain(
#         df['question1'], 
#         df['question2']
#         )
#     )
counts_vectorizer = pickle.load(open("./models/analysis2/bow.pkl", "rb"))
other_index = len(counts_vectorizer.vocabulary_)

In [8]:
# import pickle
# pickle.dump(counts_vectorizer, open("./models/analysis2/bow.pkl", "wb"))

In [9]:
words_tokenizer = re.compile(counts_vectorizer.token_pattern)

In [10]:
def create_padded_seqs(texts, max_len=10):
    seqs = texts.apply(
        lambda s: 
            [
                counts_vectorizer.vocabulary_[w] if w in counts_vectorizer.vocabulary_ else other_index
                for w in words_tokenizer.findall(s.lower())
            ]
        )
    return pad_sequences(seqs, maxlen=max_len)

In [11]:
X1_train, X1_val, X2_train, X2_val, y_train, y_val = \
    train_test_split(
        create_padded_seqs(df[df['id'].notnull()]['question1']), 
        create_padded_seqs(df[df['id'].notnull()]['question2']),
        df[df['id'].notnull()]['is_duplicate'].values,
        stratify=df[df['id'].notnull()]['is_duplicate'].values,
        test_size=0.3, random_state=1989
    )
X1_val, X1_test, X2_val, X2_test, y_val, y_test = \
    train_test_split(
        X1_val, 
        X2_val,
        y_val,
        stratify=y_val,
        test_size=0.5, random_state=1989
    )

In [21]:
input1_tensor = lyr.Input(X1_train.shape[1:])
input2_tensor = lyr.Input(X2_train.shape[1:])

words_embedding_layer = lyr.Embedding(X1_train.max() + 1, 100)
seq_embedding_layer = lyr.LSTM(256, activation='tanh')

seq_embedding = lambda tensor: seq_embedding_layer(words_embedding_layer(tensor))

merge_layer = lyr.multiply([seq_embedding(input1_tensor), seq_embedding(input2_tensor)])

dense1_layer = lyr.Dense(16, activation='sigmoid')(merge_layer)
ouput_layer = lyr.Dense(1, activation='sigmoid')(dense1_layer)

model = Model([input1_tensor, input2_tensor], ouput_layer)

model.compile(loss='binary_crossentropy', optimizer='adam')
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 10)]         0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 10)]         0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 10, 100)      1000000     ['input_3[0][0]',                
                                                                  'input_4[0][0]']                
                                                                                                  
 lstm_1 (LSTM)                  (None, 256)          365568      ['embedding_1[0][0]',      

In [28]:
model = load_model("./models/analysis2/model.h5")

In [22]:
# model.fit([X1_train, X2_train], y_train, 
#           validation_data=([X1_val, X2_val], y_val), 
#           batch_size=128, epochs=6, verbose=2)

Epoch 1/6
2211/2211 - 354s - loss: 0.5196 - val_loss: 0.4758 - 354s/epoch - 160ms/step
Epoch 2/6
2211/2211 - 318s - loss: 0.4430 - val_loss: 0.4468 - 318s/epoch - 144ms/step
Epoch 3/6
2211/2211 - 378s - loss: 0.3918 - val_loss: 0.4338 - 378s/epoch - 171ms/step
Epoch 4/6
2211/2211 - 319s - loss: 0.3466 - val_loss: 0.4277 - 319s/epoch - 144ms/step
Epoch 5/6
2211/2211 - 326s - loss: 0.3058 - val_loss: 0.4304 - 326s/epoch - 147ms/step
Epoch 6/6
2211/2211 - 309s - loss: 0.2663 - val_loss: 0.4469 - 309s/epoch - 140ms/step


<keras.callbacks.History at 0x1b2be909580>

In [24]:
# model.save("./models/analysis2/model.h5")

In [29]:
y_pred = model.predict([X1_test, X2_test],128)

In [30]:
y_pred = np.where(y_pred>0.5, 1, 0)

In [31]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
print("Accuracy Score : ", accuracy_score(y_test, y_pred))
print("F1 Score : ", f1_score(y_test, y_pred))
print("Precision : ", precision_score(y_test, y_pred))
print("Recall : ", recall_score(y_test, y_pred))

Accuracy Score :  0.8106325440274388
F1 Score :  0.7459741638648026
Precision :  0.7389779998246998
Recall :  0.7531040643144261


In [14]:
# features_model = Model([input1_tensor, input2_tensor], merge_layer)
# features_model.compile(loss='mse', optimizer='adam')
# F_train = features_model.predict([X1_train, X2_train], batch_size=128)
# F_val = features_model.predict([X1_val, X2_val], batch_size=128)

In [15]:
# import pickle
# pickle.dump(merge_layer, open("./models/analysis2/merge_layer_unprocessed.pkl", "wb"))