In [51]:
import sys
sys.path.append('../scripts')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import helpers as hp
import pickle as pkl
import os

from keras.layers import Input, Dense, LSTM, Embedding, Merge
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adadelta
from keras.callbacks import Callback
from keras import backend as K

from sklearn.metrics import f1_score, precision_score, recall_score,\
                            average_precision_score, roc_auc_score,\
                            roc_curve, precision_recall_curve, confusion_matrix,\
                            accuracy_score

from IPython.core.interactiveshell import InteractiveShell
from matplotlib import rcParams
from importlib import reload

rcParams['font.family'] = 'serif'
rcParams['font.serif'] = 'times new roman'

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

# Data Preprocessing

In [2]:
with open('../data/embeddings/glove-300.map', 'rb') as f:
    map = pkl.load(f)

data_dir = os.path.join('..','data')
source_dir = os.path.join(data_dir,'split','amazon-google')
data = hp.load_data(source_dir)

datasets = ['train_1', 'val_1', 'test_1', 'train_2', 'val_2', 'test_2']

# drop columns that do not contain embeddings
data['train_1'] = data['train_1'].drop(['id1', 'price'], axis = 'columns')
data['train_2'] = data['train_2'].drop(['id2', 'price'], axis = 'columns')
data['val_1'] = data['val_1'].drop(['id1', 'price'], axis = 'columns')
data['val_2'] = data['val_2'].drop(['id2', 'price'], axis = 'columns')
data['test_1'] = data['test_1'].drop(['id1', 'price'], axis = 'columns')
data['test_2'] = data['test_2'].drop(['id2', 'price'], axis = 'columns')


# concatenate all embedding columns so each record becomes one giant list
def concat_columns(x):
    x = x['title'] + x['description'] + x['manufacturer']
    return x

for df_name in datasets:
    data[df_name] = data[df_name].apply(concat_columns, axis='columns')
    
# pad all sequences with less than 500 indices with zeros
# limit all sequences to 500 indices
maxlen = 500
for df_name in datasets:
    data[df_name] = pad_sequences(data[df_name], maxlen=maxlen)

# Build Model

In [5]:
glove_matrix = np.load(os.path.join(data_dir, 
                                    'embeddings',
                                    'glove-300.matrix.npy'))

In [6]:
n_hidden = 50
gradient_clipping_norm = 1.25

input_left = Input(shape=(maxlen,))
input_right = Input(shape=(maxlen,))

embedding_layer = Embedding(glove_matrix.shape[0],
                            glove_matrix.shape[1],
                            weights=[glove_matrix],
                            input_length=maxlen,
                            trainable=False)
embedded_left = embedding_layer(input_left)
embedded_right = embedding_layer(input_right)

lstm_layer = LSTM(n_hidden)
output_left = lstm_layer(embedded_right)
output_right = lstm_layer(embedded_left)

def distance_metric(x1, x2):
    l1 = -K.sum(K.abs(x1 - x2), axis=1, keepdims=True)
    return K.exp(l1)

merge_layer = Merge(mode=lambda x: distance_metric(x[0], x[1]),
                    output_shape=lambda x: (x[0][0], 1))
distance = merge_layer([output_left, output_right])

malstm = Model([input_left, input_right], [distance])



In [7]:
optimizer = Adadelta(clipnorm=gradient_clipping_norm)

malstm.compile(loss='mean_squared_error',
               optimizer=optimizer,
               metrics=['accuracy'])

In [43]:
batch_size = 2048
epochs = 50

malstm_trained = malstm.fit([data['train_1'], data['train_2']],
                            data['train_y'],
                            batch_size=batch_size,
                            epochs=epochs,
                            class_weight={0:1, 1:5},
                            shuffle=True)
history.append(malstm_trained.history)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [44]:
y_train_pred = malstm.predict([data['train_1'], data['train_2']])
confusion_matrix(data['train_y'], y_train_pred >= 0.5)

array([[9093,  259],
       [ 287,  761]])

In [45]:
y_val_pred = malstm.predict([data['val_1'], data['val_2']])

array([[1138,   49],
       [  75,   38]])

In [78]:
threshold = 0.5
print(confusion_matrix(data['train_y'], y_train_pred >= threshold))
print(accuracy_score(data['train_y'], y_train_pred >= threshold))
print(f1_score(data['train_y'], y_train_pred >= threshold))

[[9093  259]
 [ 287  761]]
0.9475
0.7359767891682786


In [79]:
threshold = 0.5
print(accuracy_score(data['val_y'], y_val_pred >= threshold))
print(confusion_matrix(data['val_y'], y_val_pred >= threshold))
print(f1_score(data['val_y'], y_val_pred >= threshold))

0.9046153846153846
[[1138   49]
 [  75   38]]
0.38000000000000006
