In [1]:
import sys
sys.path.append('../scripts')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import helpers as hp
import pickle as pkl
import helpers as hp
import os

from keras.layers import Input, Dense, LSTM, Embedding, Merge
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adadelta
from keras.callbacks import Callback
from keras import backend as K

from sklearn.metrics import f1_score, precision_score, recall_score,\
                            average_precision_score, roc_auc_score,\
                            roc_curve, precision_recall_curve, confusion_matrix,\
                            accuracy_score

from IPython.core.interactiveshell import InteractiveShell
from matplotlib import rcParams
from importlib import reload
from harness import TrainingHarness

rcParams['font.family'] = 'serif'
rcParams['font.serif'] = 'times new roman'

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

Using TensorFlow backend.


# Data Preprocessing

In [2]:
with open('../data/embeddings/glove-300.map', 'rb') as f:
    map = pkl.load(f)

data_dir = os.path.join('..','data')
source_dir = os.path.join(data_dir,'split','amazon-google')
data = hp.load_data(source_dir)

datasets = ['train_1', 'val_1', 'test_1', 'train_2', 'val_2', 'test_2']

# drop columns that do not contain embeddings
data['train_1'] = data['train_1'].drop(['id1', 'price'], axis = 'columns')
data['train_2'] = data['train_2'].drop(['id2', 'price'], axis = 'columns')
data['val_1'] = data['val_1'].drop(['id1', 'price'], axis = 'columns')
data['val_2'] = data['val_2'].drop(['id2', 'price'], axis = 'columns')
data['test_1'] = data['test_1'].drop(['id1', 'price'], axis = 'columns')
data['test_2'] = data['test_2'].drop(['id2', 'price'], axis = 'columns')


# concatenate all embedding columns so each record becomes one giant list
def concat_columns(x):
    x = x['title'] + x['description'] + x['manufacturer']
    return x

for df_name in datasets:
    data[df_name] = data[df_name].apply(concat_columns, axis='columns')
    
# pad all sequences with less than 500 indices with zeros
# limit all sequences to 500 indices
maxlen = 500
for df_name in datasets:
    data[df_name] = pad_sequences(data[df_name], maxlen=maxlen)

# Build Model

In [3]:
glove_matrix = np.load(os.path.join(data_dir, 
                                    'embeddings',
                                    'glove-300.matrix.npy'))

In [24]:
n_hidden = 50

input_left = Input(shape=(maxlen,))

embedding_layer = Embedding(glove_matrix.shape[0]-1,
                            glove_matrix.shape[1],
                            weights=[glove_matrix[1:,:]],
                            input_length=maxlen,
                            trainable=False)
embedded_left = embedding_layer(input_left)

# lstm_layer = LSTM(n_hidden)
# output_left = lstm_layer(embedded_right)i
# output_right = lstm_layer(embedded_left)

# def distance_metric(x1, x2):
#     l1 = -K.sum(K.abs(x1 - x2), axis=1, keepdims=True)
#     return K.exp(l1)

# merge_layer = Merge(mode=lambda x: distance_metric(x[0], x[1]),
#                     output_shape=lambda x: (x[0][0], 1))
# distance = merge_layer([output_left, output_right])

malstm = Model([input_left], [embedded_left])

# Pre-training

In [5]:
compile_args = dict(optimizer='adam', loss='mean_squared_error')
fit_args = dict(x = [data['train_1'], data['train_2']],
                  y = data['train_y'],
                  epochs = 10,
                  batch_size = 2048,
                  validation_data = ([data['val_1'], data['val_2']], data['val_y']))
th = TrainingHarness(malstm,
                     n_checkpoints = 10,
                     compile_args = compile_args,
                     fit_args = fit_args,
                     npy_embedding_matrix = '../data/embeddings/glove-300.matrix.npy')

In [29]:
test_instance = np.zeros(500)
test_instance[0] = 0
test_instance = np.expand_dims(test_instance, axis = 0)

In [30]:
malstm.predict([test_instance])

array([[[-0.082752  ,  0.67203999, -0.14986999, ..., -0.1918    ,
         -0.37845999, -0.06589   ],
        [-0.082752  ,  0.67203999, -0.14986999, ..., -0.1918    ,
         -0.37845999, -0.06589   ],
        [-0.082752  ,  0.67203999, -0.14986999, ..., -0.1918    ,
         -0.37845999, -0.06589   ],
        ..., 
        [-0.082752  ,  0.67203999, -0.14986999, ..., -0.1918    ,
         -0.37845999, -0.06589   ],
        [-0.082752  ,  0.67203999, -0.14986999, ..., -0.1918    ,
         -0.37845999, -0.06589   ],
        [-0.082752  ,  0.67203999, -0.14986999, ..., -0.1918    ,
         -0.37845999, -0.06589   ]]], dtype=float32)

In [22]:
glove_matrix[1,:]

array([-0.082752  ,  0.67203999, -0.14986999, -0.064983  ,  0.056491  ,
        0.40228   ,  0.0027747 , -0.33109999, -0.30691001,  2.08170009,
        0.031819  ,  0.013643  ,  0.30265   ,  0.0071297 , -0.5819    ,
       -0.27739999, -0.062254  ,  1.1451    , -0.24232   ,  0.1235    ,
       -0.12243   ,  0.33151999, -0.006162  , -0.30541   , -0.13056999,
       -0.054601  ,  0.037083  , -0.070552  ,  0.58929998, -0.30385   ,
        0.28979999, -0.14653   , -0.27052   ,  0.37160999,  0.32031   ,
       -0.29124999,  0.0052483 , -0.13212   , -0.052736  ,  0.087349  ,
       -0.26668   , -0.16897   ,  0.015162  , -0.0083746 , -0.14871   ,
        0.23413   , -0.20719001, -0.091386  ,  0.40075001, -0.17223001,
        0.18144999,  0.37586001, -0.28681999,  0.37289   , -0.16185001,
        0.18008   ,  0.30320001, -0.13215999,  0.18352   ,  0.095759  ,
        0.094916  ,  0.008289  ,  0.11761   ,  0.34046   ,  0.03677   ,
       -0.29076999,  0.058303  , -0.027814  ,  0.082941  ,  0.18