In [1]:
import feats
import constants
import transactions


import os
import pickle
import numpy as np
import pandas as pd

import keras
from keras.models import Sequential
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences

from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import logging

Using TensorFlow backend.


In [2]:
import pdb

In [12]:
class AucComputer(keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(self.validation_data[0], batch_size=2048)
        logs['val_auc'] = roc_auc_score(self.validation_data[1], y_pred)
        print('epoch {}, val auc {}'.format(epoch, logs['val_auc']))        

- Leak Problem

In [4]:
with open(constants.FEAT_DATA_DIR + 'up_airr_sym.pkl', 'rb') as f:
    up_airr_sym = pickle.load(f)
tle = transactions.TransLogExtractor(constants.RAW_DATA_DIR, constants.FEAT_DATA_DIR)
train_orders = tle.get_orders()

uid_train = train_orders[train_orders.eval_set == 'train'][['user_id']].drop_duplicates()
uid_test = train_orders[train_orders.eval_set == 'test'][['user_id']].drop_duplicates()
del train_orders

up_airr_sym_train = up_airr_sym[up_airr_sym.user_id.isin(uid_train.user_id)]
up_airr_sym_test = up_airr_sym[up_airr_sym.user_id.isin(uid_test.user_id)]

up_airr_sym_train = pd.merge(up_airr_sym_train, tle.craft_label(), 
                             on=['user_id','product_id'], how='left')
up_airr_sym_train.label.fillna(0, inplace=True)

In [5]:
up_airr_sym_train = shuffle(up_airr_sym_train)

In [6]:
up_airr_sym_train['len'] = up_airr_sym_train.up_airr_sym.apply(len)

In [7]:
%%time
max_seq_len = 99
X = pad_sequences(up_airr_sym_train.up_airr_sym.values, maxlen=max_seq_len)
y = up_airr_sym_train.label.values

CPU times: user 49 s, sys: 3.02 s, total: 52 s
Wall time: 51.8 s


In [8]:
sym_set_size = 480
embed_vec_len = 32
hidden_units = 256
def embed_lstm(sym_set_size, embed_vec_len, max_seq_len, hidden_units):
    model = Sequential()
    model.add(Embedding(sym_set_size, embed_vec_len, input_length=max_seq_len))
    model.add(LSTM(hidden_units, return_sequences = True))
    model.add(LSTM(hidden_units))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
    return model
   

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [10]:
model = embed_lstm(sym_set_size, embed_vec_len, max_seq_len, hidden_units)

In [11]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 99, 32)            15360     
_________________________________________________________________
lstm_1 (LSTM)                (None, 99, 256)           295936    
_________________________________________________________________
lstm_2 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dense_1 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
__________

In [None]:
filepath="./__lstm_cache__/" + "lstm-symbol-{epoch:02d}-{val_loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
auc_computer = AucComputer()
callbacks_list = [checkpoint, auc_computer]

In [None]:
model.fit(X_train, y_train, 
          batch_size=2048, 
          epochs=100, 
          validation_split=0.02, 
          callbacks=callbacks_list,
          class_weight={0:1, 1:1})

Train on 6644133 samples, validate on 135595 samples
Epoch 1/100
epoch 0, val auc 0.7697543970380116
Epoch 2/100
epoch 1, val auc 0.7705183616409019
Epoch 3/100
epoch 2, val auc 0.7708393249016537
Epoch 4/100
epoch 3, val auc 0.7707677718528326
Epoch 5/100
epoch 4, val auc 0.7710791899243711
Epoch 6/100
epoch 5, val auc 0.771316827602849
Epoch 7/100
epoch 6, val auc 0.7716736217322547
Epoch 8/100
epoch 7, val auc 0.7714203818676213
Epoch 9/100

In [23]:
%%time
y_pred = model.predict(X_test, batch_size=4028)

CPU times: user 1min 4s, sys: 10 s, total: 1min 14s
Wall time: 1min 12s


In [24]:
print('test auc {}'.format(roc_auc_score(y_test, y_pred)))

test auc 0.7710464231968976
