In [1]:
%matplotlib inline
import os
import re
import collections
import pickle
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Input, Dropout, Lambda, LSTM, Embedding, Conv1D, TimeDistributed, Add
from tensorflow.keras import backend as K
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model

task = "classification"
noerr_eid_list = set()

In [2]:
def load(path):
    with open(path, 'rb') as f:
        data = pickle.load(f)
    return data

def dump(value, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, 'wb') as f:
        pickle.dump(value, f)

In [3]:
dataset_path = '../data/dataset/'
user2ind = load(dataset_path+'user2ind.pkl')
eid2ind = load(dataset_path+'eid2ind.pkl')
nb_feature_sub = load(dataset_path+'nb_feature_sub.pkl')
eid_train = load(dataset_path+'eid_train.pkl')
eid_test = load(dataset_path+'eid_test.pkl')
X = load(dataset_path+'X.pkl')
burnin = load(dataset_path+'burnin.pkl')
X_dict = load(dataset_path+'X_dict.pkl')
y_dict = load(dataset_path+'y_dict.pkl')
dict_ = load(dataset_path+'dict_.pkl')
scaler_dict = load(dataset_path+'scaler_dict.pkl')
subX_dict = load(dataset_path+'subX_dict.pkl')

In [4]:
'''
matrix_main is used for LSTM input.
matrix_sub is used for the scoring module.
'''
# from keras.models import load_model
# from keras.models import Model
# from keras.optimizers import Adam
# from keras.regularizers import l2
# from keras.layers import Dense, Input, Dropout, Lambda, LSTM, Embedding, Conv1D, TimeDistributed, merge

# from keras import regularizers
# from keras.optimizers import Adam
# from keras import backend as K

acc=0

nb_users = len(user2ind)
nb_events = len(eid2ind)
nb_features = 2+20+100    # (#temporal, #user, #doc)
dim_hidden = 50

In [5]:
##### Main part #####
inputs = Input(shape=(None, nb_features))
emb_out = TimeDistributed(Dense(100, activation='tanh'))(inputs)    # W_e
emb_out = Dropout(0.2)(emb_out)
rnn_out = LSTM(dim_hidden, activation='tanh', return_sequences=False)(emb_out)    #(None, dim_hidden)
rnn_out = Dense(100, activation='tanh')(rnn_out)     # (None, 100) W_r
rnn_out = Dropout(0.2)(rnn_out)

In [6]:
##### Sub part #####
nb_score = 1
nb_expand = 100
sub_input = Input(shape=(None, nb_feature_sub))
user_vec = TimeDistributed(Dense(nb_expand, activation='tanh',
                                 kernel_regularizer=keras.regularizers.l2(0.01)))(sub_input)   # (None, None, nb_expand)
sub_h = TimeDistributed(keras.layers.Dense(nb_score, activation='sigmoid'))(user_vec)    # (None, None, nb_score)
z = Lambda(lambda x: K.mean(x, axis=1), output_shape=lambda s: (s[0], s[2]))(sub_h)    #(None, nb_score)

In [7]:
##### Concatenate #####
out1 = keras.layers.Dense(1, activation='sigmoid')(rnn_out)
concat_out = Add()([out1, z])
# concat_out = merge([rnn_out, z], mode='concat', concat_axis=1)
# concat_out = concatenate([rnn_out, z], axis=1)

##### Classifier #####
# outputs = Dense(1, activation='sigmoid')(concat_out)
# outputs = Dense(1, activation='sigmoid')(concat_out)
outputs = concat_out

##### Model #####
hvector = Model(inputs=[inputs, sub_input], outputs=concat_out)
zscore = Model(inputs=sub_input, outputs=sub_h)
model = Model(inputs=[inputs, sub_input], outputs=outputs)
uvector = Model(inputs=sub_input, outputs=user_vec)

In [8]:
##### Compile #####
adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
if task=="regression":
    model.compile(optimizer=adam,
                  loss='mean_squared_error')
elif task=="classification":
    model.compile(optimizer=adam,
                  loss='binary_crossentropy')
print("Model is compiled.")

Model is compiled.


In [12]:
X.shape

(3, 122)

In [9]:
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, confusion_matrix

def sigmoid_array(x):                                        
    return 1 / (1 + np.exp(-x))

### Training... ###
# acc = 0
nb_epoch = 30
if task=="regression":
    eid_train = eid_list
    eid_test = []
    
for ep in range(nb_epoch+1):
    print("{} epoch!!!!!!!!".format(ep))
    ##### Looping for eid_train #####
    losses = []
    for ii, eid in enumerate(eid_train):
        if X.shape[0]<=2*burnin:  # ignore length<=1 sequence
            continue

        X = X_dict[eid]
        X = X.astype(np.float32)
        y = y_dict[eid]

        label = int(dict_[eid]['label'])
        if task=="classification":
            assert(label==y)

        noerr_eid_list.add(eid)

        sh = scaler_dict[eid][0]
        si = scaler_dict[eid][1]
        
        ##### Main input #####
        trainX = X
        ##### Sub input #####
        sub_trainX = subX_dict[eid]
    
        if task=="regression":
            ### TODO : if we want to predict more features, add here.
            if y.shape[1]>1:
                trainY = np.hstack([sh.transform(y[:,0].reshape(-1,1)),
                                    si.transform(y[:,1].reshape(-1,1))])
            else:
                trainY = si.transform(y)
            dim_output = trainY.shape
            
        elif task=="classification":
            trainY = y
            dim_output = 1
        
        if ep%50==0 and ii%1000==0:
            h = model.fit([trainX[np.newaxis,:,:], sub_trainX[np.newaxis,:,:]], np.array([trainY]), 
                          batch_size=1, epochs=1, verbose=2)
        else:
            h = model.fit([trainX[np.newaxis,:,:], sub_trainX[np.newaxis,:,:]], np.array([trainY]), 
                          batch_size=1, epochs=1, verbose=0)
        losses.append(h.history['loss'][0])
    print("%% mean loss : {}".format(np.mean(losses)))

    ### Evaluation ###
    preds = []
    rmses = []
    y_test = []
    for ii, eid in enumerate(eid_test):
        if X.shape[0]<=2*burnin:  # ignore length<=1 sequence
            continue

        X = X_dict[eid]
        X = X.astype(np.float32)
        y = y_dict[eid]

        testX = X
        sub_testX = subX_dict[eid]
        
        if task=="classification":
            y_test.append(int(dict_[eid]['label']))

            pred = model.predict([np.array([testX]), np.array([sub_testX])], verbose=0)
            preds.append(pred[0,0])
            
        elif task=="regression":
            predict_y = model.predict(np.array([testX]), verbose=0)
            
            sh = scaler_dict[eid][0]
            si = scaler_dict[eid][1]

            if predict_y.shape[2]==1:
                predict_y = np.hstack([sh.inverse_transform(predict_y[0,burnin:,0].reshape(-1,1))])
            elif predict_y.shape[2]==2:
                predict_y = np.hstack([sh.inverse_transform(predict_y[0,burnin:,0].reshape(-1,1)),
                                       si.inverse_transform(predict_y[0,burnin:,1].reshape(-1,1))])
            elif predict_y.shape[2]>2:
                predict_y = np.hstack([sh.inverse_transform(predict_y[0,burnin:,0].reshape(-1,1)),
                                       si.inverse_transform(predict_y[0,burnin:,1].reshape(-1,1)),
                                       predict_y[0,burnin:,2:]])
            nb_features = predict_y.shape[1]
            rmse = np.sqrt(np.mean((predict_y[:-1,:] - trainX[burnin+1:,:nb_features])**2))
            rmses.append(rmse)

    if task=="classification":
        preds = np.array(preds)
        preds = preds>0.5
        tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
        print("%%% Test results {} samples %%%".format(len(y_test)))
        print("accuracy: {}".format((tp+tn)/(tp+tn+fp+fn)))
        print("precision : {:.4f} / {:.4f}".format(tp/(tp+fp), tn/(fn+tn)))
        print("recall : {:.4f} / {:.4f}".format(tp/(tp+fn), tn/(fp+tn)))
        print("F1 score : {:.4f} / {:.4f}".format(2*tp/(2*tp+fp+fn), 2*tn/(2*tn+fp+fn)))


    elif task=="regression":
        print("%%% Test results {} samples".format(len(rmses)))
        print("mean rmse : {}".format(np.mean(rmses)))
        
    if acc < (tp+tn)/(tp+tn+fp+fn):
        acc = (tp+tn)/(tp+tn+fp+fn)
        print("%%%%%%%%%% Save model\t acc:{} %%%%%%%%%%%%".format(acc))

0 epoch!!!!!!!!
Train on 1 samples
1/1 - 1005s - loss: 15.9239
Train on 1 samples
1/1 - 0s - loss: 0.9395
%% mean loss : 3.091247005885496
%%% Test results 382 samples %%%
accuracy: 0.9109947643979057
precision : 0.2500 / 0.9251
recall : 0.0667 / 0.9830
F1 score : 0.1053 / 0.9532
%%%%%%%%%% Save model	 acc:0.9109947643979057 %%%%%%%%%%%%
1 epoch!!!!!!!!
%% mean loss : 0.5730162076457008
%%% Test results 382 samples %%%
accuracy: 0.9476439790575916
precision : 0.9167 / 0.9486
recall : 0.3667 / 0.9972
F1 score : 0.5238 / 0.9723
%%%%%%%%%% Save model	 acc:0.9476439790575916 %%%%%%%%%%%%
2 epoch!!!!!!!!
%% mean loss : 0.3852266912099891
%%% Test results 382 samples %%%
accuracy: 0.9554973821989529
precision : 0.9333 / 0.9564
recall : 0.4667 / 0.9972
F1 score : 0.6222 / 0.9764
%%%%%%%%%% Save model	 acc:0.9554973821989529 %%%%%%%%%%%%
3 epoch!!!!!!!!
%% mean loss : 0.26081418672196216
%%% Test results 382 samples %%%
accuracy: 0.9607329842931938
precision : 0.9412 / 0.9616
recall : 0.5333 /