In [1]:
# -*- coding: utf-8 -*-
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

"""LSTM_Keystroke_Similarity
"""
import numpy as np
import pandas as pd
import random
import keras.backend as K
#import keras.utils as ku
from keras.models import *
from keras.layers import *
from keras.layers.embeddings import *
from keras.optimizers import Adadelta
from keras.optimizers import Adam
#from keras.utils.vis_utils import plot_model
import matplotlib.pyplot as plt
from keras.preprocessing.sequence import pad_sequences
#from keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint
#from tensorflow.keras.utils import multi_gpu_model

np.random.seed(12345)

# Global Variables
HID_DIM = 128
SQUENCE_NUM = 70
VECTOR_UNIT = 5
VECTOR_LEN = SQUENCE_NUM * VECTOR_UNIT

# Create pairs
def create_pairs(df,user):
    all_pairs_list = []
    df_user = df[df['user'] == user]
    df_user_len = len(df_user)
    #print('user_num_of_samples',df_user_len)
    for i in range(df_user_len):
        same_pair_list = []
        z1, z2 = sum(df_user.iloc[i, 2], []), sum(df_user.iloc[i, 2], [])
        same_pair_list.append(z1)
        same_pair_list.append(z2)
        same_pair_list.append(1.0)  # 同一用户样本标识1
        all_pairs_list.append(same_pair_list)
    return all_pairs_list


def process_data(pairs):
    keystroke1_data_list = []
    keystroke2_data_list = []
    label_list = []
    for i in range(len(pairs)):
        keystroke1_data = pad_sequences([pairs[i][0]], maxlen=VECTOR_LEN, dtype='float32', padding='post',
                                        truncating='post')
        keystroke2_data = pad_sequences([pairs[i][1]], maxlen=VECTOR_LEN, dtype='float32', padding='post',
                                        truncating='post')
        label = pairs[i][2]
        judge_nan_left = np.array(keystroke1_data[0])
        judge_nan_right = np.array(keystroke2_data[0])
        if np.any(np.isnan(judge_nan_left)) or np.any(np.isnan(judge_nan_right)):
            #print('judge_nan_left',judge_nan_left)
            #print('judge_nan_right',judge_nan_right)
            continue
        keystroke1_data_list.append(keystroke1_data[0])
        keystroke2_data_list.append(keystroke2_data[0])
        label_list.append(label)
        if np.any(np.isnan(judge_nan_left)):
            print('zc_nan')
    return np.array(keystroke1_data_list), np.array(keystroke2_data_list), np.array(label_list)


def Euclidean_distance(vectors):
    left, right = vectors
    #K.print_tensor(left,message='left')
    #K.print_tensor(right,message='right')
    sum_square = K.sum(K.square(left - right), axis=1, keepdims=True)
    #K.print_tensor(sum_square,message='Eucl_dis')
    return K.sqrt(K.maximum(sum_square, K.epsilon()))


def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

def manhatan_distance(vector):
    left, right = vector
    return K.sum(K.abs(left-right),axis=1,keepdims=True)

def manhatan_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)


def contrastive_loss(y_true, y_pred):
    margin = 1.5
    sqaure_pred = K.square(y_pred)
    #margin_square = K.square(K.maximum(K.abs(margin - y_pred), 0.0))
    margin_square = K.square(K.maximum(margin - y_pred, 0.0))
    return K.mean(y_true * sqaure_pred + (1.0 - y_true) * margin_square + 1e-10)

def accuracy(y_true, y_pred):  # Tensor上的操作
    '''Compute classification accuracy with a fixed threshold on distances.
    '''
    return K.mean(K.equal(y_true, K.cast(y_pred < 1.0, y_true.dtype)))

def create_base_network(dim):
    '''Base network to be shared (eq. to feature extraction).
    '''
    input = Input(shape=(SQUENCE_NUM, VECTOR_UNIT))
    mask = Masking(0)(input)
    bn_layer1 = BatchNormalization(name='bn_layer1')(mask)
    lstm_1 = LSTM(dim, activation='relu', kernel_initializer='random_uniform',bias_initializer='zeros',
                  return_sequences=True, name='lstm_layer1', recurrent_dropout=0.2)
    lstm_layer1 = lstm_1(bn_layer1)
    Dropout(lstm_layer1, 0.5)
    bn_layer2 = BatchNormalization(name='bn_layer2_input1')(lstm_layer1)
    lstm_2 = LSTM(dim, activation='relu', kernel_initializer='random_uniform',bias_initializer='zeros',
                  return_sequences=False, name='lstm_layer2', recurrent_dropout=0.2)
    lstm_layer2 = lstm_2(bn_layer2)


    return Model(input, lstm_layer2)

In [50]:
import pickle
#file_name = './data/5_subject_for_test.json'
#file_save = './data/5_subject_for_test_features.pickle'

file_name = './data/5_subject_for_test.json'
file_save = './data/5_subject_for_test_features_Euclidean.pickle'

df = pd.read_json(file_name)

user_list = [167184,290728,338404,382301,416965]

columns = ['user', 'sess', 'vector']
data_frame = pd.DataFrame(columns = columns)

for i,u in enumerate(user_list):    
    data_pairs= create_pairs(df,u)
    left,right,label = process_data(data_pairs)

    num = left.shape[0]

    left = left.reshape(num,70,5)
    right = right.reshape(num,70,5)

    base_network = create_base_network(HID_DIM)

    input_left = Input(shape=(SQUENCE_NUM,VECTOR_UNIT))
    input_right = Input(shape=(SQUENCE_NUM,VECTOR_UNIT))

    processed_left = base_network(input_left)
    processed_right = base_network(input_right)

    distance = Lambda(Euclidean_distance,output_shape=eucl_dist_output_shape)([processed_left, processed_right])

    model = Model([input_left, input_right], distance)
    left_predict = Model([input_left, input_right],processed_left)

    #model.load_weights('./model/Euclidean_distance_epochs_50batch_size_512_lr005_correct.h5')
    #model.load_weights('./model/manhatan_distance_epochs_50batch_size_512_lr0001_correct.h5')
    model.load_weights('./model/last_680000/Euclidean_distance_epochs_50batch_size_512_lr00005_correct68000.h5')
    
    left_vector = left_predict.predict([left,right])
    
    for j in range(len(left_vector)):
        list_object = left_vector[j]
        data_frame = data_frame.append(pd.DataFrame({'user':str(u),'sess':j,'vector':[list_object]}),ignore_index=True)
    
data_dict = data_frame.to_dict()

with open(file_save,'wb') as fp:
    pickle.dump(data_dict,fp,protocol=pickle.HIGHEST_PROTOCOL)
    

In [51]:
df

Unnamed: 0,user,sess,vector_list
0,416965,0,"[[0.062745098, 0.281, -0.111, 0.17, 0.170281],..."
1,416965,1,"[[0.062745098, 0.192, -0.095, 0.097, 0.097192]..."
2,416965,2,"[[0.062745098, 0.14400000000000002, -0.081, 0...."
3,416965,3,"[[0.062745098, 0.17500000000000002, -0.064, 0...."
4,416965,4,"[[0.062745098, 0.20700000000000002, -0.08, 0.1..."
...,...,...,...
70,338404,10,"[[0.062745098, 0.313, -0.076, 0.23700000000000..."
71,338404,11,"[[0.062745098, 0.28, -0.096, 0.184, 0.18428000..."
72,338404,12,"[[0.062745098, 0.35000000000000003, -0.115, 0...."
73,338404,13,"[[0.062745098, 0.281, 0.6980000000000001, 0.97..."


In [32]:
with open(file_save,'rb') as fp:
    test_dict = pickle.load(fp)
print(test_dict.keys())

dict_keys(['user', 'sess', 'vector'])


In [None]:
import pickle
#file_name = './data/train_five_tuple_vector_data.json'
#file_save = './data/train_five_tuple_vector_data_features.pickle'

file_name = './data/test_five_tuple_vector_data_del_null.json'
file_save = './data/test_five_tuple_vector_data_del_null_features_Euclidean68000.pickle'

df = pd.read_json(file_name)
df = df.drop(['Unnamed: 0'],axis=1)

user_list = df['user'].unique().tolist()

columns = ['user', 'sess', 'vector']
data_frame = pd.DataFrame(columns = columns)

for i,u in enumerate(user_list):    
    data_pairs= create_pairs(df,u)
    #print(i,u)
    left,right,label = process_data(data_pairs)

    num = left.shape[0]

    left = left.reshape(num,70,5)
    right = right.reshape(num,70,5)

    base_network = create_base_network(HID_DIM)

    input_left = Input(shape=(SQUENCE_NUM,VECTOR_UNIT))
    input_right = Input(shape=(SQUENCE_NUM,VECTOR_UNIT))

    processed_left = base_network(input_left)
    processed_right = base_network(input_right)

    distance = Lambda(Euclidean_distance,output_shape=eucl_dist_output_shape)([processed_left, processed_right])

    model = Model([input_left, input_right], distance)
    left_predict = Model([input_left, input_right],processed_left)

    #model.load_weights('./model/Euclidean_distance_epochs_50batch_size_512_lr005_correct.h5')
    #model.load_weights('./model/manhatan_distance_epochs_50batch_size_512_lr0001_correct.h5')
    model.load_weights('./model/last_680000/Euclidean_distance_epochs_50batch_size_512_lr00005_correct68000.h5')
    
    left_vector = left_predict.predict([left,right])
    
    for j in range(len(left_vector)):
        list_object = left_vector[j]
        data_frame = data_frame.append(pd.DataFrame({'user':str(u),'sess':j,'vector':[list_object]}),ignore_index=True)
    
data_dict = data_frame.to_dict()

with open(file_save,'wb') as fp:
    pickle.dump(data_dict,fp,protocol=pickle.HIGHEST_PROTOCOL)
    

In [None]:
user_list

In [54]:
df

Unnamed: 0.1,Unnamed: 0,user,sess,vector_list
0,,175374,0,"[[0.3490196078, 0.132, 0.887, 1.019, 1.019132]..."
1,,175374,1,"[[0.062745098, 0.532, -0.17, 0.362, 0.36253199..."
2,,175374,2,"[[0.062745098, 0.723, -0.128, 0.595, 0.595723]..."
3,,175374,3,"[[0.062745098, 0.578, -0.184, 0.394, 0.394578]..."
4,,175374,4,"[[0.062745098, 0.442, -0.114, 0.328, 0.328442]..."
...,...,...,...,...
1379995,,59969,10,"[[0.0784313725, 0.08, 0.008, 0.088, 0.08808], ..."
1379996,,59969,11,"[[0.0784313725, 0.08700000000000001, 0.113, 0...."
1379997,,59969,12,"[[0.0784313725, 0.088, 0.016, 0.10400000000000..."
1379998,,59969,13,"[[0.0784313725, 0.088, 0.016, 0.10400000000000..."
