In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

"""LSTM_Keystroke_Similarity
"""
import numpy as np
import pandas as pd
import random
import keras.backend as K
#import keras.utils as ku
from keras.models import *
from keras.layers import *
from keras.layers.embeddings import *
from keras.optimizers import Adadelta
from keras.optimizers import Adam
#from keras.utils.vis_utils import plot_model
import matplotlib.pyplot as plt
from keras.preprocessing.sequence import pad_sequences
#from keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint
#from tensorflow.keras.utils import multi_gpu_model

np.random.seed(12345)

# Global Variables
HID_DIM = 128
SQUENCE_NUM = 70
VECTOR_UNIT = 5
VECTOR_LEN = SQUENCE_NUM * VECTOR_UNIT

# Create pairs
def create_pairs(df):
    all_pairs_list = []
    user_list = df['user'].unique().tolist()
    user_num = len(user_list)
    #print('user_num:',user_num)
    
    colum_list = ['user_left','index_left','user_right','index_right']
    df_makeData_all = pd.DataFrame(columns = colum_list)

    for j, u in enumerate(user_list):  # 遍历所有用户
        df_user = df[df['user'] == u]
        df_user_len = len(df_user)
        # print('user',u)
        for i in range(df_user_len):
            same_pair_list = []
            diff_pair_list = []

            #df_user.sample(frac=1).reset_index(drop=True)  # 打乱用户样本顺序
            z1, z2 = sum(df_user.iloc[i, 2], []), sum(df_user.iloc[(i + 1) % df_user_len, 2], [])  # 同一用户样本
            if any(x is None for x in z1) or any(x is None for x in z2) :
                continue             
            
            same_pair_list.append(z1)
            same_pair_list.append(z2)
            same_pair_list.append(0.0)  # 同一用户样本标识0
            all_pairs_list.append(same_pair_list)            
            df_makeData_all=df_makeData_all.append(pd.DataFrame({'user_left':[u],'index_left':[i],'user_right':[u],'index_right':[(i + 1) % df_user_len]}),ignore_index=True)
            
            inc = 1
            dn = (j + inc) % (user_num-1)  # 不同用户样本
            if dn != j:
                diff_user = user_list[dn]
            else:
                diff_user = user_list[(dn + 1) % (user_num-1)]
            # print('diff',diff_user)
            df_diff_user = df[df['user'] == diff_user]
            diff_user_len = len(df_diff_user)
            if diff_user_len == 0:
                '''
                print('dn:',dn)
                print('diff_user:',diff_user)
                print('user_list:',user_list[dn])
                print('user_list_all:',user_list)
                print('user_list_len:', len(user_list))
                print('df_diff_user:',df_diff_user)
                '''
                continue
            z3, z4 = sum(df_user.iloc[i, 2], []), sum(df_diff_user.iloc[i % diff_user_len, 2], [])
            if any(x is None for x in z3) or any(x is None for x in z4) :
                continue          
            diff_pair_list.append(z3)
            diff_pair_list.append(z4)
            diff_pair_list.append(1.0)  # 不同用户样本标识1.0
            all_pairs_list.append(diff_pair_list)
            
            df_makeData_all=df_makeData_all.append(pd.DataFrame({'user_left':[u],'index_left':[i],'user_right':[diff_user],'index_right':[i % diff_user_len]}),ignore_index=True)

        if j >= 10000:
            break            
            
    return all_pairs_list,df_makeData_all

def process_data(pairs):
    keystroke1_data_list = []
    keystroke2_data_list = []
    label_list = []
    for i in range(len(pairs)):
        keystroke1_data = pad_sequences([pairs[i][0]], maxlen=VECTOR_LEN, dtype='float32', padding='post',
                                        truncating='post')
        keystroke2_data = pad_sequences([pairs[i][1]], maxlen=VECTOR_LEN, dtype='float32', padding='post',
                                        truncating='post')
        label = pairs[i][2]
        judge_nan_left = np.array(keystroke1_data[0])
        judge_nan_right = np.array(keystroke2_data[0])
        if np.any(np.isnan(judge_nan_left)) or np.any(np.isnan(judge_nan_right)):
            #print('judge_nan_left',judge_nan_left)
            #print('judge_nan_right',judge_nan_right)
            continue
        keystroke1_data_list.append(keystroke1_data[0])
        keystroke2_data_list.append(keystroke2_data[0])
        label_list.append(label)
        if np.any(np.isnan(judge_nan_left)):
            print('zc_nan')
    return np.array(keystroke1_data_list), np.array(keystroke2_data_list), np.array(label_list)

def accuracy(y_true, y_pred):  # Tensor上的操作
    '''Compute classification accuracy with a fixed threshold on distances.
    '''
    return K.mean(K.equal(y_true, K.cast(y_pred > 0.5, y_true.dtype)))


def plot_train_history(history, train_metrics, val_metrics):
    plt.plot(history.history.get(train_metrics), '-o')
    plt.plot(history.history.get(val_metrics), '-o')
    plt.ylabel(train_metrics)
    plt.xlabel('Epochs')
    plt.legend(['train', 'validation'])

def create_base_network(dim):
    '''Base network to be shared (eq. to feature extraction).
    '''
    input = Input(shape=(SQUENCE_NUM, VECTOR_UNIT))
    mask = Masking(0)(input)
    bn_layer1 = BatchNormalization(name='bn_layer1')(mask)
    lstm_1 = LSTM(dim, activation='relu', kernel_initializer='random_uniform',bias_initializer='zeros',
                  return_sequences=False, name='lstm_layer1', recurrent_dropout=0.2)
    lstm_layer1 = lstm_1(bn_layer1)
    #Dropout(lstm_layer1, 0.5)
    dense_out = Dense(64, activation= 'relu', input_dim= 128, use_bias= True)(lstm_layer1)
    #Dropout(lstm_layer1, 0.5)
    #bn_layer2 = BatchNormalization(name='bn_layer2_input1')(lstm_layer1)
    #lstm_2 = LSTM(dim, activation='relu', kernel_initializer='random_uniform',bias_initializer='zeros',
    #              return_sequences=False, name='lstm_layer2', recurrent_dropout=0.2)
    #lstm_layer2 = lstm_2(bn_layer2)


    return Model(input, dense_out)

#file_name = './train_five_tuple_vector_data.json'
#file_name = './five_tuple_vector_data.json'
file_name = './test_five_tuple_vector_data_del_null.json'
epochs = 100
batch_size = 512

df = pd.read_json(file_name)
df = df.drop(['Unnamed: 0'],axis=1)

colum_list = ['user_left','index_left','user_right','index_right']
new_dataFrame = pd.DataFrame(columns = colum_list)

data_pairs, new_dataFrame = create_pairs(df)
left,right,label = process_data(data_pairs)
print(left[0])

num = left.shape[0]

left = left.reshape(num,SQUENCE_NUM,VECTOR_UNIT)
right = right.reshape(num,SQUENCE_NUM,VECTOR_UNIT)

base_network = create_base_network(HID_DIM)

input_left = Input(shape=(SQUENCE_NUM,VECTOR_UNIT))
input_right = Input(shape=(SQUENCE_NUM,VECTOR_UNIT))

# because we re-use the same instance `base_network`,
# the weights of the network
# will be shared across the two branches
processed_left = base_network(input_left)
processed_right = base_network(input_right)
#print(processed_left.shape)
#print(processed_left[0].shape)

abs_of_sub = Lambda (function = lambda x: abs(x[0] - x[1]), output_shape=lambda x: (x[0], 1))([processed_left,processed_right])

###distance = Lambda(Euclidean_distance,output_shape=eucl_dist_output_shape)([processed_left, processed_right])
#distance = Lambda(function=lambda x: Euclidean_distance(x[0], x[1]),output_shape=lambda x: (x[0], 1))([processed_left, processed_right])
#Dropout(abs_of_sub, 0.5)
Dens_layer1 = Dense(64, activation= 'relu', input_dim= 64, use_bias= True)(abs_of_sub)
Dens_layer = Dense(1, activation= 'sigmoid', input_dim= 64, use_bias= True)(Dens_layer1)  


#model = Model([input_left, input_right], distance)
model = Model([input_left, input_right], Dens_layer)
abs_predict = Model([input_left, input_right],abs_of_sub)
left_predict = Model([input_left, input_right],processed_left)
#model2 = Model([input_left, input_right], abs_of_sub)
model.load_weights('./64D_Free_text_explaint_step70_epochs100_100batch_size_512_lr000001_same<0.5.h5')
#model2.load_weights('./Free_text_Dense_2layers_distance_step70_epochs100_3rd100batch_size_512_lr000001_same_0.5.h5')
#model = multi_gpu_model(model,gpus =3)   #用gpu训练

model.summary()
#model2.summary()

[ 3.4901962e-01  1.3200000e-01  8.8700002e-01  1.0190001e+00
  1.0191320e+00  7.3333335e-01  1.5899999e-01  3.7500000e-01
  5.3399998e-01  5.3415900e-01  6.2745094e-02  4.6700001e-01
 -2.0800000e-01  2.5900000e-01  2.5946701e-01  3.4901962e-01
  1.4600000e-01  1.0900000e-01  2.5500000e-01  2.5514600e-01
  3.0980393e-01  2.3199999e-01 -4.3000001e-02  1.8900000e-01
  1.8923201e-01  3.3333334e-01  1.2000000e-01  5.6300002e-01
  6.8300003e-01  6.8312001e-01  7.3333335e-01  5.6000002e-02
  1.4500000e-01  2.0100001e-01  2.0105600e-01  3.1372547e-02
  3.2000002e-02  1.3200000e-01  1.6400000e-01  1.6403200e-01
  7.3333335e-01  5.9000000e-02  1.1800000e-01  1.7700000e-01
  1.7705899e-01  3.1372547e-02  9.4999999e-02  7.8000002e-02
  1.7299999e-01  1.7309500e-01  3.1372547e-02  8.7800002e-01
  1.0000000e-03  8.7900001e-01  8.7987798e-01  6.2745094e-02
  6.6200000e-01 -2.1200000e-01  4.4999999e-01  4.5066199e-01
  3.4901962e-01  1.4300001e-01  1.6200000e-01  3.0500001e-01
  3.0514300e-01  3.09803

In [2]:
result = model.predict([left,right])
decision = abs_predict.predict([left,right])


df_result = pd.DataFrame(result)
df_result.columns = {'output'}
df_decision = pd.DataFrame(decision)
df_decision.columns = ['v_0','v_1','v_2','v_3','v_4','v_5','v_6','v_7','v_8','v_9','v_10','v_11','v_12','v_13','v_14','v_15','v_16','v_17','v_18','v_19','v_20','v_21','v_22','v_23','v_24','v_25','v_26','v_27','v_28','v_29','v_30','v_31','v_32','v_33','v_34','v_35','v_36','v_37','v_38','v_39','v_40','v_41','v_42','v_43','v_44','v_45','v_46','v_47','v_48','v_49','v_50','v_51','v_52','v_53','v_54','v_55','v_56','v_57','v_58','v_59','v_60','v_61','v_62','v_63']
result = pd.concat([new_dataFrame,df_decision,df_result],ignore_index=False, axis = 1)


result.to_csv('Free_64D_decision_layer_output_10000user_test.csv')

In [6]:
'''
string = ''

for i in range(128):
    string += '\''+str(i)+'\':'+'\'v_'+str(i)+'\','
string
'''
string = ''

for i in range(64):
    string += '\'v_'+str(i)+'\','
string


"'v_0','v_1','v_2','v_3','v_4','v_5','v_6','v_7','v_8','v_9','v_10','v_11','v_12','v_13','v_14','v_15','v_16','v_17','v_18','v_19','v_20','v_21','v_22','v_23','v_24','v_25','v_26','v_27','v_28','v_29','v_30','v_31','v_32','v_33','v_34','v_35','v_36','v_37','v_38','v_39','v_40','v_41','v_42','v_43','v_44','v_45','v_46','v_47','v_48','v_49','v_50','v_51','v_52','v_53','v_54','v_55','v_56','v_57','v_58','v_59','v_60','v_61','v_62','v_63',"

In [3]:
left_pred = left_predict.predict([left,right])

df_left = pd.DataFrame(left_pred)
df_left.columns = ['v_0','v_1','v_2','v_3','v_4','v_5','v_6','v_7','v_8','v_9','v_10','v_11','v_12','v_13','v_14','v_15','v_16','v_17','v_18','v_19','v_20','v_21','v_22','v_23','v_24','v_25','v_26','v_27','v_28','v_29','v_30','v_31','v_32','v_33','v_34','v_35','v_36','v_37','v_38','v_39','v_40','v_41','v_42','v_43','v_44','v_45','v_46','v_47','v_48','v_49','v_50','v_51','v_52','v_53','v_54','v_55','v_56','v_57','v_58','v_59','v_60','v_61','v_62','v_63']
result = pd.concat([new_dataFrame,df_left],ignore_index=False, axis = 1)

result.to_csv('Free_64D_left_output_10000user_test.csv')