In [3]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

"""LSTM_Keystroke_Similarity
"""
import numpy as np
import pandas as pd
import random
import keras.backend as K
#import keras.utils as ku
from keras.models import *
from keras.layers import *
from keras.layers.embeddings import *
from keras.optimizers import Adadelta
from keras.optimizers import Adam
#from keras.utils.vis_utils import plot_model
import matplotlib.pyplot as plt
from keras.preprocessing.sequence import pad_sequences
#from keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint
#from tensorflow.keras.utils import multi_gpu_model

np.random.seed(12345)

# Global Variables
HID_DIM = 128
SQUENCE_NUM = 70
VECTOR_UNIT = 5
VECTOR_LEN = SQUENCE_NUM * VECTOR_UNIT

# Create pairs
def create_pairs(df):
    all_pairs_list = []
    user_list = df['user'].unique().tolist()
    user_num = len(user_list)
    #print('user_num:',user_num)
    
    colum_list = ['user_left','index_left','user_right','index_right']
    df_makeData_all = pd.DataFrame(columns = colum_list)

    for j, u in enumerate(user_list):  # 遍历所有用户
        df_user = df[df['user'] == u]
        df_user_len = len(df_user)
        # print('user',u)
        for i in range(df_user_len):
            same_pair_list = []
            diff_pair_list = []

            #df_user.sample(frac=1).reset_index(drop=True)  # 打乱用户样本顺序
            z1, z2 = sum(df_user.iloc[i, 2], []), sum(df_user.iloc[(i + 1) % df_user_len, 2], [])  # 同一用户样本
            same_pair_list.append(z1)
            same_pair_list.append(z2)
            same_pair_list.append(0.0)  # 同一用户样本标识0
            all_pairs_list.append(same_pair_list)            
            df_makeData_all=df_makeData_all.append(pd.DataFrame({'user_left':[u],'index_left':[i],'user_right':[u],'index_right':[(i + 1) % df_user_len]}),ignore_index=True)
            
            inc = 1
            dn = (j + inc) % (user_num-1)  # 不同用户样本
            if dn != j:
                diff_user = user_list[dn]
            else:
                diff_user = user_list[(dn + 1) % (user_num-1)]
            # print('diff',diff_user)
            df_diff_user = df[df['user'] == diff_user]
            diff_user_len = len(df_diff_user)
            if diff_user_len == 0:
                
                '''
                print('dn:',dn)
                print('diff_user:',diff_user)
                print('user_list:',user_list[dn])
                print('user_list_all:',user_list)
                print('user_list_len:', len(user_list))
                print('df_diff_user:',df_diff_user)
                '''
                
                continue
            z3, z4 = sum(df_user.iloc[i, 2], []), sum(df_diff_user.iloc[i % diff_user_len, 2], [])
            diff_pair_list.append(z3)
            diff_pair_list.append(z4)
            diff_pair_list.append(1.0)  # 不同用户样本标识1.0
            all_pairs_list.append(diff_pair_list)
            
            df_makeData_all=df_makeData_all.append(pd.DataFrame({'user_left':[u],'index_left':[i],'user_right':[diff_user],'index_right':[i % diff_user_len]}),ignore_index=True)

        if j >= 6:
            break            
            
    return all_pairs_list,df_makeData_all


def process_data(pairs):
    keystroke1_data_list = []
    keystroke2_data_list = []
    label_list = []
    for i in range(len(pairs)):
        keystroke1_data = pad_sequences([pairs[i][0]], maxlen=VECTOR_LEN, dtype='float32', padding='post',
                                        truncating='post')
        keystroke2_data = pad_sequences([pairs[i][1]], maxlen=VECTOR_LEN, dtype='float32', padding='post',
                                        truncating='post')
        label = pairs[i][2]
        judge_nan_left = np.array(keystroke1_data[0])
        judge_nan_right = np.array(keystroke2_data[0])
        if np.any(np.isnan(judge_nan_left)) or np.any(np.isnan(judge_nan_right)):
            #print('judge_nan_left',judge_nan_left)
            #print('judge_nan_right',judge_nan_right)
            continue
        keystroke1_data_list.append(keystroke1_data[0])
        keystroke2_data_list.append(keystroke2_data[0])
        label_list.append(label)
        if np.any(np.isnan(judge_nan_left)):
            print('zc_nan')
    return np.array(keystroke1_data_list), np.array(keystroke2_data_list), np.array(label_list)

def accuracy(y_true, y_pred):  # Tensor上的操作
    '''Compute classification accuracy with a fixed threshold on distances.
    '''
    return K.mean(K.equal(y_true, K.cast(y_pred > 0.5, y_true.dtype)))


def plot_train_history(history, train_metrics, val_metrics):
    plt.plot(history.history.get(train_metrics), '-o')
    plt.plot(history.history.get(val_metrics), '-o')
    plt.ylabel(train_metrics)
    plt.xlabel('Epochs')
    plt.legend(['train', 'validation'])


def create_base_network(dim):
    '''Base network to be shared (eq. to feature extraction).
    '''
    input = Input(shape=(SQUENCE_NUM, VECTOR_UNIT))
    mask = Masking(0)(input)
    bn_layer1 = BatchNormalization(name='bn_layer1')(mask)
    lstm_1 = LSTM(dim, activation='relu', kernel_initializer='random_uniform',bias_initializer='zeros',
                  return_sequences=False, name='lstm_layer1', recurrent_dropout=0.2)
    lstm_layer1 = lstm_1(bn_layer1)
    #Dropout(lstm_layer1, 0.5)
    dense_out = Dense(128, activation= 'relu', input_dim= 128, use_bias= True)(lstm_layer1)
    #Dropout(lstm_layer1, 0.5)
    #bn_layer2 = BatchNormalization(name='bn_layer2_input1')(lstm_layer1)
    #lstm_2 = LSTM(dim, activation='relu', kernel_initializer='random_uniform',bias_initializer='zeros',
    #              return_sequences=False, name='lstm_layer2', recurrent_dropout=0.2)
    #lstm_layer2 = lstm_2(bn_layer2)


    return Model(input, dense_out)

file_name = './5_subject_for_test.json'
#file_name = './five_tuple_vector_data.json'
epochs = 100
batch_size = 512

df = pd.read_json(file_name)
#df = df.drop(['Unnamed: 0'],axis=1)

colum_list = ['user_left','index_left','user_right','index_right']
new_dataFrame = pd.DataFrame(columns = colum_list)

data_pairs, new_dataFrame = create_pairs(df)
left,right,label = process_data(data_pairs)
print(left[0])

num = left.shape[0]

left = left.reshape(num,SQUENCE_NUM,VECTOR_UNIT)
right = right.reshape(num,SQUENCE_NUM,VECTOR_UNIT)

base_network = create_base_network(HID_DIM)

input_left = Input(shape=(SQUENCE_NUM,VECTOR_UNIT))
input_right = Input(shape=(SQUENCE_NUM,VECTOR_UNIT))

# because we re-use the same instance `base_network`,
# the weights of the network
# will be shared across the two branches
processed_left = base_network(input_left)
processed_right = base_network(input_right)
#print(processed_left.shape)
#print(processed_left[0].shape)

abs_of_sub = Lambda (function = lambda x: abs(x[0] - x[1]), output_shape=lambda x: (x[0], 1))([processed_left,processed_right])

###distance = Lambda(Euclidean_distance,output_shape=eucl_dist_output_shape)([processed_left, processed_right])
#distance = Lambda(function=lambda x: Euclidean_distance(x[0], x[1]),output_shape=lambda x: (x[0], 1))([processed_left, processed_right])
Dropout(abs_of_sub, 0.5)
Dens_layer1 = Dense(128, activation= 'relu', input_dim= 128, use_bias= True)(abs_of_sub)
Dens_layer = Dense(1, activation= 'sigmoid', input_dim= 128, use_bias= True)(Dens_layer1)  


#model = Model([input_left, input_right], distance)
model = Model([input_left, input_right], Dens_layer)
abs_predict = Model([input_left, input_right], abs_of_sub)
model.load_weights('./128D_Free_text_explaint_step70_epochs100_2nd100batch_size_512_lr000001_same_0.5.h5')

#model = multi_gpu_model(model,gpus =3)   #用gpu训练
model.summary()

[ 0.06274509  0.281      -0.111       0.17        0.170281    0.32941177
  0.048       0.148       0.196       0.196048    0.28235295  0.043
  0.144       0.187       0.187043    0.27058825  0.064       0.049
  0.113       0.113064    0.1254902   0.079       0.08        0.159
  0.159079    0.2627451   0.08        0.016       0.096       0.09608
  0.28627452  0.08        0.048       0.128       0.12808     0.32941177
  0.064       0.112       0.176       0.176064    0.34901962  0.064
  0.016       0.08        0.080064    0.1254902   0.096       0.129
  0.225       0.225096    0.3137255   0.08        0.095       0.175
  0.17508     0.29803923  0.08        0.017       0.097       0.09708
  0.25490198  0.08        0.096       0.176       0.17608     0.34901962
  0.112       0.063       0.175       0.175112    0.3254902   0.096
  0.          0.096       0.096096    0.1254902   0.064       0.385
  0.449       0.449064    0.28235295  0.079       0.064       0.143
  0.143079    0.30980393  0.0

In [51]:
string = ''
for i in range(128):
    string += '\'v_'+str(i)+'\','
string

"'v_0','v_1','v_2','v_3','v_4','v_5','v_6','v_7','v_8','v_9','v_10','v_11','v_12','v_13','v_14','v_15','v_16','v_17','v_18','v_19','v_20','v_21','v_22','v_23','v_24','v_25','v_26','v_27','v_28','v_29','v_30','v_31','v_32','v_33','v_34','v_35','v_36','v_37','v_38','v_39','v_40','v_41','v_42','v_43','v_44','v_45','v_46','v_47','v_48','v_49','v_50','v_51','v_52','v_53','v_54','v_55','v_56','v_57','v_58','v_59','v_60','v_61','v_62','v_63','v_64','v_65','v_66','v_67','v_68','v_69','v_70','v_71','v_72','v_73','v_74','v_75','v_76','v_77','v_78','v_79','v_80','v_81','v_82','v_83','v_84','v_85','v_86','v_87','v_88','v_89','v_90','v_91','v_92','v_93','v_94','v_95','v_96','v_97','v_98','v_99','v_100','v_101','v_102','v_103','v_104','v_105','v_106','v_107','v_108','v_109','v_110','v_111','v_112','v_113','v_114','v_115','v_116','v_117','v_118','v_119','v_120','v_121','v_122','v_123','v_124','v_125','v_126','v_127',"

In [53]:
result = model.predict([left,right])
decision = abs_predict.predict([left,right])
df_result = pd.DataFrame(result)
df_result.columns = ['result']

df_decision = pd.DataFrame(decision)
df_decision.columns = ['v_0','v_1','v_2','v_3','v_4','v_5','v_6','v_7','v_8','v_9','v_10','v_11','v_12','v_13','v_14','v_15','v_16','v_17','v_18','v_19','v_20','v_21','v_22','v_23','v_24','v_25','v_26','v_27','v_28','v_29','v_30','v_31','v_32','v_33','v_34','v_35','v_36','v_37','v_38','v_39','v_40','v_41','v_42','v_43','v_44','v_45','v_46','v_47','v_48','v_49','v_50','v_51','v_52','v_53','v_54','v_55','v_56','v_57','v_58','v_59','v_60','v_61','v_62','v_63','v_64','v_65','v_66','v_67','v_68','v_69','v_70','v_71','v_72','v_73','v_74','v_75','v_76','v_77','v_78','v_79','v_80','v_81','v_82','v_83','v_84','v_85','v_86','v_87','v_88','v_89','v_90','v_91','v_92','v_93','v_94','v_95','v_96','v_97','v_98','v_99','v_100','v_101','v_102','v_103','v_104','v_105','v_106','v_107','v_108','v_109','v_110','v_111','v_112','v_113','v_114','v_115','v_116','v_117','v_118','v_119','v_120','v_121','v_122','v_123','v_124','v_125','v_126','v_127']

output_data = pd.concat([pd.DataFrame(df_decision),df_result],axis =1)
output_data


Unnamed: 0,v_0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,v_8,v_9,...,v_119,v_120,v_121,v_122,v_123,v_124,v_125,v_126,v_127,result
0,0.000000,0.000000,0.048756,0.036071,0.129744,0.062844,0.001060,0.0,0.118177,0.040427,...,0.000000,0.000000,0.011264,0.013780,0.024251,0.032965,0.096675,0.0,0.043144,0.023633
1,0.148149,0.000000,0.002147,0.085029,0.336176,0.451633,0.140752,0.0,0.171776,0.037734,...,0.117366,0.405371,0.057061,0.028243,0.334648,0.771072,0.023946,0.0,0.299974,1.000000
2,0.000000,0.000000,0.000000,0.061188,0.005968,0.068114,0.007467,0.0,0.033016,0.214344,...,0.000000,0.000000,0.079251,0.023368,0.025355,0.059652,0.075034,0.0,0.120870,0.012507
3,0.000000,0.047998,0.000000,0.042490,0.076248,0.487693,0.203320,0.0,0.006392,0.017684,...,0.050938,0.288381,0.138245,0.046799,0.189173,0.677029,0.022472,0.0,0.343118,0.999899
4,0.000000,0.000000,0.000000,0.022985,0.130249,0.004798,0.101974,0.0,0.075732,0.015634,...,0.000000,0.000000,0.129858,0.017974,0.119763,0.020708,0.014232,0.0,0.094324,0.104083
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,0.000000,0.050155,0.069497,0.084630,0.079397,0.108425,0.069887,0.0,0.083827,0.163758,...,0.096983,0.174661,0.005534,0.033304,0.043700,0.049480,0.188339,0.0,0.040687,0.927260
146,0.000000,0.000000,0.073470,0.011230,0.024647,0.093203,0.035046,0.0,0.103308,0.065169,...,0.001498,0.021780,0.012184,0.035019,0.038415,0.114997,0.055998,0.0,0.011817,0.070547
147,0.000000,0.000000,0.017166,0.086873,0.263601,0.131972,0.133331,0.0,0.121904,0.029628,...,0.038762,0.230578,0.025702,0.005217,0.021572,0.092784,0.051419,0.0,0.035979,0.307270
148,0.000000,0.000000,0.105926,0.011230,0.098991,0.027523,0.102172,0.0,0.153743,0.071682,...,0.070655,0.002089,0.031492,0.018294,0.015444,0.208745,0.106894,0.0,0.060385,0.532511


In [26]:
result.shape,decision.shape
decision[:100,:].shape

(100, 128)

In [58]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
X_train = decision[:100,:]
y_train = result[:100]
X_test = decision[100:,:]
y_test = result[100:]
# 建立线性回归，并用训练的模型绘图
regressor = LinearRegression()
regressor.fit(X_train, y_train)

quadratic_featurizer = PolynomialFeatures(degree=2)
X_train_quadratic = quadratic_featurizer.fit_transform(X_train)
X_test_quadratic = quadratic_featurizer.transform(X_test)
regressor_quadratic = LinearRegression()
regressor_quadratic.fit(X_train_quadratic, y_train)

cubic_featurizer = PolynomialFeatures(degree=3)
X_train_cubic = cubic_featurizer.fit_transform(X_train)
X_test_cubic = cubic_featurizer.transform(X_test)
regressor_cubic = LinearRegression()
regressor_cubic.fit(X_train_cubic, y_train)

power4_featurizer = PolynomialFeatures(degree=3)
X_train_power4 = power4_featurizer.fit_transform(X_train)
X_test_power4 = power4_featurizer.transform(X_test)
regressor_power4 = LinearRegression()
regressor_power4.fit(X_train_power4, y_train)

print(X_train)
print(X_train_quadratic)
print(X_test)
print(X_test_quadratic)
print('1 r-squared', regressor.score(X_test, y_test))
print('2 r-squared', regressor_quadratic.score(X_test_quadratic, y_test))
print('3 r-squared', regressor_cubic.score(X_test_cubic, y_test))
print('4 r-squared', regressor_power4.score(X_test_power4, y_test))

[[0.         0.         0.04875588 ... 0.09667468 0.         0.04314399]
 [0.14814892 0.         0.00214668 ... 0.02394623 0.         0.29997438]
 [0.         0.         0.         ... 0.07503384 0.         0.12087035]
 ...
 [0.         0.         0.09972847 ... 0.10276932 0.         0.5466726 ]
 [0.         0.         0.04137722 ... 0.15352815 0.         0.05821124]
 [0.         0.         0.00951308 ... 0.03340119 0.         0.36554167]]
[[1.         0.         0.         ... 0.         0.         0.0018614 ]
 [1.         0.14814892 0.         ... 0.         0.         0.08998463]
 [1.         0.         0.         ... 0.         0.         0.01460964]
 ...
 [1.         0.         0.         ... 0.         0.         0.29885092]
 [1.         0.         0.         ... 0.         0.         0.00338855]
 [1.         0.         0.         ... 0.         0.         0.13362071]]
[[0.         0.         0.04341801 ... 0.06631041 0.         0.19238067]
 [0.         0.17552195 0.0761313  ... 

In [62]:
regressor_quadratic.coef_,regressor_quadratic.coef_.shape

(array([[ 0.0052892 , -0.09391707, -0.04211064, ...,  0.        ,
          0.        ,  0.08612192]], dtype=float32),
 (1, 8385))

In [63]:
regressor_quadratic.intercept_,regressor_quadratic.intercept_.shape

(array([-0.33145303], dtype=float32), (1,))

In [64]:
p = PolynomialFeatures(degree=2).fit(df_decision)
coeff = p.get_feature_names(df_decision.columns)
print(coeff),len(coeff)

['1', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14', 'v_15', 'v_16', 'v_17', 'v_18', 'v_19', 'v_20', 'v_21', 'v_22', 'v_23', 'v_24', 'v_25', 'v_26', 'v_27', 'v_28', 'v_29', 'v_30', 'v_31', 'v_32', 'v_33', 'v_34', 'v_35', 'v_36', 'v_37', 'v_38', 'v_39', 'v_40', 'v_41', 'v_42', 'v_43', 'v_44', 'v_45', 'v_46', 'v_47', 'v_48', 'v_49', 'v_50', 'v_51', 'v_52', 'v_53', 'v_54', 'v_55', 'v_56', 'v_57', 'v_58', 'v_59', 'v_60', 'v_61', 'v_62', 'v_63', 'v_64', 'v_65', 'v_66', 'v_67', 'v_68', 'v_69', 'v_70', 'v_71', 'v_72', 'v_73', 'v_74', 'v_75', 'v_76', 'v_77', 'v_78', 'v_79', 'v_80', 'v_81', 'v_82', 'v_83', 'v_84', 'v_85', 'v_86', 'v_87', 'v_88', 'v_89', 'v_90', 'v_91', 'v_92', 'v_93', 'v_94', 'v_95', 'v_96', 'v_97', 'v_98', 'v_99', 'v_100', 'v_101', 'v_102', 'v_103', 'v_104', 'v_105', 'v_106', 'v_107', 'v_108', 'v_109', 'v_110', 'v_111', 'v_112', 'v_113', 'v_114', 'v_115', 'v_116', 'v_117', 'v_118', 'v_119', 'v_120', 'v_121', 'v_122

(None, 8385)

In [74]:
output_dataFrame = pd.DataFrame(columns = coeff)
output_dataFrame.loc[0] = regressor_quadratic.coef_.tolist()[0]
output_dataFrame

Unnamed: 0,1,v_0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,v_8,...,v_124^2,v_124 v_125,v_124 v_126,v_124 v_127,v_125^2,v_125 v_126,v_125 v_127,v_126^2,v_126 v_127,v_127^2
0,0.005289,-0.093917,-0.042111,0.121468,-0.140342,0.003901,0.112716,0.191563,0.001112,0.228598,...,-0.030161,-0.039253,0.0,-0.013796,-0.011511,0.0,0.036138,0.0,0.0,0.086122
