In [1]:
import sys
sys.path.append('../../ml_utils')

import config as cfg
import data_utils as du
import autoencoder as aenc
import timeit

import keras
from keras import regularizers

import sklearn
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

import os
os.environ['CUDA_VISIBLE_DEVICES']='-1'

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
print("Python: ", sys.version)
print("pandas: ", pd.__version__)
print("numpy: ", np.__version__)
print("Keras: ", keras.__version__)

Python:  3.7.5 (default, Oct 25 2019, 15:51:11) 
[GCC 7.3.0]
pandas:  0.25.3
numpy:  1.17.4
Keras:  2.2.4


### Load and clean the data

In [3]:
log_data = './../../shared/data/swissid_authorize_logs_april_to_sept_2019.csv'

start = timeit.default_timer()
df = pd.read_csv(filepath_or_buffer=log_data, header=0, sep='\t', names=cfg.complete_feature_list, index_col=None)
stop = timeit.default_timer()

print("Original data frame size: {}\n".format(df.shape))
print("Time: {} seconds\n".format(stop - start))

Original data frame size: (12417597, 31)

Time: 40.427318611000004 seconds



In [4]:
df = du.clean_data(df)

columns with NaN: loc_country_code
loc_country_code is fixed


In [5]:
features = ['label_nr', 'src_software_sub_type', 'src_operating_system_name', 'src_hardware_type', 'response_status_code', 
            'oidc_client_id', 'oidc_scopes', 'oidc_ui_locales', 'loc_city', 'loc_country_code', 'date_weekday']

excludes = list(set(df.columns.tolist()) - set(features))

In [6]:
reduced_df = du.reduce_features(df, excludes)

print("reduced data frame size: {}\n".format(reduced_df.shape))
#display(reduced_df)

reduced data frame size: (12204748, 11)



In [7]:
print(reduced_df['label_nr'].value_counts())

2    7412813
0    4769169
1      22766
Name: label_nr, dtype: int64


In [8]:
def create_model(optimizer='adam', activation='relu'):
    
    model = aenc.create_deep_auto_encoder(10, 8, 4, 2, activation)
    model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mae', 'accuracy'])

    return model

In [9]:
np.random.seed(7)

sample_size= 200000
exclude = ['label_nr']

# load dataset
X_df, y_df, anomaly_rate, encoder = du.build_normal_anomaly_sample_df(reduced_df, sample_size, exclude, True, True, False, True)
x_train, x_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=42)

In [10]:
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

model = KerasClassifier(build_fn=create_model, 
                        epochs=20, 
                        batch_size=32, 
                        verbose=2, 
                        shuffle=True)

# define the grid search parameters
optimizer = ['RMSprop', 'Adadelta', 'Adam', 'Adamax']
activation = ['softmax', 'relu', 'tanh', 'sigmoid']
batch_size = [32, 64, 128]
epochs = [50, 100, 150]


param_grid = dict(optimizer=optimizer, batch_size=batch_size, epochs=epochs)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)

start = timeit.default_timer()
grid_result = grid.fit(x_train, x_train)
stop = timeit.default_timer()

# summarize results
print("Time: {} minutes\n".format((stop - start)/60))
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/100
 - 4s - loss: 0.9188 - mean_absolute_error: 0.5131 - acc: 0.3748
Epoch 2/100
 - 4s - loss: 0.8696 - mean_absolute_error: 0.4887 - acc: 0.6283
Epoch 3/100
 - 4s - loss: 0.8635 - mean_absolute_error: 0.4862 - acc: 0.6635
Epoch 4/100
 - 4s - loss: 0.8589 - mean_absolute_error: 0.4842 - acc: 0.6919
Epoch 5/100
 - 4s - loss: 0.8566 - mean_absolute_error: 0.4830 - acc: 0.7067
Epoch 6/100
 - 4s - loss: 0.8555 - mean_absolute_error: 0.4825 - acc: 0.7199
Epoch 7/100
 - 4s - loss: 0.8542 - mean_absolute_error: 0.4817 - acc: 0.7238
Epoch 8/100
 - 4s - loss: 0.8538 - mean_absolute_error: 0.4814 - acc: 0.7229
Epoch 9/100
 - 4s - loss: 0.8537 - mean_absolute_error: 0.4813 - acc: 0.7170
Epoch 10/100
 - 4s - loss: 0.8536 - mean_absolute_error: 0.4813 - acc: 0.7165
Epoch 11/100
 - 4s - loss: 0.8532 - mean_absolute_error: 0.4813 - acc: 0.7173
Epoch 12/100
 - 4s - loss: 0.85

Epoch 99/100
 - 4s - loss: 0.8330 - mean_absolute_error: 0.4721 - acc: 0.7878
Epoch 100/100
 - 4s - loss: 0.8330 - mean_absolute_error: 0.4721 - acc: 0.7852
Time: 53.174567831900006 minutes

Best: 0.568006 using {'batch_size': 64, 'epochs': 100, 'optimizer': 'Adadelta'}
0.310893 (0.171404) with: {'batch_size': 32, 'epochs': 50, 'optimizer': 'RMSprop'}
0.421069 (0.086142) with: {'batch_size': 32, 'epochs': 50, 'optimizer': 'Adadelta'}
0.499713 (0.058051) with: {'batch_size': 32, 'epochs': 50, 'optimizer': 'Adam'}
0.362712 (0.068582) with: {'batch_size': 32, 'epochs': 50, 'optimizer': 'Adamax'}
0.485175 (0.122338) with: {'batch_size': 32, 'epochs': 100, 'optimizer': 'RMSprop'}
0.469850 (0.059720) with: {'batch_size': 32, 'epochs': 100, 'optimizer': 'Adadelta'}
0.257080 (0.188709) with: {'batch_size': 32, 'epochs': 100, 'optimizer': 'Adam'}
0.445487 (0.045455) with: {'batch_size': 32, 'epochs': 100, 'optimizer': 'Adamax'}
0.566519 (0.029511) with: {'batch_size': 32, 'epochs': 150, 'optimi