In [1]:
import sys
sys.path.append('../../ml_utils')

import config as cfg
import data_utils as du
import numpy as np
import pandas as pd
import timeit
import matplotlib.pyplot as plt
import joblib
import sklearn

from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score

%matplotlib inline

In [2]:
print("joblib (0.14.1): {}\n".format(joblib.__version__))
print("numpy (1.17.4): {}\n".format(np.__version__))
print("pandas (0.25.3): {}\n".format(pd.__version__))
print("sklearn (0.22): {}\n".format(sklearn.__version__))

joblib (0.14.1): 0.14.1

numpy (1.17.4): 1.17.4

pandas (0.25.3): 0.25.3

sklearn (0.22): 0.22



### Load and clean the data

In [3]:
log_data = './../../shared/data/swissid_authorize_logs_april_to_sept_2019.csv'

start = timeit.default_timer()
df = pd.read_csv(filepath_or_buffer=log_data, header=0, sep='\t', names=cfg.complete_feature_list, index_col=None)
stop = timeit.default_timer()

print("Original data frame size: {}\n".format(df.shape))
print("Time: {} seconds\n".format(stop - start))

Original data frame size: (12417597, 31)

Time: 39.41910548499982 seconds



In [4]:
df = du.clean_data(df)

columns with NaN: loc_country_code
loc_country_code is fixed


In [5]:
features = ['label_nr', 'src_software_sub_type', 'src_operating_system_name', 'src_hardware_type', 'response_status_code', 
            'oidc_client_id', 'oidc_scopes', 'oidc_ui_locales', 'loc_city', 'loc_country_code', 'date_weekday']

excludes = list(set(df.columns.tolist()) - set(features))

In [6]:
reduced_df = du.reduce_features(df, excludes)

print("reduced data frame size: {}\n".format(reduced_df.shape))
#display(reduced_df)

reduced data frame size: (12204748, 11)



In [7]:
print(reduced_df['label_nr'].value_counts())

2    7412813
0    4769169
1      22766
Name: label_nr, dtype: int64


### Build a sampled dataframe having the same anomaly rate as the original dateset and collect the metrics data

In [8]:
iterations = 100
metrics = []
iter_values = []

In [9]:
sample_size= 200000
novelties_size = 10000


exclude = ['label_nr']

start_overall = timeit.default_timer()

for i in range(iterations):

    X_df, y_df, X_val_df, y_val_df, X_nov_df, y_nov_df, encoder = du.build_normal_and_novelty_sample_df(reduced_df, sample_size, 1000, novelties_size, exclude)

    x_train, x_test, y_train, y_test = train_test_split(X_df, y_df, test_size = 0.2, random_state = 42)

    ocsvm_clf = OneClassSVM(kernel='rbf', gamma=0.00005, nu=0.001, cache_size=5000)
    
    start_fitting = timeit.default_timer()
    ocsvm_clf.fit(x_train)
    stop_fitting = timeit.default_timer()
    
    fit_time = stop_fitting - start_fitting
    print("{0} Time to fit: {1:.2f} seconds".format(i, fit_time))
    
    y_scores = ocsvm_clf.score_samples(X_nov_df)
    
    y_truth = y_nov_df.values == cfg.label_anomaly_idx

    best_res, fpr_values, tpr_values, best_values = du.calc_best_threshold(y_truth, y_scores, du.step(y_scores), True)

    iter_values.append(best_values)

    y_predictions = y_scores < best_res['best_auc_threshold']
    res, values = du.calc_metrics(y_truth, y_predictions)

    iter_values.append(values)

    y_predictions = y_scores < best_res['best_f1_score_threshold']
    res, values = du.calc_metrics(y_truth, y_predictions)

    iter_values.append(values)

    flat_list = [item for sublist in iter_values for item in sublist]
    
    flat_list.append(np.float64(fit_time))
    
    metrics.append(flat_list)
    
    iter_values = []

stop_overall = timeit.default_timer()

overall_runtime = stop_overall - start_overall
print("Time: {0:.2f} seconds\n".format(overall_runtime))
print("Time: {0:.2f} minutes\n".format(overall_runtime/60))

0 Time to fit: 101.54 seconds
1 Time to fit: 51.72 seconds
2 Time to fit: 44.70 seconds
3 Time to fit: 63.16 seconds
4 Time to fit: 53.76 seconds
5 Time to fit: 52.50 seconds
6 Time to fit: 70.62 seconds
7 Time to fit: 52.87 seconds
8 Time to fit: 50.87 seconds
9 Time to fit: 77.44 seconds
10 Time to fit: 41.37 seconds
11 Time to fit: 67.06 seconds
12 Time to fit: 120.38 seconds
13 Time to fit: 3.27 seconds
14 Time to fit: 50.07 seconds
15 Time to fit: 78.30 seconds
16 Time to fit: 48.70 seconds
17 Time to fit: 52.89 seconds
18 Time to fit: 58.26 seconds
19 Time to fit: 47.60 seconds
20 Time to fit: 3.09 seconds
21 Time to fit: 42.43 seconds
22 Time to fit: 60.48 seconds
23 Time to fit: 61.66 seconds
24 Time to fit: 107.90 seconds
25 Time to fit: 56.35 seconds
26 Time to fit: 3.37 seconds
27 Time to fit: 51.66 seconds
28 Time to fit: 63.90 seconds
29 Time to fit: 3.43 seconds
30 Time to fit: 52.37 seconds
31 Time to fit: 54.69 seconds
32 Time to fit: 3.23 seconds
33 Time to fit: 57.31 

### Calculate Metrics

In [12]:
metrics_df = du.create_metric_df(metrics, './one_class_svm_metrics.csv')
metrics_df.describe()

Unnamed: 0,best_f1_score_threshold,total,tot_anomaly,tot_normal,tpr_recall,precision,accuracy,f1_score,fnr,tnr,fpr,t_pos,f_pos,f_neg,t_neg,auc,time_to_fit
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,1.391836,10000.0,5000.0,5000.0,0.946538,0.976275,0.961205,0.960802,0.053462,0.975872,0.024128,4732.69,120.64,267.31,4879.36,0.961205,45.942068
std,0.005763,0.0,0.0,0.0,0.016055,0.029468,0.015741,0.014824,0.016055,0.0326,0.0326,80.273736,163.002046,80.273736,163.002046,0.015741,30.193105
min,1.3792,10000.0,5000.0,5000.0,0.9092,0.840112,0.888,0.895366,0.0356,0.8176,0.001,4546.0,5.0,178.0,4088.0,0.888,3.093963
25%,1.387964,10000.0,5000.0,5000.0,0.93735,0.968994,0.95495,0.954032,0.04175,0.96945,0.00645,4686.75,32.25,208.75,4847.25,0.95495,30.556394
50%,1.391668,10000.0,5000.0,5000.0,0.9544,0.987937,0.9643,0.963238,0.0456,0.9886,0.0114,4772.0,57.0,228.0,4943.0,0.9643,50.078188
75%,1.394988,10000.0,5000.0,5000.0,0.95825,0.993291,0.9726,0.972138,0.06265,0.99355,0.03055,4791.25,152.75,313.25,4967.75,0.9726,58.464476
max,1.408705,10000.0,5000.0,5000.0,0.9644,0.998907,0.978,0.977569,0.0908,0.999,0.1824,4822.0,912.0,454.0,4995.0,0.978,120.378291
