In [1]:
import sys
sys.path.append('../../ml_utils')

import config as cfg
import data_utils as du
import numpy as np
import pandas as pd
import timeit
import matplotlib.pyplot as plt
import joblib
import sklearn

from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score

%matplotlib inline

In [2]:
print("joblib (0.14.1): {}\n".format(joblib.__version__))
print("numpy (1.17.4): {}\n".format(np.__version__))
print("pandas (0.25.3): {}\n".format(pd.__version__))
print("sklearn (0.22): {}\n".format(sklearn.__version__))

joblib (0.14.1): 0.14.1

numpy (1.17.4): 1.17.4

pandas (0.25.3): 0.25.3

sklearn (0.22): 0.22



### Load and clean the data

In [3]:
log_data = './../../shared/data/swissid_authorize_logs_april_to_sept_2019.csv'

start = timeit.default_timer()
df = pd.read_csv(filepath_or_buffer=log_data, header=0, sep='\t', names=cfg.complete_feature_list, index_col=None)
stop = timeit.default_timer()

print("Original data frame size: {}\n".format(df.shape))
print("Time: {} seconds\n".format(stop - start))

Original data frame size: (12417597, 31)

Time: 40.093010555999854 seconds



In [4]:
df = du.clean_data(df)

columns with NaN: loc_country_code
loc_country_code is fixed


In [5]:
features = ['label_nr', 'src_software_sub_type', 'src_operating_system_name', 'src_hardware_type', 'response_status_code', 
            'oidc_client_id', 'oidc_scopes', 'oidc_ui_locales', 'loc_city', 'loc_country_code', 'date_weekday']

excludes = list(set(df.columns.tolist()) - set(features))

In [6]:
reduced_df = du.reduce_features(df, excludes)

print("reduced data frame size: {}\n".format(reduced_df.shape))
#display(reduced_df)

reduced data frame size: (12204748, 11)



In [7]:
print(reduced_df['label_nr'].value_counts())

2    7412813
0    4769169
1      22766
Name: label_nr, dtype: int64


### Build a sampled dataframe having the same anomaly rate as the original dateset and collect the metrics data


In [8]:
iterations = 100
metrics = []
iter_values = []

In [9]:
sample_size= 200000
exclude = ['label_nr']

start_overall = timeit.default_timer()

for i in range(iterations):

    X_df, y_df, anomaly_rate, encoder = du.build_normal_anomaly_sample_df(reduced_df, sample_size, exclude)

    x_train, x_test, y_train, y_test = train_test_split(X_df, y_df, test_size = 0.2, random_state = 42)

    iforest_clf = IsolationForest(n_estimators=120, max_samples='auto', contamination=anomaly_rate)
    
    start_fitting = timeit.default_timer()
    iforest_clf.fit(x_train)
    stop_fitting = timeit.default_timer()
    
    fit_time = stop_fitting - start_fitting
    print("{0} Time to fit: {1:.2f} seconds".format(i, fit_time))
    
    y_scores = iforest_clf.decision_function(x_test)
    y_truth = y_test.values == cfg.label_anomaly_idx

    best_res, fpr_values, tpr_values, best_values = du.calc_best_threshold(y_truth, y_scores)

    iter_values.append(best_values)

    y_predictions = y_scores < best_res['best_auc_threshold']
    res, values = du.calc_metrics(y_truth, y_predictions)

    iter_values.append(values)

    y_predictions = y_scores < best_res['best_f1_score_threshold']
    res, values = du.calc_metrics(y_truth, y_predictions)

    iter_values.append(values)

    flat_list = [item for sublist in iter_values for item in sublist]
    
    flat_list.append(np.float64(fit_time))
    
    metrics.append(flat_list)
    
    iter_values = []

stop_overall = timeit.default_timer()

overall_runtime = stop_overall - start_overall
print("Time: {0:.2f} seconds\n".format(overall_runtime))
print("Time: {0:.2f} minutes\n".format(overall_runtime/60))

0 Time to fit: 3.59 seconds
1 Time to fit: 3.59 seconds
2 Time to fit: 3.60 seconds
3 Time to fit: 3.57 seconds
4 Time to fit: 3.59 seconds
5 Time to fit: 3.59 seconds
6 Time to fit: 3.57 seconds
7 Time to fit: 3.56 seconds
8 Time to fit: 3.58 seconds
9 Time to fit: 3.57 seconds
10 Time to fit: 3.55 seconds
11 Time to fit: 3.58 seconds
12 Time to fit: 3.55 seconds
13 Time to fit: 3.56 seconds
14 Time to fit: 3.56 seconds
15 Time to fit: 3.56 seconds
16 Time to fit: 3.60 seconds
17 Time to fit: 3.57 seconds
18 Time to fit: 3.58 seconds
19 Time to fit: 3.56 seconds
20 Time to fit: 3.58 seconds
21 Time to fit: 3.55 seconds
22 Time to fit: 3.55 seconds
23 Time to fit: 3.54 seconds
24 Time to fit: 3.54 seconds
25 Time to fit: 3.56 seconds
26 Time to fit: 3.59 seconds
27 Time to fit: 3.58 seconds
28 Time to fit: 3.56 seconds
29 Time to fit: 3.55 seconds
30 Time to fit: 3.55 seconds
31 Time to fit: 3.55 seconds
32 Time to fit: 3.57 seconds
33 Time to fit: 3.54 seconds
34 Time to fit: 3.57 sec

### Calculate Metrics

In [11]:
metrics_df = du.create_metric_df(metrics, './isolation_forest_metrics.csv')
metrics_df.describe()

Unnamed: 0,best_f1_score_threshold,total,tot_anomaly,tot_normal,tpr_recall,precision,accuracy,f1_score,fnr,tnr,fpr,t_pos,f_pos,f_neg,t_neg,auc,time_to_fit
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,-0.023786,40000.0,191.11,39808.89,0.800934,0.955242,0.998863,0.870631,0.199066,0.999812,0.000188,153.1,7.48,38.01,39801.41,0.900373,3.578659
std,0.010791,0.0,11.979692,11.979692,0.027137,0.044119,0.000245,0.026137,0.027137,0.0002,0.0002,11.310583,7.972782,5.547763,14.073469,0.013573,0.028633
min,-0.05138,40000.0,166.0,39774.0,0.720588,0.722222,0.997675,0.736544,0.127551,0.998745,0.0,129.0,0.0,25.0,39761.0,0.860219,3.512977
25%,-0.030075,40000.0,183.0,39802.0,0.787009,0.942855,0.998725,0.85833,0.182636,0.999768,7.5e-05,146.0,3.0,34.0,39792.75,0.893414,3.557313
50%,-0.022382,40000.0,192.0,39808.0,0.804613,0.967837,0.9989,0.875708,0.195387,0.999874,0.000126,153.5,5.0,37.0,39802.0,0.902175,3.575334
75%,-0.017317,40000.0,198.0,39817.0,0.817364,0.982832,0.999031,0.88654,0.212991,0.999925,0.000232,160.25,9.25,41.0,39810.25,0.90858,3.596072
max,0.002986,40000.0,226.0,39834.0,0.872449,1.0,0.9993,0.924324,0.279412,1.0,0.001255,180.0,50.0,57.0,39834.0,0.936187,3.656407
