## Extended Isolation Forest Benchmark

H2O cloud is inicialize before every run and shutdown after every run. Algorithm is firstly tested on Training performance and after on Evaluation performance.

* N = number of rows
* P = number of collumns
* sample_size = how many rows will be used to built a tree
* max_depth = only for IF, how big is the depth of the tree, in EIF is always set on math.ceil(math.log(sample_size, 2)) and max_depth is always depends on sample_size in benchmark

Computer parameters:
 * Lenovo ThinkPad P53,
 * MS Windows 10 Pro x64,
 * Intel Core i7-9850H CPU @ 2.60GHz,
 * 6 cores and 12 threads,
 * 96.0 GB RAM.

In [None]:
import sys
sys.path.append("D:/skola/dip/h2o-3/h2o-py/build/main") # path to h2o build

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sb
import time
import math
import h2o
from h2o.estimators import H2OIsolationForestEstimator
from h2o.estimators import H2OExtendedIsolationForestEstimator
from sklearn.ensemble import IsolationForest
from sklearn.datasets import make_blobs

In [None]:
# Notebook parameters

seed = 1234
ntrees = 100
attempt_per_thread = 10 # number of runs of the algorithm in the thread
threds = [12, 10, 8, 6, 4, 2, 1]


In [None]:
def train_models(nthreads, data):
    h2o.init(nthreads=nthreads)
    hf = h2o.H2OFrame(data)
    start = time.time()
    EIF_h2o = H2OExtendedIsolationForestEstimator(model_id = "extended_isolation_forest.hex",
                                         ntrees = ntrees, seed = seed, sample_size = sample_size, extension_level = P - 1)
    EIF_h2o.train(training_frame = hf)
    end = time.time()
    eif_time = end - start
    print(f"EIF Time: {eif_time}s")
    start = time.time()
    IF_h2o = H2OIsolationForestEstimator(model_id = "isolation_forest.hex",
                                     ntrees = ntrees, seed = seed, sample_size = sample_size, max_depth = max_depth)
    IF_h2o.train(training_frame = hf)
    end = time.time()
    if_time = end - start
    print(f"IF Time: {if_time}s")
    h2o.cluster().shutdown()
    return eif_time, if_time


def run_benchmark(data):
    all_times = []
    all_times_num_eif = []
    all_times_num_if = []
    for nthreads in threds:
        times_eif = []
        times_if = []
        for i in range(attempt_per_thread):
            eif_time, if_time = train_models(nthreads, data)
            times_eif.append(eif_time)
            times_if.append(if_time)
        print(f"EIF {np.mean(times_eif)}s")
        print(f"IF {np.mean(times_if)}s")
        all_times_num_eif.append(times_eif)
        all_times_num_if.append(times_if)
        all_times.append(f"thread {nthreads} - EIF {np.mean(times_eif)}s and IF {np.mean(times_if)}s")

    eif_means = dict()
    if_means = dict()
    for i, nthreads in enumerate(threds):
        print(f"{nthreads} - EIF = {np.mean(all_times_num_eif[i])}, IF = {np.mean(all_times_num_if[i])}")
        eif_means[nthreads] = np.mean(all_times_num_eif[i])
        if_means[nthreads] = np.mean(all_times_num_if[i])
    
    return eif_means, if_means
        
    
def plot_result(eif_means, if_means):
    data = {"x":[], "y": [], "label":[]}
    for label, coord in eif_means.items():
        data["x"].append(label)
        data["y"].append(coord)

    data_if = {"x":[], "y": [], "label":[]}
    for label, coord in if_means.items():
        data_if["x"].append(label)
        data_if["y"].append(coord)    

    fig=plt.figure(figsize=(8,10))
    fig.add_subplot(111)
    plt.plot(data['x'], data['y'], '-', label="EIF", linewidth=3)
    plt.plot(data_if['x'], data_if['y'], '-', label="IF", linewidth=3)
    plt.xlabel("Number of threads")
    plt.ylabel("Computing time (s)")
    plt.legend()
    plt.tick_params(direction='out', length=6, width=2) 
    plt.title(f"Extended Isolation Forest - training benchmark\nModel: N = {N}; P = {P}; ntrees = {ntrees}; sample_size = {sample_size};  max_depth = {max_depth}")
    plt.savefig(f"h2o-scale-perf_{sample_size}_{N}_{coord}.png", bbox_inches='tight', pad_inches=.05)
    plt.show()



## Training stage
### Toy data

### Small data and small dimension

In [None]:
# Toy data parameters

N = 500
P = 2
sample_size = 256
max_depth = math.ceil(math.log(sample_size, 2))

blobs_params = dict(random_state=seed, n_samples=N, n_features=P)
X = make_blobs(centers=[[0 for i in range(P)] for i in range(P)], cluster_std=1, **blobs_params)[0]

eif_time, if_time = run_benchmark(X)

In [None]:
plot_result(eif_time, if_time)

### Small data and high dimension

In [None]:
# Toy data parameters

N = 5000
P = 30
sample_size = 256
max_depth = math.ceil(math.log(sample_size, 2))

blobs_params = dict(random_state=seed, n_samples=N, n_features=P)
X = make_blobs(centers=[[0 for i in range(P)] for i in range(P)], cluster_std=1, **blobs_params)[0]

eif_time, if_time = run_benchmark(X)

In [None]:
plot_result(eif_time, if_time)

### Big data - small dimension, small sample_size

In [None]:
# Toy data parameters

N = 1_500_000
P = 2
sample_size = 256
max_depth = math.ceil(math.log(sample_size, 2))

blobs_params = dict(random_state=seed, n_samples=N, n_features=P)
X = make_blobs(centers=[[0 for i in range(P)] for i in range(P)], cluster_std=1, **blobs_params)[0]

eif_time, if_time = run_benchmark(X)

In [None]:
plot_result(eif_time, if_time)

### Big data - high dimension, small sample_size

In [None]:
# Toy data parameters

N = 100_000
P = 30
sample_size = 256
max_depth = math.ceil(math.log(sample_size, 2))

blobs_params = dict(random_state=seed, n_samples=N, n_features=P)
X = make_blobs(centers=[[0 for i in range(P)] for i in range(P)], cluster_std=1, **blobs_params)[0]

eif_time, if_time = run_benchmark(X)

In [None]:
plot_result(eif_time, if_time)

### Big data - small dimension, big sample_size

In [None]:
# Toy data parameters

N = 1_500_000
P = 2
sample_size = 15_000
max_depth = math.ceil(math.log(sample_size, 2))

blobs_params = dict(random_state=seed, n_samples=N, n_features=P)
X = make_blobs(centers=[[0 for i in range(P)] for i in range(P)], cluster_std=1, **blobs_params)[0]

eif_time, if_time = run_benchmark(X)

In [None]:
plot_result(eif_time, if_time)

### Big data - high dimension, big sample_size

In [None]:
# Toy data parameters

N = 100_000
P = 30
sample_size = 10_000
max_depth = math.ceil(math.log(sample_size, 2))

blobs_params = dict(random_state=seed, n_samples=N, n_features=P)
X = make_blobs(centers=[[0 for i in range(P)] for i in range(P)], cluster_std=1, **blobs_params)[0]

eif_time, if_time = run_benchmark(X)

In [None]:
plot_result(eif_time, if_time)

### Real Credit Card Fraud Detection Data

https://www.kaggle.com/mlg-ulb/creditcardfraud

In [None]:
# Real data parameters

df = pd.read_csv("creditcard.csv", nrows=1)
columns = df.columns.tolist()
cols_to_use = columns[:len(columns)-1]
df = pd.read_csv("creditcard.csv", usecols=cols_to_use)


N = df.shape[0]
P = df.shape[1]
sample_size = int(df.shape[0] * 0.01)
max_depth = math.ceil(math.log(sample_size, 2))

print(N, P, sample_size, max_depth)

In [None]:
eif_time, if_time = run_benchmark(df)

In [None]:
plot_result(eif_time, if_time)

### Bigger sample_size

In [None]:
# Real data parameters

df = pd.read_csv("creditcard.csv", nrows=1)
columns = df.columns.tolist()
cols_to_use = columns[:len(columns)-1]
df = pd.read_csv("creditcard.csv", usecols=cols_to_use)


N = df.shape[0]
P = df.shape[1]
sample_size = int(df.shape[0] * 0.05)
max_depth = math.ceil(math.log(sample_size, 2))

print(N, P, sample_size, max_depth)

In [None]:
eif_time, if_time = run_benchmark(df)

In [None]:
plot_result(eif_time, if_time)

## Evaluation Stage

In [None]:
def run_predict(EIF_model, IF_model, hf_test):
    start = time.time()
    EIF_model.predict(hf_test)
    end = time.time()
    eif_time = end - start
    print(f"EIF Time: {eif_time}s")
    start = time.time()
    IF_model.predict(hf_test)
    end = time.time()
    if_time = end - start
    print(f"IF Time: {if_time}s")    
    return eif_time, if_time


def run_predict_benchmark(train_data, test_data):
    all_times = []
    all_times_num_eif = []
    all_times_num_if = []
    for nthreads in threds:
        times_eif = []
        times_if = []
        h2o.init(nthreads=nthreads)     
        hf_train = h2o.H2OFrame(train_data)    
        hf_test = h2o.H2OFrame(test_data)
        EIF_model = H2OExtendedIsolationForestEstimator(model_id = "extended_isolation_forest.hex",
                                             ntrees = ntrees, seed = seed, sample_size = sample_size, extension_level = P - 1)
        EIF_model.train(training_frame = hf_train)
        IF_model = H2OIsolationForestEstimator(model_id = "isolation_forest.hex",
                                         ntrees = ntrees, seed = seed, sample_size = sample_size, max_depth = max_depth)
        IF_model.train(training_frame = hf_train)
        for i in range(attempt_per_thread):
            eif_time, if_time = run_predict(EIF_model, IF_model, hf_test)
            times_eif.append(eif_time)
            times_if.append(if_time)
        h2o.cluster().shutdown()
        print(f"EIF {np.mean(times_eif)}s")
        print(f"IF {np.mean(times_if)}s")
        all_times_num_eif.append(times_eif)
        all_times_num_if.append(times_if)
        all_times.append(f"thread {nthreads} - EIF {np.mean(times_eif)}s and IF {np.mean(times_if)}s")
        
        eif_means = dict()
        if_means = dict()
        for i, nthreads in enumerate(threds):
            print(f"{nthreads} - EIF = {np.mean(all_times_num_eif[i])}, IF = {np.mean(all_times_num_if[i])}")
            eif_means[nthreads] = np.mean(all_times_num_eif[i])
            if_means[nthreads] = np.mean(all_times_num_if[i])

        return eif_means, if_means


def plot_predict(EIF_means, IF_means):
    data = {"x":[], "y": [], "label":[]}
    for label, coord in EIF_means.items():
        data["x"].append(label)
        data["y"].append(coord)

    data_if = {"x":[], "y": [], "label":[]}
    for label, coord in IF_means.items():
        data_if["x"].append(label)
        data_if["y"].append(coord)    

    fig=plt.figure(figsize=(8,10))
    fig.add_subplot(111)
    plt.plot(data['x'], data['y'], '-', label="EIF", linewidth=3)
    plt.plot(data_if['x'], data_if['y'], '-', label="IF", linewidth=3)
    # plt.grid("off")
    plt.xlabel("Number of threads")
    plt.ylabel("Computing time (s)")
    plt.legend()
    plt.tick_params(direction='out', length=6, width=2) 
    plt.title(f"Extended Isolation Forest - evaluation benchmark\nModel: N = {N_train}; P = {P}; ntrees = {ntrees}; sample_size = {sample_size};  max_depth = {max_depth}\nEvaluation Frame: N = {N}; P = {P}")
    plt.savefig(f"h2o-eval-perf_{sample_size}_{N}_{coord}.png", bbox_inches='tight', pad_inches=.05)
    plt.show()

In [None]:
# Evaluation stage parameters

N = 100_000
P = 30
N_train = 500_000
sample_size = 10_000
max_depth = math.ceil(math.log(sample_size, 2))

blobs_params = dict(random_state=seed, n_samples=N_train, n_features=P)
X_train = make_blobs(centers=[[0 for i in range(P)] for i in range(P)], cluster_std=1, **blobs_params)[0]

blobs_params = dict(random_state=seed, n_samples=N, n_features=P)
X_test = make_blobs(centers=[[0 for i in range(P)] for i in range(P)], cluster_std=1, **blobs_params)[0]

eif_time, if_time = run_predict_benchmark(X_train, X_test)

In [None]:
plot_predict(eif_time, if_time)