In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from pyod.models.auto_encoder import AutoEncoder

from os import listdir
from os.path import isfile, join
import warnings

warnings.filterwarnings(action='ignore')

Using TensorFlow backend.


### Undersampling, Autoencoder, Isolation Forest

In [2]:
DATA_PATH = '../source_data/'
datasets = [
    'letter', 
    'cardio', 
    #'mnist',
    #'speech'
]


methods = [
    IsolationForest, 
    AutoEncoder
]

skf = StratifiedKFold(n_splits=5, random_state=179)

params = {
    'ABOD':
    {
        'contamination': [0.05, 0.1, 0.15]
    },
    'LocalOutlierFactor':
    {
        'n_neighbors': [2, 5, 10, 20],
        'contamination': ['auto'],
        'novelty': [True]
    },
    'SOD':
    {
        'contamination': [0.05, 0.1, 0.15]
    },
    'LOCI':
    {
        'contamination': [0.05, 0.1, 0.15]
    },
    'KNN':
    {
        'contamination': [0.05, 0.1, 0.15]
    },
    'SOD':
    {
        'contamination': [0.05, 0.1, 0.15]
    },
    'IsolationForest':
    {
        
    },
    'AutoEncoder':
    {
        'hidden_neurons': [32, 16, 16, 32],
        'verbose': 0
    }
}


def get_y(method_name, y):
    y_sym = y * 2 - 1
    if method_name in ['LocalOutlierFactor', 'IsolationForest']:
        return y_sym
    
    return y


for dataset in datasets:
    X = pd.read_csv(DATA_PATH + dataset + '_x.csv', header=None).values
    y = pd.read_csv(DATA_PATH + dataset + '_y.csv', header=None).values.ravel()
    
    print("dataset:", dataset)
    print("X shape:", X.shape)
    print(f"y shape: {y.size} ({round(y[y == 0].size / y.size, 2)}% outliers)")
    
    preds_df = pd.DataFrame()
    
    for method in methods:
        preds = np.zeros(y.size)
        
        method_name = method.__name__
        print("method:", method_name)
        
        for train_val_index, test_index in skf.split(X, y):
            y_cur = get_y(method_name, y)
            X_train_val, X_test = X[train_val_index], X[test_index]
            y_train_val, y_test = y_cur[train_val_index], y_cur[test_index]
            
            clf = GridSearchCV(
                method(),
                params[method_name],
                scoring='f1',
                cv=4,
                refit=True
            )
            clf.fit(X_train_val, y_train_val)
            preds[test_index] = clf.decision_function(X_test)
        
        preds_df[method_name] = preds

    preds_df.to_csv(DATA_PATH + dataset + '_preds.csv')
    print()

dataset: letter
X shape: (1600, 32)
y shape: 1600 (0.94% outliers)
method: IsolationForest
method: AutoEncoder


ValueError: Parameter values for parameter (verbose) need to be a sequence(but not a string) or np.ndarray.

In [13]:
np.unique(y_train_val)

array([0, 1])