In [1]:
import numpy as np
import pandas as pd

# Read and standarize the dataset
df = pd.read_csv('data/selected_data1.csv')
#df = pd.read_csv('Z:\\Descargas\\resampled_ts_group_scalled.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
df = pd.DataFrame(X)
df['target'] = y

# Divide dataframe depending on target
df_correct = df.loc[df['target'] == 1]
df_incorrect = df.loc[df['target'] == 0]

# Print each target shape
print('Target 1 shape: ' + str(df_correct.shape))
print('Target 0 shape: ' + str(df_incorrect.shape))

Target 1 shape: (5049, 25)
Target 0 shape: (4951, 25)


In [2]:
from sklearn.model_selection import train_test_split

df_train, df_test_correct = train_test_split(df_correct, test_size=0.2, random_state=0)
df_test_incorrect = df_incorrect

X_train = df_train.iloc[:,:-1]
X_test_correct = df_test_correct.iloc[:,:-1]
X_test_incorrect = df_test_incorrect.iloc[:,:-1]

In [3]:
from sklearn.preprocessing import StandardScaler

cols = X.columns.values

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test_correct = scaler.transform(X_test_correct)
X_test_incorrect = scaler.transform(X_test_incorrect)

In [4]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.97).fit(X_train)
X_train = pca.transform(X_train)
X_test_correct = pca.transform(X_test_correct)
X_test_incorrect = pca.transform(X_test_incorrect)


### isolation forest

In [5]:
import collections
from sklearn.ensemble import IsolationForest

clf = IsolationForest(contamination=0.2, random_state=0, n_jobs=-1).fit(X_train)

In [6]:
collections.Counter(clf.predict(X_train))

Counter({1: 3231, -1: 808})

In [7]:
collections.Counter(clf.predict(X_test_correct))

Counter({1: 805, -1: 205})

In [8]:
collections.Counter(clf.predict(X_test_incorrect))

Counter({-1: 3181, 1: 1770})

### skop

In [9]:
X_skopt = X_test_incorrect
Y_skopt = df_test_incorrect.iloc[:,-1].replace(0, -1)

In [10]:
import skopt
from sklearn.metrics import accuracy_score
from skopt.utils import use_named_args


# The list of hyper-parameters we want to optimize. For each one we define the
# bounds, the corresponding scikit-learn parameter name, as well as how to
# sample values from that dimension (`'log-uniform'` for the learning rate)
space  = [
    skopt.space.Real(0.1, 0.9, name='contamination')
]

# this decorator allows your objective function to receive a the parameters as
# keyword arguments. This is particularly convenient when you want to set
# scikit-learn estimator parameters
@use_named_args(space)
def objective(**params):
    clf.set_params(**params)
    return -accuracy_score(Y_skopt, clf.predict(X_skopt))

In [11]:
from skopt import gp_minimize

res_gp = gp_minimize(objective, space, n_calls=50, random_state=0)



In [12]:
"Best score=%.4f" % res_gp.fun

'Best score=-0.6425'