In [1]:
import os.path

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from smote_variants import MLPClassifierWrapper

import smote_variants as sv

import pandas as pd
import numpy as np

import imbalanced_databases as imbd
import os, pickle, itertools

In [2]:
# the evaluation procedure uses a directory for caching

cache_path= os.path.join(os.path.expanduser('~'), 'smote_test')

if not os.path.exists(cache_path):
    os.makedirs(cache_path)

In [3]:
# instantiate classifiers
sv_classifiers= [CalibratedClassifierCV(LinearSVC(C=1.0, penalty='l1', loss= 'squared_hinge', dual= False)),
                CalibratedClassifierCV(LinearSVC(C=1.0, penalty='l2', loss= 'hinge', dual= True)),
                CalibratedClassifierCV(LinearSVC(C=1.0, penalty='l2', loss= 'squared_hinge', dual= False)),
                CalibratedClassifierCV(LinearSVC(C=10.0, penalty='l1', loss= 'squared_hinge', dual= False)),
                CalibratedClassifierCV(LinearSVC(C=10.0, penalty='l2', loss= 'hinge', dual= True)),
                CalibratedClassifierCV(LinearSVC(C=10.0, penalty='l2', loss= 'squared_hinge', dual= False))]

mlp_classifiers= []
for x in itertools.product(['relu', 'logistic'], [1.0, 0.5, 0.1]):
    mlp_classifiers.append(MLPClassifierWrapper(activation= x[0], hidden_layer_fraction= x[1]))

nn_classifiers= []
for x in itertools.product([3, 5, 7], ['uniform', 'distance'], [1, 2, 3]):
    nn_classifiers.append(KNeighborsClassifier(n_neighbors= x[0], weights= x[1], p= x[2]))

dt_classifiers= []
for x in itertools.product(['gini', 'entropy'], [None, 3, 5]):
    dt_classifiers.append(DecisionTreeClassifier(criterion= x[0], max_depth= x[1]))

classifiers= []
classifiers.extend(sv_classifiers)
classifiers.extend(mlp_classifiers)
classifiers.extend(nn_classifiers)
classifiers.extend(dt_classifiers)

In [4]:
datas = {}
datas['name'] = 'paw02a-800-7-30-BI-full'
dat = pd.read_csv('paw02a-800-7-30-BI-full.csv')
dat.head()

Unnamed: 0,dim1,dim2,output_class
0,358.0,400.0,0
1,404.0,387.0,0
2,415.0,294.0,0
3,224.0,622.0,0
4,527.0,363.0,0


In [5]:
datas['target'] = dat['output_class'].to_numpy()
dat = dat.drop('output_class', axis=1)
datas['data'] = np.array(dat.values.tolist())

In [6]:
# specifying the classifiers

knn_classifier= KNeighborsClassifier()
dt_classifier= DecisionTreeClassifier()
sv_classifier = CalibratedClassifierCV(LinearSVC())

In [7]:
samplers = [sv.SMOTE, sv.Borderline_SMOTE1, sv.Borderline_SMOTE2, sv.SMOTE_IPF]
results= sv.evaluate_oversamplers(datasets= [datas],
                                    samplers= samplers,
                                    classifiers= classifiers,
                                    cache_path= cache_path,
                                    n_jobs= 10,
                                    max_samp_par_comb=25)

2021-12-07 03:52:58,306:INFO:creating cache directory
2021-12-07 03:52:58,307:INFO:dataset: paw02a-800-7-30-BI-full, samplings_available: False, evaluations_available: False
2021-12-07 03:52:58,307:INFO:doing the folding
2021-12-07 03:52:58,308:INFO:Folding doing folding folding_paw02a-800-7-30-BI-full.pickle
2021-12-07 03:52:58,310:INFO:Folding dumping to file folding_paw02a-800-7-30-BI-full.pickle
2021-12-07 03:52:58,312:INFO:do the samplings
2021-12-07 03:52:58,312:INFO:create sampling objects, random_state: 
2021-12-07 03:52:58,313:INFO:samplers: [<class 'smote_variants._smote_variants.SMOTE'>, <class 'smote_variants._smote_variants.Borderline_SMOTE1'>, <class 'smote_variants._smote_variants.Borderline_SMOTE2'>, <class 'smote_variants._smote_variants.SMOTE_IPF'>]
2021-12-07 03:52:58,313:INFO:[{'n_neighbors': 3, 'proportion': 0.1}, {'n_neighbors': 3, 'proportion': 0.25}, {'n_neighbors': 3, 'proportion': 0.5}, {'n_neighbors': 3, 'proportion': 0.75}, {'n_neighbors': 3, 'proportion': 1

["('paw02a-800-7-30-BI-full', OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 7), ('p', 0.01), ('proportion', 2.0), ('random_state', None), ('voting', 'consensus')]), 'CalibratedClassifierCV', OrderedDict([('base_estimator', LinearSVC(dual=False, penalty='l1')), ('base_estimator__C', 1.0), ('base_estimator__class_weight', None), ('base_estimator__dual', False), ('base_estimator__fit_intercept', True), ('base_estimator__intercept_scaling', 1), ('base_estimator__loss', 'squared_hinge'), ('base_estimator__max_iter', 1000), ('base_estimator__multi_class', 'ovr'), ('base_estimator__penalty', 'l1'), ('base_estimator__random_state', None), ('base_estimator__tol', 0.0001), ('base_estimator__verbose', 0), ('cv', None), ('ensemble', True), ('method', 'sigmoid'), ('n_jobs', None)]))", "('paw02a-800-7-30-BI-full', OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 7), ('p', 0.

2021-12-07 03:53:02,946:INFO:Sampling sampler parameter string OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 3), ('p', 0.01), ('proportion', 1.0), ('random_state', None), ('voting', 'consensus')])
2021-12-07 03:53:02,959:INFO:{'k_neighbors': 5, 'n_neighbors': 3, 'proportion': 1.0, 'random_state': None}
2021-12-07 03:53:02,959:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 5), ('n_neighbors', 3), ('proportion', 1.0), ('random_state', None)])
2021-12-07 03:53:02,968:INFO:{'k_neighbors': 7, 'n_neighbors': 3, 'proportion': 1.0, 'random_state': None}
2021-12-07 03:53:02,972:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 7), ('n_neighbors', 3), ('proportion', 1.0), ('random_state', None)])
2021-12-07 03:53:02,980:INFO:{'k_neighbors': 3, 'n_neighbors': 3, 'proportion': 1.0, 'random_state': None}
2021-12-07 03:53:02,989:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 3

["('paw02a-800-7-30-BI-full', OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 3), ('p', 0.01), ('proportion', 1.0), ('random_state', None), ('voting', 'consensus')]), 'CalibratedClassifierCV', OrderedDict([('base_estimator', LinearSVC(dual=False, penalty='l1')), ('base_estimator__C', 1.0), ('base_estimator__class_weight', None), ('base_estimator__dual', False), ('base_estimator__fit_intercept', True), ('base_estimator__intercept_scaling', 1), ('base_estimator__loss', 'squared_hinge'), ('base_estimator__max_iter', 1000), ('base_estimator__multi_class', 'ovr'), ('base_estimator__penalty', 'l1'), ('base_estimator__random_state', None), ('base_estimator__tol', 0.0001), ('base_estimator__verbose', 0), ('cv', None), ('ensemble', True), ('method', 'sigmoid'), ('n_jobs', None)]))", "('paw02a-800-7-30-BI-full', OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 3), ('p', 0.

2021-12-07 03:53:03,181:INFO:Sampling sampler parameter string OrderedDict([('n_neighbors', 3), ('proportion', 0.75), ('random_state', None)])
2021-12-07 03:53:03,193:INFO:{'classifier': DecisionTreeClassifier(random_state=2), 'k': 3, 'n_folds': 9, 'n_neighbors': 3, 'p': 0.01, 'proportion': 0.5, 'voting': 'consensus', 'random_state': None}
2021-12-07 03:53:03,194:INFO:Sampling sampler parameter string OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 3), ('p', 0.01), ('proportion', 0.5), ('random_state', None), ('voting', 'consensus')])
2021-12-07 03:53:03,205:INFO:{'classifier': DecisionTreeClassifier(random_state=2), 'k': 3, 'n_folds': 9, 'n_neighbors': 5, 'p': 0.01, 'proportion': 0.5, 'voting': 'majority', 'random_state': None}
2021-12-07 03:53:03,207:INFO:Sampling sampler parameter string OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 5), ('p', 0.01), ('propo

["('paw02a-800-7-30-BI-full', OrderedDict([('n_neighbors', 3), ('proportion', 0.75), ('random_state', None)]), 'CalibratedClassifierCV', OrderedDict([('base_estimator', LinearSVC(dual=False, penalty='l1')), ('base_estimator__C', 1.0), ('base_estimator__class_weight', None), ('base_estimator__dual', False), ('base_estimator__fit_intercept', True), ('base_estimator__intercept_scaling', 1), ('base_estimator__loss', 'squared_hinge'), ('base_estimator__max_iter', 1000), ('base_estimator__multi_class', 'ovr'), ('base_estimator__penalty', 'l1'), ('base_estimator__random_state', None), ('base_estimator__tol', 0.0001), ('base_estimator__verbose', 0), ('cv', None), ('ensemble', True), ('method', 'sigmoid'), ('n_jobs', None)]))", "('paw02a-800-7-30-BI-full', OrderedDict([('n_neighbors', 3), ('proportion', 0.75), ('random_state', None)]), 'CalibratedClassifierCV', OrderedDict([('base_estimator', LinearSVC(loss='hinge')), ('base_estimator__C', 1.0), ('base_estimator__class_weight', None), ('base_es

2021-12-07 03:53:03,398:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 3), ('n_neighbors', 7), ('proportion', 0.25), ('random_state', None)])
2021-12-07 03:53:03,405:INFO:{'k_neighbors': 7, 'n_neighbors': 7, 'proportion': 0.25, 'random_state': None}
2021-12-07 03:53:03,407:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 7), ('n_neighbors', 7), ('proportion', 0.25), ('random_state', None)])
2021-12-07 03:53:03,413:INFO:{'k_neighbors': 7, 'n_neighbors': 3, 'proportion': 0.25, 'random_state': None}
2021-12-07 03:53:03,414:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 7), ('n_neighbors', 3), ('proportion', 0.25), ('random_state', None)])
2021-12-07 03:53:03,422:INFO:{'k_neighbors': 7, 'n_neighbors': 3, 'proportion': 0.25, 'random_state': None}
2021-12-07 03:53:03,423:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 7), ('n_neighbors', 3), ('proportion', 0.25), ('random_state', None)])
2021-12-07 03:53:03,430:

["('paw02a-800-7-30-BI-full', OrderedDict([('k_neighbors', 3), ('n_neighbors', 7), ('proportion', 0.25), ('random_state', None)]), 'CalibratedClassifierCV', OrderedDict([('base_estimator', LinearSVC(dual=False, penalty='l1')), ('base_estimator__C', 1.0), ('base_estimator__class_weight', None), ('base_estimator__dual', False), ('base_estimator__fit_intercept', True), ('base_estimator__intercept_scaling', 1), ('base_estimator__loss', 'squared_hinge'), ('base_estimator__max_iter', 1000), ('base_estimator__multi_class', 'ovr'), ('base_estimator__penalty', 'l1'), ('base_estimator__random_state', None), ('base_estimator__tol', 0.0001), ('base_estimator__verbose', 0), ('cv', None), ('ensemble', True), ('method', 'sigmoid'), ('n_jobs', None)]))", "('paw02a-800-7-30-BI-full', OrderedDict([('k_neighbors', 3), ('n_neighbors', 7), ('proportion', 0.25), ('random_state', None)]), 'CalibratedClassifierCV', OrderedDict([('base_estimator', LinearSVC(loss='hinge')), ('base_estimator__C', 1.0), ('base_es

2021-12-07 03:53:03,601:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-07 03:53:03,602:INFO:Sampling mkl thread number set to 1 successfully
2021-12-07 03:53:03,606:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-07 03:53:03,606:INFO:Sampling mkl thread number set to 1 successfully
2021-12-07 03:53:03,606:INFO:Evaluation do the evaluation paw02a-800-7-30-BI-full Borderline_SMOTE2 CalibratedClassifierCV
2021-12-07 03:53:03,609:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-07 03:53:03,610:INFO:Sampling mkl thread number set to 1 successfully
2021-12-07 03:53:03,610:INFO:Evaluation do the evaluation paw02a-800-7-30-BI-full Borderline_SMOTE1 CalibratedClassifierCV
2021-12-07 03:53:03,614:INFO:Evaluation do the evaluation paw02a-800-7-30-BI-full Borderline_SMOTE1 CalibratedClassifierCV
2021-12-07 03:53:03,614:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-07 03:53:03,615:INFO:Sampling mkl thread number set to 1 successfully

In [8]:
highest_auc_score= results['auc'].idxmax()
print(highest_auc_score)

9


In [9]:
print(results.columns)
print("------------------------")
print(results)

Index(['db_name', 'classifier', 'sampler', 'auc', 'auc_mean', 'auc_std',
       'brier', 'acc', 'f1', 'p_top20', 'gacc', 'runtime', 'db_size',
       'db_n_attr', 'imbalanced_ratio', 'sampler_categories',
       'classifier_parameters_auc', 'classifier_parameters_acc',
       'classifier_parameters_gacc', 'classifier_parameters_f1',
       'classifier_parameters_p_top20', 'classifier_parameters_brier',
       'sampler_parameters_auc', 'sampler_parameters_acc',
       'sampler_parameters_gacc', 'sampler_parameters_f1',
       'sampler_parameters_p_top20', 'sampler_parameters_brier'],
      dtype='object')
------------------------
                    db_name              classifier            sampler  \
0   paw02a-800-7-30-BI-full  CalibratedClassifierCV  Borderline_SMOTE1   
1   paw02a-800-7-30-BI-full  CalibratedClassifierCV  Borderline_SMOTE2   
2   paw02a-800-7-30-BI-full  CalibratedClassifierCV              SMOTE   
3   paw02a-800-7-30-BI-full  CalibratedClassifierCV          SMOTE_

In [10]:
print(results[['db_name', 'classifier', 'sampler', 'auc']].to_html('paw.html'))

None
