In [1]:
import os.path

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from smote_variants import MLPClassifierWrapper

import smote_variants as sv

import pandas as pd
import numpy as np

import imbalanced_databases as imbd
import os, pickle, itertools

In [2]:
# the evaluation procedure uses a directory for caching

cache_path= os.path.join(os.path.expanduser('~'), 'smote_test')

if not os.path.exists(cache_path):
    os.makedirs(cache_path)

In [3]:
# instantiate classifiers
sv_classifiers= [CalibratedClassifierCV(LinearSVC(C=1.0, penalty='l1', loss= 'squared_hinge', dual= False)),
                CalibratedClassifierCV(LinearSVC(C=1.0, penalty='l2', loss= 'hinge', dual= True)),
                CalibratedClassifierCV(LinearSVC(C=1.0, penalty='l2', loss= 'squared_hinge', dual= False)),
                CalibratedClassifierCV(LinearSVC(C=10.0, penalty='l1', loss= 'squared_hinge', dual= False)),
                CalibratedClassifierCV(LinearSVC(C=10.0, penalty='l2', loss= 'hinge', dual= True)),
                CalibratedClassifierCV(LinearSVC(C=10.0, penalty='l2', loss= 'squared_hinge', dual= False))]

mlp_classifiers= []
for x in itertools.product(['relu', 'logistic'], [1.0, 0.5, 0.1]):
    mlp_classifiers.append(MLPClassifierWrapper(activation= x[0], hidden_layer_fraction= x[1]))

nn_classifiers= []
for x in itertools.product([3, 5, 7], ['uniform', 'distance'], [1, 2, 3]):
    nn_classifiers.append(KNeighborsClassifier(n_neighbors= x[0], weights= x[1], p= x[2]))

dt_classifiers= []
for x in itertools.product(['gini', 'entropy'], [None, 3, 5]):
    dt_classifiers.append(DecisionTreeClassifier(criterion= x[0], max_depth= x[1]))

classifiers= []
classifiers.extend(sv_classifiers)
classifiers.extend(mlp_classifiers)
classifiers.extend(nn_classifiers)
classifiers.extend(dt_classifiers)

In [4]:
datas = {}
datas['name'] = '03subcl5-800-7-30-BI-full'
dat = pd.read_csv('03subcl5-800-7-30-BI-full.csv')
dat.head()

Unnamed: 0,dim1,dim2,output_class
0,223.0,763.0,0
1,187.0,34.0,0
2,204.0,89.0,0
3,196.0,-81.0,0
4,201.0,733.0,0


In [5]:
datas['target'] = dat['output_class'].to_numpy()
dat = dat.drop('output_class', axis=1)
datas['data'] = np.array(dat.values.tolist())

In [6]:
# specifying the classifiers

knn_classifier= KNeighborsClassifier()
dt_classifier= DecisionTreeClassifier()
sv_classifier = CalibratedClassifierCV(LinearSVC())

In [7]:
samplers = [sv.SMOTE, sv.Borderline_SMOTE1, sv.Borderline_SMOTE2, sv.SMOTE_IPF]
results= sv.evaluate_oversamplers(datasets= [datas],
                                    samplers= samplers,
                                    classifiers= classifiers,
                                    cache_path= cache_path,
                                    n_jobs= 10,
                                    max_samp_par_comb=25)

2021-12-07 03:34:51,307:INFO:dataset: 03subcl5-800-7-30-BI-full, samplings_available: True, evaluations_available: True
2021-12-07 03:34:51,308:INFO:doing the folding
2021-12-07 03:34:51,308:INFO:Folding reading from file folding_03subcl5-800-7-30-BI-full.pickle
2021-12-07 03:34:51,310:INFO:do the samplings
2021-12-07 03:34:51,310:INFO:create sampling objects, random_state: 
2021-12-07 03:34:51,311:INFO:samplers: [<class 'smote_variants._smote_variants.SMOTE'>, <class 'smote_variants._smote_variants.Borderline_SMOTE1'>, <class 'smote_variants._smote_variants.Borderline_SMOTE2'>, <class 'smote_variants._smote_variants.SMOTE_IPF'>]
2021-12-07 03:34:51,311:INFO:[{'n_neighbors': 3, 'proportion': 0.1}, {'n_neighbors': 3, 'proportion': 0.25}, {'n_neighbors': 3, 'proportion': 0.5}, {'n_neighbors': 3, 'proportion': 0.75}, {'n_neighbors': 3, 'proportion': 1.0}, {'n_neighbors': 3, 'proportion': 1.5}, {'n_neighbors': 3, 'proportion': 2.0}, {'n_neighbors': 5, 'proportion': 0.1}, {'n_neighbors': 5,

["('03subcl5-800-7-30-BI-full', OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 7), ('p', 0.01), ('proportion', 2.0), ('random_state', None), ('voting', 'consensus')]), 'CalibratedClassifierCV', OrderedDict([('base_estimator', LinearSVC(dual=False, penalty='l1')), ('base_estimator__C', 1.0), ('base_estimator__class_weight', None), ('base_estimator__dual', False), ('base_estimator__fit_intercept', True), ('base_estimator__intercept_scaling', 1), ('base_estimator__loss', 'squared_hinge'), ('base_estimator__max_iter', 1000), ('base_estimator__multi_class', 'ovr'), ('base_estimator__penalty', 'l1'), ('base_estimator__random_state', None), ('base_estimator__tol', 0.0001), ('base_estimator__verbose', 0), ('cv', None), ('ensemble', True), ('method', 'sigmoid'), ('n_jobs', None)]))", "('03subcl5-800-7-30-BI-full', OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 7), ('p'

2021-12-07 03:34:53,216:INFO:{'k_neighbors': 5, 'n_neighbors': 7, 'proportion': 1.5, 'random_state': None}
2021-12-07 03:34:53,221:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 5), ('n_neighbors', 7), ('proportion', 1.5), ('random_state', None)])
2021-12-07 03:34:53,240:INFO:{'k_neighbors': 3, 'n_neighbors': 3, 'proportion': 1.5, 'random_state': None}
2021-12-07 03:34:53,240:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 3), ('n_neighbors', 3), ('proportion', 1.5), ('random_state', None)])
2021-12-07 03:34:53,246:INFO:{'k_neighbors': 5, 'n_neighbors': 5, 'proportion': 1.5, 'random_state': None}
2021-12-07 03:34:53,247:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 5), ('n_neighbors', 5), ('proportion', 1.5), ('random_state', None)])
2021-12-07 03:34:53,254:INFO:{'n_neighbors': 7, 'proportion': 1.5, 'random_state': None}
2021-12-07 03:34:53,255:INFO:Sampling sampler parameter string OrderedDict([('n_neighbors', 7), ('propor

["('03subcl5-800-7-30-BI-full', OrderedDict([('k_neighbors', 5), ('n_neighbors', 7), ('proportion', 1.5), ('random_state', None)]), 'CalibratedClassifierCV', OrderedDict([('base_estimator', LinearSVC(dual=False, penalty='l1')), ('base_estimator__C', 1.0), ('base_estimator__class_weight', None), ('base_estimator__dual', False), ('base_estimator__fit_intercept', True), ('base_estimator__intercept_scaling', 1), ('base_estimator__loss', 'squared_hinge'), ('base_estimator__max_iter', 1000), ('base_estimator__multi_class', 'ovr'), ('base_estimator__penalty', 'l1'), ('base_estimator__random_state', None), ('base_estimator__tol', 0.0001), ('base_estimator__verbose', 0), ('cv', None), ('ensemble', True), ('method', 'sigmoid'), ('n_jobs', None)]))", "('03subcl5-800-7-30-BI-full', OrderedDict([('k_neighbors', 5), ('n_neighbors', 7), ('proportion', 1.5), ('random_state', None)]), 'CalibratedClassifierCV', OrderedDict([('base_estimator', LinearSVC(loss='hinge')), ('base_estimator__C', 1.0), ('base_

2021-12-07 03:34:53,448:INFO:Sampling sampler parameter string OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 3), ('p', 0.01), ('proportion', 0.75), ('random_state', None), ('voting', 'consensus')])
2021-12-07 03:34:53,462:INFO:{'classifier': DecisionTreeClassifier(random_state=2), 'k': 3, 'n_folds': 9, 'n_neighbors': 5, 'p': 0.01, 'proportion': 0.75, 'voting': 'majority', 'random_state': None}
2021-12-07 03:34:53,463:INFO:Sampling sampler parameter string OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 5), ('p', 0.01), ('proportion', 0.75), ('random_state', None), ('voting', 'majority')])
2021-12-07 03:34:53,474:INFO:{'classifier': DecisionTreeClassifier(random_state=2), 'k': 3, 'n_folds': 9, 'n_neighbors': 5, 'p': 0.01, 'proportion': 0.75, 'voting': 'consensus', 'random_state': None}
2021-12-07 03:34:53,475:INFO:Sampling sampler parameter string OrderedDict([

["('03subcl5-800-7-30-BI-full', OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 3), ('p', 0.01), ('proportion', 0.75), ('random_state', None), ('voting', 'consensus')]), 'CalibratedClassifierCV', OrderedDict([('base_estimator', LinearSVC(dual=False, penalty='l1')), ('base_estimator__C', 1.0), ('base_estimator__class_weight', None), ('base_estimator__dual', False), ('base_estimator__fit_intercept', True), ('base_estimator__intercept_scaling', 1), ('base_estimator__loss', 'squared_hinge'), ('base_estimator__max_iter', 1000), ('base_estimator__multi_class', 'ovr'), ('base_estimator__penalty', 'l1'), ('base_estimator__random_state', None), ('base_estimator__tol', 0.0001), ('base_estimator__verbose', 0), ('cv', None), ('ensemble', True), ('method', 'sigmoid'), ('n_jobs', None)]))", "('03subcl5-800-7-30-BI-full', OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 3), ('p

2021-12-07 03:34:53,662:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 5), ('n_neighbors', 3), ('proportion', 0.5), ('random_state', None)])
2021-12-07 03:34:53,671:INFO:{'k_neighbors': 5, 'n_neighbors': 7, 'proportion': 0.5, 'random_state': None}
2021-12-07 03:34:53,673:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 5), ('n_neighbors', 7), ('proportion', 0.5), ('random_state', None)])
2021-12-07 03:34:53,687:INFO:{'n_neighbors': 3, 'proportion': 0.5, 'random_state': None}
2021-12-07 03:34:53,688:INFO:Sampling sampler parameter string OrderedDict([('n_neighbors', 3), ('proportion', 0.5), ('random_state', None)])
2021-12-07 03:34:53,699:INFO:{'n_neighbors': 7, 'proportion': 0.5, 'random_state': None}
2021-12-07 03:34:53,700:INFO:Sampling sampler parameter string OrderedDict([('n_neighbors', 7), ('proportion', 0.5), ('random_state', None)])
2021-12-07 03:34:53,710:INFO:{'n_neighbors': 5, 'proportion': 0.5, 'random_state': None}
2021-12-07 03:34:5

["('03subcl5-800-7-30-BI-full', OrderedDict([('k_neighbors', 5), ('n_neighbors', 3), ('proportion', 0.5), ('random_state', None)]), 'CalibratedClassifierCV', OrderedDict([('base_estimator', LinearSVC(dual=False, penalty='l1')), ('base_estimator__C', 1.0), ('base_estimator__class_weight', None), ('base_estimator__dual', False), ('base_estimator__fit_intercept', True), ('base_estimator__intercept_scaling', 1), ('base_estimator__loss', 'squared_hinge'), ('base_estimator__max_iter', 1000), ('base_estimator__multi_class', 'ovr'), ('base_estimator__penalty', 'l1'), ('base_estimator__random_state', None), ('base_estimator__tol', 0.0001), ('base_estimator__verbose', 0), ('cv', None), ('ensemble', True), ('method', 'sigmoid'), ('n_jobs', None)]))", "('03subcl5-800-7-30-BI-full', OrderedDict([('k_neighbors', 5), ('n_neighbors', 3), ('proportion', 0.5), ('random_state', None)]), 'CalibratedClassifierCV', OrderedDict([('base_estimator', LinearSVC(loss='hinge')), ('base_estimator__C', 1.0), ('base_

2021-12-07 03:34:53,875:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 3), ('n_neighbors', 3), ('proportion', 0.1), ('random_state', None)])
2021-12-07 03:34:53,882:INFO:{'k_neighbors': 7, 'n_neighbors': 7, 'proportion': 0.1, 'random_state': None}
2021-12-07 03:34:53,883:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 7), ('n_neighbors', 7), ('proportion', 0.1), ('random_state', None)])
2021-12-07 03:34:53,890:INFO:{'k_neighbors': 3, 'n_neighbors': 5, 'proportion': 0.1, 'random_state': None}
2021-12-07 03:34:53,891:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 3), ('n_neighbors', 5), ('proportion', 0.1), ('random_state', None)])
2021-12-07 03:34:53,897:INFO:{'k_neighbors': 7, 'n_neighbors': 3, 'proportion': 0.1, 'random_state': None}
2021-12-07 03:34:53,898:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 7), ('n_neighbors', 3), ('proportion', 0.1), ('random_state', None)])
2021-12-07 03:34:53,904:INFO:{'

["('03subcl5-800-7-30-BI-full', OrderedDict([('k_neighbors', 3), ('n_neighbors', 3), ('proportion', 0.1), ('random_state', None)]), 'CalibratedClassifierCV', OrderedDict([('base_estimator', LinearSVC(dual=False, penalty='l1')), ('base_estimator__C', 1.0), ('base_estimator__class_weight', None), ('base_estimator__dual', False), ('base_estimator__fit_intercept', True), ('base_estimator__intercept_scaling', 1), ('base_estimator__loss', 'squared_hinge'), ('base_estimator__max_iter', 1000), ('base_estimator__multi_class', 'ovr'), ('base_estimator__penalty', 'l1'), ('base_estimator__random_state', None), ('base_estimator__tol', 0.0001), ('base_estimator__verbose', 0), ('cv', None), ('ensemble', True), ('method', 'sigmoid'), ('n_jobs', None)]))", "('03subcl5-800-7-30-BI-full', OrderedDict([('k_neighbors', 3), ('n_neighbors', 3), ('proportion', 0.1), ('random_state', None)]), 'CalibratedClassifierCV', OrderedDict([('base_estimator', LinearSVC(loss='hinge')), ('base_estimator__C', 1.0), ('base_

2021-12-07 03:34:54,076:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-07 03:34:54,081:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-07 03:34:54,084:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-07 03:34:54,089:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-07 03:34:54,094:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-07 03:34:54,099:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-07 03:34:54,104:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-07 03:34:54,109:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-07 03:34:54,113:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-07 03:34:54,119:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-07 03:34:54,122:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-07 03:34:54,126:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-07 03:34:54,130:INFO:Evaluation 

In [8]:
highest_auc_score= results['auc'].idxmax()
print(highest_auc_score)

7


In [9]:
print(results.columns)
print("------------------------")
print(results)

Index(['db_name', 'classifier', 'sampler', 'auc', 'auc_mean', 'auc_std',
       'brier', 'acc', 'f1', 'p_top20', 'gacc', 'runtime', 'db_size',
       'db_n_attr', 'imbalanced_ratio', 'sampler_categories',
       'classifier_parameters_auc', 'classifier_parameters_acc',
       'classifier_parameters_gacc', 'classifier_parameters_f1',
       'classifier_parameters_p_top20', 'classifier_parameters_brier',
       'sampler_parameters_auc', 'sampler_parameters_acc',
       'sampler_parameters_gacc', 'sampler_parameters_f1',
       'sampler_parameters_p_top20', 'sampler_parameters_brier'],
      dtype='object')
------------------------
                      db_name              classifier            sampler  \
0   03subcl5-800-7-30-BI-full  CalibratedClassifierCV  Borderline_SMOTE1   
1   03subcl5-800-7-30-BI-full  CalibratedClassifierCV  Borderline_SMOTE2   
2   03subcl5-800-7-30-BI-full  CalibratedClassifierCV              SMOTE   
3   03subcl5-800-7-30-BI-full  CalibratedClassifierCV      

In [10]:
print(results[['db_name', 'classifier', 'sampler', 'auc']].to_html('sub-cluster.html'))

None
