In [1]:
import os.path

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from smote_variants import MLPClassifierWrapper

import smote_variants as sv

import pandas as pd
import numpy as np

import imbalanced_databases as imbd
import os, pickle, itertools

In [2]:
# the evaluation procedure uses a directory for caching

cache_path= os.path.join(os.path.expanduser('~'), 'smote_test')

if not os.path.exists(cache_path):
    os.makedirs(cache_path)

In [3]:
# instantiate classifiers
sv_classifiers= [CalibratedClassifierCV(LinearSVC(C=1.0, penalty='l1', loss= 'squared_hinge', dual= False)),
                CalibratedClassifierCV(LinearSVC(C=1.0, penalty='l2', loss= 'hinge', dual= True)),
                CalibratedClassifierCV(LinearSVC(C=1.0, penalty='l2', loss= 'squared_hinge', dual= False)),
                CalibratedClassifierCV(LinearSVC(C=10.0, penalty='l1', loss= 'squared_hinge', dual= False)),
                CalibratedClassifierCV(LinearSVC(C=10.0, penalty='l2', loss= 'hinge', dual= True)),
                CalibratedClassifierCV(LinearSVC(C=10.0, penalty='l2', loss= 'squared_hinge', dual= False))]

mlp_classifiers= []
for x in itertools.product(['relu', 'logistic'], [1.0, 0.5, 0.1]):
    mlp_classifiers.append(MLPClassifierWrapper(activation= x[0], hidden_layer_fraction= x[1]))

nn_classifiers= []
for x in itertools.product([3, 5, 7], ['uniform', 'distance'], [1, 2, 3]):
    nn_classifiers.append(KNeighborsClassifier(n_neighbors= x[0], weights= x[1], p= x[2]))

dt_classifiers= []
for x in itertools.product(['gini', 'entropy'], [None, 3, 5]):
    dt_classifiers.append(DecisionTreeClassifier(criterion= x[0], max_depth= x[1]))

classifiers= []
classifiers.extend(sv_classifiers)
classifiers.extend(mlp_classifiers)
classifiers.extend(nn_classifiers)
classifiers.extend(dt_classifiers)

In [4]:
datas = {}
datas['name'] = '04clover5z-800-7-30-BI-full'
dat = pd.read_csv('04clover5z-800-7-30-BI-full.csv')
dat.head()

Unnamed: 0,dim1,dim2,output_class
0,71.0,76.0,0
1,133.0,114.0,0
2,175.0,87.0,0
3,168.0,114.0,0
4,-87.0,-39.0,0


In [5]:
datas['target'] = dat['output_class'].to_numpy()
dat = dat.drop('output_class', axis=1)
datas['data'] = np.array(dat.values.tolist())

In [6]:
# specifying the classifiers

knn_classifier= KNeighborsClassifier()
dt_classifier= DecisionTreeClassifier()
sv_classifier = CalibratedClassifierCV(LinearSVC())

In [7]:
samplers = [sv.SMOTE, sv.Borderline_SMOTE1, sv.Borderline_SMOTE2, sv.SMOTE_IPF]
results= sv.evaluate_oversamplers(datasets= [datas],
                                    samplers= samplers,
                                    classifiers= classifiers,
                                    cache_path= cache_path,
                                    n_jobs= 10,
                                    max_samp_par_comb=25)

2021-12-07 03:39:43,610:INFO:creating cache directory
2021-12-07 03:39:43,611:INFO:dataset: 04clover5z-800-7-30-BI-full, samplings_available: False, evaluations_available: False
2021-12-07 03:39:43,612:INFO:doing the folding
2021-12-07 03:39:43,612:INFO:Folding doing folding folding_04clover5z-800-7-30-BI-full.pickle
2021-12-07 03:39:43,615:INFO:Folding dumping to file folding_04clover5z-800-7-30-BI-full.pickle
2021-12-07 03:39:43,616:INFO:do the samplings
2021-12-07 03:39:43,616:INFO:create sampling objects, random_state: 
2021-12-07 03:39:43,617:INFO:samplers: [<class 'smote_variants._smote_variants.SMOTE'>, <class 'smote_variants._smote_variants.Borderline_SMOTE1'>, <class 'smote_variants._smote_variants.Borderline_SMOTE2'>, <class 'smote_variants._smote_variants.SMOTE_IPF'>]
2021-12-07 03:39:43,617:INFO:[{'n_neighbors': 3, 'proportion': 0.1}, {'n_neighbors': 3, 'proportion': 0.25}, {'n_neighbors': 3, 'proportion': 0.5}, {'n_neighbors': 3, 'proportion': 0.75}, {'n_neighbors': 3, 'pr

["('04clover5z-800-7-30-BI-full', OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 5), ('p', 0.01), ('proportion', 2.0), ('random_state', None), ('voting', 'consensus')]), 'CalibratedClassifierCV', OrderedDict([('base_estimator', LinearSVC(dual=False, penalty='l1')), ('base_estimator__C', 1.0), ('base_estimator__class_weight', None), ('base_estimator__dual', False), ('base_estimator__fit_intercept', True), ('base_estimator__intercept_scaling', 1), ('base_estimator__loss', 'squared_hinge'), ('base_estimator__max_iter', 1000), ('base_estimator__multi_class', 'ovr'), ('base_estimator__penalty', 'l1'), ('base_estimator__random_state', None), ('base_estimator__tol', 0.0001), ('base_estimator__verbose', 0), ('cv', None), ('ensemble', True), ('method', 'sigmoid'), ('n_jobs', None)]))", "('04clover5z-800-7-30-BI-full', OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 5), 

2021-12-07 03:39:48,521:INFO:Sampling sampler parameter string OrderedDict([('n_neighbors', 7), ('proportion', 1.5), ('random_state', None)])
2021-12-07 03:39:48,528:INFO:{'n_neighbors': 5, 'proportion': 1.5, 'random_state': None}
2021-12-07 03:39:48,528:INFO:Sampling sampler parameter string OrderedDict([('n_neighbors', 5), ('proportion', 1.5), ('random_state', None)])
2021-12-07 03:39:48,534:INFO:{'classifier': DecisionTreeClassifier(random_state=2), 'k': 3, 'n_folds': 9, 'n_neighbors': 5, 'p': 0.01, 'proportion': 1.0, 'voting': 'majority', 'random_state': None}
2021-12-07 03:39:48,534:INFO:Sampling sampler parameter string OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 5), ('p', 0.01), ('proportion', 1.0), ('random_state', None), ('voting', 'majority')])
2021-12-07 03:39:48,544:INFO:{'classifier': DecisionTreeClassifier(random_state=2), 'k': 3, 'n_folds': 9, 'n_neighbors': 3, 'p': 0.01, 'proportion': 1.0, 'voting': 'maj

["('04clover5z-800-7-30-BI-full', OrderedDict([('n_neighbors', 7), ('proportion', 1.5), ('random_state', None)]), 'CalibratedClassifierCV', OrderedDict([('base_estimator', LinearSVC(dual=False, penalty='l1')), ('base_estimator__C', 1.0), ('base_estimator__class_weight', None), ('base_estimator__dual', False), ('base_estimator__fit_intercept', True), ('base_estimator__intercept_scaling', 1), ('base_estimator__loss', 'squared_hinge'), ('base_estimator__max_iter', 1000), ('base_estimator__multi_class', 'ovr'), ('base_estimator__penalty', 'l1'), ('base_estimator__random_state', None), ('base_estimator__tol', 0.0001), ('base_estimator__verbose', 0), ('cv', None), ('ensemble', True), ('method', 'sigmoid'), ('n_jobs', None)]))", "('04clover5z-800-7-30-BI-full', OrderedDict([('n_neighbors', 7), ('proportion', 1.5), ('random_state', None)]), 'CalibratedClassifierCV', OrderedDict([('base_estimator', LinearSVC(loss='hinge')), ('base_estimator__C', 1.0), ('base_estimator__class_weight', None), ('b

2021-12-07 03:39:48,775:INFO:{'k_neighbors': 7, 'n_neighbors': 7, 'proportion': 0.5, 'random_state': None}
2021-12-07 03:39:48,776:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 7), ('n_neighbors', 7), ('proportion', 0.5), ('random_state', None)])
2021-12-07 03:39:48,782:INFO:{'k_neighbors': 3, 'n_neighbors': 3, 'proportion': 0.5, 'random_state': None}
2021-12-07 03:39:48,782:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 3), ('n_neighbors', 3), ('proportion', 0.5), ('random_state', None)])
2021-12-07 03:39:48,790:INFO:{'k_neighbors': 5, 'n_neighbors': 5, 'proportion': 0.5, 'random_state': None}
2021-12-07 03:39:48,791:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 5), ('n_neighbors', 5), ('proportion', 0.5), ('random_state', None)])
2021-12-07 03:39:48,800:INFO:{'k_neighbors': 3, 'n_neighbors': 5, 'proportion': 0.5, 'random_state': None}
2021-12-07 03:39:48,801:INFO:Sampling sampler parameter string OrderedDict([('k_neighb

["('04clover5z-800-7-30-BI-full', OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 7), ('p', 0.01), ('proportion', 0.5), ('random_state', None), ('voting', 'majority')]), 'CalibratedClassifierCV', OrderedDict([('base_estimator', LinearSVC(dual=False, penalty='l1')), ('base_estimator__C', 1.0), ('base_estimator__class_weight', None), ('base_estimator__dual', False), ('base_estimator__fit_intercept', True), ('base_estimator__intercept_scaling', 1), ('base_estimator__loss', 'squared_hinge'), ('base_estimator__max_iter', 1000), ('base_estimator__multi_class', 'ovr'), ('base_estimator__penalty', 'l1'), ('base_estimator__random_state', None), ('base_estimator__tol', 0.0001), ('base_estimator__verbose', 0), ('cv', None), ('ensemble', True), ('method', 'sigmoid'), ('n_jobs', None)]))", "('04clover5z-800-7-30-BI-full', OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 7), (

2021-12-07 03:39:48,988:INFO:Sampling sampler parameter string OrderedDict([('n_neighbors', 7), ('proportion', 0.25), ('random_state', None)])
2021-12-07 03:39:48,998:INFO:{'n_neighbors': 5, 'proportion': 0.25, 'random_state': None}
2021-12-07 03:39:48,999:INFO:Sampling sampler parameter string OrderedDict([('n_neighbors', 5), ('proportion', 0.25), ('random_state', None)])
2021-12-07 03:39:49,010:INFO:{'classifier': DecisionTreeClassifier(random_state=2), 'k': 3, 'n_folds': 9, 'n_neighbors': 3, 'p': 0.01, 'proportion': 0.1, 'voting': 'consensus', 'random_state': None}
2021-12-07 03:39:49,011:INFO:Sampling sampler parameter string OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 3), ('p', 0.01), ('proportion', 0.1), ('random_state', None), ('voting', 'consensus')])
2021-12-07 03:39:49,024:INFO:{'classifier': DecisionTreeClassifier(random_state=2), 'k': 3, 'n_folds': 9, 'n_neighbors': 7, 'p': 0.01, 'proportion': 0.1, 'voting':

["('04clover5z-800-7-30-BI-full', OrderedDict([('n_neighbors', 7), ('proportion', 0.25), ('random_state', None)]), 'CalibratedClassifierCV', OrderedDict([('base_estimator', LinearSVC(dual=False, penalty='l1')), ('base_estimator__C', 1.0), ('base_estimator__class_weight', None), ('base_estimator__dual', False), ('base_estimator__fit_intercept', True), ('base_estimator__intercept_scaling', 1), ('base_estimator__loss', 'squared_hinge'), ('base_estimator__max_iter', 1000), ('base_estimator__multi_class', 'ovr'), ('base_estimator__penalty', 'l1'), ('base_estimator__random_state', None), ('base_estimator__tol', 0.0001), ('base_estimator__verbose', 0), ('cv', None), ('ensemble', True), ('method', 'sigmoid'), ('n_jobs', None)]))", "('04clover5z-800-7-30-BI-full', OrderedDict([('n_neighbors', 7), ('proportion', 0.25), ('random_state', None)]), 'CalibratedClassifierCV', OrderedDict([('base_estimator', LinearSVC(loss='hinge')), ('base_estimator__C', 1.0), ('base_estimator__class_weight', None), (

2021-12-07 03:39:49,187:INFO:Evaluation do the evaluation 04clover5z-800-7-30-BI-full Borderline_SMOTE1 CalibratedClassifierCV
2021-12-07 03:39:49,188:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-07 03:39:49,189:INFO:Sampling mkl thread number set to 1 successfully
2021-12-07 03:39:49,192:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-07 03:39:49,193:INFO:Sampling mkl thread number set to 1 successfully
2021-12-07 03:39:49,193:INFO:Evaluation do the evaluation 04clover5z-800-7-30-BI-full SMOTE CalibratedClassifierCV
2021-12-07 03:39:49,197:INFO:Evaluation do the evaluation 04clover5z-800-7-30-BI-full SMOTE CalibratedClassifierCV
2021-12-07 03:39:49,197:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-07 03:39:49,197:INFO:Sampling mkl thread number set to 1 successfully
2021-12-07 03:39:49,201:INFO:Evaluation do the evaluation 04clover5z-800-7-30-BI-full SMOTE CalibratedClassifierCV
  results['npv'] = results['tn']/(results['tn'] + resu

In [8]:
highest_auc_score= results['auc'].idxmax()
print(highest_auc_score)

11


In [9]:
print(results.columns)
print("------------------------")
print(results)

Index(['db_name', 'classifier', 'sampler', 'auc', 'auc_mean', 'auc_std',
       'brier', 'acc', 'f1', 'p_top20', 'gacc', 'runtime', 'db_size',
       'db_n_attr', 'imbalanced_ratio', 'sampler_categories',
       'classifier_parameters_auc', 'classifier_parameters_acc',
       'classifier_parameters_gacc', 'classifier_parameters_f1',
       'classifier_parameters_p_top20', 'classifier_parameters_brier',
       'sampler_parameters_auc', 'sampler_parameters_acc',
       'sampler_parameters_gacc', 'sampler_parameters_f1',
       'sampler_parameters_p_top20', 'sampler_parameters_brier'],
      dtype='object')
------------------------
                        db_name              classifier            sampler  \
0   04clover5z-800-7-30-BI-full  CalibratedClassifierCV  Borderline_SMOTE1   
1   04clover5z-800-7-30-BI-full  CalibratedClassifierCV  Borderline_SMOTE2   
2   04clover5z-800-7-30-BI-full  CalibratedClassifierCV              SMOTE   
3   04clover5z-800-7-30-BI-full  CalibratedClassifi

In [18]:
print(results[['db_name', 'classifier', 'sampler', 'auc']].to_html('clover.html'))

None
