In [1]:
import os.path

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from smote_variants import MLPClassifierWrapper

import smote_variants as sv

import pandas as pd
import numpy as np

import imbalanced_databases as imbd
import os, pickle, itertools

In [2]:
# the evaluation procedure uses a directory for caching

cache_path= os.path.join(os.path.expanduser('~'), 'smote_test')

if not os.path.exists(cache_path):
    os.makedirs(cache_path)

In [3]:
# instantiate classifiers
sv_classifiers= [CalibratedClassifierCV(LinearSVC(C=1.0, penalty='l1', loss= 'squared_hinge', dual= False)),
                CalibratedClassifierCV(LinearSVC(C=1.0, penalty='l2', loss= 'hinge', dual= True)),
                CalibratedClassifierCV(LinearSVC(C=1.0, penalty='l2', loss= 'squared_hinge', dual= False)),
                CalibratedClassifierCV(LinearSVC(C=10.0, penalty='l1', loss= 'squared_hinge', dual= False)),
                CalibratedClassifierCV(LinearSVC(C=10.0, penalty='l2', loss= 'hinge', dual= True)),
                CalibratedClassifierCV(LinearSVC(C=10.0, penalty='l2', loss= 'squared_hinge', dual= False))]

mlp_classifiers= []
for x in itertools.product(['relu', 'logistic'], [1.0, 0.5, 0.1]):
    mlp_classifiers.append(MLPClassifierWrapper(activation= x[0], hidden_layer_fraction= x[1]))

nn_classifiers= []
for x in itertools.product([3, 5, 7], ['uniform', 'distance'], [1, 2, 3]):
    nn_classifiers.append(KNeighborsClassifier(n_neighbors= x[0], weights= x[1], p= x[2]))

dt_classifiers= []
for x in itertools.product(['gini', 'entropy'], [None, 3, 5]):
    dt_classifiers.append(DecisionTreeClassifier(criterion= x[0], max_depth= x[1]))

classifiers= []
classifiers.extend(sv_classifiers)
classifiers.extend(mlp_classifiers)
classifiers.extend(nn_classifiers)
classifiers.extend(dt_classifiers)

In [4]:
datas = {}
datas['name'] = 'bupa-full'
dat = pd.read_csv('bupa-full.csv')
dat.head()

Unnamed: 0,Mcv,Alkphos,Sgpt,Sgot,Gammagt,Drink,output_class
0,85.0,92.0,45.0,27.0,31.0,0.0,0
1,85.0,64.0,59.0,32.0,23.0,0.0,1
2,91.0,78.0,34.0,24.0,36.0,0.0,1
3,98.0,55.0,13.0,17.0,17.0,0.0,1
4,88.0,67.0,21.0,11.0,11.0,0.5,0


In [5]:
datas['target'] = dat['output_class'].to_numpy()
dat = dat.drop('output_class', axis=1)
datas['data'] = np.array(dat.values.tolist())

In [6]:
# specifying the classifiers

knn_classifier= KNeighborsClassifier()
dt_classifier= DecisionTreeClassifier()
sv_classifier = CalibratedClassifierCV(LinearSVC())

In [7]:
samplers = [sv.SMOTE, sv.Borderline_SMOTE1, sv.Borderline_SMOTE2, sv.SMOTE_IPF]
results= sv.evaluate_oversamplers(datasets= [imbd.load_haberman],
                                    samplers= samplers,
                                    classifiers= classifiers,
                                    cache_path= cache_path,
                                    n_jobs= 10,
                                    max_samp_par_comb=25)

2021-12-07 04:30:56,587:INFO:dataset: haberman, samplings_available: True, evaluations_available: True
2021-12-07 04:30:56,588:INFO:doing the folding
2021-12-07 04:30:56,588:INFO:Folding reading from file folding_haberman.pickle
2021-12-07 04:30:56,590:INFO:do the samplings
2021-12-07 04:30:56,590:INFO:create sampling objects, random_state: 
2021-12-07 04:30:56,591:INFO:samplers: [<class 'smote_variants._smote_variants.SMOTE'>, <class 'smote_variants._smote_variants.Borderline_SMOTE1'>, <class 'smote_variants._smote_variants.Borderline_SMOTE2'>, <class 'smote_variants._smote_variants.SMOTE_IPF'>]
2021-12-07 04:30:56,591:INFO:[{'n_neighbors': 3, 'proportion': 0.1}, {'n_neighbors': 3, 'proportion': 0.25}, {'n_neighbors': 3, 'proportion': 0.5}, {'n_neighbors': 3, 'proportion': 0.75}, {'n_neighbors': 3, 'proportion': 1.0}, {'n_neighbors': 3, 'proportion': 1.5}, {'n_neighbors': 3, 'proportion': 2.0}, {'n_neighbors': 5, 'proportion': 0.1}, {'n_neighbors': 5, 'proportion': 0.25}, {'n_neighbor

["('haberman', OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 3), ('p', 0.01), ('proportion', 2.0), ('random_state', None), ('voting', 'consensus')]), 'CalibratedClassifierCV', OrderedDict([('base_estimator', LinearSVC(dual=False, penalty='l1')), ('base_estimator__C', 1.0), ('base_estimator__class_weight', None), ('base_estimator__dual', False), ('base_estimator__fit_intercept', True), ('base_estimator__intercept_scaling', 1), ('base_estimator__loss', 'squared_hinge'), ('base_estimator__max_iter', 1000), ('base_estimator__multi_class', 'ovr'), ('base_estimator__penalty', 'l1'), ('base_estimator__random_state', None), ('base_estimator__tol', 0.0001), ('base_estimator__verbose', 0), ('cv', None), ('ensemble', True), ('method', 'sigmoid'), ('n_jobs', None)]))", "('haberman', OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 3), ('p', 0.01), ('proportion', 2.0), ('ra

2021-12-07 04:31:00,732:INFO:{'classifier': DecisionTreeClassifier(random_state=2), 'k': 3, 'n_folds': 9, 'n_neighbors': 7, 'p': 0.01, 'proportion': 1.0, 'voting': 'consensus', 'random_state': None}
2021-12-07 04:31:00,735:INFO:Sampling sampler parameter string OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 7), ('p', 0.01), ('proportion', 1.0), ('random_state', None), ('voting', 'consensus')])
2021-12-07 04:31:00,747:INFO:{'k_neighbors': 5, 'n_neighbors': 5, 'proportion': 1.0, 'random_state': None}
2021-12-07 04:31:00,749:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 5), ('n_neighbors', 5), ('proportion', 1.0), ('random_state', None)])
2021-12-07 04:31:00,756:INFO:{'k_neighbors': 5, 'n_neighbors': 7, 'proportion': 1.0, 'random_state': None}
2021-12-07 04:31:00,757:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 5), ('n_neighbors', 7), ('proportion', 1.0), ('random_state', None)])
2

["('haberman', OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 7), ('p', 0.01), ('proportion', 1.0), ('random_state', None), ('voting', 'consensus')]), 'CalibratedClassifierCV', OrderedDict([('base_estimator', LinearSVC(dual=False, penalty='l1')), ('base_estimator__C', 1.0), ('base_estimator__class_weight', None), ('base_estimator__dual', False), ('base_estimator__fit_intercept', True), ('base_estimator__intercept_scaling', 1), ('base_estimator__loss', 'squared_hinge'), ('base_estimator__max_iter', 1000), ('base_estimator__multi_class', 'ovr'), ('base_estimator__penalty', 'l1'), ('base_estimator__random_state', None), ('base_estimator__tol', 0.0001), ('base_estimator__verbose', 0), ('cv', None), ('ensemble', True), ('method', 'sigmoid'), ('n_jobs', None)]))", "('haberman', OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 7), ('p', 0.01), ('proportion', 1.0), ('ra

2021-12-07 04:31:00,955:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 3), ('n_neighbors', 5), ('proportion', 0.75), ('random_state', None)])
2021-12-07 04:31:00,962:INFO:{'k_neighbors': 5, 'n_neighbors': 3, 'proportion': 0.75, 'random_state': None}
2021-12-07 04:31:00,962:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 5), ('n_neighbors', 3), ('proportion', 0.75), ('random_state', None)])
2021-12-07 04:31:00,971:INFO:{'k_neighbors': 7, 'n_neighbors': 7, 'proportion': 0.75, 'random_state': None}
2021-12-07 04:31:00,971:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 7), ('n_neighbors', 7), ('proportion', 0.75), ('random_state', None)])
2021-12-07 04:31:00,981:INFO:{'k_neighbors': 3, 'n_neighbors': 7, 'proportion': 0.75, 'random_state': None}
2021-12-07 04:31:00,982:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 3), ('n_neighbors', 7), ('proportion', 0.75), ('random_state', None)])
2021-12-07 04:31:00,992:

["('haberman', OrderedDict([('k_neighbors', 3), ('n_neighbors', 5), ('proportion', 0.75), ('random_state', None)]), 'CalibratedClassifierCV', OrderedDict([('base_estimator', LinearSVC(dual=False, penalty='l1')), ('base_estimator__C', 1.0), ('base_estimator__class_weight', None), ('base_estimator__dual', False), ('base_estimator__fit_intercept', True), ('base_estimator__intercept_scaling', 1), ('base_estimator__loss', 'squared_hinge'), ('base_estimator__max_iter', 1000), ('base_estimator__multi_class', 'ovr'), ('base_estimator__penalty', 'l1'), ('base_estimator__random_state', None), ('base_estimator__tol', 0.0001), ('base_estimator__verbose', 0), ('cv', None), ('ensemble', True), ('method', 'sigmoid'), ('n_jobs', None)]))", "('haberman', OrderedDict([('k_neighbors', 3), ('n_neighbors', 5), ('proportion', 0.75), ('random_state', None)]), 'CalibratedClassifierCV', OrderedDict([('base_estimator', LinearSVC(loss='hinge')), ('base_estimator__C', 1.0), ('base_estimator__class_weight', None),

2021-12-07 04:31:01,173:INFO:Sampling sampler parameter string OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 3), ('p', 0.01), ('proportion', 0.25), ('random_state', None), ('voting', 'majority')])
2021-12-07 04:31:01,183:INFO:{'k_neighbors': 7, 'n_neighbors': 7, 'proportion': 0.25, 'random_state': None}
2021-12-07 04:31:01,184:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 7), ('n_neighbors', 7), ('proportion', 0.25), ('random_state', None)])
2021-12-07 04:31:01,191:INFO:{'k_neighbors': 5, 'n_neighbors': 7, 'proportion': 0.25, 'random_state': None}
2021-12-07 04:31:01,192:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 5), ('n_neighbors', 7), ('proportion', 0.25), ('random_state', None)])
2021-12-07 04:31:01,197:INFO:{'k_neighbors': 5, 'n_neighbors': 5, 'proportion': 0.25, 'random_state': None}
2021-12-07 04:31:01,198:INFO:Sampling sampler parameter string OrderedDict([('k_neighbor

["('haberman', OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 3), ('p', 0.01), ('proportion', 0.25), ('random_state', None), ('voting', 'majority')]), 'CalibratedClassifierCV', OrderedDict([('base_estimator', LinearSVC(dual=False, penalty='l1')), ('base_estimator__C', 1.0), ('base_estimator__class_weight', None), ('base_estimator__dual', False), ('base_estimator__fit_intercept', True), ('base_estimator__intercept_scaling', 1), ('base_estimator__loss', 'squared_hinge'), ('base_estimator__max_iter', 1000), ('base_estimator__multi_class', 'ovr'), ('base_estimator__penalty', 'l1'), ('base_estimator__random_state', None), ('base_estimator__tol', 0.0001), ('base_estimator__verbose', 0), ('cv', None), ('ensemble', True), ('method', 'sigmoid'), ('n_jobs', None)]))", "('haberman', OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 3), ('p', 0.01), ('proportion', 0.25), ('r

2021-12-07 04:31:01,377:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-07 04:31:01,377:INFO:Sampling mkl thread number set to 1 successfully
2021-12-07 04:31:01,380:INFO:Evaluation do the evaluation haberman Borderline_SMOTE1 CalibratedClassifierCV
2021-12-07 04:31:01,381:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-07 04:31:01,382:INFO:Sampling mkl thread number set to 1 successfully
2021-12-07 04:31:01,385:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-07 04:31:01,385:INFO:Evaluation do the evaluation haberman Borderline_SMOTE1 CalibratedClassifierCV
2021-12-07 04:31:01,403:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-07 04:31:01,404:INFO:Sampling mkl thread number set to 1 successfully
2021-12-07 04:31:01,407:INFO:Evaluation do the evaluation haberman SMOTE CalibratedClassifierCV
2021-12-07 04:31:01,607:INFO:Evaluation do the evaluation haberman SMOTE_IPF CalibratedClassifierCV
2021-12-07 04:31:01,611:INFO:Evalu

In [8]:
highest_auc_score= results['auc'].idxmax()
print(highest_auc_score)

1


In [9]:
print(results.columns)
print("------------------------")
print(results)

Index(['db_name', 'classifier', 'sampler', 'auc', 'auc_mean', 'auc_std',
       'brier', 'acc', 'f1', 'p_top20', 'gacc', 'runtime', 'db_size',
       'db_n_attr', 'imbalanced_ratio', 'sampler_categories',
       'classifier_parameters_auc', 'classifier_parameters_acc',
       'classifier_parameters_gacc', 'classifier_parameters_f1',
       'classifier_parameters_p_top20', 'classifier_parameters_brier',
       'sampler_parameters_auc', 'sampler_parameters_acc',
       'sampler_parameters_gacc', 'sampler_parameters_f1',
       'sampler_parameters_p_top20', 'sampler_parameters_brier'],
      dtype='object')
------------------------
     db_name              classifier            sampler       auc  auc_mean  \
0   haberman  CalibratedClassifierCV  Borderline_SMOTE1  0.698863  0.707805   
1   haberman  CalibratedClassifierCV  Borderline_SMOTE2  0.702728  0.708017   
2   haberman  CalibratedClassifierCV              SMOTE  0.696577  0.706275   
3   haberman  CalibratedClassifierCV          S

In [10]:
print(results[['db_name', 'classifier', 'sampler', 'auc']].to_html('haberman.html'))

None
