# Evaluation of oversamplers with a set of classifiers on one database

In this notebook we give an example of optimizing oversamplers and classifiers for given dataset.

In [1]:
import os.path

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

import smote_variants as sv

import imbalanced_databases as imbd

In [2]:
import pandas as pd
import numpy as np

In [3]:
# the evaluation procedure uses a directory for caching

cache_path= os.path.join(os.path.expanduser('~'), 'smote_test')

if not os.path.exists(cache_path):
    os.makedirs(cache_path)

In [4]:
# specifying the dataset to be used

dataset= imbd.load_glass0()

In [5]:
datas = {}
dat = pd.read_csv('bupa-full.csv')
dat.head()

Unnamed: 0,Mcv,Alkphos,Sgpt,Sgot,Gammagt,Drink,output_class
0,85.0,92.0,45.0,27.0,31.0,0.0,0
1,85.0,64.0,59.0,32.0,23.0,0.0,1
2,91.0,78.0,34.0,24.0,36.0,0.0,1
3,98.0,55.0,13.0,17.0,17.0,0.0,1
4,88.0,67.0,21.0,11.0,11.0,0.5,0


In [6]:
datas['target'] = dat['output_class'].to_numpy()

In [7]:
datas['target']

array([0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1,

In [8]:
type(datas['target'])

numpy.ndarray

In [9]:
type(dataset['target'])

numpy.ndarray

In [10]:
dat = dat.drop('output_class', axis=1)
print(dat.head())

    Mcv  Alkphos  Sgpt  Sgot  Gammagt  Drink
0  85.0     92.0  45.0  27.0     31.0    0.0
1  85.0     64.0  59.0  32.0     23.0    0.0
2  91.0     78.0  34.0  24.0     36.0    0.0
3  98.0     55.0  13.0  17.0     17.0    0.0
4  88.0     67.0  21.0  11.0     11.0    0.5


In [11]:
datas['data'] = np.array(dat.values.tolist())

In [12]:
datas['data']

array([[85., 92., 45., 27., 31.,  0.],
       [85., 64., 59., 32., 23.,  0.],
       [91., 78., 34., 24., 36.,  0.],
       ...,
       [82., 72., 31., 20., 84.,  3.],
       [91., 54., 25., 22., 35.,  4.],
       [95., 93., 21., 27., 47.,  6.]])

In [13]:
print(type(datas['data']))
print(type(dataset['data']))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [14]:
datas['name'] = 'bupa-full'

In [15]:
datas

{'target': array([0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1,
        1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
        0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1,
        0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1,
        1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1,
        1, 0, 0, 0, 1, 0, 1,

In [16]:
# dataset[]

In [17]:
# specifying the classifiers

knn_classifier= KNeighborsClassifier()
dt_classifier= DecisionTreeClassifier()

In [24]:
# executing the evaluation using 5 parallel jobs and at most 35 random but meaningful parameter combinations
# with the 5 quickest oversamplers

results= sv.evaluate_oversamplers(datasets= [datas],
                                    samplers= [sv.SMOTE, sv.SMOTE_IPF],
                                    classifiers= [knn_classifier],
                                    cache_path= cache_path,
                                    n_jobs= 5,
                                    max_samp_par_comb= 35)

2021-12-06 11:03:49,101:INFO:dataset: bupa-full, samplings_available: True, evaluations_available: True
2021-12-06 11:03:49,101:INFO:doing the folding
2021-12-06 11:03:49,102:INFO:Folding reading from file folding_bupa-full.pickle
2021-12-06 11:03:49,103:INFO:do the samplings
2021-12-06 11:03:49,103:INFO:create sampling objects, random_state: 
2021-12-06 11:03:49,103:INFO:samplers: [<class 'smote_variants._smote_variants.SMOTE'>, <class 'smote_variants._smote_variants.SMOTE_IPF'>]
2021-12-06 11:03:49,104:INFO:[{'n_neighbors': 3, 'proportion': 0.1}, {'n_neighbors': 3, 'proportion': 0.25}, {'n_neighbors': 3, 'proportion': 0.5}, {'n_neighbors': 3, 'proportion': 0.75}, {'n_neighbors': 3, 'proportion': 1.0}, {'n_neighbors': 3, 'proportion': 1.5}, {'n_neighbors': 3, 'proportion': 2.0}, {'n_neighbors': 5, 'proportion': 0.1}, {'n_neighbors': 5, 'proportion': 0.25}, {'n_neighbors': 5, 'proportion': 0.5}, {'n_neighbors': 5, 'proportion': 0.75}, {'n_neighbors': 5, 'proportion': 1.0}, {'n_neighbor

["('bupa-full', OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 3), ('p', 0.01), ('proportion', 2.0), ('random_state', None), ('voting', 'majority')]), 'KNeighborsClassifier', OrderedDict([('algorithm', 'auto'), ('leaf_size', 30), ('metric', 'minkowski'), ('metric_params', None), ('n_jobs', None), ('n_neighbors', 5), ('p', 2), ('weights', 'uniform')]))"]
["('bupa-full', OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 7), ('p', 0.01), ('proportion', 2.0), ('random_state', None), ('voting', 'consensus')]), 'KNeighborsClassifier', OrderedDict([('algorithm', 'auto'), ('leaf_size', 30), ('metric', 'minkowski'), ('metric_params', None), ('n_jobs', None), ('n_neighbors', 5), ('p', 2), ('weights', 'uniform')]))"]
["('bupa-full', OrderedDict([('classifier', DecisionTreeClassifier(random_state=2)), ('k', 3), ('n_folds', 9), ('n_neighbors', 5), ('p', 0.01), ('proportion', 

2021-12-06 11:03:49,486:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-06 11:03:49,488:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-06 11:03:49,490:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-06 11:03:49,493:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-06 11:03:49,496:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-06 11:03:49,499:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-06 11:03:49,502:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-06 11:03:49,504:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-06 11:03:49,506:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-06 11:03:49,509:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-06 11:03:49,512:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-06 11:03:49,515:INFO:Evaluation mkl thread number set to 1 successfully
2021-12-06 11:03:49,517:INFO:Evaluation 

In [25]:
# determining oversampler and classifier combination with highest AUC score

highest_auc_score= results['auc'].idxmax()
# print(highest_auc_score)

In [26]:
# The results are arranged in a pandas DataFrame with the following columns:
# db_name - name of the database
# classifier - name of the classifier
# sampler - name of the oversampling technique
# auc - highest auc score with the classifier and oversampler (aggregated over all classifier and oversampler
# parameter combinations)
# brier - highest brier score with the classifier and oversampler (aggregated similarly)
# acc - the highest accuracy score with the classifier and oversampler (aggregated similarly)
# f1 - the highest f1 score with the classifier and oversampler (aggregated similarly)
# p_top20 - the highest p_top20 score with the classifier and oversampler (aggregated similarly)
# gacc - the highest GACC score with the classifier and oversampler (aggregated similarly)
# runtime - average runtime in seconds
# db_size - size of the dataset
# db_n_attr - number of attributes in the dataset
# imbalanced_ratio - the ratio of majority/minority class sizes
# sampler_categories - the categories assigned to the oversampler
# classifier_parameters_auc - the classifier parameters reaching the highest auc score
# classifier_parameters_acc - the classifier parameters reaching the highest acc score
# classifier_parameters_gacc - the classifier parameters reaching the highest gacc score
# classifier_parameters_f1 - the classifier parameters reaching the highest f1 score
# classifier_parameters_p_top20 - the classifier parameters reaching the highest p_top20 score
# classifier_parameters_brier - the classifier parameters reaching the highest brier score
# sampler_parameters_auc - the oversampler parameters reaching the highest auc score
# sampler_parameters_acc - the oversampler parameters reaching the highest acc score
# sampler_parameters_gacc - the oversampler parameters reaching the highest gacc score
# sampler_parameters_f1 - the oversampler parameters reaching the highest f1 score
# sampler_parameters_p_top20 - the oversampler parameters reaching the highest p_top20 score
# sampler_parameters_brier - the oversampler parameters reaching the highest brier score

print(results.columns)

Index(['db_name', 'classifier', 'sampler', 'auc', 'auc_mean', 'auc_std',
       'brier', 'acc', 'f1', 'p_top20', 'gacc', 'runtime', 'db_size',
       'db_n_attr', 'imbalanced_ratio', 'sampler_categories',
       'classifier_parameters_auc', 'classifier_parameters_acc',
       'classifier_parameters_gacc', 'classifier_parameters_f1',
       'classifier_parameters_p_top20', 'classifier_parameters_brier',
       'sampler_parameters_auc', 'sampler_parameters_acc',
       'sampler_parameters_gacc', 'sampler_parameters_f1',
       'sampler_parameters_p_top20', 'sampler_parameters_brier'],
      dtype='object')


In [27]:
# The results can be processed according to the requirements of the analysis

print(results)

     db_name              classifier    sampler       auc  auc_mean   auc_std  \
0  bupa-full  CalibratedClassifierCV  SMOTE_IPF  0.715716  0.722184  0.072728   
1  bupa-full  DecisionTreeClassifier      SMOTE  0.658563  0.658563  0.049274   
2  bupa-full  DecisionTreeClassifier  SMOTE_IPF  0.657931  0.657931  0.045314   
3  bupa-full    KNeighborsClassifier      SMOTE  0.661523  0.661264  0.044360   
4  bupa-full    KNeighborsClassifier  SMOTE_IPF  0.667623  0.667443  0.053032   

      brier       acc        f1   p_top20  ...  \
0  0.211469  0.692754  0.631683  0.768116  ...   
1  0.334300  0.665700  0.606818  0.618357  ...   
2  0.334300  0.665700  0.605023  0.599034  ...   
3  0.246647  0.597101  0.598848  0.661836  ...   
4  0.245797  0.604831  0.600768  0.666667  ...   

                          classifier_parameters_gacc  \
0  {'base_estimator__C': 1.0, 'base_estimator__cl...   
1  {'ccp_alpha': 0.0, 'class_weight': None, 'crit...   
2  {'ccp_alpha': 0.0, 'class_weight': None, 

In [22]:
# querying classifier and oversampler parameters with highest AUC score

cl, cl_par, samp, samp_par= results.loc[highest_auc_score][['classifier',
                                                           'classifier_parameters_auc',
                                                           'sampler',
                                                           'sampler_parameters_auc']]

In [23]:
# instantiating oversampler and classifier objects providing the highest AUC score

samp_obj= getattr(sv, samp)(**eval(samp_par))
cl_obj= eval(cl)(**eval(cl_par))

NameError: name 'CalibratedClassifierCV' is not defined

In [None]:
# oversampling the entire dataset and fitting a classifier

X_samp, y_samp= samp_obj.sample(dataset['data'], dataset['target'])
cl_obj.fit(X_samp, y_samp)