In [None]:
# import the imbalanced-learn library
# install it with ! pip install imbalanced-learn
import imblearn

In [2]:
# import the scikit-learn library
# install it with ! pip install scikit-learn
import sklearn
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold, cross_validate

In [None]:
# import other necessary libraries
import numpy as np
import pandas as pd
from collections import Counter
import seed_utils

In [4]:
# function to append the averages and
# standard deviations of scores,
# row 11 are the averages and
# row 12 are the stds
def append_avgs(dict):
    avgs = []
    stds = []
    for i in dict:
        avg = np.average(dict[i])
        std = np.std(dict[i])
        avgs.append(avg)
        stds.append(std)
    df = pd.DataFrame(dict)
    df.loc[11] = avgs
    df.loc[12] = stds
    return df

In [5]:
# create an imbalaced toy dataset with 10000 samples and 10 features
# class 0: 7500 samples, class 1: 2500 samples 
X, y = make_classification(n_samples=10000, weights=[0.75], n_features = 10,
                           flip_y=0, random_state=42)

# define the Decision Tree Classifier model
model = sklearn.tree.DecisionTreeClassifier()

# define crossvalidation strategy
# this will perform a stratified 10-fold crossvalidation
# parameter n_splits defines the number of folds
cv = StratifiedKFold(n_splits=10)

# define the scoring dictionary to evaluate the crossvalidation
scoring = {'f1': 'f1', 'precision': 'precision', 'accuracy': 'accuracy',
           'recall': 'recall', 'roc_auc': 'roc_auc'}

# Cross-validation without sampling

In [7]:
# crossvalidate on original data set
scores = cross_validate(model, X, y, scoring=scoring, cv=cv, n_jobs=-1)

**Row 11 contains average scores and 12 contains the standard deviation**

In [8]:
# use previously defined function "append_avgs" to append averages and stds
scores = append_avgs(scores)
scores

Unnamed: 0,fit_time,score_time,test_f1,test_precision,test_accuracy,test_recall,test_roc_auc
0,0.108466,0.007195,0.820408,0.8375,0.912,0.804,0.876
1,0.103974,0.005416,0.827586,0.839506,0.915,0.816,0.882
2,0.102305,0.005039,0.831068,0.807547,0.913,0.856,0.894
3,0.095506,0.005006,0.843327,0.816479,0.919,0.872,0.903333
4,0.088152,0.005148,0.823062,0.818182,0.911,0.828,0.883333
5,0.094373,0.004866,0.827586,0.839506,0.915,0.816,0.882
6,0.089898,0.005041,0.84739,0.850806,0.924,0.844,0.897333
7,0.085832,0.00491,0.837573,0.819923,0.917,0.856,0.896667
8,0.07482,0.004948,0.808679,0.797665,0.903,0.82,0.875333
9,0.081238,0.004822,0.835759,0.87013,0.921,0.804,0.882


# Cross-validation with sampling

**We will use _SMOTE_ and _Random Undersampling_ as sampling techniques**

In [9]:
# define SMOTE to oversample minority class to contain
# half the amount of samples in the majority class
# by setting sampling_strategy to 0.5
smote = imblearn.over_sampling.SMOTE(sampling_strategy = 0.5)

# define RandomUnderSampler to undersample majority class
# to contain twice the amount of samples in the minority class
# by setting sampling_strategy to 0.5
under = imblearn.under_sampling.RandomUnderSampler(sampling_strategy = 0.5)

## Cross validation with sampling -- DONE WRONG

### Cross-validation with oversampling -- DONE WRONG

In [10]:
# oversample minority class
X_smote, y_smote = smote.fit_resample(X, y)

In [11]:
print("Distribution in original data set", Counter(y))
print("Distribution in data set after SMOTE", Counter(y_smote))

Distribution in original data set Counter({0: 7500, 1: 2500})
Distribution in data set after SMOTE Counter({0: 7500, 1: 3750})


In [12]:
# crossvalidate on oversampled data set using the crossvalidation technique defined in "cv"
wrong_scores_o = cross_validate(model, X_smote, y_smote, scoring=scoring, cv=cv, n_jobs=-1)

In [13]:
# use previously defined function "append_avgs" to append averages and stds
wrong_scores_o = append_avgs(wrong_scores_o)
wrong_scores_o

Unnamed: 0,fit_time,score_time,test_f1,test_precision,test_accuracy,test_recall,test_roc_auc
0,0.089884,0.004451,0.86443,0.87027,0.910222,0.858667,0.897333
1,0.124148,0.007586,0.873656,0.880759,0.916444,0.866667,0.904
2,0.132073,0.006117,0.889182,0.879896,0.925333,0.898667,0.918667
3,0.138672,0.0078,0.861789,0.876033,0.909333,0.848,0.894
4,0.080333,0.004428,0.880319,0.877984,0.92,0.882667,0.910667
5,0.081005,0.005078,0.884097,0.893733,0.923556,0.874667,0.911333
6,0.087266,0.005098,0.885333,0.885333,0.923556,0.885333,0.914
7,0.135069,0.006123,0.90815,0.88191,0.936889,0.936,0.936667
8,0.128185,0.005452,0.905318,0.881313,0.935111,0.930667,0.934
9,0.128115,0.005909,0.923684,0.911688,0.948444,0.936,0.945333


### Cross-validation with undersampling -- DONE WRONG

In [14]:
# undersample majority class
X_under, y_under = under.fit_resample(X, y)

In [15]:
print("Distribution in original data set", Counter(y))
print("Distribution in data set after undersampling", Counter(y_under))

Distribution in original data set Counter({0: 7500, 1: 2500})
Distribution in data set after undersampling Counter({0: 5000, 1: 2500})


In [16]:
# crossvalidate on undersampled data set using the crossvalidation technique defined in "cv"
wrong_scores_u = cross_validate(model, X_under, y_under, scoring=scoring, cv=cv, n_jobs=-1)

In [17]:
# use previously defined function "append_avgs" to append averages and stds
wrong_scores_u = append_avgs(wrong_scores_u)
wrong_scores_u

Unnamed: 0,fit_time,score_time,test_f1,test_precision,test_accuracy,test_recall,test_roc_auc
0,0.069667,0.00661,0.884848,0.893878,0.924,0.876,0.912
1,0.072461,0.007538,0.869048,0.862205,0.912,0.876,0.903
2,0.071862,0.007442,0.880952,0.874016,0.92,0.888,0.912
3,0.073257,0.007837,0.884462,0.880952,0.922667,0.888,0.914
4,0.072965,0.006358,0.86747,0.870968,0.912,0.864,0.9
5,0.074656,0.005146,0.851927,0.864198,0.902667,0.84,0.887
6,0.074196,0.004138,0.902584,0.897233,0.934667,0.908,0.928
7,0.073489,0.004388,0.863544,0.879668,0.910667,0.848,0.895
8,0.059434,0.004732,0.874751,0.869565,0.916,0.88,0.907
9,0.07663,0.004319,0.843373,0.846774,0.896,0.84,0.882


## Cross validation with sampling -- DONE RIGHT

### Cross-validation with oversampling -- DONE RIGHT

In [18]:
# define the SMOTE oversampling pipeline
smote_steps = [('over', smote), ('model', model)]
smote_pipeline = imblearn.pipeline.Pipeline(steps=smote_steps)

This Pipeline first oversamples the training dataset with SMOTE then fits the model.

In [19]:
# show the SMOTE pipeline
print(smote_pipeline)

Pipeline(steps=[('over', SMOTE(sampling_strategy=0.5)),
                ('model', DecisionTreeClassifier())])


In [20]:
# evaluate the SMOTE pipeline using the crossvalidation technique defined in cv
smote_scores = cross_validate(smote_pipeline, X, y, scoring=scoring, cv=cv, n_jobs=-1)

In [21]:
# use previously defined function "append_avgs" to append averages and stds
smote_scores = append_avgs(smote_scores)
smote_scores

Unnamed: 0,fit_time,score_time,test_f1,test_precision,test_accuracy,test_recall,test_roc_auc
0,0.194384,0.005174,0.847059,0.830769,0.922,0.864,0.902667
1,0.187998,0.004536,0.85214,0.829545,0.924,0.876,0.908
2,0.152889,0.00418,0.842912,0.808824,0.918,0.88,0.905333
3,0.206151,0.004577,0.849802,0.839844,0.924,0.86,0.902667
4,0.195976,0.005112,0.815686,0.8,0.906,0.832,0.881333
5,0.204993,0.005073,0.839357,0.842742,0.92,0.836,0.892
6,0.191301,0.004694,0.828125,0.80916,0.912,0.848,0.890667
7,0.189942,0.004753,0.82,0.82,0.91,0.82,0.88
8,0.1926,0.004889,0.827184,0.803774,0.911,0.852,0.891333
9,0.185301,0.004841,0.862823,0.857708,0.931,0.868,0.91


### Cross-validation with undersampling -- DONE RIGHT

In [22]:
# define RandomUnderSampler undersampling pipeline
under_steps = [('under', under), ('model', model)]
under_pipeline = imblearn.pipeline.Pipeline(steps=under_steps)

This Pipeline first undersamples the training dataset with RandomUnderSampler then fits the model.

In [23]:
# show outher RandomUnderSampler pipeline
print(under_pipeline)

Pipeline(steps=[('under', RandomUnderSampler(sampling_strategy=0.5)),
                ('model', DecisionTreeClassifier())])


In [24]:
# evaluate the RandomUnderSampler pipeline using the crossvalidation technique defined in cv
under_scores = cross_validate(under_pipeline, X, y, scoring=scoring, cv=cv, n_jobs=-1)

In [25]:
# use previously defined function "append_avgs" to append averages and stds
under_scores = append_avgs(under_scores)
under_scores

Unnamed: 0,fit_time,score_time,test_f1,test_precision,test_accuracy,test_recall,test_roc_auc
0,0.080154,0.005063,0.861538,0.82963,0.928,0.896,0.917333
1,0.084976,0.008919,0.844961,0.819549,0.92,0.872,0.904
2,0.080933,0.008015,0.828794,0.806818,0.912,0.852,0.892
3,0.08434,0.007915,0.833006,0.818533,0.915,0.848,0.892667
4,0.079821,0.007884,0.816,0.816,0.908,0.816,0.877333
5,0.071391,0.004895,0.823529,0.807692,0.91,0.84,0.886667
6,0.083737,0.007761,0.857692,0.825926,0.926,0.892,0.914667
7,0.06867,0.004894,0.833652,0.798535,0.913,0.872,0.899333
8,0.065845,0.004958,0.852336,0.8,0.921,0.912,0.918
9,0.083408,0.005175,0.836292,0.824903,0.917,0.848,0.894


# Compare scores

In [26]:
# create an empty dataframe to compare the scores
df = pd.DataFrame()

# add the average scores of each sampling strategy
# to the empty dataframe
df = df.append([scores.loc[11], wrong_scores_o.loc[11], wrong_scores_u.loc[11],
           smote_scores.loc[11], under_scores.loc[11]]).reset_index(drop=True)

# rename the indices
df = df.rename(index={0:'NO SAMPLING', 1:'WRONG SMOTE', 2:'WRONG UNDERSAMPLING',
                 3: 'RIGHT SMOTE', 4:'RIGHT UNDERSAMPLING'})

df

Unnamed: 0,fit_time,score_time,test_f1,test_precision,test_accuracy,test_recall,test_roc_auc
NO SAMPLING,0.092456,0.005239,0.830244,0.829725,0.915,0.8316,0.8872
WRONG SMOTE,0.112475,0.005804,0.887596,0.883892,0.924889,0.891733,0.9166
WRONG UNDERSAMPLING,0.071862,0.005851,0.872296,0.873946,0.915067,0.8708,0.904
RIGHT SMOTE,0.190154,0.004783,0.838509,0.824237,0.9178,0.8536,0.8964
RIGHT UNDERSAMPLING,0.078327,0.006548,0.83878,0.814759,0.917,0.8648,0.8996
