In [None]:
import pandas as pd
import seaborn as sns

# SMOTE for Imbalanced Classification
Synthetic Minority Oversampling Technique <br>
https://imbalanced-learn.org/stable/introduction.html

In [None]:
from collections import Counter
from sklearn.datasets import make_classification

from numpy import where

X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
                           n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)

counter = Counter(y)
counter

In [None]:
from matplotlib import pyplot

for label, _ in counter.items():
    row_ix = where(y == label)[0]
    pyplot.scatter(X[row_ix, 0], X[row_ix, 1], label=str(label))
pyplot.legend()
pyplot.show()

In [None]:
from imblearn.over_sampling import SMOTE

oversample = SMOTE(sampling_strategy=0.3)
X_resampled, y_resampled = oversample.fit_resample(X, y)

counter = Counter(y_resampled)
counter


In [None]:
for label, _ in counter.items():
    row_ix = where(y_resampled == label)[0]
    pyplot.scatter(X_resampled[row_ix, 0], X_resampled[row_ix, 1], label=str(label))
pyplot.legend()
pyplot.show()

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline


over = SMOTE(sampling_strategy=0.3)
under = RandomUnderSampler(sampling_strategy=0.5)

pipeline = Pipeline(steps=[('over', over), ('under', under)])

X_ou, y_ou = pipeline.fit_resample(X, y)
counter = Counter(y_ou)

counter

In [None]:
for label, _ in counter.items():
    row_ix = where(y_ou == label)[0]
    pyplot.scatter(X_ou[row_ix, 0], X_ou[row_ix, 1], label=str(label))
pyplot.legend()
pyplot.show()

In [None]:
from numpy import mean
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import StratifiedKFold, cross_val_score

#model = DecisionTreeClassifier()
model = LogisticRegression()

cv = StratifiedKFold(n_splits=10)

over_pipeline = pipeline = Pipeline(steps=[('over', over), ('model', model)])

over_under_pipeline = pipeline = Pipeline(steps=[('over', over), ('under', under), ('model', model)])

original_scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC for original data: %.3f' % mean(original_scores))

over_scores = cross_val_score(over_pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC for oversampled data: %.3f' % mean(over_scores))

over_under_scores = cross_val_score(over_pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC for over- and undersampled data: %.3f' % mean(over_under_scores))
