# Random Oversampling Imbalanced Datasets

In [3]:
# check version number

import imblearn

print(imblearn.__version__)

0.9.1


In [6]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
#Define oversampling strategy 1
oversample = RandomOverSampler(sampling_strategy = 'minority')

In [8]:
#Define oversampling strategy 2
oversample = RandomOverSampler(sampling_strategy = 0.5)

In [7]:
help(RandomOverSampler)

Help on class RandomOverSampler in module imblearn.over_sampling._random_over_sampler:

class RandomOverSampler(imblearn.over_sampling.base.BaseOverSampler)
 |  RandomOverSampler(*, sampling_strategy='auto', random_state=None, shrinkage=None)
 |  
 |  Class to perform random over-sampling.
 |  
 |  Object to over-sample the minority class(es) by picking samples at random
 |  with replacement. The bootstrap can be generated in a smoothed manner.
 |  
 |  Read more in the :ref:`User Guide <random_over_sampler>`.
 |  
 |  Parameters
 |  ----------
 |  sampling_strategy : float, str, dict or callable, default='auto'
 |      Sampling information to resample the data set.
 |  
 |      - When ``float``, it corresponds to the desired ratio of the number of
 |        samples in the minority class over the number of samples in the
 |        majority class after resampling. Therefore, the ratio is expressed as
 |        :math:`\alpha_{os} = N_{rm} / N_{M}` where :math:`N_{rm}` is the
 |        nu

In [10]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler

In [11]:
#Define a pair of dataset.
X, y = make_classification(n_samples = 10000, weights = [0.99], flip_y = 0)

In [12]:
#Summarize class distribution.
print(Counter(y))

Counter({0: 9900, 1: 100})


In [13]:
#Define oversampling strategy
oversample = RandomOverSampler(sampling_strategy = 'minority')

In [14]:
#Fit and apply the transform.
X_over, y_over = oversample.fit_resample(X, y)

In [15]:
print(Counter(y_over))

Counter({0: 9900, 1: 9900})


## Pipeline with oversampling

This is used when we are applying the k-fold evaluation, we want to only have the oversampling on the evaluation dataset and make sure that the validation set won't be oversampled, which means the data in validation set won't be duplicated in training set.

In [None]:
# example of evaluating a decision tree with random oversampling
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
# define dataset
X, y = make_classification(n_samples=10000, weights=[0.99], flip_y=0)
# define pipeline
steps = [('over', RandomOverSampler()), ('model', DecisionTreeClassifier())]
pipeline = Pipeline(steps=steps)
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='f1_micro', cv=cv, n_jobs=-1)
score = mean(scores)
print('F1 Score: %.3f' % score)

# Random Undersampling Imbalanced Datasets

In [None]:
# Define undersample strategy 1
undersample = RandomUnderSampler(sampling_strategy='majority')

In [None]:
# Define undersample strategy 2
undersample = RandomUnderSampler(sampling_strategy=0.5)

In [18]:
# Example of random undersampling to balance the class distribution
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.under_sampling import RandomUnderSampler

In [19]:
# Define dataset
X, y = make_classification(n_samples=10000, weights=[0.99], flip_y=0)

In [20]:
# Summarize class distribution
print(Counter(y))

Counter({0: 9900, 1: 100})


In [21]:
# Define undersample strategy
undersample = RandomUnderSampler(sampling_strategy='majority')
# Fit and apply the transform
X_over, y_over = undersample.fit_resample(X, y)
# Summarize class distribution
print(Counter(y_over))

Counter({0: 100, 1: 100})


## Pipeline with undersampling

In [None]:
# example of evaluating a decision tree with random undersampling
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
# define dataset
X, y = make_classification(n_samples=10000, weights=[0.99], flip_y=0)
# define pipeline
steps = [('under', RandomUnderSampler()), ('model', DecisionTreeClassifier())]
pipeline = Pipeline(steps=steps)
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='f1_micro', cv=cv, n_jobs=-1)
score = mean(scores)
print('F1 Score: %.3f' % score)

# Combining Random Oversampling and Undersampling

In [22]:
# example of combining random oversampling and undersampling for imbalanced data
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
# define dataset
X, y = make_classification(n_samples=10000, weights=[0.99], flip_y=0)
# summarize class distribution
print(Counter(y))
# define oversampling strategy
over = RandomOverSampler(sampling_strategy=0.1)
# fit and apply the transform
X, y = over.fit_resample(X, y)
# summarize class distribution
print(Counter(y))
# define undersampling strategy
under = RandomUnderSampler(sampling_strategy=0.5)
# fit and apply the transform
X, y = under.fit_resample(X, y)
# summarize class distribution
print(Counter(y))

Counter({0: 9900, 1: 100})
Counter({0: 9900, 1: 990})
Counter({0: 1980, 1: 990})


## Pipeline with combination of undersampling and oversampling

In [None]:
# example of evaluating a model with random oversampling and undersampling
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
# define dataset
X, y = make_classification(n_samples=10000, weights=[0.99], flip_y=0)
# define pipeline
over = RandomOverSampler(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under), ('m', DecisionTreeClassifier())]
pipeline = Pipeline(steps=steps)
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='f1_micro', cv=cv, n_jobs=-1)
score = mean(scores)
print('F1 Score: %.3f' % score)