In [4]:
import numpy as np
import pandas as pd

## Up Sampling and Down Sampling

When working with imbalanced datasets, where one class is significantly more frequent than another, machine learning models can become biased towards the majority class. To address this, two common techniques are used: **up sampling** and **down sampling**.

### Up Sampling

Up sampling involves increasing the number of samples in the minority class by replicating existing samples or generating new synthetic samples. This helps balance the class distribution and allows the model to learn equally from both classes. In this notebook, up sampling is performed by randomly resampling the minority class with replacement until it matches the size of the majority class.

### Down Sampling

Down sampling reduces the number of samples in the majority class by randomly selecting a subset of its data. This also balances the class distribution but at the cost of potentially discarding useful information from the majority class. In this notebook, down sampling is performed by randomly resampling the majority class to match the size of the minority class.

Both techniques aim to improve the performance of models trained on imbalanced datasets by providing a more balanced representation of each class.


In [3]:
np.random.seed(123)

n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

n_class_0, n_class_1

(900, 100)

In [8]:
class_0 = pd.DataFrame(
    {
        "feature_1": np.random.normal(loc=0, scale=1, size=n_class_0),
        "feature_2": np.random.normal(loc=0, scale=1, size=n_class_0),
        "target": [0] * n_class_0,
    }
)

class_1 = pd.DataFrame(
    {
        "feature_1": np.random.normal(loc=2, scale=1, size=n_class_1),
        "feature_2": np.random.normal(loc=2, scale=1, size=n_class_1),
        "target": [1] * n_class_1,
    }
)

In [15]:
df = pd.concat([class_0, class_1], ignore_index=True)

In [17]:
df["target"].value_counts()

target
0    900
1    100
Name: count, dtype: int64

## Up sampling


In [None]:
df_minority = df[df["target"] == 1]
df_majority = df[df["target"] == 0]

In [None]:
from sklearn.utils import resample

df_minority_upsampled = resample(
    df_minority,
    replace=True,  # sample with replacement)
    n_samples=len(df_majority),
    random_state=123,
)  # reproducible results

In [24]:
df_minority_upsampled

Unnamed: 0,feature_1,feature_2,target
966,0.671933,1.370988,1
992,2.196570,1.397425,1
998,2.902006,0.390305,1
917,2.197269,4.216788,1
983,2.844335,2.015572,1
...,...,...,...
912,2.834499,2.773754,1
983,2.844335,2.015572,1
917,2.197269,4.216788,1
950,2.399896,-0.840847,1


In [26]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled], ignore_index=True)

In [27]:
df_upsampled

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.252750,0
4,-0.578600,-0.292004,0
...,...,...,...
1795,2.834499,2.773754,1
1796,2.844335,2.015572,1
1797,2.197269,4.216788,1
1798,2.399896,-0.840847,1


## Down sampling


In [None]:
class_0 = pd.DataFrame(
    {
        "feature_1": np.random.normal(loc=0, scale=1, size=n_class_0),
        "feature_2": np.random.normal(loc=0, scale=1, size=n_class_0),
        "target": [0] * n_class_0,
    }
)

class_1 = pd.DataFrame(
    {
        "feature_1": np.random.normal(loc=2, scale=1, size=n_class_1),
        "feature_2": np.random.normal(loc=2, scale=1, size=n_class_1),
        "target": [1] * n_class_1,
    }
)

In [29]:
df_minority = df[df["target"] == 1]
df_majority = df[df["target"] == 0]

In [30]:
df_majority_downsampled = resample(
    df_majority,
    replace=True,  # sample with replacement)
    n_samples=len(df_minority),
    random_state=123,
)  # reproducible results

In [32]:
df_majority_downsampled

Unnamed: 0,feature_1,feature_2,target
510,0.416124,0.726136,0
365,0.059291,0.494234,0
382,-1.305786,-0.366811,0
322,-0.375750,0.220715,0
98,0.379401,0.470264,0
...,...,...,...
51,-1.038788,-0.778272,0
885,-0.893196,0.651913,0
745,1.308518,-1.287595,0
304,-0.144013,-0.616971,0


In [37]:
df_downsampled = pd.concat([df_minority, df_majority_downsampled], ignore_index=True)

df_downsampled

Unnamed: 0,feature_1,feature_2,target
0,1.699768,2.139033,1
1,1.367739,2.025577,1
2,1.795683,1.803557,1
3,2.213696,3.312255,1
4,3.033878,3.187417,1
...,...,...,...
195,-1.038788,-0.778272,0
196,-0.893196,0.651913,0
197,1.308518,-1.287595,0
198,-0.144013,-0.616971,0


In [38]:
df_downsampled.target.value_counts()

target
1    100
0    100
Name: count, dtype: int64