In [53]:
import numpy as np
import pandas as pd 


# NumPy generates random numbers using a pseudo-random number generator (PRNG).
np.random.seed(123)

sample_data = 1000
class_0_ratio = 0.9
class_0_data = int(sample_data * class_0_ratio) # first data



In [55]:
class_1_data = sample_data - class_0_data # second data
class_1_data 

100

In [56]:
# Generate features for class zero
feature1_class_zero = np.random.normal(loc=0, scale=1, size=class_0_data)
feature2_class_zero = np.random.normal(loc=0, scale=1, size=class_0_data)


# Generate features for class one
feature1_class_one = np.random.normal(loc=1, scale=1, size=class_1_data)
feature2_class_one = np.random.normal(loc=1, scale=1, size=class_1_data)


# Create dataframes
class_zero_df = pd.DataFrame({
    'feature1': feature1_class_zero,
    'feature2': feature2_class_zero,
    'target': 0
})

class_one_df = pd.DataFrame({
    'feature1': feature1_class_one,
    'feature2': feature2_class_one,
    'target': 1
})

# Combine dataframes
df = pd.concat([class_zero_df, class_one_df], ignore_index=True)

print(df.head())
print(df.tail())

   feature1  feature2  target
0 -1.085631  0.551302       0
1  0.997345  0.419589       0
2  0.282978  1.815652       0
3 -1.506295 -0.252750       0
4 -0.578600 -0.292004       0
     feature1  feature2  target
995  0.376371  1.845701       1
996  1.239810 -0.119923       1
997  0.131760  0.640703       1
998  1.902006 -0.609695       1
999  1.697490  1.013570       1


In [57]:
# check the total number of data 
df['target'].value_counts()

target
0    900
1    100
Name: count, dtype: int64

In [58]:
# one the major method to balance the data in upsampling 
# major number of data
df_major = df[df['target']==0]
# minor number of data
df_minor = df[df['target'] == 1]


In [59]:
from sklearn.utils import resample

# We resample the minority class with replacement to increase its size to match the majority class.
df_minor_upsampled = resample(
    df_minor,
    replace=True,  # sample with replacement
    n_samples=len(df_major),  # match number in majority class
    random_state=42
)

df_minor_upsampled['target'].value_counts()
# now the upsampled value is 900 it means we balance the minor values and make it similar with major values

target
1    900
Name: count, dtype: int64

In [None]:
df_upsampled = pd.concat([df_major,df_minor_upsampled])
# counts the value
df_upsampled['target'].value_counts()


target
0    900
1    900
Name: count, dtype: int64

In [62]:
# use the same data
# Generate features for class zero
feature1_class_zero = np.random.normal(loc=0, scale=1, size=class_0_data)
feature2_class_zero = np.random.normal(loc=0, scale=1, size=class_0_data)


# Generate features for class one
feature1_class_one = np.random.normal(loc=1, scale=1, size=class_1_data)
feature2_class_one = np.random.normal(loc=1, scale=1, size=class_1_data)


# Create dataframes
class_zero_df = pd.DataFrame({
    'feature1': feature1_class_zero,
    'feature2': feature2_class_zero,
    'target': 0
})

class_one_df = pd.DataFrame({
    'feature1': feature1_class_one,
    'feature2': feature2_class_one,
    'target': 1
})

# one the major method to balance the data in downsampling 
# major number of data
df_majorup = df[df['target']==0]
# minor number of data
df_minorup = df[df['target'] == 1]

df_minorup['target'].value_counts()

target
1    100
Name: count, dtype: int64

In [None]:
# let's use the same data and perform the downsample
# downsample mean reduce the data and balance with minor datas
from sklearn.utils import resample
df_major_downsampled = resample(
    df_major,
    replace=False,  # sample without replacement
    n_samples=len(df_minor), 
    random_state=42
)

df_major_downsampled['target'].value_counts()
# now major data has been reduce to 100

target
0    100
Name: count, dtype: int64