### Upsampling: 
#### It is used to handled imbalanced dataset, in upsampling we increase the minor datapoints.

In [1]:
import numpy as np
import pandas as pd

np.random.seed(121)

In [2]:
# Create dataframe with two classes
n_sample = 1000
class_0_ratio = 0.9

n_class_0 = int(n_sample * class_0_ratio)
n_class_1 = n_sample - n_class_0

In [None]:
# Creating dataframe with imbalanced dataset

class_0 = pd.DataFrame({
    'feature_1':np.random.normal(loc=0, scale=1, size=n_class_0), # Creates a normal/gaussian distribution, loc => mean, scale => standard deviation 
    'feature_2':np.random.normal(loc=0, scale=1, size=n_class_0),
    'target':[0]*n_class_0
})

class_0

Unnamed: 0,feature_1,feature_2,target
0,-0.212033,-0.206849,0
1,-0.284929,-1.390810,0
2,-0.573898,-0.311531,0
3,-0.440310,0.815084,0
4,-0.330111,-1.984009,0
...,...,...,...
895,-0.930128,0.195092,0
896,-1.141839,-1.215697,0
897,-0.471065,-1.198864,0
898,0.281270,-0.827062,0


In [None]:
class_1 = pd.DataFrame({
    'feature_1':np.random.normal(loc=1, scale=1, size=n_class_1), 
    'feature_2':np.random.normal(loc=1, scale=1, size=n_class_1),
    'target':[1]*n_class_1
})

class_1

Unnamed: 0,feature_1,feature_2,target
0,1.467098,0.425451,1
1,1.516927,-0.064626,1
2,0.994206,0.115141,1
3,-0.485257,1.529806,1
4,1.757646,2.341931,1
...,...,...,...
95,-0.318932,0.870906,1
96,2.130943,0.882006,1
97,1.337912,2.556846,1
98,0.996449,0.174392,1


In [12]:
df = pd.concat([class_1, class_0]).reset_index(drop=True)
df

Unnamed: 0,feature_1,feature_2,target
0,1.467098,0.425451,1
1,1.516927,-0.064626,1
2,0.994206,0.115141,1
3,-0.485257,1.529806,1
4,1.757646,2.341931,1
...,...,...,...
995,-0.930128,0.195092,0
996,-1.141839,-1.215697,0
997,-0.471065,-1.198864,0
998,0.281270,-0.827062,0


### Upsampling

In [13]:
df_minority = df[df['target'] == 1]
df_majority = df[df['target'] == 0]

In [15]:
from sklearn.utils import resample

In [None]:
df_minority_upsampled = resample(
    df_minority, replace=True, n_samples=len(df_majority), random_state=42
)

df_minority_upsampled

Unnamed: 0,feature_1,feature_2,target
51,1.953578,0.252614,1
92,0.327699,1.892039,1
14,-0.199089,1.363207,1
71,1.724234,0.924910,1
60,1.083575,0.839822,1
...,...,...,...
52,0.698703,0.818081,1
65,-0.035464,1.124976,1
76,0.784731,0.847603,1
42,3.511360,1.371022,1


In [23]:
df_upsampled = pd.concat([df_minority_upsampled, df_majority]).reset_index(drop=True)
df_upsampled

Unnamed: 0,feature_1,feature_2,target
0,1.953578,0.252614,1
1,0.327699,1.892039,1
2,-0.199089,1.363207,1
3,1.724234,0.924910,1
4,1.083575,0.839822,1
...,...,...,...
1795,-0.930128,0.195092,0
1796,-1.141839,-1.215697,0
1797,-0.471065,-1.198864,0
1798,0.281270,-0.827062,0
