### RANDOM OVERSAMPLING

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
!pip install imbalanced-learn

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
X = np.array([[1.0, 1.0],
              [2.0, 2.0],
              [5.0, 5.0],
              [6.0, 6.0],
              [7.0, 7.0],
              [8.0, 8.0],
              [9.0, 9.0],
              [10.0, 10.0],
              [11.0, 11.0],
              [12.0, 12.0]])

y = np.array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0])
df = pd.DataFrame(X, columns=['Feature 1', 'Feature 2'])
df['Class'] = y

In [None]:
df.head()

In [None]:
df["Class"].value_counts()

In [None]:
ros = RandomOverSampler(random_state=42)

In [None]:
X_resampled, y_resampled = ros.fit_resample(df[['Feature 1', 'Feature 2']], df['Class'])

In [None]:
X_resampled

In [None]:
df_resampled = pd.DataFrame(X_resampled, columns=['Feature 1', 'Feature 2'])
df_resampled['Class'] = y_resampled

In [None]:
df_resampled["Class"].value_counts()

In [None]:
df_resampled.duplicated().sum()

### SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
np.random.seed(0)
num_samples = 1000
num_features = 5

X = np.random.randn(num_samples, num_features)
y = np.concatenate([np.zeros(900), np.ones(100)]) 

# Create a DataFrame
feature_names = [f'feature_{i}' for i in range(num_features)]
df = pd.DataFrame(X, columns=feature_names)
df['Class'] = y.astype(int)

In [None]:
df.head()

In [None]:
df["Class"].value_counts()

In [None]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(df.drop('Class', axis=1), df['Class'])

In [None]:
X_resampled

In [None]:
df_resampled = pd.DataFrame(X_resampled, columns=feature_names)
df_resampled['Class'] = y_resampled

In [None]:
df_resampled["Class"].value_counts()

In [None]:
df_resampled.duplicated().sum()

### UNDERSAMPLING

In [None]:
class0_data = np.random.randn(900, 5)
class0_df = pd.DataFrame(class0_data, columns=[f'feature_{i}' for i in range(5)])
class0_df['Class'] = 0

class1_data = np.random.randn(400, 5)
class1_df = pd.DataFrame(class1_data, columns=[f'feature_{i}' for i in range(5)])
class1_df['Class'] = 1

df = pd.concat([class0_df, class1_df], ignore_index=True)
df.head()

In [None]:
df['Class'].value_counts()

In [None]:
class_0 = df[df["Class"]==0]
class_1 = df[df["Class"]==1]

In [None]:
class_0.shape

In [None]:
class_1.shape

In [None]:
undersampled_class0 = class_0.sample(n=400, random_state=42)

In [None]:
undersampled_class0.shape

In [None]:
df_resampled = pd.concat([undersampled_class0, class_1], ignore_index=True)

In [None]:
df_resampled.tail()

In [None]:
df_resampled["Class"].value_counts()

In [None]:
## SHUFFLING
df_resampled = df_resampled.sample(frac=1, random_state=0).reset_index(drop=True)

In [None]:
df_resampled.head()

### HYBRID SAMPLING

In [2]:
np.random.seed(0)

# Class 0 (majority class): 900 samples
class0_data = np.random.randn(900, 5)
class0_df = pd.DataFrame(class0_data, columns=[f'feature_{i}' for i in range(5)])
class0_df['Class'] = 0

# Class 1 (minority class): 100 samples
class1_data = np.random.randn(100, 5)
class1_df = pd.DataFrame(class1_data, columns=[f'feature_{i}' for i in range(5)])
class1_df['Class'] = 1

# Combine into a single DataFrame
df = pd.concat([class0_df, class1_df], ignore_index=True)

In [3]:
df.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,Class
0,1.764052,0.400157,0.978738,2.240893,1.867558,0
1,-0.977278,0.950088,-0.151357,-0.103219,0.410599,0
2,0.144044,1.454274,0.761038,0.121675,0.443863,0
3,0.333674,1.494079,-0.205158,0.313068,-0.854096,0
4,-2.55299,0.653619,0.864436,-0.742165,2.269755,0


In [4]:
df["Class"].value_counts()

Class
0    900
1    100
Name: count, dtype: int64

In [5]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [6]:
### UNDERSAMPLINNG
undersample_strategy = {0 : 300}  # Undersample majority class to the size of minority class


undersampler = RandomUnderSampler(sampling_strategy=undersample_strategy, random_state=0)

X_resampled, y_resampled = undersampler.fit_resample(df.drop('Class', axis=1), df['Class'])

In [7]:
X_resampled.shape , y_resampled.shape

((400, 5), (400,))

In [8]:
### SMOTE
smote = SMOTE(random_state=0)
X_resampled, y_resampled = smote.fit_resample(X_resampled, y_resampled)

In [9]:
X_resampled.shape , y_resampled.shape

((600, 5), (600,))

In [10]:
# Convert resampled data back to DataFrame
df_resampled = pd.DataFrame(X_resampled, columns=[f'feature_{i}' for i in range(5)])
df_resampled['Class'] = y_resampled

In [11]:
df_resampled["Class"].value_counts()

Class
0    300
1    300
Name: count, dtype: int64

In [12]:
df_resampled.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,Class
0,-0.935127,0.381824,0.398296,-1.255775,1.222877,0
1,2.132153,0.936446,-0.035095,1.265078,0.211497,0
2,0.100159,-0.475175,1.272954,-1.696131,0.730184,0
3,-0.663478,1.126636,-1.079932,-1.147469,-0.43782,0
4,0.677246,-0.031911,-0.173608,0.898241,-0.197787,0


In [13]:
### SHUFFLING
df_resampled = df_resampled.sample(frac=1, random_state=0).reset_index(drop=True)

In [14]:
df_resampled.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,Class
0,-0.946076,1.153331,0.077131,-0.213818,-0.555937,1
1,-0.779923,0.089076,-0.129153,0.264739,-1.661848,0
2,-0.062053,1.259167,0.704111,-1.49568,2.526368,0
3,1.077169,0.825145,-0.189224,1.232787,0.331181,1
4,-0.184951,-0.176783,-2.199254,0.729289,1.100558,0
