In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [6]:
df = pd.read_csv("../../Datasets/sample_dataset.csv").iloc[:, -10:].dropna()

In [11]:
X = df.iloc[:, 0:-1]

In [12]:
y = df["target"]

In [13]:
X_scaled = StandardScaler().fit_transform(X)

# With Numerical Features

In [14]:
y.value_counts()

target
1    61
0    35
Name: count, dtype: int64

In [15]:
resampler = SMOTE(random_state = 0)

In [16]:
X_res, y_res = resampler.fit_resample(X_scaled, y)

In [17]:
y_res.value_counts()

target
0    61
1    61
Name: count, dtype: int64

# With Categorical Features

In [22]:
from imblearn.over_sampling import SMOTENC

In [23]:
df = pd.read_csv("../../Datasets/sample_dataset.csv").loc[:, ["mean radius", "mean texture", "mean perimeter", "area error", "target"]].dropna()
df

Unnamed: 0,mean radius,mean texture,mean perimeter,area error,target
1,20.57,17.77,132.90,A,0
2,19.69,21.25,130.00,A,0
3,11.42,20.38,77.58,A,0
7,13.71,20.83,90.20,A,0
8,13.00,21.82,87.50,A,0
...,...,...,...,...,...
561,11.20,29.37,70.67,A,1
563,20.92,25.09,143.00,A,0
564,21.56,22.39,142.00,A,0
566,16.60,28.08,108.30,A,0


In [24]:
X = df.iloc[:, 0:-1]
y = df["target"]

In [30]:
resampler = SMOTENC(random_state=0, categorical_features = [3], k_neighbors = 10)

In [31]:
X_res, y_res = resampler.fit_resample(X, y)

In [32]:
y_res.value_counts()

target
0    211
1    211
Name: count, dtype: int64

In [33]:
X_res.shape

(422, 4)

In [34]:
X.shape

(332, 4)

# Exercise

- Load wine Dataset
- Apply Standardization to the features
- Apply SMOTE to balance target classes
- Invert Standardization to go back to the Original feature space

In [37]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE, SMOTENC

In [42]:
X, y = load_wine(return_X_y=True)

In [43]:
np.unique(y, return_counts=True)

(array([0, 1, 2]), array([59, 71, 48], dtype=int64))

In [44]:
scaler = StandardScaler()

In [45]:
X_scaled = scaler.fit_transform(X)

In [47]:
res = SMOTE(random_state=0)
X_res, y_res = res.fit_resample(X, y)

In [48]:
np.unique(y_res, return_counts=True)

(array([0, 1, 2]), array([71, 71, 71], dtype=int64))

In [49]:
scaler.inverse_transform(X_res)

array([[2.45204137e+01, 4.24129452e+00, 3.03129753e+00, ...,
        1.19449519e+00, 5.38701899e+00, 3.35179958e+05],
       [2.36865844e+01, 4.31927477e+00, 2.95196156e+00, ...,
        1.19677448e+00, 5.01886249e+00, 3.30469633e+05],
       [2.36542027e+01, 4.96539687e+00, 3.09695488e+00, ...,
        1.19221590e+00, 4.85602404e+00, 3.72862557e+05],
       ...,
       [2.33685963e+01, 6.67099527e+00, 3.02246412e+00, ...,
        1.08250521e+00, 3.88102485e+00, 1.98030451e+05],
       [2.41145227e+01, 5.68816403e+00, 3.01644862e+00, ...,
        1.11607486e+00, 3.82409013e+00, 1.94507134e+05],
       [2.33249691e+01, 6.25501738e+00, 3.02458066e+00, ...,
        1.09604590e+00, 3.84823149e+00, 1.99555111e+05]])