This assignment focuses on methods of classification when data has a class imbalance. You will compare results obtained with Synthetic Minority Oversampling, ADASYN with FastKDE, and Normalizing Flows to oversample the minority class. To develop the data application, you will use



https://www.kaggle.com/competitions/porto-seguro-safe-driver-prediction.



This dataset is very large, and in this project, you may subset 10% of observations from each class. Your results should include confusion matrices and stratified K-fold validated estimates of accuracy and recalls.



For the overall presentation format of the project, the following is a good example:



https://www.kaggle.com/code/rafjaa/resampling-strategies-for-imbalanced-datasets

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, recall_score, accuracy_score
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
data = pd.read_csv('drive/MyDrive/train.csv')

In [5]:
data

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595207,1488013,0,3,1,10,0,0,0,0,0,...,4,1,9,6,0,1,1,0,1,1
595208,1488016,0,5,1,3,0,0,0,0,0,...,4,1,3,8,1,0,1,0,1,1
595209,1488017,0,1,1,10,0,0,1,0,0,...,3,2,2,6,0,0,1,0,0,0
595210,1488021,0,5,2,3,1,0,0,0,1,...,4,1,4,2,0,1,1,1,0,0


In [6]:
total_rows = data.shape[0]
rows_with_missing = (data == -1).any(axis=1).sum()
percentage_rows_with_missing = (rows_with_missing / total_rows) * 100
print(f"Rows with -1 values: {rows_with_missing} ({rows_with_missing}/{total_rows}, {percentage_rows_with_missing:.2f}%)")

# if we remove all rows with a -1 value, we will still be able to sample 10% of the original data (100% - 79.1% > 10%)
# this is all we need - so we decide to remove all such rows

Rows with -1 values: 470281 (470281/595212, 79.01%)


In [7]:
data = data[~(data == -1).any(axis=1)]

In [8]:
data

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
7,22,0,5,1,4,0,0,1,0,0,...,7,1,3,6,1,0,1,0,1,0
9,28,1,1,1,2,0,0,0,1,0,...,3,5,0,6,0,1,0,0,1,0
13,43,0,1,1,3,1,0,0,1,0,...,2,0,4,3,0,0,1,0,1,0
17,58,0,5,1,6,0,1,1,0,0,...,9,1,3,9,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595198,1487990,0,5,1,7,1,0,0,0,0,...,6,2,2,14,0,1,1,0,0,0
595201,1487996,0,0,2,2,1,0,0,0,0,...,4,1,2,6,1,1,0,1,1,0
595202,1488001,0,4,1,3,0,0,0,1,0,...,11,6,2,6,0,1,1,0,1,0
595203,1488005,0,3,2,3,1,0,0,1,0,...,5,2,1,6,0,0,0,0,0,0


In [9]:
minority = data[data['target'] == 1]
majority = data[data['target'] == 0]
minority_sample = minority.sample(frac=0.5, random_state=42)
majority_sample = majority.sample(frac=0.5, random_state=42)
subset_data = pd.concat([minority_sample, majority_sample]).reset_index(drop=True)

# Should be 0.5 since we deleted some rows already

In [10]:
subset_data

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,939074,1,0,2,2,1,0,0,1,0,...,8,2,4,9,0,1,0,0,1,0
1,1047958,1,0,1,10,0,0,1,0,0,...,6,1,4,9,0,1,1,0,1,1
2,488377,1,5,1,8,0,0,1,0,0,...,4,0,2,7,0,0,0,0,0,0
3,687047,1,3,1,3,1,0,0,1,0,...,6,0,5,3,0,1,0,0,0,0
4,430075,1,5,4,10,1,0,0,0,0,...,6,2,1,10,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62460,507807,0,4,1,3,1,0,0,1,0,...,5,0,4,5,0,1,1,1,0,1
62461,1290532,0,4,2,1,0,0,1,0,0,...,3,0,1,7,0,0,1,0,0,0
62462,568317,0,4,1,2,0,0,0,1,0,...,5,2,2,7,0,1,1,0,0,0
62463,478718,0,4,1,4,1,0,0,0,1,...,4,3,3,16,0,1,1,0,1,0


In [11]:
data = subset_data

In [12]:
data

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,939074,1,0,2,2,1,0,0,1,0,...,8,2,4,9,0,1,0,0,1,0
1,1047958,1,0,1,10,0,0,1,0,0,...,6,1,4,9,0,1,1,0,1,1
2,488377,1,5,1,8,0,0,1,0,0,...,4,0,2,7,0,0,0,0,0,0
3,687047,1,3,1,3,1,0,0,1,0,...,6,0,5,3,0,1,0,0,0,0
4,430075,1,5,4,10,1,0,0,0,0,...,6,2,1,10,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62460,507807,0,4,1,3,1,0,0,1,0,...,5,0,4,5,0,1,1,1,0,1
62461,1290532,0,4,2,1,0,0,1,0,0,...,3,0,1,7,0,0,1,0,0,0
62462,568317,0,4,1,2,0,0,0,1,0,...,5,2,2,7,0,1,1,0,0,0
62463,478718,0,4,1,4,1,0,0,0,1,...,4,3,3,16,0,1,1,0,1,0


In [13]:
X = data.loc[:,'ps_ind_01':'ps_calc_20_bin'].values
y = data.target.values

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=123)

In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [16]:
# you only need the Intel patching if the data is too big
!pip install -q scikit-learn-intelex

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m58.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.4/91.4 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [17]:
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from sklearnex import patch_sklearn
patch_sklearn()
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import resample
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report as CR, confusion_matrix as CM, accuracy_score as AS, recall_score as RS
import warnings
warnings.simplefilter(action='ignore')

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [18]:
counter = Counter(y_train)
print(counter)

Counter({0: 47704, 1: 2268})


SMOTE:

In [19]:
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)
counter = Counter(y_train_smote)
print(counter)

Counter({0: 47704, 1: 47704})


In [20]:
# Confusion matrices and stratified K-fold validated estimates of accuracy and recalls
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
model = LogisticRegression(class_weight='balanced', random_state=123)

for train_idx, test_idx in skf.split(X_train_smote, y_train_smote):
    X_train_fold, X_test_fold = X_train_smote[train_idx], X_train_smote[test_idx]
    y_train_fold, y_test_fold = y_train_smote[train_idx], y_train_smote[test_idx]

    model.fit(X_train_fold, y_train_fold)
    y_pred = model.predict(X_test_fold)

    print("Confusion Matrix:")
    print(CM(y_test_fold, y_pred))
    print(f"Accuracy: {AS(y_test_fold, y_pred)}")
    print(f"Recall: {RS(y_test_fold, y_pred)}")

Confusion Matrix:
[[5674 3867]
 [3868 5673]]
Accuracy: 0.5946441672780631
Recall: 0.594591761869825
Confusion Matrix:
[[5599 3942]
 [3886 5655]]
Accuracy: 0.589770464311917
Recall: 0.5927051671732523
Confusion Matrix:
[[5652 3889]
 [3851 5690]]
Accuracy: 0.5943821402368724
Recall: 0.5963735457499214
Confusion Matrix:
[[5624 3917]
 [3829 5711]]
Accuracy: 0.594046433625072
Recall: 0.5986373165618448
Confusion Matrix:
[[5603 3937]
 [3865 5676]]
Accuracy: 0.5911115769613752
Recall: 0.5949061943192537


In [21]:
model.fit(X_train_smote, y_train_smote)
y_pred_smote = model.predict(X_test_scaled)

# Final confusion matrix and metrics
print("Confusion Matrix (SMOTE):")
print(CM(y_test, y_pred_smote))
print(f"Accuracy (SMOTE): {AS(y_test, y_pred_smote)}")
print(f"Recall (SMOTE): {RS(y_test, y_pred_smote)}")

Confusion Matrix (SMOTE):
[[7044 4882]
 [ 269  298]]
Accuracy (SMOTE): 0.5876891058993036
Recall (SMOTE): 0.5255731922398589


In [22]:
print(CR(y_test, y_pred_smote))

              precision    recall  f1-score   support

           0       0.96      0.59      0.73     11926
           1       0.06      0.53      0.10       567

    accuracy                           0.59     12493
   macro avg       0.51      0.56      0.42     12493
weighted avg       0.92      0.59      0.70     12493



ADASYN with FastKDE

In [23]:
adasyn = ADASYN()
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train_scaled, y_train)
counter = Counter(y_train_adasyn)
print(counter)

Counter({1: 47716, 0: 47704})


In [24]:
# Confusion matrices and stratified K-fold validated estimates of accuracy and recalls
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
model = LogisticRegression(class_weight='balanced', random_state=123)

for train_idx, test_idx in skf.split(X_train_adasyn, y_train_adasyn):
    X_train_fold, X_test_fold = X_train_adasyn[train_idx], X_train_adasyn[test_idx]
    y_train_fold, y_test_fold = y_train_adasyn[train_idx], y_train_adasyn[test_idx]

    model.fit(X_train_fold, y_train_fold)
    y_pred = model.predict(X_test_fold)

    print("Confusion Matrix:")
    print(CM(y_test_fold, y_pred))
    print(f"Accuracy: {AS(y_test_fold, y_pred)}")
    print(f"Recall: {RS(y_test_fold, y_pred)}")

Confusion Matrix:
[[5633 3908]
 [3825 5718]]
Accuracy: 0.5947914483336827
Recall: 0.5991826469663628
Confusion Matrix:
[[5586 3955]
 [3894 5649]]
Accuracy: 0.5887130580591071
Recall: 0.5919522162841874
Confusion Matrix:
[[5635 3906]
 [3845 5698]]
Accuracy: 0.5938482498428003
Recall: 0.5970868699570365
Confusion Matrix:
[[5600 3941]
 [3830 5713]]
Accuracy: 0.5928002515195976
Recall: 0.5986587027140312
Confusion Matrix:
[[5551 3989]
 [3810 5734]]
Accuracy: 0.5913330538671138
Recall: 0.6007963118189439


In [25]:
model.fit(X_train_adasyn, y_train_adasyn)
y_pred_adasyn = model.predict(X_test_scaled)

# Final confusion matrix and metrics
print("Confusion Matrix (ADASYN):")
print(CM(y_test, y_pred_adasyn))
print(f"Accuracy (ADASYN): {AS(y_test, y_pred_adasyn)}")
print(f"Recall (ADASYN): {RS(y_test, y_pred_adasyn)}")

Confusion Matrix (ADASYN):
[[7018 4908]
 [ 264  303]]
Accuracy (ADASYN): 0.5860081645721604
Recall (ADASYN): 0.5343915343915344


In [26]:
print(CR(y_test, y_pred_adasyn))

              precision    recall  f1-score   support

           0       0.96      0.59      0.73     11926
           1       0.06      0.53      0.10       567

    accuracy                           0.59     12493
   macro avg       0.51      0.56      0.42     12493
weighted avg       0.92      0.59      0.70     12493



Normalizing Flows

In [27]:
pip install nflows torch

Collecting nflows
  Downloading nflows-0.14.tar.gz (45 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/45.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.8/45.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: nflows
  Building wheel for nflows (setup.py) ... [?25l[?25hdone
  Created wheel for nflows: filename=nflows-0.14-py3-none-any.whl size=53654 sha256=7410de4b449d0f4c99a04ccceac5d361219d90e1b2d7390bdb8d1f1301112836
  Stored in directory: /root/.cache/pip/wheels/ca/8f/ac/c324eb57b461632081812c33b13161878290d0e6fbb8f5a7e2
Successfully built nflows
Installing collected packages: nflows
Successfully installed nflows-0.14


In [28]:
pip install normflows

Collecting normflows
  Downloading normflows-1.7.3.tar.gz (65 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/65.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.3/65.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: normflows
  Building wheel for normflows (setup.py) ... [?25l[?25hdone
  Created wheel for normflows: filename=normflows-1.7.3-py2.py3-none-any.whl size=87244 sha256=7dc43f07d86d6acdc0fb56a6ae6392660438201d747767104aad252964948eee
  Stored in directory: /root/.cache/pip/wheels/08/b1/a6/f018e29f12dc6251793263911d14764ddad0a6844f7b024007
Successfully built normflows
Installing collected packages: normflows
Successfully installed normflows-1.7.3


In [29]:
import torch
import torch.nn as nn
import torch.optim as optim
from nflows import transforms, flows, distributions

In [35]:
X_majority = X_train_scaled[y_train == 0]
X_minority = X_train_scaled[y_train == 1]

print(f"Majority class samples: {len(X_majority)}")
print(f"Minority class samples: {len(X_minority)}")

Majority class samples: 47704
Minority class samples: 2268


In [69]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x, context=None):
        return self.net(x)

def create_flow_model(input_dim, hidden_dim=64):
    base_distribution = distributions.StandardNormal([input_dim])
    transform = transforms.CompositeTransform([
        transforms.ActNorm(input_dim),
        transforms.AffineCouplingTransform(
            mask=np.arange(input_dim) % 2,
            transform_net_create_fn=lambda in_channels, out_channels: MLP(in_channels, hidden_dim, out_channels)
        ),
        transforms.BatchNorm(input_dim),
        transforms.Permutation(torch.randperm(input_dim)),
    ])
    flow = flows.Flow(transform, base_distribution)
    return flow

def train_flow(data, model, optimizer, epochs=10, batch_size=128):
    dataset = torch.tensor(data, dtype=torch.float64)
    model.train()
    for epoch in range(epochs):
        perm = torch.randperm(dataset.size(0))
        epoch_loss = 0.0
        valid_batches = 0
        for i in range(0, dataset.size(0), batch_size):
            batch = dataset[perm[i:i + batch_size]]
        if valid_batches > 0:
            epoch_loss /= valid_batches

In [70]:
input_dim = X_minority.shape[1] # Use dataset's dimension
hidden_dim = 64

flow_model = create_flow_model(input_dim, hidden_dim)

for m in flow_model.modules():
    if isinstance(m, nn.Linear):
        nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
        nn.init.zeros_(m.bias)

optimizer = torch.optim.Adam(flow_model.parameters(), lr=1e-4)
train_flow(X_minority, flow_model, optimizer, epochs=10, batch_size=128) # train model

flow_model.eval()
num_synthetic_samples = len(X_majority) - len(X_minority)
synthetic_samples = flow_model.sample(num_synthetic_samples)
synthetic_samples = synthetic_samples.detach().numpy()

synthetic_samples = synthetic_samples[~np.isnan(synthetic_samples).any(axis=1)] # remove invalid samples
synthetic_samples = synthetic_samples[np.isfinite(synthetic_samples).all(axis=1)]

X_balanced = np.vstack([X_train_scaled, synthetic_samples]) # combine datasets
y_balanced = np.hstack([y_train, np.ones(len(synthetic_samples))])

counter = Counter(y_balanced) # check that the len of the classes are the same
print(f"Balanced dataset class distribution: {counter}")

Balanced dataset class distribution: Counter({0.0: 47704, 1.0: 47704})


In [71]:
# Confusion matrices and stratified K-fold validated estimates of accuracy and recalls

model = LogisticRegression(class_weight='balanced', random_state=123)
model.fit(X_balanced, y_balanced)
y_pred = model.predict(X_test_scaled)

print("Confusion Matrix (NormalizingFlow):")
print(CM(y_test, y_pred))
print(f"Accuracy (NormalizingFlow): {AS(y_test, y_pred):.4f}")
print(f"Recall (NormalizingFlow): {RS(y_test, y_pred):.4f}")

Confusion Matrix (NormalizingFlow):
[[7433 4493]
 [ 267  300]]
Accuracy (NormalizingFlow): 0.6190
Recall (NormalizingFlow): 0.5291


In [73]:
print(CR(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.62      0.76     11926
           1       0.06      0.53      0.11       567

    accuracy                           0.62     12493
   macro avg       0.51      0.58      0.43     12493
weighted avg       0.92      0.62      0.73     12493

