In [None]:
# === 0. Libs & seed ==========================================================
import math, random, os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as T

def set_seed(seed=1337):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed(1337)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === 1. Data: CIFAR-100 -> resize 227, augmentation mạnh =====================
BATCH_SIZE = 512
IMG_SIZE = 227

# CIFAR-100 stats
MEAN = (0.5071, 0.4867, 0.4408)
STD  = (0.2675, 0.2565, 0.2761)

transform_train = T.Compose([
    T.Resize(256),
    T.RandomResizedCrop(IMG_SIZE, scale=(0.6, 1.0)),
    T.RandomHorizontalFlip(),
    T.RandAugment(num_ops=2, magnitude=9),   # N=2 phép, M=9 cường độ
    T.ToTensor(),
    T.Normalize(MEAN, STD),
])

transform_test = T.Compose([
    T.Resize(IMG_SIZE),
    T.CenterCrop(IMG_SIZE),
    T.ToTensor(),
    T.Normalize(MEAN, STD),
])

trainset = torchvision.datasets.CIFAR100(root="./data", train=True, download=True, transform=transform_train)
testset  = torchvision.datasets.CIFAR100(root="./data", train=False, download=True, transform=transform_test)

trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
testloader  = DataLoader(testset,  batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

In [None]:
# === 2. Model theo đúng mô tả trong paper ====================================
class ProposedCNN(nn.Module):
    """
    7 conv layers: 4 Conv2d + 1 GroupedConv + 2 TransposedConv
    Sau đó: Pool(3x3,s2) -> FC(4096)->Drop(0.5)->FC(4096)->Drop(0.5)->FC(100)
    Dùng LazyLinear để không phải đoán flatten_dim.
    """
    def __init__(self, num_classes=100):
        super().__init__()
        self.features = nn.Sequential(
            # Conv1: 13x13, s=6, pad≈"same"
            nn.Conv2d(3, 128, kernel_size=13, stride=6, padding=3),
            nn.LeakyReLU(0.01, inplace=True),
            nn.BatchNorm2d(128),
            nn.MaxPool2d(kernel_size=5, stride=2, padding=2),   # "same" cho pool

            # Conv2: 7x7, s=2, pad="same"
            nn.Conv2d(128, 64, kernel_size=7, stride=2, padding=3),
            nn.LeakyReLU(0.01, inplace=True),
            nn.BatchNorm2d(64),
            nn.MaxPool2d(kernel_size=5, stride=2, padding=2),

            # Conv3: 5x5, s=2, pad="same"
            nn.Conv2d(64, 48, kernel_size=5, stride=2, padding=2),
            nn.LeakyReLU(0.01, inplace=True),

            # Conv4: 3x3, s=2, pad="same"
            nn.Conv2d(48, 32, kernel_size=3, stride=2, padding=1),
            nn.LeakyReLU(0.01, inplace=True),

            # Grouped Conv5: 3x3, s=1, pad="same", groups=2
            nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1, groups=2),
            nn.LeakyReLU(0.01, inplace=True)
        )

        # Conv6 (Transposed): 3x3, s=1, pad=1, out=256
        self.deconv1 = nn.Sequential(
            nn.ConvTranspose2d(32, 256, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(0.01, inplace=True)
        )

        # Conv7 (Transposed): 3x3, s=1, pad=1, out=128 + BN + Pool(3,s=2)
        self.deconv2 = nn.Sequential(
            nn.ConvTranspose2d(256, 128, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(0.01, inplace=True),
            nn.BatchNorm2d(128),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)   # "same" cho pool
        )

        # Classifier theo paper: 4096 -> 4096 -> 100, dropout 0.5 giữa các FC
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.LazyLinear(4096),
            nn.LeakyReLU(0.01, inplace=True),
            nn.Dropout(p=0.5),

            nn.Linear(4096, 4096),
            nn.LeakyReLU(0.01, inplace=True),
            nn.Dropout(p=0.5),

            nn.Linear(4096, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.deconv1(x)
        x = self.deconv2(x)
        x = self.classifier(x)   # logits
        return x

net = ProposedCNN(num_classes=100).to(device)

In [None]:
from tqdm import tqdm
import copy
import random
import numpy as np

# === EarlyStopping helper =====================================================
class EarlyStopping:
    def __init__(self, patience=10, delta=0.0):
        self.patience = patience
        self.delta = delta
        self.best_score = None
        self.counter = 0
        self.early_stop = False
        self.best_model_wts = None

    def __call__(self, score, model):
        if self.best_score is None:
            self.best_score = score
            self.best_model_wts = copy.deepcopy(model.state_dict())
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f"  No improvement for {self.counter}/{self.patience} epochs")
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.best_model_wts = copy.deepcopy(model.state_dict())
            self.counter = 0

# === 3. Loss/Optim/Scheduler ==========================================
MIX_ALPHA = 1.0  # Beta distribution alpha cho MixUp/CutMix
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
num_epochs = 500
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)
scaler = GradScaler(enabled=True)

early_stopping = EarlyStopping(patience=12, delta=0.0)

# === MixUp & CutMix ===================================================
def mixup_data(x, y, alpha=MIX_ALPHA):
    lam = torch.distributions.Beta(alpha, alpha).sample().item() if alpha > 0 else 1.0
    index = torch.randperm(x.size(0), device=x.device)
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

def cutmix_data(x, y, alpha=MIX_ALPHA):
    lam = torch.distributions.Beta(alpha, alpha).sample().item() if alpha > 0 else 1.0
    index = torch.randperm(x.size(0), device=x.device)
    y_a, y_b = y, y[index]

    bbx1, bby1, bbx2, bby2 = rand_bbox(x.size(), lam)
    mixed_x = x.clone()
    mixed_x[:, :, bby1:bby2, bbx1:bbx2] = x[index, :, bby1:bby2, bbx1:bbx2]

    # điều chỉnh lại lambda dựa trên diện tích patch
    lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (x.size(-1) * x.size(-2)))
    return mixed_x, y_a, y_b, lam

def rand_bbox(size, lam):
    W = size[3]
    H = size[2]
    cut_rat = (1. - lam) ** 0.5
    cut_w = int(W * cut_rat)
    cut_h = int(H * cut_rat)

    # uniform center
    cx = random.randint(0, W)
    cy = random.randint(0, H)

    bbx1 = np.clip(cx - cut_w // 2, 0, W)
    bby1 = np.clip(cy - cut_h // 2, 0, H)
    bbx2 = np.clip(cx + cut_w // 2, 0, W)
    bby2 = np.clip(cy + cut_h // 2, 0, H)

    return bbx1, bby1, bbx2, bby2

def mix_criterion(pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

def accuracy(pred, target):
    with torch.no_grad():
        _, p = pred.max(1)
        return (p == target).float().mean().item() * 100.0

# === 4. Train/Eval loop ======================================================
for epoch in range(num_epochs):
    net.train()
    running_loss, run_correct, run_total = 0.0, 0, 0
    pbar = tqdm(enumerate(trainloader), total=len(trainloader), desc=f"Epoch {epoch+1}/{num_epochs}")

    for i, (inputs, labels) in pbar:
        inputs, labels = inputs.to(device, non_blocking=True), labels.to(device, non_blocking=True)
        optimizer.zero_grad(set_to_none=True)

        # chọn ngẫu nhiên MixUp hoặc CutMix
        if random.random() < 0.5:
            inputs_mixed, y_a, y_b, lam = mixup_data(inputs, labels)
        else:
            inputs_mixed, y_a, y_b, lam = cutmix_data(inputs, labels)

        with autocast(enabled=True):
            outputs = net(inputs_mixed)
            loss = mix_criterion(outputs, y_a, y_b, lam)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # ước lượng acc
        acc_a = accuracy(outputs, y_a)
        acc_b = accuracy(outputs, y_b)
        batch_acc = lam * acc_a + (1 - lam) * acc_b

        running_loss += loss.item()
        run_correct += batch_acc * inputs.size(0)
        run_total   += inputs.size(0)

        if (i % 50 == 0) or (i == len(trainloader)-1):
            avg_loss = running_loss / (i+1)
            avg_acc  = run_correct / run_total
            pbar.set_postfix({"Loss": f"{avg_loss:.3f}", "Train Acc": f"{avg_acc:.2f}%"})

    # ---- Evaluation ----
    net.eval()
    correct, total = 0, 0
    with torch.no_grad(), autocast(enabled=True):
        for inputs, labels in testloader:
            inputs, labels = inputs.to(device, non_blocking=True), labels.to(device, non_blocking=True)
            outputs = net(inputs)
            _, pred = outputs.max(1)
            total   += labels.size(0)
            correct += (pred == labels).sum().item()
    test_acc = 100.0 * correct / total
    print(f"Epoch {epoch+1}/{num_epochs} finished - Test Accuracy: {test_acc:.2f}%")

    scheduler.step()

    # ---- Early stopping ----
    early_stopping(test_acc, net)
    if early_stopping.early_stop:
        print("Early stopping triggered! Loading best model weights...")
        net.load_state_dict(early_stopping.best_model_wts)
        break

print("Finished Training.")

In [None]:
net.load_state_dict(early_stopping.best_model_wts)
net.eval()
correct, total = 0, 0
with torch.no_grad(), autocast(enabled=True):
    for inputs, labels in testloader:
        inputs, labels = inputs.to(device, non_blocking=True), labels.to(device, non_blocking=True)
        outputs = net(inputs)
        _, pred = outputs.max(1)
        total   += labels.size(0)
        correct += (pred == labels).sum().item()
test_acc = 100.0 * correct / total
print(f"Epoch {epoch+1}/{num_epochs} finished - Test Accuracy: {test_acc:.2f}%")

In [None]:
torch.save(net.state_dict(), "proposed_cnn_cifar100_state.pth")
model = ProposedCNN(num_classes=100).to(device)
model.load_state_dict(torch.load("proposed_cnn_cifar100_state.pth"))
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for inputs, labels in testloader:
        inputs, labels = inputs.to(device, non_blocking=True), labels.to(device, non_blocking=True)
        outputs = model(inputs)
        _, pred = outputs.max(1)
        total   += labels.size(0)
        correct += (pred == labels).sum().item()
test_acc = 100.0 * correct / total
print(f"Epoch {epoch+1}/{num_epochs} finished - Test Accuracy: {test_acc:.2f}%")

In [1]:
# === 0. Libs & seed ==========================================================
import math, random, os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as T

def set_seed(seed=1337):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed(1337)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === 2. Model theo đúng mô tả trong paper ====================================
class ProposedCNN(nn.Module):
    """
    7 conv layers: 4 Conv2d + 1 GroupedConv + 2 TransposedConv
    Sau đó: Pool(3x3,s2) -> FC(4096)->Drop(0.5)->FC(4096)->Drop(0.5)->FC(100)
    Dùng LazyLinear để không phải đoán flatten_dim.
    """
    def __init__(self, num_classes=100):
        super().__init__()
        self.features = nn.Sequential(
            # Conv1: 13x13, s=6, pad≈"same"
            nn.Conv2d(3, 128, kernel_size=13, stride=6, padding=3),
            nn.LeakyReLU(0.01, inplace=True),
            nn.BatchNorm2d(128),
            nn.MaxPool2d(kernel_size=5, stride=2, padding=2),   # "same" cho pool

            # Conv2: 7x7, s=2, pad="same"
            nn.Conv2d(128, 64, kernel_size=7, stride=2, padding=3),
            nn.LeakyReLU(0.01, inplace=True),
            nn.BatchNorm2d(64),
            nn.MaxPool2d(kernel_size=5, stride=2, padding=2),

            # Conv3: 5x5, s=2, pad="same"
            nn.Conv2d(64, 48, kernel_size=5, stride=2, padding=2),
            nn.LeakyReLU(0.01, inplace=True),

            # Conv4: 3x3, s=2, pad="same"
            nn.Conv2d(48, 32, kernel_size=3, stride=2, padding=1),
            nn.LeakyReLU(0.01, inplace=True),

            # Grouped Conv5: 3x3, s=1, pad="same", groups=2
            nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1, groups=2),
            nn.LeakyReLU(0.01, inplace=True)
        )

        # Conv6 (Transposed): 3x3, s=1, pad=1, out=256
        self.deconv1 = nn.Sequential(
            nn.ConvTranspose2d(32, 256, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(0.01, inplace=True)
        )

        # Conv7 (Transposed): 3x3, s=1, pad=1, out=128 + BN + Pool(3,s=2)
        self.deconv2 = nn.Sequential(
            nn.ConvTranspose2d(256, 128, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(0.01, inplace=True),
            nn.BatchNorm2d(128),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)   # "same" cho pool
        )

        # Classifier theo paper: 4096 -> 4096 -> 100, dropout 0.5 giữa các FC
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.LazyLinear(4096),
            nn.LeakyReLU(0.01, inplace=True),
            nn.Dropout(p=0.5),

            nn.Linear(4096, 4096),
            nn.LeakyReLU(0.01, inplace=True),
            nn.Dropout(p=0.5),

            nn.Linear(4096, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.deconv1(x)
        x = self.deconv2(x)
        x = self.classifier(x)   # logits
        return x

net = ProposedCNN(num_classes=100).to(device)

In [2]:
import os, numpy as np, torch, torch.nn as nn
import torchvision.transforms as T
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ==== Tải model đã pretrain trên CIFAR-100 ====
# Cách 1 (khuyến nghị): state_dict
net = ProposedCNN(num_classes=100).to(device)
state = torch.load("/kaggle/input/proposed_cnn_cifar100/pytorch/default/1/proposed_cnn_cifar100_state.pth", map_location=device)
net.load_state_dict(state)

net.eval()

# ==== Hook để lấy đặc trưng FC2 theo paper ====
# Kiến trúc classifier: [Flatten, FC1(4096), LeakyReLU, Dropout,
#                        FC2(4096), LeakyReLU, Dropout, FC3(num_classes)]
# Ta muốn lấy output sau LeakyReLU của FC2 -> classifier[5]
fc2_feats = []
def hook_fc2(module, inp, out):
    # out shape: (B, 4096)
    fc2_feats.append(out.detach().float().cpu())

# Đăng ký hook tại classifier[5] (LeakyReLU sau FC2)
hook_handle = net.classifier[5].register_forward_hook(hook_fc2)

In [3]:
import os, random

def load_all_images_ratio(base_path, total=20000, seed=1337):
    image_paths_0, image_paths_1 = [], []

    for patient_id in os.listdir(base_path):
        patient_path = os.path.join(base_path, patient_id)
        if not os.path.isdir(patient_path):
            continue

        for label_folder in ['0', '1']:  # 0 = Non-IDC, 1 = IDC
            label_path = os.path.join(patient_path, label_folder)
            if not os.path.exists(label_path):
                continue

            for file in os.listdir(label_path):
                if file.endswith('.png'):
                    if label_folder == '0':
                        image_paths_0.append(os.path.join(label_path, file))
                    else:
                        image_paths_1.append(os.path.join(label_path, file))

    # Shuffle từng class
    random.seed(seed)
    random.shuffle(image_paths_0)
    random.shuffle(image_paths_1)

    # Tính số ảnh theo tỷ lệ 28.4% IDC : 71.6% Non-IDC
    n_pos = int(total * 0.284)   # IDC (+)
    n_neg = total - n_pos        # Non-IDC (−)

    image_paths_1 = image_paths_1[:n_pos]
    image_paths_0 = image_paths_0[:n_neg]

    # Gộp lại và gán nhãn
    image_paths = image_paths_0 + image_paths_1
    labels = [0] * len(image_paths_0) + [1] * len(image_paths_1)

    # Shuffle lại toàn bộ dataset
    combined = list(zip(image_paths, labels))
    random.shuffle(combined)
    image_paths, labels = zip(*combined)

    return list(image_paths), list(labels)

In [4]:
# 20,000 ảnh (≈ 5.7k IDC, 14.3k non-IDC)
base_path = '/kaggle/input/breast-histopathology-images'
image_paths, labels = load_all_images_ratio(base_path, total=1000)
print("Total images loaded:", len(image_paths))
print("Label:", len(labels))
num_class0 = sum(1 for l in labels if l == 0)
num_class1 = sum(1 for l in labels if l == 1)

print(f"Class 0 (Benign): {num_class0}")
print(f"Class 1 (Malignant): {num_class1}")

Total images loaded: 1000
Label: 1000
Class 0 (Benign): 716
Class 1 (Malignant): 284


In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torchvision.transforms as T

# Transform theo paper (resize 227 + normalize theo CIFAR-100)
IMG_SIZE = 227
MEAN = (0.5071, 0.4867, 0.4408)
STD  = (0.2675, 0.2565, 0.2761)

transform_eval = T.Compose([
    T.Resize(IMG_SIZE),
    T.CenterCrop(IMG_SIZE),
    T.ToTensor(),
    T.Normalize(MEAN, STD),
])

# Dataset tuỳ chỉnh dùng list path + labels
class IDCDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        label = self.labels[idx]
        img = Image.open(img_path).convert("RGB")
        if self.transform:
            img = self.transform(img)
        return img, label

# Tạo dataset và dataloader
idc_ds = IDCDataset(image_paths, labels, transform=transform_eval)
idc_loader = DataLoader(idc_ds, batch_size=128, shuffle=False, num_workers=2, pin_memory=True)

print("Dataset size:", len(idc_ds))
img, lbl = idc_ds[0]
print("Sample image tensor shape:", img.shape, "Label:", lbl)

Dataset size: 1000
Sample image tensor shape: torch.Size([3, 227, 227]) Label: 0


In [6]:
fc2_feats.clear()
labels_all = []

with torch.no_grad():
    for inputs, labels in idc_loader:
        inputs = inputs.to(device, non_blocking=True)
        _ = net(inputs)                  # forward sẽ đẩy output FC2 vào fc2_feats
        labels_all.append(labels.cpu())

# Ghép lại thành ma trận N x 4096
X = torch.cat(fc2_feats, dim=0).numpy()     # (N, 4096)
y = torch.cat(labels_all, dim=0).numpy()    # (N,)
print("Feature shape:", X.shape, "Labels:", y.shape)

# Bỏ hook (không cần nữa)
hook_handle.remove()

Feature shape: (1000, 4096) Labels: (1000,)


In [42]:
# !pip install pynndescent
import numpy as np
from tqdm import tqdm
import pynndescent  # cài bằng: pip install pynndescent
from sklearn.preprocessing import MinMaxScaler

def reliefF_fast(X, y, n_neighbors=30, sample_frac=0.2, random_state=42):
    """
    Relief-F nhanh với Sampling + Approximate Nearest Neighbors (ANN).
    
    Parameters
    ----------
    X : ndarray (n_samples, n_features)
    y : ndarray (n_samples,)
    n_neighbors : số hàng xóm gần (k)
    sample_frac : tỷ lệ mẫu được chọn để tính (0.0 - 1.0)
    random_state : seed
    
    Returns
    -------
    scores : ndarray (n_features,) 
        Điểm importance cho từng feature
    """
    rng = np.random.default_rng(random_state)
    n_samples, n_features = X.shape

    # --- Sampling ---
    n_sub = int(sample_frac * n_samples)
    idx_sub = rng.choice(n_samples, n_sub, replace=False)
    X_sub, y_sub = X[idx_sub], y[idx_sub]

    # --- ANN (Approximate Nearest Neighbor search) ---
    index = pynndescent.NNDescent(X_sub, n_neighbors=n_neighbors+1, random_state=random_state)
    neighbors, _ = index.query(X_sub, k=n_neighbors+1)  # +1 vì nó trả cả chính nó
    neighbors = neighbors[:, 1:]  # bỏ chính nó đi

    # --- Relief-F scoring ---
    scores = np.zeros(n_features)
    for i in tqdm(range(n_sub), desc="ReliefF (fast ANN)"):
        xi, yi = X_sub[i], y_sub[i]
        for nn in neighbors[i]:
            xj, yj = X_sub[nn], y_sub[nn]
            diff = np.abs(xi - xj)
            if yi == yj:  # nearest hit
                scores -= diff / n_sub
            else:         # nearest miss
                scores += diff / n_sub

    return scores

In [None]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import IncrementalPCA
from sklearn.feature_selection import mutual_info_classif
from tqdm import tqdm
from skrebate import ReliefF

# --- PCC score với tqdm ---
from scipy.stats import pearsonr

def pcc_scores_progress(X, y):
    scores = []
    for j in tqdm(range(X.shape[1]), desc="Computing PCC"):
        r, _ = pearsonr(X[:, j], y)
        scores.append(abs(r))
    return np.array(scores)

print("Step 1: PCC ...")
pcc = pcc_scores_progress(X, y)  # (4096,)

print("Step 2: Relief-F (fast, sampling + ANN) ...")
relief_scores = reliefF_fast(X, y, n_neighbors=30, sample_frac=0.2)

# --- Chuẩn hoá về [0,1] ---
scaler = MinMaxScaler()
pcc_n = scaler.fit_transform(pcc.reshape(-1,1)).ravel()
rel_n = scaler.fit_transform(relief_scores.reshape(-1,1)).ravel()

# --- Chọn top-NF ---
NF = 1000
top_pcc = np.argsort(pcc_n)[::-1][:NF]
top_rel = np.argsort(rel_n)[::-1][:NF]

# --- Tách đặc trưng ---
X_pcc = X[:, top_pcc]      # N x 1000
X_rel = X[:, top_rel]      # N x 1000

# --- Gộp lại ---
X_concat = np.concatenate([X_pcc, X_rel], axis=1)  # N x 2000
print("After PCC branch:", X_pcc.shape)
print("After ReliefF branch:", X_rel.shape)
print("Concatenated features:", X_concat.shape)

# --- PCA incremental với tqdm ---
print("Step 3: PCA ...")
batch_size = 1024
n_comp = 1000
pca = IncrementalPCA(n_components=n_comp, batch_size=batch_size)

n_samples = X_concat.shape[0]
for start in tqdm(range(0, n_samples, batch_size), desc="PCA fitting"):
    end = min(start + batch_size, n_samples)
    chunk = X_concat[start:end]

    # Nếu batch cuối nhỏ hơn n_comp → nối với batch trước đó
    if chunk.shape[0] < n_comp:
        chunk = X_concat[-n_comp:]   # lấy đúng n_comp mẫu cuối
    pca.partial_fit(chunk)

X_pca = pca.transform(X_concat)
print("After PCA:", X_pca.shape)

Step 1: PCC ...


Computing PCC: 100%|██████████| 4096/4096 [00:01<00:00, 3187.71it/s]


Step 2: Relief-F (fast, sampling + ANN) ...


ReliefF (fast ANN): 100%|██████████| 200/200 [00:00<00:00, 2552.23it/s]


After PCC branch: (1000, 1000)
After ReliefF branch: (1000, 1000)
Concatenated features: (1000, 2000)
Step 3: PCA ...


PCA fitting: 100%|██████████| 1/1 [00:00<00:00,  2.01it/s]

After PCA: (1000, 1000)





In [44]:
import numpy as np
from sklearn.neighbors import NearestNeighbors

def simple_smote(
    X, y, *, minority_class=None,
    target_ratio=1.0,
    n_samples=None,
    k=5,
    shrink=0.7,
    jitter_frac=1e-3,
    random_state=None
):
    """
    Nếu X, y là numpy array -> dùng SMOTE theo kNN (shrink + jitter).
    Nếu X, y là torch.Tensor -> dùng TorchSMOTE class ở trên.
    """
    # --- Nếu là torch.Tensor thì dùng TorchSMOTE ---
    if isinstance(X, torch.Tensor) and isinstance(y, torch.Tensor):
        sm = TorchSMOTE(dims=X.shape[1], k=k)
        return sm.fit_generate(X, y)

    # --- Nếu là numpy thì dùng phiên bản cũ ---
    rng = np.random.default_rng(random_state)
    X = np.asarray(X); y = np.asarray(y)

    classes, counts = np.unique(y, return_counts=True)
    if len(classes) != 2:
        raise ValueError("Chỉ hỗ trợ nhị phân.")
    if minority_class is None:
        minority_class = classes[np.argmin(counts)]
    majority_class = classes[0] if classes[1] == minority_class else classes[1]

    X_min = X[y == minority_class]
    X_maj = X[y == majority_class]
    n_min, n_maj = len(X_min), len(X_maj)

    if n_min == 0:
        raise ValueError("Không có mẫu thiểu số.")
    if n_samples is None:
        target_min = int(np.ceil(target_ratio * n_maj))
        n_to_gen = max(0, target_min - n_min)
    else:
        n_to_gen = int(n_samples)
    if n_to_gen <= 0:
        return X, y

    D = X.shape[1]
    synth = np.empty((n_to_gen, D), dtype=X.dtype)

    # Jitter scale
    std = X_min.std(axis=0, ddof=0)
    std[std == 0] = 1.0
    jitter_scale = jitter_frac * std

    if n_min == 1:
        noise = rng.normal(0, 1.0, size=(n_to_gen, D)).astype(X.dtype) * jitter_scale
        synth[:] = X_min[0] + noise
    else:
        k_eff = max(1, min(k, n_min - 1))
        nn = NearestNeighbors(n_neighbors=k_eff, algorithm="auto").fit(X_min)
        neigh_idx = nn.kneighbors(return_distance=False)

        centroid = X_min.mean(axis=0, dtype=float)

        for t in range(n_to_gen):
            i = rng.integers(0, n_min)
            j = int(neigh_idx[i][rng.integers(0, k_eff)])
            lam = rng.random()
            x_interp = X_min[i] + lam * (X_min[j] - X_min[i])
            x_interp = centroid + shrink * (x_interp - centroid)
            x_interp = x_interp + rng.normal(0, 1.0, size=D) * jitter_scale
            synth[t] = x_interp.astype(X.dtype, copy=False)

    X_new = np.vstack([X, synth])
    y_new = np.hstack([y, np.full(n_to_gen, minority_class, dtype=y.dtype)])
    return X_new, y_new

In [45]:
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import numpy as np
import pandas as pd
from tqdm import tqdm


def eval_clf(clf, X, y, n_splits=5, name="", augment=False):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=1337)
    accs, sens, precs, fnrs = [], [], [], []

    for tr, va in tqdm(skf.split(X, y), total=n_splits, desc=f"CV {name}", leave=False):
        X_train, y_train = X[tr], y[tr]
        X_val, y_val = X[va], y[va]

        # augment bằng SMOTE nếu bật cờ augment
        if augment:
            X_train, y_train = simple_smote(
                                    X_train, y_train,
                                    target_ratio=1.0,   # cân bằng hoàn toàn
                                    k=5,                # local neighbors
                                    shrink=0.7,         # giữ phân phối gọn hơn (giúp LDA)
                                    jitter_frac=1e-3,   # rất nhỏ để tránh trùng điểm
                                    random_state=42)

        clf.fit(X_train, y_train)
        pred = clf.predict(X_val)

        accs.append(accuracy_score(y_val, pred) * 100.0)
        sens.append(recall_score(y_val, pred, pos_label=1) * 100.0)  # sensitivity
        precs.append(precision_score(y_val, pred, pos_label=1, zero_division=0) * 100.0)

        # Confusion matrix để tính FNR
        tn, fp, fn, tp = confusion_matrix(y_val, pred).ravel()
        fnrs.append(fn / (fn + tp + 1e-8) * 100.0)

    return np.mean(accs), np.mean(sens), np.mean(fnrs), np.mean(precs)

# --- 8 classifiers như Table trong paper ---
classifiers = [
    ("Quadratic SVM", SVC(kernel='poly', degree=2, C=1.0, gamma='scale', class_weight='balanced')),
    ("Cubic SVM",     SVC(kernel='poly', degree=3, C=1.0, gamma='scale', class_weight='balanced')),
    ("Linear SVM",    SVC(kernel='linear', C=1.0, class_weight='balanced')),
    ("Medium Gaussian SVM", SVC(kernel='rbf', C=1.0, gamma=0.1, class_weight='balanced')),  # gamma=0.1 ~ "medium"
    ("Coarse Gaussian SVM", SVC(kernel='rbf', C=1.0, gamma=0.01, class_weight='balanced')), # gamma nhỏ = coarse
    ("Ensemble Subspace Discriminant", 
        BaggingClassifier(estimator=LinearDiscriminantAnalysis(solver="lsqr", shrinkage="auto"),
                          n_estimators=50, max_samples=0.5, random_state=1337, n_jobs=-1)),
    ("Ensemble Boosted Tree", 
        AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=3), n_estimators=100, random_state=1337)),
    ("Fine Tree", DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=1337))
]

In [46]:
from tqdm import tqdm

# --- Chạy đánh giá và gom vào bảng ---
results = []
for name, clf in tqdm(classifiers, desc="Evaluating classifiers"):
    mean_acc, mean_sens, mean_fnr, mean_prec = eval_clf(clf, X_pca, y, n_splits=5, augment=True)
    results.append([name, mean_acc, mean_sens, mean_fnr, mean_prec])

df = pd.DataFrame(results, columns=["Classifier", "Accuracy (%)", "Sensitivity (%)", "FNR (%)", "Precision (%)"])
print(df.to_string(index=False))

Evaluating classifiers:   0%|          | 0/8 [00:00<?, ?it/s]
CV :   0%|          | 0/5 [00:00<?, ?it/s][A
CV :  20%|██        | 1/5 [00:00<00:01,  2.70it/s][A
CV :  40%|████      | 2/5 [00:00<00:01,  2.67it/s][A
CV :  60%|██████    | 3/5 [00:01<00:00,  2.68it/s][A
CV :  80%|████████  | 4/5 [00:01<00:00,  2.78it/s][A
CV : 100%|██████████| 5/5 [00:01<00:00,  2.73it/s][A
Evaluating classifiers:  12%|█▎        | 1/8 [00:01<00:12,  1.84s/it]
CV :   0%|          | 0/5 [00:00<?, ?it/s][A
CV :  20%|██        | 1/5 [00:00<00:01,  2.84it/s][A
CV :  40%|████      | 2/5 [00:00<00:00,  3.01it/s][A
CV :  60%|██████    | 3/5 [00:01<00:00,  2.97it/s][A
CV :  80%|████████  | 4/5 [00:01<00:00,  2.93it/s][A
CV : 100%|██████████| 5/5 [00:01<00:00,  2.97it/s][A
Evaluating classifiers:  25%|██▌       | 2/8 [00:03<00:10,  1.76s/it]
CV :   0%|          | 0/5 [00:00<?, ?it/s][A
CV :  20%|██        | 1/5 [00:00<00:01,  3.14it/s][A
CV :  40%|████      | 2/5 [00:00<00:00,  3.20it/s][A
CV :  60%|██

                    Classifier  Accuracy (%)  Sensitivity (%)   FNR (%)  Precision (%)
                 Quadratic SVM          81.7        64.097744 35.902256      69.655665
                     Cubic SVM          82.6        67.274436 32.725564      70.316521
                    Linear SVM          83.1        71.842105 28.157895      69.851885
           Medium Gaussian SVM          83.7        65.513784 34.486216      74.291506
           Coarse Gaussian SVM          80.9        71.510025 28.489975      64.811018
Ensemble Subspace Discriminant          77.1        26.409774 73.590226      79.984876
         Ensemble Boosted Tree          76.6        51.434837 48.565163      60.334987
                     Fine Tree          72.8        55.977444 44.022556      52.369274





In [47]:
from tqdm import tqdm

# --- Chạy đánh giá và gom vào bảng ---
results = []
for name, clf in tqdm(classifiers, desc="Evaluating classifiers"):
    mean_acc, mean_sens, mean_fnr, mean_prec = eval_clf(clf, X_pca, y, n_splits=10)
    results.append([name, mean_acc, mean_sens, mean_fnr, mean_prec])

df = pd.DataFrame(results, columns=["Classifier", "Accuracy (%)", "Sensitivity (%)", "FNR (%)", "Precision (%)"])
print(df.to_string(index=False))

Evaluating classifiers:   0%|          | 0/8 [00:00<?, ?it/s]
CV :   0%|          | 0/10 [00:00<?, ?it/s][A
CV :  10%|█         | 1/10 [00:00<00:02,  4.43it/s][A
CV :  20%|██        | 2/10 [00:00<00:01,  4.31it/s][A
CV :  30%|███       | 3/10 [00:00<00:01,  4.22it/s][A
CV :  40%|████      | 4/10 [00:00<00:01,  4.32it/s][A
CV :  50%|█████     | 5/10 [00:01<00:01,  4.29it/s][A
CV :  60%|██████    | 6/10 [00:01<00:00,  4.30it/s][A
CV :  70%|███████   | 7/10 [00:01<00:00,  4.28it/s][A
CV :  80%|████████  | 8/10 [00:01<00:00,  4.32it/s][A
CV :  90%|█████████ | 9/10 [00:02<00:00,  4.39it/s][A
CV : 100%|██████████| 10/10 [00:02<00:00,  4.34it/s][A
Evaluating classifiers:  12%|█▎        | 1/8 [00:02<00:16,  2.32s/it]
CV :   0%|          | 0/10 [00:00<?, ?it/s][A
CV :  10%|█         | 1/10 [00:00<00:01,  4.79it/s][A
CV :  20%|██        | 2/10 [00:00<00:01,  4.73it/s][A
CV :  30%|███       | 3/10 [00:00<00:01,  4.75it/s][A
CV :  40%|████      | 4/10 [00:00<00:01,  4.86it/s][A
CV 

                    Classifier  Accuracy (%)  Sensitivity (%)    FNR (%)  Precision (%)
                 Quadratic SVM          83.4        64.088670  35.911330      74.334965
                     Cubic SVM          82.8        63.706897  36.293103      72.349804
                    Linear SVM          81.5        74.322660  25.677340      65.923699
           Medium Gaussian SVM          81.2        71.810345  28.189655      65.907447
           Coarse Gaussian SVM          79.7        73.596059  26.403941      62.296245
Ensemble Subspace Discriminant          71.6         0.000000 100.000000       0.000000
         Ensemble Boosted Tree          77.6        45.751232  54.248768      65.974426
                     Fine Tree          75.5        55.603448  44.396552      57.290158



