In [45]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
import tqdm

from sklearn.datasets import make_classification
from sklearn.neighbors import NearestNeighbors, LocalOutlierFactor
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_score, recall_score, f1_score
from scipy.stats import entropy
from numpy.random import RandomState
from typing import Dict

# Hàm tiện ích để kiểm tra random_state
def check_random_state(seed):
    if seed is None or isinstance(seed, int):
        return RandomState(seed)
    if isinstance(seed, RandomState):
        return seed
    raise ValueError("random_state must be None, int or np.random.RandomState")


In [46]:
class KNN_Shapley():
    def __init__(
        self,
        x_train: torch.Tensor,
        y_train: torch.Tensor,
        x_valid: torch.Tensor,
        y_valid: torch.Tensor,
        k_neighbors: int = 10,
        batch_size: int = 32,
        metric: str = 'cosine',
        embedding_model = None,
        random_state: int = 42
    ):
        """
        - x_train, y_train: tập train (có thể chứa nhiễu)
        - x_valid, y_valid: tập validation để đo "match" (hiệu quả phân loại).
        - k_neighbors: số láng giềng (để tính shapley theo logic KNN).
        - metric: 'cosine' hoặc 'euclidean'.
        - embedding_model: nếu muốn biến đổi dữ liệu trước khi đo khoảng cách,
          hiện chưa dùng trong code này.
        - random_state: để tái lặp.
        """
        self.x_train = torch.tensor(x_train, dtype=torch.float32) if not isinstance(x_train, torch.Tensor) else x_train
        self.y_train = torch.tensor(y_train, dtype=torch.long) if not isinstance(y_train, torch.Tensor) else y_train
        self.x_valid = torch.tensor(x_valid, dtype=torch.float32) if not isinstance(x_valid, torch.Tensor) else x_valid
        self.y_valid = torch.tensor(y_valid, dtype=torch.long) if not isinstance(y_valid, torch.Tensor) else y_valid

        self.k_neighbors = k_neighbors
        self.batch_size = batch_size
        self.embedding_model = embedding_model
        self.random_state = check_random_state(random_state)
        self.metric = metric
        self.data_values = None

    def match(self, y: torch.Tensor) -> torch.Tensor:
        """
        So sánh nhãn dự đoán (y) với y_valid để đánh giá "đúng" (1) hay "sai" (0).
        """
        return (y == self.y_valid).float()

    def train_data_values(self):
        """
        Tính giá trị KNN-Shapley cho từng điểm train,
        dựa trên logic "sort theo khoảng cách đến x_valid,
        lần lượt thêm điểm train vào, cập nhật tỉ lệ match".
        """
        n = len(self.x_train)
        m = len(self.x_valid)

        # Giả sử chưa có embedding_model => ta dùng x_train, x_valid trực tiếp
        x_train_view = self.x_train.view(n, -1)
        x_valid_view = self.x_valid.view(m, -1)

        dist_list = []
        # Tính ma trận khoảng cách dist (size [n, m]) theo batch
        for x_train_batch in DataLoader(x_train_view, batch_size=self.batch_size, shuffle=False):
            dist_row = []
            for x_val_batch in DataLoader(x_valid_view, batch_size=self.batch_size, shuffle=False):
                if self.metric == 'cosine':
                    # Dot product
                    dot_product = torch.mm(x_train_batch, x_val_batch.T)
                    # Norm
                    norm_train = torch.norm(x_train_batch, dim=1, keepdim=True)
                    norm_val   = torch.norm(x_val_batch,   dim=1, keepdim=True)
                    # Cosine similarity
                    cos_sim = dot_product / (norm_train * norm_val.T + 1e-8)
                    # Đảo thành distance
                    dist_batch = 1 - cos_sim
                else:
                    # Euclidean
                    dist_batch = torch.cdist(x_train_batch, x_val_batch)
                dist_row.append(dist_batch)
            dist_list.append(torch.cat(dist_row, dim=1))

        dist = torch.cat(dist_list, dim=0)
        # dist.shape = [n, m]

        # Sắp xếp train theo khoảng cách tăng dần cho mỗi cột (mỗi x_valid)
        sort_indices = torch.argsort(dist, dim=0, stable=True)
        # sort_indices.shape = [n, m]
        y_train_sort = self.y_train[sort_indices]

        # Mảng score (size [n, m]) sẽ cập nhật dần
        score = torch.zeros_like(dist, dtype=torch.float32)

        # Khởi tạo hàng cuối (i = n-1)
        score[sort_indices[n-1], range(m)] = self.match(y_train_sort[n-1]) / n

        # Thuật toán "KNN-Shapley" ngược
        for i in tqdm.tqdm(range(n-2, -1, -1)):
            score[sort_indices[i], range(m)] = (
                score[sort_indices[i+1], range(m)]
                + (self.match(y_train_sort[i]) - self.match(y_train_sort[i+1])) / max(self.k_neighbors, i+1)
            )

        # Cuối cùng, lấy mean theo cột => Shapley Value cho mỗi điểm train
        # (điểm train_i = trung bình score[i, :])
        self.data_values = score.mean(axis=1).detach().numpy()
        return self

    def evaluate_data_values(self) -> np.ndarray:
        """
        Lấy ra mảng Shapley Value cho mỗi điểm train (càng THẤP => càng nghi nhiễu).
        """
        return self.data_values


In [17]:
!pip install geomloss

Collecting geomloss
  Downloading geomloss-0.2.6.tar.gz (26 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->geomloss)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->geomloss)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->geomloss)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->geomloss)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->geomloss)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch->geomloss)
  Download

In [47]:
import numpy as np
import pandas as pd
import torch
import itertools
import tqdm
from torch.utils.data import DataLoader, TensorDataset
from typing import Union, Callable, Optional, Literal, Dict
from functools import partial

# ===========================================================
# 0. CÁC IMPORT LIÊN QUAN ĐẾN SKLEARN, GEOMLOSS
# ===========================================================
import geomloss  # Đảm bảo bạn đã cài "pip install geomloss"
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors, LocalOutlierFactor
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from scipy.stats import entropy
from numpy.random import RandomState

# ===========================================================
# 1. HÀM / CLASS TIỆN ÍCH
# ===========================================================
def check_random_state(seed):
    """Tạo / kiểm tra random_state."""
    if seed is None or isinstance(seed, int):
        return RandomState(seed)
    if isinstance(seed, RandomState):
        return seed
    raise ValueError("random_state must be None, int or np.random.RandomState")

# cost_routines cho GeomLoss
cost_routines = {
    1: geomloss.utils.distances,
    2: lambda x, y: geomloss.utils.squared_distances(x, y) / 2,
}

def batch_augmented_cost(
    Z1: torch.Tensor,
    Z2: torch.Tensor,
    W: Optional[torch.Tensor] = None,
    feature_cost: Optional[str] = None,
    p: int = 2,
    lam_x: float = 1.0,
    lam_y: float = 1.0,
) -> torch.Tensor:
    """
    Hàm cost tuỳ biến cho GeomLoss, kết hợp chi phí feature + label (W).
    """
    Y1 = Z1[:, :, -1].long()
    Y2 = Z2[:, :, -1].long()

    # Tính cost feature (euclidean hoặc tuỳ biến)
    if feature_cost is None or feature_cost == "euclidean":
        C1 = cost_routines[p](Z1[:, :, :-1], Z2[:, :, :-1])  # p=1 => dist, p=2 => squared_dist/2
    else:
        C1 = feature_cost(Z1[:, :, :-1], Z2[:, :, :-1])

    # Tính cost label
    if W is not None:
        # M = offset cột, do W có shape [n1, n2]
        # Y1, Y2 là batch [B, N], ta reshape => index
        # Trick: W.shape[1] * Y1 + Y2
        M = W.shape[1] * Y1[:, :, None] + Y2[:, None, :]
        C2 = W.flatten()[M.flatten(start_dim=1)].reshape(-1, Y1.shape[1], Y2.shape[1])
    else:
        raise ValueError("Must provide label distances W for OT cost")

    return lam_x * C1 + lam_y * (C2 / p)

def pwdist_exact(
    X1: torch.Tensor,
    Y1: torch.Tensor,
    X2: Optional[torch.Tensor] = None,
    Y2: Optional[torch.Tensor] = None,
    symmetric: bool = False,
    loss: str = "sinkhorn",
    cost_function: Union[Literal["euclidean"], Callable[..., torch.Tensor]] = "euclidean",
    p: int = 2,
    debias: bool = True,
    entreg: float = 1e-1,
    device: torch.device = torch.device("cpu"),
) -> torch.Tensor:
    """
    Tính pairwise distances giữa các class, dựa trên OT.
    Mặc định: "euclidean" cho feature, "sinkhorn" cho OT-loss.
    """
    if X2 is None:
        symmetric = True
        X2, Y2 = X1, Y1

    c1 = torch.unique(Y1)
    c2 = torch.unique(Y2)
    n1, n2 = len(c1), len(c2)

    if symmetric:
        pairs = list(itertools.combinations(range(n1), 2))
    else:
        pairs = list(itertools.product(range(n1), range(n2)))

    if cost_function == "euclidean":
        cost_function = cost_routines[p]

    distance = geomloss.SamplesLoss(
        loss=loss,
        p=p,
        cost=cost_function,
        debias=debias,
        blur=entreg ** (1 / p),
    )

    D = torch.zeros((n1, n2), device=device, dtype=X1.dtype)
    for i, j in tqdm.tqdm(pairs, leave=False, desc="Computing label-to-label distance"):
        m1 = X1[Y1 == c1[i]].to(device)
        m2 = X2[Y2 == c2[j]].to(device)

        a = torch.ones(len(m1), device=device) / len(m1)
        b = torch.ones(len(m2), device=device) / len(m2)
        val = distance(a, m1, b, m2).item()
        D[i, j] = val

        # Tuỳ chỉnh phạt khi c1[i] == c2[j] hay khác
        # Ở đây tạm để D[i,j] = val * 1
        # (bạn có thể tuỳ biến if c1[i] == c2[j] => scale)
        if symmetric:
            D[j, i] = D[i, j]
    return D

# ===========================================================
# 2. CÁC CLASS "MỚI": CKNN_Shapley, SAVA_OT, SAVA_OT_savel2l
#    (bạn đã cung cấp)
# ===========================================================

# -----------------------------------------------------------
# 2.1 CKNN-Shapley
# -----------------------------------------------------------
class CKNN_Shapley():
    def __init__(
        self,
        x_train: torch.Tensor,
        y_train: torch.Tensor,
        x_valid: torch.Tensor,
        y_valid: torch.Tensor,
        k_neighbors: int = 10,
        T: int = 0,
        default: bool = True,
        batch_size: int = 32,
        embedding_model = None,
        random_state: int = 42
    ):
        self.x_train = torch.tensor(x_train, dtype=torch.float32) if not isinstance(x_train, torch.Tensor) else x_train
        self.y_train = torch.tensor(y_train, dtype=torch.long) if not isinstance(y_train, torch.Tensor) else y_train
        self.x_valid = torch.tensor(x_valid, dtype=torch.float32) if not isinstance(x_valid, torch.Tensor) else x_valid
        self.y_valid = torch.tensor(y_valid, dtype=torch.long) if not isinstance(y_valid, torch.Tensor) else y_valid
        self.k_neighbors = k_neighbors
        self.batch_size = batch_size
        self.embedding_model = embedding_model
        self.random_state = check_random_state(random_state)
        self.T = T  # T = N - 2k (tuỳ ý), default => N//2
        self.default = default
        self.data_values = None

    def match(self, y: torch.Tensor) -> torch.Tensor:
        # So sánh nhãn dự đoán y với y_valid => match = 1, mismatch = 0
        return (y == self.y_valid).float()

    def train_data_values(self):
        n = len(self.x_train)
        m = len(self.x_valid)
        if self.default:
            self.T = n // 2

        # Có thể chèn embedding_model nếu muốn
        if self.embedding_model is not None:
            self.x_train, self.x_valid = self.embedding_model(self.x_train, self.x_valid)

        x_train_view = self.x_train.view(n, -1)
        x_valid_view = self.x_valid.view(m, -1)

        # Tính ma trận khoảng cách (euclidean)
        dist_list = []
        train_loader = DataLoader(x_train_view, batch_size=self.batch_size, shuffle=False)
        valid_loader = DataLoader(x_valid_view, batch_size=self.batch_size, shuffle=False)
        for x_train_batch in train_loader:
            dist_row = []
            for x_val_batch in valid_loader:
                dist_batch = torch.cdist(x_train_batch, x_val_batch)  # [batch_size, batch_valid]
                dist_row.append(dist_batch)
            dist_list.append(torch.cat(dist_row, dim=1))
        dist = torch.cat(dist_list, dim=0)  # => shape [n, m]

        # Sắp xếp theo cột (m) => indices
        sort_indices = torch.argsort(dist, dim=0, stable=True)
        y_train_sort = self.y_train[sort_indices]

        score = torch.zeros_like(dist, dtype=torch.float32)

        # Bước khởi tạo: set 0 cho khoảng (n-1) đến (n-T)
        for i in tqdm.tqdm(range(n-1, n-self.T-1, -1)):
            score[sort_indices[i], range(m)] = 0

        # Điểm (n-T-1)
        score[sort_indices[n-self.T-1], range(m)] = self.match(y_train_sort[n-self.T-1]) / (n - self.T)

        # Vòng lặp từ (n-T-1) -> 0
        for i in tqdm.tqdm(range(n-self.T-2, -1, -1)):
            score[sort_indices[i], range(m)] = (
                score[sort_indices[i+1], range(m)]
                + (self.match(y_train_sort[i]) - self.match(y_train_sort[i+1])) / max(self.k_neighbors, i+1)
            )

        self.data_values = score.mean(dim=1).detach().numpy()
        return self

    def evaluate_data_values(self)->np.ndarray:
        return self.data_values

# -----------------------------------------------------------
# 2.2 KNN-Shapley (bạn có thể đưa thêm nếu muốn so sánh)
#    (Phiên bản cũ, để đầy đủ)
# -----------------------------------------------------------
class KNN_Shapley():
    def __init__(
        self,
        x_train: torch.Tensor,
        y_train: torch.Tensor,
        x_valid: torch.Tensor,
        y_valid: torch.Tensor,
        k_neighbors: int = 10,
        batch_size: int = 32,
        metric: str = 'cosine',
        embedding_model = None,
        random_state: int = 42
    ):
        self.x_train = torch.tensor(x_train, dtype=torch.float32) if not isinstance(x_train, torch.Tensor) else x_train
        self.y_train = torch.tensor(y_train, dtype=torch.long) if not isinstance(y_train, torch.Tensor) else y_train
        self.x_valid = torch.tensor(x_valid, dtype=torch.float32) if not isinstance(x_valid, torch.Tensor) else x_valid
        self.y_valid = torch.tensor(y_valid, dtype=torch.long) if not isinstance(y_valid, torch.Tensor) else y_valid

        self.k_neighbors = k_neighbors
        self.batch_size = batch_size
        self.embedding_model = embedding_model
        self.random_state = check_random_state(random_state)
        self.metric = metric
        self.data_values = None

    def match(self, y: torch.Tensor) -> torch.Tensor:
        return (y == self.y_valid).float()

    def train_data_values(self):
        n = len(self.x_train)
        m = len(self.x_valid)

        x_train_view = self.x_train.view(n, -1)
        x_valid_view = self.x_valid.view(m, -1)

        dist_list = []
        for x_train_batch in DataLoader(x_train_view, batch_size=self.batch_size, shuffle=False):
            dist_row = []
            for x_val_batch in DataLoader(x_valid_view, batch_size=self.batch_size, shuffle=False):
                if self.metric == 'cosine':
                    dot_product = torch.mm(x_train_batch, x_val_batch.T)
                    norm_train = torch.norm(x_train_batch, dim=1, keepdim=True)
                    norm_val   = torch.norm(x_val_batch, dim=1, keepdim=True)
                    cos_sim = dot_product / (norm_train * norm_val.T + 1e-8)
                    dist_batch = 1 - cos_sim
                else:
                    dist_batch = torch.cdist(x_train_batch, x_val_batch)
                dist_row.append(dist_batch)
            dist_list.append(torch.cat(dist_row, dim=1))
        dist = torch.cat(dist_list, dim=0)  # [n, m]

        sort_indices = torch.argsort(dist, dim=0, stable=True)
        y_train_sort = self.y_train[sort_indices]

        score = torch.zeros_like(dist, dtype=torch.float32)

        # Khởi tạo hàng cuối
        score[sort_indices[n-1], range(m)] = self.match(y_train_sort[n-1]) / n

        # Vòng lặp
        for i in tqdm.tqdm(range(n-2, -1, -1)):
            score[sort_indices[i], range(m)] = (
                score[sort_indices[i+1], range(m)]
                + (self.match(y_train_sort[i]) - self.match(y_train_sort[i+1])) / max(self.k_neighbors, i+1)
            )

        self.data_values = score.mean(axis=1).detach().numpy()
        return self

    def evaluate_data_values(self)->np.ndarray:
        return self.data_values

# -----------------------------------------------------------
# 2.3 SAVA_OT
# -----------------------------------------------------------
class DatasetDistance_geoloss:
    """
    Tính toán khoảng cách dataset sử dụng Optimal Transport + GeomLoss.
    """
    def __init__(
        self,
        x_train: torch.Tensor,
        y_train: torch.Tensor,
        x_valid: torch.Tensor,
        y_valid: torch.Tensor,
        feature_cost: Union[Literal["euclidean"], Callable[..., torch.Tensor]] = "euclidean",
        p: int = 2,
        entreg: float = 0.1,
        lam_x: float = 1.0,
        lam_y: float = 1.0,
        inner_ot_loss: str = "sinkhorn",
        inner_ot_debiased: bool = False,
        inner_ot_p: int = 2,
        inner_ot_entreg: float = 0.1,
        device: torch.device = torch.device("cpu"),
        ot_method: str = 'balance_ot_sinkhorn',
        label_distances: torch.Tensor = None,
    ):
        self.feature_cost = feature_cost
        self.inner_ot_loss = inner_ot_loss
        self.p = p
        self.entreg = entreg
        self.lam_x = lam_x
        self.lam_y = lam_y
        self.inner_ot_p = inner_ot_p
        self.inner_ot_entreg = inner_ot_entreg
        self.inner_ot_debiased = inner_ot_debiased
        self.device = device
        self.label_distances = label_distances

        self.x_train = torch.tensor(x_train, dtype=torch.float32) if not isinstance(x_train, torch.Tensor) else x_train
        self.y_train = torch.tensor(y_train, dtype=torch.long) if not isinstance(y_train, torch.Tensor) else y_train
        self.x_valid = torch.tensor(x_valid, dtype=torch.float32) if not isinstance(x_valid, torch.Tensor) else x_valid
        self.y_valid = torch.tensor(y_valid, dtype=torch.long) if not isinstance(y_valid, torch.Tensor) else y_valid
        self.num_train, self.num_valid = len(self.y_train), len(self.y_valid)

    def _get_label_distances(self) -> torch.Tensor:
        if self.label_distances is not None:
            return self.label_distances

        pwdist_fn = partial(
            pwdist_exact,
            symmetric=False,
            p=self.inner_ot_p,
            loss=self.inner_ot_loss,
            debias=self.inner_ot_debiased,
            entreg=self.inner_ot_entreg,
            cost_function=self.feature_cost,
            device=self.device,
        )

        # Tính khoảng cách pairwise giữa train_labels và valid_labels
        DYY1 = pwdist_fn(self.x_train, self.y_train)  # so sánh class train vs train?
        DYY2 = pwdist_fn(self.x_valid, self.y_valid)  # so sánh class valid vs valid?
        DYY12 = pwdist_fn(self.x_train, self.y_train, self.x_valid, self.y_valid)

        # Tạo ma trận label_distances
        # Tùy bạn ghép matrix ra sao. Ở đây tạm ghép [DYY12, DYY1], ...
        D = torch.cat([torch.cat([DYY12, DYY1], 1), torch.cat([DYY12.t(), DYY2], 1)])
        self.label_distances = D
        return self.label_distances

    def dual_sol(self) -> tuple[float, torch.Tensor]:
        wasserstein = self._get_label_distances().to(self.device)
        cost_geomloss = partial(
            batch_augmented_cost,
            W=wasserstein,
            lam_x=self.lam_x,
            lam_y=self.lam_y,
            feature_cost=self.feature_cost,
        )
        loss = geomloss.SamplesLoss(
            loss="sinkhorn",
            p=self.p,
            cost=cost_geomloss,
            debias=False,  # or True tuỳ
            blur=self.entreg ** (1 / self.p),
            backend="tensorized",
        )
        Z1 = torch.cat((self.x_train, self.y_train.float().unsqueeze(dim=1)), -1)
        Z2 = torch.cat((self.x_valid, self.y_valid.float().unsqueeze(dim=1)), -1)

        N, M = len(self.x_train), len(self.x_valid)
        a = torch.ones(N, device=self.device) / N
        b = torch.ones(M, device=self.device) / M

        with torch.no_grad():
            loss.debias = False
            loss.potentials = True
            F_i, G_j = loss(a, Z1.to(self.device), b, Z2.to(self.device))
            pi = [F_i, G_j]  # dual potential
        return pi

    def compute_distance(self, pi) -> np.ndarray:
        # Ở đây ta trích "train_gradients" = -F_i => score
        f1k = pi.squeeze()
        train_gradients = -1 * f1k
        return train_gradients.cpu().numpy()

class SAVA_OT:
    def __init__(self, random_state=0, batch_size=100, device=torch.device("cpu")):
        self.random_state = random_state
        self.device = device
        torch.manual_seed(random_state)
        self.batch_size = batch_size
        self.train_loader = None
        self.valid_loader = None

    def create_dataloaders(self, x_train, y_train, x_valid, y_valid):
        x_train = torch.tensor(x_train, dtype=torch.float32) if not isinstance(x_train, torch.Tensor) else x_train
        y_train = torch.tensor(y_train, dtype=torch.long) if not isinstance(y_train, torch.Tensor) else y_train
        x_valid = torch.tensor(x_valid, dtype=torch.float32) if not isinstance(x_valid, torch.Tensor) else x_valid
        y_valid = torch.tensor(y_valid, dtype=torch.long) if not isinstance(y_valid, torch.Tensor) else y_valid

        train_dataset = TensorDataset(x_train, y_train)
        valid_dataset = TensorDataset(x_valid, y_valid)

        self.train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=False)
        self.valid_loader = DataLoader(valid_dataset, batch_size=self.batch_size, shuffle=False)

    def evaluate_data_values(self, x_train, y_train, x_valid, y_valid, lam_x=1.0, lam_y=1.0):
        self.create_dataloaders(x_train, y_train, x_valid, y_valid)
        values_all = []

        for x_train_batch, y_train_batch in tqdm.tqdm(self.train_loader, desc='SAVA_OT train_batch'):
            x_train_batch = x_train_batch.to(self.device)
            y_train_batch = y_train_batch.to(self.device)
            values_tmp = np.zeros(x_train_batch.shape[0])

            for x_valid_batch, y_valid_batch in tqdm.tqdm(self.valid_loader, desc='SAVA_OT valid_batch'):
                x_valid_batch = x_valid_batch.to(self.device)
                y_valid_batch = y_valid_batch.to(self.device)

                dist_calculator = DatasetDistance_geoloss(
                    x_train_batch, y_train_batch, x_valid_batch, y_valid_batch,
                    device=self.device, ot_method='balance_ot_sinkhorn'
                )
                pi = dist_calculator.dual_sol()
                dist = dist_calculator.compute_distance(pi[0])
                values_tmp += dist

            # Trung bình theo số batch valid
            values_all.append(values_tmp / len(self.valid_loader))

        # Ghép thành 1 array
        values_all = np.concatenate(values_all)
        return values_all

# -----------------------------------------------------------
# 2.4 SAVA_OT_savel2l
# -----------------------------------------------------------
class SAVA_OT_savel2l:
    """
    Giống SAVA_OT nhưng tái sử dụng label_distances precomputed => tối ưu hơn.
    """
    def __init__(self, random_state=0, batch_size=100, device=torch.device("cpu")):
        self.random_state = random_state
        self.device = device
        torch.manual_seed(random_state)
        self.batch_size = batch_size
        self.train_loader = None
        self.valid_loader = None

    def create_dataloaders(self, x_train, y_train, x_valid, y_valid):
        x_train = torch.tensor(x_train, dtype=torch.float32) if not isinstance(x_train, torch.Tensor) else x_train
        y_train = torch.tensor(y_train, dtype=torch.long) if not isinstance(y_train, torch.Tensor) else y_train
        x_valid = torch.tensor(x_valid, dtype=torch.float32) if not isinstance(x_valid, torch.Tensor) else x_valid
        y_valid = torch.tensor(y_valid, dtype=torch.long) if not isinstance(y_valid, torch.Tensor) else y_valid

        train_dataset = TensorDataset(x_train, y_train)
        valid_dataset = TensorDataset(x_valid, y_valid)

        self.train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=False)
        self.valid_loader = DataLoader(valid_dataset, batch_size=self.batch_size, shuffle=False)

    def evaluate_data_values(self, x_train, y_train, x_valid, y_valid, lam_x=1.0, lam_y=1.0):
        self.create_dataloaders(x_train, y_train, x_valid, y_valid)

        # Tính label_distances 1 lần
        dist_calculator_global = DatasetDistance_geoloss(
            x_train, y_train, x_valid, y_valid,
            device=self.device, ot_method='balance_ot_sinkhorn'
        )
        label_distances = dist_calculator_global._get_label_distances()

        values_all = []
        for x_train_batch, y_train_batch in tqdm.tqdm(self.train_loader, desc='SAVA_l2l train_batch'):
            x_train_batch = x_train_batch.to(self.device)
            y_train_batch = y_train_batch.to(self.device)
            values_tmp = np.zeros(x_train_batch.shape[0])

            for x_valid_batch, y_valid_batch in tqdm.tqdm(self.valid_loader, desc='SAVA_l2l valid_batch'):
                x_valid_batch = x_valid_batch.to(self.device)
                y_valid_batch = y_valid_batch.to(self.device)

                dist_calculator = DatasetDistance_geoloss(
                    x_train_batch, y_train_batch, x_valid_batch, y_valid_batch,
                    label_distances=label_distances,
                    device=self.device,
                    ot_method='balance_ot_sinkhorn'
                )
                pi = dist_calculator.dual_sol()
                dist = dist_calculator.compute_distance(pi[0])
                values_tmp += dist

            values_all.append(values_tmp / len(self.valid_loader))

        values_all = np.concatenate(values_all)
        return values_all

In [49]:
from sklearn.neighbors import NearestNeighbors

def knn_consistency_scores(X, y, k=5):
    """
    Tỉ lệ hàng xóm có nhãn KHÁC => CÀNG CAO => càng nghi nhiễu
    """
    knn = NearestNeighbors(n_neighbors=k)
    knn.fit(X)
    neighbors_idx = knn.kneighbors(X, return_distance=False)

    scores = []
    for i in range(len(X)):
        neighbor_labels = y[neighbors_idx[i]]
        diff_count = np.sum(neighbor_labels != y[i])
        score = diff_count / k
        scores.append(score)
    return np.array(scores)

def entropy_consistency_scores(X, y, k=5, num_cls=5):
    """
    Entropy láng giềng CÀNG CAO => càng nghi nhiễu
    """
    knn = NearestNeighbors(n_neighbors=k)
    knn.fit(X)
    neighbors_idx = knn.kneighbors(X, return_distance=False)

    scores = []
    for i in range(len(X)):
        neighbor_labels = y[neighbors_idx[i]]
        counts = np.bincount(neighbor_labels, minlength=num_cls)
        e = entropy(counts)
        scores.append(e)
    return np.array(scores)

def kmeans_inconsistency_labels(X, y, n_clusters=5):
    """
    KMeans => majority label => 1(nhiễu) / 0(không)
    """
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(X)

    cluster_maj_labels = []
    for c in range(n_clusters):
        idx_c = np.where(clusters == c)[0]
        maj_label = np.argmax(np.bincount(y[idx_c]))
        cluster_maj_labels.append(maj_label)

    result = []
    for i in range(len(X)):
        c = clusters[i]
        if y[i] != cluster_maj_labels[c]:
            result.append(1)
        else:
            result.append(0)
    return np.array(result)  # 1 => nhiễu, 0 => sạch

def pca_outlier_detection(X, n_components=2):
    """
    Khoảng cách đến tâm trong PCA => CÀNG LỚN => outlier
    """
    pca = PCA(n_components=n_components, random_state=42)
    X_pca = pca.fit_transform(X)
    center = np.mean(X_pca, axis=0)
    dist = np.linalg.norm(X_pca - center, axis=1)
    return dist

def isolation_forest_outlier(X):
    """
    -decision_function => CÀNG CAO => outlier
    """
    iso = IsolationForest(contamination=0.1, random_state=42)
    iso.fit(X)
    return -iso.decision_function(X)

def local_outlier_factor_scores(X):
    """
    LOF => negative_outlier_factor_ (CÀNG nhỏ => outlier)
    => ta trả về -negative_outlier_factor_ => CÀNG CAO => outlier
    """
    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
    lof.fit(X)
    return -lof.negative_outlier_factor_


In [50]:
class LavaEvaluator_geomloss():
    """
    Lava evaluator using Optimal Transport.
    """
    def __init__(self, lam_x: float = 1.0, lam_y: float = 1.0, **kwargs):
        super().__init__(**kwargs)
        self.lam_x = lam_x
        self.lam_y = lam_y

    def evaluate_data_values(self, x_train: torch.Tensor, y_train: torch.Tensor, x_valid: torch.Tensor, y_valid: torch.Tensor) -> np.ndarray:
        dist_calculator = DatasetDistance_geoloss(x_train, y_train, x_valid, y_valid, lam_x=self.lam_x, lam_y=self.lam_y)
        u, _ = dist_calculator.dual_sol()
        return dist_calculator.compute_distance(u)

In [25]:
def evaluate_noise_detection(data_values: np.ndarray, noise_indices: np.ndarray, per: float=0.2) -> Dict[str, float]:
    """
    1) data_values: "càng THẤP => càng nghi nhiễu"
    2) Lấy 20% thấp nhất => dự đoán nhiễu
    3) So với noise_indices => tính Prec, Recall, F1
    """
    N = len(data_values)
    sorted_idx = np.argsort(data_values)  # tăng dần
    cutoff = int(per * N)
    pred_noise = sorted_idx[:cutoff]  # 20% đầu => nghi nhiễu

    # Tính TP, FP, FN
    found_in_noisy = np.intersect1d(pred_noise, noise_indices)  # TP
    not_in_noisy   = np.setdiff1d(pred_noise, noise_indices)    # FP
    not_found_in_small = np.setdiff1d(noise_indices, pred_noise) # FN

    TP = len(found_in_noisy)
    FP = len(not_in_noisy)
    FN = len(not_found_in_small)

    precision = TP/(TP+FP) if (TP+FP)>0 else 0
    recall    = TP/(TP+FN) if (TP+FN)>0 else 0
    f1        = 2*precision*recall/(precision+recall) if (precision+recall)>0 else 0

    return {"Precision": precision, "Recall": recall, "F1-Score": f1}

def invert_score_if_needed(score_array: np.ndarray, method_name: str) -> np.ndarray:
    """
    Trả về "càng THẤP => càng nghi nhiễu"
    """
    if method_name == "KNN-Shapley":
        # Đã "thấp => nghi"
        return score_array
    elif method_name in ["KNN Consistency", "Entropy-based", "PCA Outlier", "Isolation Forest", "Local Outlier Factor"]:
        # Đều "cao => nghi" => đảo
        return -score_array
    elif method_name == "K-means Inconsistency":
        # 1 => nhiễu => ta muốn 0 => nhiễu => => 1 - label
        return 1 - score_array
    else:
        return score_array


# sythetic dataset

In [48]:
from sklearn.model_selection import train_test_split

# 3.1 Tạo dataset multi-class
X_all, y_all = make_classification(
    n_samples=12000,
    n_features=10,
    n_informative=8,
    n_redundant=2,
    n_classes=5,
    random_state=42
)

# 3.2 Chia train (10000) - valid (2000)
X_train_full, X_valid, y_train_full, y_valid = train_test_split(
    X_all, y_all, test_size=2000, random_state=42, stratify=y_all
)

# 3.3 Thêm nhiễu nhãn vào train
num_samples_train = len(X_train_full)
noise_ratio = 0.2
noise_count = int(noise_ratio * num_samples_train)
noise_indices = np.random.choice(num_samples_train, size=noise_count, replace=False)

y_noisy = y_train_full.copy()
for idx in noise_indices:
    # chọn ngẫu nhiên 1 nhãn khác
    possible_labels = list(set(range(5)) - {y_train_full[idx]})
    y_noisy[idx] = np.random.choice(possible_labels)

# Chuyển sang torch.Tensor (phục vụ KNN-Shapley)
X_train_tensor = torch.tensor(X_train_full, dtype=torch.float32)
y_train_tensor = torch.tensor(y_noisy,      dtype=torch.long)
X_valid_tensor = torch.tensor(X_valid,      dtype=torch.float32)
y_valid_tensor = torch.tensor(y_valid,      dtype=torch.long)


In [51]:
# 5.3 Tính score cho TẤT CẢ CÁC PHƯƠNG PHÁP
# --- (A) KNN-Shapley
knn_shapley_model = KNN_Shapley(
    x_train=X_train_tensor, y_train=y_train_tensor,
    x_valid=X_valid_tensor, y_valid=y_valid_tensor,
    k_neighbors=10, batch_size=64, metric='euclidean'
)
knn_shapley_model.train_data_values()
score_knn_shapley = knn_shapley_model.evaluate_data_values()

# --- (B) CKNN-Shapley
cknn_shapley_model = CKNN_Shapley(
    x_train=X_train_tensor, y_train=y_train_tensor,
    x_valid=X_valid_tensor, y_valid=y_valid_tensor,
    k_neighbors=10, default=True, batch_size=64
)
cknn_shapley_model.train_data_values()
score_cknn_shapley = cknn_shapley_model.evaluate_data_values()
# ----------- LAVA
lava = LavaEvaluator_geomloss()
score_lava = lava.evaluate_data_values(
    X_train_full, y_noisy, X_valid, y_valid
)
# --- (C) SAVA_OT
sava_ot = SAVA_OT(random_state=42, batch_size=1000)
score_sava_ot = sava_ot.evaluate_data_values(
    X_train_full, y_noisy, X_valid, y_valid
)

# --- (D) SAVA_OT_savel2l
sava_ot_l2l = SAVA_OT_savel2l(random_state=42, batch_size=1000)
score_sava_ot_l2l = sava_ot_l2l.evaluate_data_values(
    X_train_full, y_noisy, X_valid, y_valid
)

# --- (E) Phương pháp Label Noise "truyền thống"
knn_consistency = knn_consistency_scores(X_train_full, y_noisy, k=10)
entropy_scores  = entropy_consistency_scores(X_train_full, y_noisy, k=5, num_cls=5)
kmeans_labels   = kmeans_inconsistency_labels(X_train_full, y_noisy, n_clusters=5)

# --- (F) Phương pháp Outlier "feature-based"
pca_outlier = pca_outlier_detection(X_train_full, n_components=2)
iso_outlier = isolation_forest_outlier(X_train_full)
lof_outlier = local_outlier_factor_scores(X_train_full)

100%|██████████| 9999/9999 [00:13<00:00, 766.10it/s]
100%|██████████| 5000/5000 [00:03<00:00, 1658.73it/s]
100%|██████████| 4999/4999 [00:06<00:00, 760.36it/s]
SAVA_OT train_batch:   0%|          | 0/10 [00:00<?, ?it/s]
SAVA_OT valid_batch:   0%|          | 0/2 [00:00<?, ?it/s][A

Computing label-to-label distance:   0%|          | 0/10 [00:00<?, ?it/s][A[A

                                                                         [A[A

Computing label-to-label distance:   0%|          | 0/10 [00:00<?, ?it/s][A[A

                                                                         [A[A

Computing label-to-label distance:   0%|          | 0/25 [00:00<?, ?it/s][A[A

Computing label-to-label distance:  52%|█████▏    | 13/25 [00:00<00:00, 128.42it/s][A[A

                                                                                   [A[A
SAVA_OT valid_batch:  50%|█████     | 1/2 [00:00<00:00,  2.11it/s][A

Computing label-to-label distance:   0%|          | 0/10 [00

In [52]:
# 5.4 Gom vào dict => đánh giá
methods_scores = {
    "KNN-Shapley":           score_knn_shapley,
    "CKNN-Shapley":          score_cknn_shapley,
    "Lava":                  score_lava,
    "SAVA_OT":               score_sava_ot,
    "SAVA_OT_savel2l":       score_sava_ot_l2l,
    "KNN Consistency":       knn_consistency,
    "Entropy-based":         entropy_scores,
    "K-means Inconsistency": kmeans_labels,
    "PCA Outlier":           pca_outlier,
    "Isolation Forest":      iso_outlier,
    "Local Outlier Factor":  lof_outlier
}

results = []
for method_name, original_score in methods_scores.items():
    # Chuyển về quy ước "càng THẤP => càng nhiễu"
    data_vals = invert_score_if_needed(original_score, method_name)
    eval_res = evaluate_noise_detection(data_vals, noise_indices, per=0.2)
    eval_res["Method"] = method_name
    results.append(eval_res)

df_eval = pd.DataFrame(results)
print(df_eval)

    Precision  Recall  F1-Score                 Method
0      0.7675  0.7675    0.7675            KNN-Shapley
1      0.7765  0.7765    0.7765           CKNN-Shapley
2      0.4825  0.4825    0.4825                   Lava
3      0.4030  0.4030    0.4030                SAVA_OT
4      0.4085  0.4085    0.4085        SAVA_OT_savel2l
5      0.8145  0.8145    0.8145        KNN Consistency
6      0.3865  0.3865    0.3865          Entropy-based
7      0.2415  0.2415    0.2415  K-means Inconsistency
8      0.1960  0.1960    0.1960            PCA Outlier
9      0.1990  0.1990    0.1990       Isolation Forest
10     0.1990  0.1990    0.1990   Local Outlier Factor


In [53]:
import numpy as np
import torch
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 1) Tạo dataset multi-class
X_all, y_all = make_classification(
    n_samples=12000,
    n_features=10,
    n_informative=8,
    n_redundant=2,
    n_classes=5,
    random_state=42
)

# 2) Tách train (10000) - valid (2000)
X_train_full, X_valid, y_train_full, y_valid = train_test_split(
    X_all, y_all, test_size=2000, random_state=42, stratify=y_all
)

# 3) Chuẩn hoá X (chỉ fit trên train, sau đó transform valid)
scaler = StandardScaler()
X_train_full_scaled = scaler.fit_transform(X_train_full)
X_valid_scaled      = scaler.transform(X_valid)

# 4) Thêm nhiễu Gaussian vào 20% mẫu train (sau khi đã scale)
num_samples_train = len(X_train_full_scaled)
noise_ratio = 0.2
noise_count = int(noise_ratio * num_samples_train)

feature_noise_indices = np.random.choice(
    num_samples_train,
    size=noise_count,
    replace=False
)

X_train_noisy = X_train_full_scaled.copy()
for idx in feature_noise_indices:
    # Thêm noise ~ N(0, 1) vào vector feature của mẫu idx
    X_train_noisy[idx] += np.random.normal(
        loc=0,
        scale=1.0,
        size=X_train_noisy[idx].shape
    )

# 5) Chuyển sang torch.Tensor (phục vụ cho KNN-Shapley hoặc các phương pháp khác)
X_train_tensor = torch.tensor(X_train_noisy,    dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_full,     dtype=torch.long)

X_valid_tensor = torch.tensor(X_valid_scaled,   dtype=torch.float32)
y_valid_tensor = torch.tensor(y_valid,          dtype=torch.long)

# Lúc này:
#  - feature_noise_indices là mảng index những mẫu thật sự bị thêm Gaussian noise trong train (đã được scale).
#  - X_train_noisy là dữ liệu train sau chuẩn hoá, với 20% mẫu bị làm nhiễu.
#  - X_valid_scaled là dữ liệu valid chỉ được transform bởi scaler (không thêm nhiễu).


In [54]:
# 5.3 Tính score cho TẤT CẢ CÁC PHƯƠNG PHÁP
# --- (A) KNN-Shapley
knn_shapley_model = KNN_Shapley(
    x_train=X_train_tensor, y_train=y_train_tensor,
    x_valid=X_valid_tensor, y_valid=y_valid_tensor,
    k_neighbors=10, batch_size=64, metric='euclidean'
)
knn_shapley_model.train_data_values()
score_knn_shapley = knn_shapley_model.evaluate_data_values()

# --- (B) CKNN-Shapley
cknn_shapley_model = CKNN_Shapley(
    x_train=X_train_tensor, y_train=y_train_tensor,
    x_valid=X_valid_tensor, y_valid=y_valid_tensor,
    k_neighbors=10, default=True, batch_size=64
)
cknn_shapley_model.train_data_values()
score_cknn_shapley = cknn_shapley_model.evaluate_data_values()
# ----------- LAVA
lava = LavaEvaluator_geomloss()
score_lava = lava.evaluate_data_values(
    X_train_full, y_noisy, X_valid, y_valid
)
# --- (C) SAVA_OT
sava_ot = SAVA_OT(random_state=42, batch_size=1000)
score_sava_ot = sava_ot.evaluate_data_values(
    X_train_full, y_noisy, X_valid, y_valid
)

# --- (D) SAVA_OT_savel2l
sava_ot_l2l = SAVA_OT_savel2l(random_state=42, batch_size=1000)
score_sava_ot_l2l = sava_ot_l2l.evaluate_data_values(
    X_train_full, y_noisy, X_valid, y_valid
)

# --- (E) Phương pháp Label Noise "truyền thống"
knn_consistency = knn_consistency_scores(X_train_full, y_noisy, k=10)
entropy_scores  = entropy_consistency_scores(X_train_full, y_noisy, k=5, num_cls=5)
kmeans_labels   = kmeans_inconsistency_labels(X_train_full, y_noisy, n_clusters=5)

# --- (F) Phương pháp Outlier "feature-based"
pca_outlier = pca_outlier_detection(X_train_full, n_components=2)
iso_outlier = isolation_forest_outlier(X_train_full)
lof_outlier = local_outlier_factor_scores(X_train_full)

100%|██████████| 9999/9999 [00:12<00:00, 771.49it/s]
100%|██████████| 5000/5000 [00:02<00:00, 1687.57it/s]
100%|██████████| 4999/4999 [00:06<00:00, 772.25it/s]
SAVA_OT train_batch:   0%|          | 0/10 [00:00<?, ?it/s]
SAVA_OT valid_batch:   0%|          | 0/2 [00:00<?, ?it/s][A

Computing label-to-label distance:   0%|          | 0/10 [00:00<?, ?it/s][A[A

                                                                         [A[A

Computing label-to-label distance:   0%|          | 0/10 [00:00<?, ?it/s][A[A

                                                                         [A[A

Computing label-to-label distance:   0%|          | 0/25 [00:00<?, ?it/s][A[A

Computing label-to-label distance:  44%|████▍     | 11/25 [00:00<00:00, 106.22it/s][A[A

Computing label-to-label distance: 100%|██████████| 25/25 [00:00<00:00, 123.05it/s][A[A

                                                                                   [A[A
SAVA_OT valid_batch:  50%|█████     | 1/

In [55]:
# 5.4 Gom vào dict => đánh giá
methods_scores = {
    "KNN-Shapley":           score_knn_shapley,
    "CKNN-Shapley":          score_cknn_shapley,
    "Lava":                  score_lava,
    "SAVA_OT":               score_sava_ot,
    "SAVA_OT_savel2l":       score_sava_ot_l2l,
    "KNN Consistency":       knn_consistency,
    "Entropy-based":         entropy_scores,
    "K-means Inconsistency": kmeans_labels,
    "PCA Outlier":           pca_outlier,
    "Isolation Forest":      iso_outlier,
    "Local Outlier Factor":  lof_outlier
}

results = []
for method_name, original_score in methods_scores.items():
    # Chuyển về quy ước "càng THẤP => càng nhiễu"
    data_vals = invert_score_if_needed(original_score, method_name)
    eval_res = evaluate_noise_detection(data_vals, feature_noise_indices, per=0.2)
    eval_res["Method"] = method_name
    results.append(eval_res)

df_eval = pd.DataFrame(results)
print(df_eval)

    Precision  Recall  F1-Score                 Method
0      0.2695  0.2695    0.2695            KNN-Shapley
1      0.3845  0.3845    0.3845           CKNN-Shapley
2      0.1905  0.1905    0.1905                   Lava
3      0.1830  0.1830    0.1830                SAVA_OT
4      0.1855  0.1855    0.1855        SAVA_OT_savel2l
5      0.2010  0.2010    0.2010        KNN Consistency
6      0.1985  0.1985    0.1985          Entropy-based
7      0.2065  0.2065    0.2065  K-means Inconsistency
8      0.1830  0.1830    0.1830            PCA Outlier
9      0.1880  0.1880    0.1880       Isolation Forest
10     0.1975  0.1975    0.1975   Local Outlier Factor


# cifar

In [3]:
import torch
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torchvision.models import resnet50, ResNet50_Weights
from torch.utils.data import DataLoader, Subset
import numpy as np

# Cấu hình thiết bị
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Transform áp dụng cho CIFAR-10
img2vec_transforms = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize ảnh cho phù hợp với ResNet
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Tải CIFAR-10 dataset
cifar10_train = datasets.CIFAR10(root='.', train=True, transform=img2vec_transforms, download=True)
cifar10_test = datasets.CIFAR10(root='.', train=False, transform=img2vec_transforms, download=True)

sample_size = 10000  # Số lượng mẫu để so sánh
subset_indices_test = list(range(min(sample_size, len(cifar10_test))))
subset_test = Subset(cifar10_test, subset_indices_test)
dataloader_test = DataLoader(subset_test, batch_size=128, shuffle=False)

subset_indices_train = list(range(min(sample_size, len(cifar10_train))))
subset_train = Subset(cifar10_train, subset_indices_train)
dataloader_train = DataLoader(subset_train, batch_size=128, shuffle=False)

# Mô hình ResNet-50 mặc định
resnet_default = resnet50(weights=ResNet50_Weights.DEFAULT).to(device)
resnet_default.fc = torch.nn.Identity()  # Loại bỏ lớp phân loại
resnet_default.eval()

# Hàm tính embedding
def get_embeddings(model, dataloader):
    embeddings = []
    with torch.no_grad():
        for images, _ in dataloader:
            images = images.to(device)
            emb = model(images).cpu()
            embeddings.append(emb)
    return torch.cat(embeddings, dim=0)

# Lấy embedding từ cả hai mô hình trên tập test và train
embedding_default_test = get_embeddings(resnet_default, dataloader_test)
embedding_default_train = get_embeddings(resnet_default, dataloader_train)
print("Embedding test (default):", embedding_default_test.shape)
print("Embedding train (default):", embedding_default_train.shape)

Using device: cuda
Files already downloaded and verified
Files already downloaded and verified
Embedding test (default): torch.Size([10000, 2048])
Embedding train (default): torch.Size([10000, 2048])


In [4]:
# resnet_default_test -> valid/ test
# resnet_default_train -> train
embedding_default_valid = embedding_default_test[2000:]
embedding_default_test = embedding_default_test[:2000]
embedding_default_train = embedding_default_train.numpy()
print(embedding_default_valid.shape)
print(embedding_default_test.shape)
print(embedding_default_train.shape)

torch.Size([8000, 2048])
torch.Size([2000, 2048])
(10000, 2048)


In [5]:
embedding_default_valid = embedding_default_valid.numpy()
embedding_default_test = embedding_default_test.numpy()

In [6]:
def get_labels(dataloader):
    labels = []
    with torch.no_grad():
        for _, label in dataloader:
            labels.append(label)
    return torch.cat(labels, dim=0)

In [7]:
finetuned_label_train = get_labels(dataloader_train)
finetuned_label_test = get_labels(dataloader_test)
print(finetuned_label_train.shape)
print(finetuned_label_test.shape)

torch.Size([10000])
torch.Size([10000])


In [8]:
finetuned_label_train = finetuned_label_train.numpy()
finetuned_label_valid = finetuned_label_test[2000:]
finetuned_label_test = finetuned_label_test[:2000]
finetuned_label_valid = finetuned_label_valid.numpy()
finetuned_label_test = finetuned_label_test.numpy()

In [9]:
# save
import pickle

with open('x_y_embedding_mnist.pkl', 'wb') as f:
    #pickle.dump((x_embeddings, y_labels, xt_embeddings, yt_labels), f)
    pickle.dump((embedding_default_train, finetuned_label_train, embedding_default_valid, finetuned_label_valid, embedding_default_test, finetuned_label_test), f)

In [10]:
### Load x_y_embedding_data
import pickle
with open('x_y_embedding_mnist.pkl', 'rb') as f:
    #x_embeddings, y_labels, xt_embeddings, yt_labels = pickle.load(f)
    embedding_default_train, finetuned_label_train, embedding_default_valid, finetuned_label_valid, embedding_default_test, finetuned_label_test = pickle.load(f)
x_embeddings = embedding_default_train
y_labels = finetuned_label_train
xt_embeddings = embedding_default_test
yt_labels = finetuned_label_test
print("Training embeddings shape:", x_embeddings.shape)
print("Training labels shape:", y_labels.shape)
print("Validation embeddings shape:", xt_embeddings.shape)
print("Validation labels shape:", yt_labels.shape)

Training embeddings shape: (10000, 2048)
Training labels shape: (10000,)
Validation embeddings shape: (2000, 2048)
Validation labels shape: (2000,)


In [11]:
import torch
import numpy as np

# Chuyển dữ liệu sang torch.Tensor nếu cần
x_embeddings = torch.tensor(x_embeddings, dtype=torch.float32) if not isinstance(x_embeddings, torch.Tensor) else x_embeddings
xt_embeddings = torch.tensor(xt_embeddings, dtype=torch.float32) if not isinstance(xt_embeddings, torch.Tensor) else xt_embeddings

# Tính giá trị trung bình và độ lệch chuẩn từ X_train
mean = x_embeddings.mean(dim=0)  # Trung bình theo từng cột (feature)
std = x_embeddings.std(dim=0)    # Độ lệch chuẩn theo từng cột (feature)

# Đảm bảo std không có giá trị 0 để tránh chia cho 0
std[std == 0] = 1e-8

# Chuẩn hóa dữ liệu
X_train_normalized = (x_embeddings - mean) / std
X_valid_normalized = (xt_embeddings - mean) / std

# In kết quả
print("After normalization:")
print(X_train_normalized.shape, X_valid_normalized.shape)



After normalization:
torch.Size([10000, 2048]) torch.Size([2000, 2048])


In [12]:
# models.py
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score
from collections import OrderedDict
from sklearn.svm import SVC
import torch
import numpy as np
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
# Logistic Regression
class LogisticRegression(nn.Module):
    def __init__(self, input_dim: int, num_classes: int):
        super().__init__()
        self.linear = nn.Linear(input_dim, num_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if self.linear.out_features == 1:
            return torch.sigmoid(self.linear(x))
        return torch.softmax(self.linear(x), dim=-1)

    def fit(self, X_train: torch.Tensor, Y_train: torch.Tensor, lr=0.01, epochs=100, batch_size=32):
        if isinstance(X_train, np.ndarray):
            X_train = np.ascontiguousarray(X_train)
        if isinstance(Y_train, np.ndarray):
            Y_train = np.ascontiguousarray(Y_train)
        X_train = torch.tensor(X_train, dtype=torch.float32) if not isinstance(X_train, torch.Tensor) else X_train
        Y_train = torch.tensor(Y_train, dtype=torch.long) if not isinstance(Y_train, torch.Tensor) else Y_train
        criterion = nn.BCELoss() if self.linear.out_features == 1 else nn.CrossEntropyLoss()
        optimizer = optim.SGD(self.parameters(), lr=lr)

        dataset = TensorDataset(X_train, Y_train)
        loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        for epoch in range(epochs):
            epoch_loss = 0.0
            for batch_X, batch_y in loader:
                optimizer.zero_grad()
                outputs = self(batch_X)
                batch_y = batch_y.float().view(-1, 1) if self.linear.out_features == 1 else batch_y
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item()
            #if (epoch + 1) % 10 == 0:
            #    print(f"Epoch [{epoch + 1}/{epochs}], Loss: {epoch_loss / len(loader):.4f}")

    def predict(self, X: torch.Tensor) -> torch.Tensor:
        X = torch.tensor(X, dtype=torch.float32) if not isinstance(X, torch.Tensor) else X
        self.eval()
        with torch.no_grad():
            outputs = self(X)
            return (outputs >= 0.5).int() if self.linear.out_features == 1 else outputs.argmax(dim=1)
    def clone(self):
        # Tạo bản sao của model bằng deepcopy
        new_model = LogisticRegression(self.linear.in_features, self.linear.out_features)
        new_model.load_state_dict(self.state_dict())  # Copy trọng số
        return new_model

In [13]:
# nosify.py
import numpy as np
from numpy.random import RandomState
from typing import Dict, Optional  # Bổ sung để định kiểu dữ liệu trả về

def mix_label(y_train: np.ndarray, y_valid: np.ndarray = None, noise_rate: float = 0.2, random_state: int = 0) -> Dict[str, np.ndarray]:
    """
    Hàm trộn nhãn (label) bằng cách thêm nhiễu ngẫu nhiên vào dữ liệu huấn luyện và kiểm tra.

    Parameters:
    - y_train: Dữ liệu nhãn huấn luyện (numpy array).
    - y_valid: Dữ liệu nhãn kiểm tra (numpy array).
    - noise_rate: Tỷ lệ nhiễu được thêm vào.
    - random_state: Giá trị random seed để tái tạo kết quả.

    Returns:
    - dict chứa:
        - y_train: Nhãn huấn luyện sau khi thêm nhiễu.
        - y_valid: Nhãn kiểm tra sau khi thêm nhiễu.
        - noisy_train_indices: Các chỉ số của nhãn huấn luyện bị nhiễu.
    """
    rs = RandomState(random_state)
    num_train = len(y_train)
    if y_valid is None:
        num_valid = 0
    else:
        num_valid = len(y_valid)
    print(num_valid)
    num_noisy_t = int(noise_rate * num_train)
    num_noisy_v = int(noise_rate * num_valid)

    train_replace = rs.choice(num_train, num_noisy_t, replace=False)
    valid_replace = rs.choice(num_valid, num_noisy_v, replace=False)

    train_classes, train_mapping = np.unique(y_train, return_inverse=True)
    valid_classes, valid_mapping = np.unique(y_valid, return_inverse=True)

    train_shift = rs.choice(len(train_classes) - 1, len(train_replace)) + 1
    valid_shift = rs.choice(len(valid_classes) - 1, len(valid_replace)) + 1

    y_train[train_replace] = train_classes[(train_mapping[train_replace] + train_shift) % len(train_classes)]
    y_valid[valid_replace] = valid_classes[(valid_mapping[valid_replace] + valid_shift) % len(valid_classes)]

    return {
        'y_train': y_train,
        'y_valid': y_valid,
        'noisy_train_indices': train_replace,
    }

def add_gauss_noise(x_train: np.ndarray, x_valid: np.ndarray, x_test: Optional[np.ndarray] = None, noise_rate: float = 0.2, mu: float = 0.0, sigma: float = 1, random_state: int = 0) -> Dict[str, np.ndarray]:
    """
    Hàm thêm nhiễu Gaussian vào dữ liệu đầu vào.

    Parameters:
    - x_train: Dữ liệu huấn luyện (numpy array).
    - x_valid: Dữ liệu kiểm tra (numpy array).
    - x_test: Dữ liệu kiểm thử (nếu có, numpy array).
    - noise_rate: Tỷ lệ nhiễu được thêm vào.
    - mu: Trung bình của phân phối Gaussian.
    - sigma: Độ lệch chuẩn của phân phối Gaussian.
    - random_state: Giá trị random seed để tái tạo kết quả.

    Returns:
    - dict chứa:
        - x_train: Dữ liệu huấn luyện sau khi thêm nhiễu.
        - x_valid: Dữ liệu kiểm tra sau khi thêm nhiễu.
        - x_test: Dữ liệu kiểm thử sau khi thêm nhiễu (nếu có).
        - noisy_train_indices: Các chỉ số của dữ liệu huấn luyện bị nhiễu.
    """
    rs = np.random.RandomState(random_state)
    num_train = len(x_train)
    num_valid = len(x_valid)
    num_noisy_t = int(noise_rate * num_train)
    num_noisy_v = int(noise_rate * num_valid)

    noise_train_indices = rs.choice(num_train, num_noisy_t, replace=False)
    noise_valid_indices = rs.choice(num_valid, num_noisy_v, replace=False)

    noise_train = rs.normal(mu, sigma, size=(num_noisy_t, x_train.shape[1])).astype(np.float32)
    noise_valid = rs.normal(mu, sigma, size=(num_noisy_v, x_valid.shape[1])).astype(np.float32)

    x_train[noise_train_indices] += noise_train
    x_valid[noise_valid_indices] += noise_valid

    result = {
        'x_train': x_train,
        'x_valid': x_valid,
        'noisy_train_indices': noise_train_indices,
    }

    if x_test is not None:
        num_test = len(x_test)
        num_noisy_test = int(noise_rate * num_test)
        noise_test_indices = rs.choice(num_test, num_noisy_test, replace=False)
        noise_test = rs.normal(mu, sigma, size=(num_noisy_test, x_test.shape[1]))
        x_test[noise_test_indices] += noise_test
        result['x_test'] = x_test

    return result

# if __name__ == "__main__":
#     # Dữ liệu giả lập
#     y_train = np.array([0, 1, 1, 0, 1, 0])
#     y_valid = np.array([1, 0, 0, 1])
#     x_train = np.random.rand(6, 2)
#     x_valid = np.random.rand(4, 2)
#     x_test = np.random.rand(3, 2)

#     # Gọi hàm mix_label
#     mixed_labels = mix_label(y_train, y_valid, noise_rate=0.3, random_state=42)
#     print("Mixed Labels:", mixed_labels)

#     # Gọi hàm add_gauss_noise
#     noisy_data = add_gauss_noise(x_train, x_valid, x_test, noise_rate=0.3, mu=0.0, sigma=0.1, random_state=42)
#     print("Noisy Data:", noisy_data)


In [14]:
X_copy = X_train_normalized.clone()
param = add_gauss_noise(X_copy, X_valid_normalized, noise_rate=0.2)
X_train_normalized_noisy = param['x_train']
noisy_train_indices = param['noisy_train_indices']
print("Noisy training labels shape:", X_train_normalized_noisy.shape)
# train model logistic regression
input_dim = X_train_normalized_noisy.shape[1]
num_classes = len(np.unique(y_labels))
model = LogisticRegression(input_dim, num_classes)
model.fit(X_train_normalized_noisy, y_labels, epochs= 1000, lr = 0.1)
# predict
y_pred = model.predict(X_valid_normalized)
# evaluate
from sklearn.metrics import f1_score, accuracy_score
accuracy = f1_score(yt_labels, y_pred, average='weighted')
print("Accuracy:", accuracy)

Noisy training labels shape: torch.Size([10000, 2048])
Accuracy: 0.8899173004764849


In [21]:
X_train_tensor = X_train_normalized_noisy
y_train_tensor = torch.tensor(y_labels, dtype=torch.long)
X_valid_tensor = X_valid_normalized
y_valid_tensor = torch.tensor(yt_labels, dtype=torch.long)
X_train_full = X_train_normalized
y_noisy = y_labels

In [23]:
# 5.3 Tính score cho TẤT CẢ CÁC PHƯƠNG PHÁP
# --- (A) KNN-Shapley
knn_shapley_model = KNN_Shapley(
    x_train=X_train_tensor, y_train=y_train_tensor,
    x_valid=X_valid_tensor, y_valid=y_valid_tensor,
    k_neighbors=10, batch_size=64, metric='euclidean'
)
knn_shapley_model.train_data_values()
score_knn_shapley = knn_shapley_model.evaluate_data_values()

# --- (B) CKNN-Shapley
cknn_shapley_model = CKNN_Shapley(
    x_train=X_train_tensor, y_train=y_train_tensor,
    x_valid=X_valid_tensor, y_valid=y_valid_tensor,
    k_neighbors=10, default=True, batch_size=64
)
cknn_shapley_model.train_data_values()
score_cknn_shapley = cknn_shapley_model.evaluate_data_values()
# ----------- LAVA
lava = LavaEvaluator_geomloss()
score_lava = lava.evaluate_data_values(
    X_train_tensor, y_train_tensor, X_valid_tensor, y_valid_tensor
)
# --- (C) SAVA_OT
sava_ot = SAVA_OT(random_state=42, batch_size=1000)
score_sava_ot = sava_ot.evaluate_data_values(
    X_train_tensor, y_train_tensor, X_valid_tensor, y_valid_tensor
)

# --- (D) SAVA_OT_savel2l
sava_ot_l2l = SAVA_OT_savel2l(random_state=42, batch_size=1000)
score_sava_ot_l2l = sava_ot_l2l.evaluate_data_values(
    X_train_tensor, y_train_tensor, X_valid_tensor, y_valid_tensor
)

# --- (E) Phương pháp Label Noise "truyền thống"
knn_consistency = knn_consistency_scores(X_train_full, y_noisy, k=10)
entropy_scores  = entropy_consistency_scores(X_train_full, y_noisy, k=5, num_cls=5)
kmeans_labels   = kmeans_inconsistency_labels(X_train_full, y_noisy, n_clusters=5)

# --- (F) Phương pháp Outlier "feature-based"
pca_outlier = pca_outlier_detection(X_train_full, n_components=2)
iso_outlier = isolation_forest_outlier(X_train_full)
lof_outlier = local_outlier_factor_scores(X_train_full)

100%|██████████| 9999/9999 [00:12<00:00, 778.06it/s]
100%|██████████| 5000/5000 [00:02<00:00, 1739.20it/s]
100%|██████████| 4999/4999 [00:06<00:00, 761.27it/s]
SAVA_OT train_batch:   0%|          | 0/10 [00:00<?, ?it/s]
SAVA_OT valid_batch:   0%|          | 0/2 [00:00<?, ?it/s][A

Computing label-to-label distance:   0%|          | 0/45 [00:00<?, ?it/s][A[A

Computing label-to-label distance:  33%|███▎      | 15/45 [00:00<00:00, 143.12it/s][A[A

Computing label-to-label distance:  67%|██████▋   | 30/45 [00:00<00:00, 139.96it/s][A[A

Computing label-to-label distance: 100%|██████████| 45/45 [00:00<00:00, 139.64it/s][A[A

                                                                                   [A[A

Computing label-to-label distance:   0%|          | 0/45 [00:00<?, ?it/s][A[A

Computing label-to-label distance:  29%|██▉       | 13/45 [00:00<00:00, 123.20it/s][A[A

Computing label-to-label distance:  58%|█████▊    | 26/45 [00:00<00:00, 124.16it/s][A[A

Computing

In [26]:
# 5.4 Gom vào dict => đánh giá
methods_scores = {
    "KNN-Shapley":           score_knn_shapley,
    "CKNN-Shapley":          score_cknn_shapley,
    "Lava":                  score_lava,
    "SAVA_OT":               score_sava_ot,
    "SAVA_OT_savel2l":       score_sava_ot_l2l,
    "KNN Consistency":       knn_consistency,
    "Entropy-based":         entropy_scores,
    "K-means Inconsistency": kmeans_labels,
    "PCA Outlier":           pca_outlier,
    "Isolation Forest":      iso_outlier,
    "Local Outlier Factor":  lof_outlier
}

results = []
for method_name, original_score in methods_scores.items():
    # Chuyển về quy ước "càng THẤP => càng nhiễu"
    data_vals = invert_score_if_needed(original_score, method_name)
    eval_res = evaluate_noise_detection(data_vals, noisy_train_indices, per=0.2)
    eval_res["Method"] = method_name
    results.append(eval_res)

df_eval = pd.DataFrame(results)
print(df_eval)

    Precision  Recall  F1-Score                 Method
0      0.4065  0.4065    0.4065            KNN-Shapley
1      0.5950  0.5950    0.5950           CKNN-Shapley
2      0.7055  0.7055    0.7055                   Lava
3      0.7095  0.7095    0.7095                SAVA_OT
4      0.7120  0.7120    0.7120        SAVA_OT_savel2l
5      0.2020  0.2020    0.2020        KNN Consistency
6      0.1990  0.1990    0.1990          Entropy-based
7      0.1845  0.1845    0.1845  K-means Inconsistency
8      0.2180  0.2180    0.2180            PCA Outlier
9      0.2125  0.2125    0.2125       Isolation Forest
10     0.1990  0.1990    0.1990   Local Outlier Factor


In [27]:
y_copy = y_labels.copy()
param = mix_label(y_copy, yt_labels, noise_rate=0.2)
y_labels_noisy = param['y_train']
noisy_train_indices = param['noisy_train_indices']
print("Noisy training labels shape:", y_labels_noisy.shape)
# train model logistic regression
input_dim = x_embeddings.shape[1]
num_classes = len(np.unique(y_labels))
model = LogisticRegression(input_dim, num_classes)
model.fit(x_embeddings, y_labels_noisy, epochs= 1000, lr = 0.1)
# predict
y_pred = model.predict(xt_embeddings)
# evaluate
from sklearn.metrics import f1_score, accuracy_score
accuracy = f1_score(yt_labels, y_pred, average='weighted')
print("Accuracy:", accuracy)

2000
Noisy training labels shape: (10000,)
Accuracy: 0.714764227233503


In [28]:
X_train_tensor = torch.tensor(x_embeddings, dtype=torch.float32)
y_train_tensor = torch.tensor(y_labels_noisy, dtype=torch.long)
X_valid_tensor = torch.tensor(xt_embeddings, dtype=torch.float32)
y_valid_tensor = torch.tensor(yt_labels, dtype=torch.long)
X_train_full = x_embeddings
y_noisy = y_labels_noisy

  X_train_tensor = torch.tensor(x_embeddings, dtype=torch.float32)
  X_valid_tensor = torch.tensor(xt_embeddings, dtype=torch.float32)


In [29]:
# 5.3 Tính score cho TẤT CẢ CÁC PHƯƠNG PHÁP
# --- (A) KNN-Shapley
knn_shapley_model = KNN_Shapley(
    x_train=X_train_tensor, y_train=y_train_tensor,
    x_valid=X_valid_tensor, y_valid=y_valid_tensor,
    k_neighbors=10, batch_size=64, metric='euclidean'
)
knn_shapley_model.train_data_values()
score_knn_shapley = knn_shapley_model.evaluate_data_values()

# --- (B) CKNN-Shapley
cknn_shapley_model = CKNN_Shapley(
    x_train=X_train_tensor, y_train=y_train_tensor,
    x_valid=X_valid_tensor, y_valid=y_valid_tensor,
    k_neighbors=10, default=True, batch_size=64
)
cknn_shapley_model.train_data_values()
score_cknn_shapley = cknn_shapley_model.evaluate_data_values()
# ----------- LAVA
lava = LavaEvaluator_geomloss()
score_lava = lava.evaluate_data_values(
    X_train_tensor, y_train_tensor, X_valid_tensor, y_valid_tensor
)
# --- (C) SAVA_OT
sava_ot = SAVA_OT(random_state=42, batch_size=1000)
score_sava_ot = sava_ot.evaluate_data_values(
    X_train_tensor, y_train_tensor, X_valid_tensor, y_valid_tensor
)

# --- (D) SAVA_OT_savel2l
sava_ot_l2l = SAVA_OT_savel2l(random_state=42, batch_size=1000)
score_sava_ot_l2l = sava_ot_l2l.evaluate_data_values(
    X_train_tensor, y_train_tensor, X_valid_tensor, y_valid_tensor
)

# --- (E) Phương pháp Label Noise "truyền thống"
knn_consistency = knn_consistency_scores(X_train_full, y_noisy, k=10)
entropy_scores  = entropy_consistency_scores(X_train_full, y_noisy, k=5, num_cls=5)
kmeans_labels   = kmeans_inconsistency_labels(X_train_full, y_noisy, n_clusters=5)

# --- (F) Phương pháp Outlier "feature-based"
pca_outlier = pca_outlier_detection(X_train_full, n_components=2)
iso_outlier = isolation_forest_outlier(X_train_full)
lof_outlier = local_outlier_factor_scores(X_train_full)

100%|██████████| 9999/9999 [00:12<00:00, 769.98it/s]
100%|██████████| 5000/5000 [00:02<00:00, 1668.98it/s]
100%|██████████| 4999/4999 [00:06<00:00, 767.15it/s]
SAVA_OT train_batch:   0%|          | 0/10 [00:00<?, ?it/s]
SAVA_OT valid_batch:   0%|          | 0/2 [00:00<?, ?it/s][A

Computing label-to-label distance:   0%|          | 0/45 [00:00<?, ?it/s][A[A

Computing label-to-label distance:  36%|███▌      | 16/45 [00:00<00:00, 157.21it/s][A[A

Computing label-to-label distance:  71%|███████   | 32/45 [00:00<00:00, 156.67it/s][A[A

                                                                                   [A[A

Computing label-to-label distance:   0%|          | 0/45 [00:00<?, ?it/s][A[A

Computing label-to-label distance:  38%|███▊      | 17/45 [00:00<00:00, 166.63it/s][A[A

Computing label-to-label distance:  76%|███████▌  | 34/45 [00:00<00:00, 166.26it/s][A[A

                                                                                   [A[A

Computing

In [30]:
# 5.4 Gom vào dict => đánh giá
methods_scores = {
    "KNN-Shapley":           score_knn_shapley,
    "CKNN-Shapley":          score_cknn_shapley,
    "Lava":                  score_lava,
    "SAVA_OT":               score_sava_ot,
    "SAVA_OT_savel2l":       score_sava_ot_l2l,
    "KNN Consistency":       knn_consistency,
    "Entropy-based":         entropy_scores,
    "K-means Inconsistency": kmeans_labels,
    "PCA Outlier":           pca_outlier,
    "Isolation Forest":      iso_outlier,
    "Local Outlier Factor":  lof_outlier
}

results = []
for method_name, original_score in methods_scores.items():
    # Chuyển về quy ước "càng THẤP => càng nhiễu"
    data_vals = invert_score_if_needed(original_score, method_name)
    eval_res = evaluate_noise_detection(data_vals, noisy_train_indices, per=0.2)
    eval_res["Method"] = method_name
    results.append(eval_res)

df_eval = pd.DataFrame(results)
print(df_eval)

    Precision  Recall  F1-Score                 Method
0      0.8145  0.8145    0.8145            KNN-Shapley
1      0.8260  0.8260    0.8260           CKNN-Shapley
2      0.2905  0.2905    0.2905                   Lava
3      0.2845  0.2845    0.2845                SAVA_OT
4      0.2900  0.2900    0.2900        SAVA_OT_savel2l
5      0.8100  0.8100    0.8100        KNN Consistency
6      0.3755  0.3755    0.3755          Entropy-based
7      0.2880  0.2880    0.2880  K-means Inconsistency
8      0.2065  0.2065    0.2065            PCA Outlier
9      0.2125  0.2125    0.2125       Isolation Forest
10     0.1995  0.1995    0.1995   Local Outlier Factor


In [31]:
!pip install hdbscan

Collecting hdbscan
  Downloading hdbscan-0.8.40-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Downloading hdbscan-0.8.40-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/4.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/4.6 MB[0m [31m7.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━[0m [32m2.6/4.6 MB[0m [31m38.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hdbscan
Successfully installed hdbscan-0.8.40


In [32]:
import numpy as np
from sklearn.cluster import KMeans, DBSCAN
import hdbscan  # Cần cài: pip install hdbscan

def kmeans_inconsistency_labels(X, y, n_clusters=5):
    """
    KMeans => chia dữ liệu thành n_clusters.
    Gán cụm cho 'majority label'.
    Điểm nào khác với majority label => 1 (nhiễu), 0 (không).
    """
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(X)

    cluster_maj_labels = []
    for c in range(n_clusters):
        idx_c = np.where(clusters == c)[0]
        # Tìm nhãn majority
        maj_label = np.argmax(np.bincount(y[idx_c]))
        cluster_maj_labels.append(maj_label)

    result = []
    for i in range(len(X)):
        c = clusters[i]
        # Nếu nhãn thực tế != nhãn majority của cụm => nhiễu
        if y[i] != cluster_maj_labels[c]:
            result.append(1)
        else:
            result.append(0)
    return np.array(result)  # 1 => nhiễu, 0 => sạch

def dbscan_inconsistency_labels(X, y, eps=0.5, min_samples=5):
    """
    DBSCAN => phân cụm dựa trên mật độ.
    - Điểm bị đánh dấu '-1' => coi là 'outlier' => 1 (nhiễu).
    - Với cụm >= 0, ta tìm nhãn majority rồi xem điểm nào lệch => 1 (nhiễu).
    """
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    clusters = dbscan.fit_predict(X)  # -1 => outlier/noise, >=0 => cụm ID

    unique_clusters = set(clusters) - {-1}  # Loại bỏ outlier (-1)

    # Tính majority label cho mỗi cụm (không tính outlier)
    cluster_maj_labels = {}
    for c in unique_clusters:
        idx_c = np.where(clusters == c)[0]
        maj_label = np.argmax(np.bincount(y[idx_c]))
        cluster_maj_labels[c] = maj_label

    result = []
    for i in range(len(X)):
        c = clusters[i]
        if c == -1:
            # DBSCAN đánh dấu điểm này là nhiễu => 1
            result.append(1)
        else:
            # Nếu khác nhãn majority => 1
            if y[i] != cluster_maj_labels[c]:
                result.append(1)
            else:
                result.append(0)
    return np.array(result)  # 1 => nhiễu, 0 => sạch

def hdbscan_inconsistency_labels(X, y, min_cluster_size=5, min_samples=None):
    """
    HDBSCAN => phiên bản hierarchical DBSCAN.
    - clusterer.labels_ = -1 => outlier/noise
    - Mỗi cụm >=0 => majority label => kiểm tra lệch => 1 (nhiễu)
    """
    if min_samples is None:
        # Nếu không chỉ định, ta để mặc định = min_cluster_size
        min_samples = min_cluster_size

    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size,
                                min_samples=min_samples)
    clusters = clusterer.fit_predict(X)  # -1 => outlier

    unique_clusters = set(clusters) - {-1}  # Loại bỏ outlier (-1)

    cluster_maj_labels = {}
    for c in unique_clusters:
        idx_c = np.where(clusters == c)[0]
        maj_label = np.argmax(np.bincount(y[idx_c]))
        cluster_maj_labels[c] = maj_label

    result = []
    for i in range(len(X)):
        c = clusters[i]
        if c == -1:
            # HDBSCAN đánh dấu là outlier/noise
            result.append(1)
        else:
            if y[i] != cluster_maj_labels[c]:
                result.append(1)
            else:
                result.append(0)
    return np.array(result)  # 1 => nhiễu, 0 => sạch


In [33]:
# --- (E) Phương pháp Label Noise "truyền thống"
kmeans_labels   = kmeans_inconsistency_labels(X_train_full, y_noisy, n_clusters=5)
dbscan_labels   = dbscan_inconsistency_labels(X_train_full, y_noisy)
hdbscan_labels  = hdbscan_inconsistency_labels(X_train_full, y_noisy)



In [34]:
def invert_score_if_needed(score_array: np.ndarray, method_name: str) -> np.ndarray:
    """
    Chuẩn hoá về quy ước: "CÀNG THẤP => CÀNG nghi nhiễu".
    ------------------------------------------------
    score_array: mảng score hoặc {0,1} (1=nhiễu, 0=sạch).
    method_name: tên phương pháp.
    """
    if method_name in ["KNN-Shapley", "CKNN-Shapley", "SAVA_OT", "SAVA_OT_savel2l"]:
        # Đã "càng THẤP => nghi"
        return score_array

    elif method_name in [
        "KNN Consistency",
        "Entropy-based",
        "PCA Outlier",
        "Isolation Forest",
        "Local Outlier Factor"
    ]:
        # score CÀNG CAO => nghi => đảo dấu
        return -score_array

    elif method_name in [
        "K-means Inconsistency",
        "DBSCAN Inconsistency",
        "HDBSCAN Inconsistency"
    ]:
        # 1 => nhiễu => 1 - x => 0 => nhiễu => "càng thấp => nghi"
        return 1 - score_array

    else:
        return score_array


In [35]:
# 5.4 Gom vào dict => đánh giá
methods_scores = {
    "kmeans_inconsistency_labels":           kmeans_labels,
    "dbscan_inconsistency_labels":           dbscan_labels,
    "hdbscan_inconsistency_labels":          hdbscan_labels,
}

results = []
for method_name, original_score in methods_scores.items():
    # Chuyển về quy ước "càng THẤP => càng nhiễu"
    data_vals = invert_score_if_needed(original_score, method_name)
    eval_res = evaluate_noise_detection(data_vals, noisy_train_indices, per=0.2)
    eval_res["Method"] = method_name
    results.append(eval_res)

df_eval = pd.DataFrame(results)
print(df_eval)

   Precision  Recall  F1-Score                        Method
0     0.0315  0.0315    0.0315   kmeans_inconsistency_labels
1     0.1860  0.1860    0.1860   dbscan_inconsistency_labels
2     0.1160  0.1160    0.1160  hdbscan_inconsistency_labels
