# Weakly Supervised Learning на датасете Adult

В этом ноутбуке исследуются методы обучения со слабой разметкой: шумные метки, частичные метки и агрегированные (bag-level) метки. Сравниваются устойчивые лоссы и специализированные подходы с upper bound на чистых метках.


In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score
from sklearn.metrics import brier_score_loss
from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)


## 1. Подготовка данных

Загружаем датасет Adult и применяем ColumnTransformer для обработки числовых и категориальных признаков без утечки информации.


In [2]:
adult = fetch_openml('adult', version=2, as_frame=True, parser='pandas')
X = adult.data
y = adult.target

le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.15, random_state=42, stratify=y_train
)

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
print(f"Class distribution (train): {np.bincount(y_train)}")
print(f"Classes: {le.classes_}")


Train: (33212, 14), Val: (5861, 14), Test: (9769, 14)
Class distribution (train): [25265  7947]
Classes: ['<=50K' '>50K']


In [3]:
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])

X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)
X_test_processed = preprocessor.transform(X_test)

input_dim = X_train_processed.shape[1]
print(f"Input dimension after preprocessing: {input_dim}")


Input dimension after preprocessing: 108


## 2. Базовая модель и метрики

Определяем MLP-классификатор и функции для вычисления метрик, включая калибровку (Brier score и ECE).


In [4]:
class MLPClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=256):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, 1)
        )
    
    def forward(self, x):
        return self.network(x).squeeze(-1)

def compute_ece(y_true, y_prob, n_bins=10):
    bin_boundaries = np.linspace(0, 1, n_bins + 1)
    bin_lowers = bin_boundaries[:-1]
    bin_uppers = bin_boundaries[1:]
    
    ece = 0.0
    for bin_lower, bin_upper in zip(bin_lowers, bin_uppers):
        in_bin = (y_prob > bin_lower) & (y_prob <= bin_upper)
        prop_in_bin = in_bin.mean()
        
        if prop_in_bin > 0:
            accuracy_in_bin = y_true[in_bin].mean()
            avg_confidence_in_bin = y_prob[in_bin].mean()
            ece += np.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin
    
    return ece

def evaluate_model(model, X, y, device='cpu'):
    model.eval()
    with torch.no_grad():
        X_tensor = torch.FloatTensor(X).to(device)
        logits = model(X_tensor)
        probs = torch.sigmoid(logits).cpu().numpy()
        preds = (probs >= 0.5).astype(int)
    
    acc = accuracy_score(y, preds)
    f1 = f1_score(y, preds, average='macro')
    roc_auc = roc_auc_score(y, probs)
    pr_auc = average_precision_score(y, probs)
    brier = brier_score_loss(y, probs)
    ece = compute_ece(y, probs)
    
    return {
        'accuracy': acc,
        'macro_f1': f1,
        'roc_auc': roc_auc,
        'pr_auc': pr_auc,
        'brier': brier,
        'ece': ece
    }


## 3. Upper Bound: обучение на чистых метках

Обучаем модель на чистых метках для получения upper bound — эталона для сравнения.


In [5]:
def train_model(model, X_train, y_train, X_val, y_val, epochs=50, lr=1e-3, 
                criterion_fn=None, device='cpu', verbose=True):
    model = model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    
    if criterion_fn is None:
        criterion = nn.BCEWithLogitsLoss()
    else:
        criterion = criterion_fn
    
    train_dataset = TensorDataset(
        torch.FloatTensor(X_train), 
        torch.FloatTensor(y_train)
    )
    train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
    
    best_val_auc = 0
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        
        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            
            optimizer.zero_grad()
            logits = model(x_batch)
            
            if hasattr(criterion, '__call__'):
                loss = criterion(logits, y_batch)
            else:
                loss = F.binary_cross_entropy_with_logits(logits, y_batch)
            
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        train_loss /= len(train_loader)
        
        val_metrics = evaluate_model(model, X_val, y_val, device)
        
        if val_metrics['pr_auc'] > best_val_auc:
            best_val_auc = val_metrics['pr_auc']
            best_model_state = model.state_dict().copy()
        
        if verbose and (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}/{epochs}, Loss: {train_loss:.4f}, "
                  f"Val PR-AUC: {val_metrics['pr_auc']:.4f}")
    
    model.load_state_dict(best_model_state)
    return model


In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

print("\n" + "="*60)
print("Training Upper Bound (Clean Labels)")
print("="*60)

model_clean = MLPClassifier(input_dim, hidden_dim=256)
model_clean = train_model(model_clean, X_train_processed, y_train.astype(np.float32), 
                          X_val_processed, y_val, epochs=80, device=device)

clean_metrics = evaluate_model(model_clean, X_test_processed, y_test, device)
print(f"\nUpper Bound Test Metrics:")
for k, v in clean_metrics.items():
    print(f"  {k}: {v:.4f}")


Using device: cuda

Training Upper Bound (Clean Labels)
Epoch 10/80, Loss: 0.2943, Val PR-AUC: 0.7893
Epoch 20/80, Loss: 0.2767, Val PR-AUC: 0.7829
Epoch 30/80, Loss: 0.2612, Val PR-AUC: 0.7771
Epoch 40/80, Loss: 0.2495, Val PR-AUC: 0.7738
Epoch 50/80, Loss: 0.2394, Val PR-AUC: 0.7685
Epoch 60/80, Loss: 0.2307, Val PR-AUC: 0.7671
Epoch 70/80, Loss: 0.2219, Val PR-AUC: 0.7630
Epoch 80/80, Loss: 0.2173, Val PR-AUC: 0.7620

Upper Bound Test Metrics:
  accuracy: 0.8514
  macro_f1: 0.7772
  roc_auc: 0.8925
  pr_auc: 0.7498
  brier: 0.1083
  ece: 0.0472


## 4. Сценарий A: Шумные метки

### 4.1 Генерация шумных меток

Создаем симметричный и асимметричный шум в метках:


In [7]:
def add_label_noise(y, noise_type='symmetric', p_flip=0.2, seed=42):
    np.random.seed(seed)
    y_noisy = y.copy()
    n = len(y)
    
    if noise_type == 'symmetric':
        flip_mask = np.random.rand(n) < p_flip
        y_noisy[flip_mask] = 1 - y_noisy[flip_mask]
        
        print(f"Symmetric noise: {flip_mask.sum()} labels flipped ({flip_mask.sum()/n*100:.1f}%)")
        
    elif noise_type == 'asymmetric':
        p_0_to_1 = p_flip
        p_1_to_0 = p_flip / 2
        
        mask_0 = (y == 0)
        mask_1 = (y == 1)
        
        flip_0 = mask_0 & (np.random.rand(n) < p_0_to_1)
        flip_1 = mask_1 & (np.random.rand(n) < p_1_to_0)
        
        y_noisy[flip_0] = 1
        y_noisy[flip_1] = 0
        
        print(f"Asymmetric noise: {flip_0.sum()} (0→1), {flip_1.sum()} (1→0)")
    
    return y_noisy

p_flip = 0.2
y_train_noisy = add_label_noise(y_train, noise_type='symmetric', p_flip=p_flip)


Symmetric noise: 6595 labels flipped (19.9%)


### 4.2 Устойчивые лоссы

Реализуем различные устойчивые к шуму функции потерь:
- **Label Smoothing**: $y_{\text{smooth}} = (1-\epsilon) \cdot y + \epsilon \cdot 0.5$
- **Bootstrapping**: $\mathcal{L} = \alpha \cdot \text{CE}(y, p) + (1-\alpha) \cdot \text{CE}(p_{\text{detach}}, p)$
- **Confidence Penalty**: $\mathcal{L} = \text{CE}(y, p) - \lambda \cdot H(p)$


In [8]:
class LabelSmoothingLoss(nn.Module):
    def __init__(self, smoothing=0.1):
        super().__init__()
        self.smoothing = smoothing
    
    def forward(self, logits, targets):
        targets_smooth = targets * (1 - self.smoothing) + 0.5 * self.smoothing
        return F.binary_cross_entropy_with_logits(logits, targets_smooth)

class BootstrappingLoss(nn.Module):
    def __init__(self, alpha=0.8):
        super().__init__()
        self.alpha = alpha
    
    def forward(self, logits, targets):
        probs = torch.sigmoid(logits)
        loss_hard = F.binary_cross_entropy_with_logits(logits, targets)
        loss_soft = F.binary_cross_entropy(probs, probs.detach())
        return self.alpha * loss_hard + (1 - self.alpha) * loss_soft

class ConfidencePenaltyLoss(nn.Module):
    def __init__(self, lambda_penalty=0.1):
        super().__init__()
        self.lambda_penalty = lambda_penalty
    
    def forward(self, logits, targets):
        probs = torch.sigmoid(logits)
        ce_loss = F.binary_cross_entropy_with_logits(logits, targets)
        entropy = -(probs * torch.log(probs + 1e-8) + 
                   (1 - probs) * torch.log(1 - probs + 1e-8))
        return ce_loss - self.lambda_penalty * entropy.mean()


In [9]:
noisy_results = {}

loss_configs = [
    ('BCE Baseline', None),
    ('Label Smoothing', LabelSmoothingLoss(smoothing=0.1)),
    ('Bootstrapping', BootstrappingLoss(alpha=0.8)),
    ('Confidence Penalty', ConfidencePenaltyLoss(lambda_penalty=0.05))
]

for name, loss_fn in loss_configs:
    print(f"\n{'='*60}")
    print(f"Training with {name} on Noisy Labels")
    print(f"{'='*60}")
    
    model = MLPClassifier(input_dim, hidden_dim=256)
    model = train_model(model, X_train_processed, y_train_noisy.astype(np.float32),
                       X_val_processed, y_val, epochs=80, 
                       criterion_fn=loss_fn, device=device, verbose=False)
    
    metrics = evaluate_model(model, X_test_processed, y_test, device)
    noisy_results[name] = metrics
    
    print(f"Test Metrics:")
    print(f"  Accuracy: {metrics['accuracy']:.4f}, PR-AUC: {metrics['pr_auc']:.4f}")



Training with BCE Baseline on Noisy Labels
Test Metrics:
  Accuracy: 0.8278, PR-AUC: 0.6345

Training with Label Smoothing on Noisy Labels
Test Metrics:
  Accuracy: 0.8221, PR-AUC: 0.6300

Training with Bootstrapping on Noisy Labels
Test Metrics:
  Accuracy: 0.8253, PR-AUC: 0.6258

Training with Confidence Penalty on Noisy Labels
Test Metrics:
  Accuracy: 0.8248, PR-AUC: 0.6239


### 4.3 Small-Loss Selection

Отбираем для обучения только примеры с наименьшим лоссом, фильтруя потенциально зашумленные метки.


In [10]:
def train_with_small_loss_selection(model, X_train, y_train, X_val, y_val, 
                                    epochs=80, device='cpu'):
    model = model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
    criterion = LabelSmoothingLoss(smoothing=0.1)
    
    train_dataset = TensorDataset(
        torch.FloatTensor(X_train), 
        torch.FloatTensor(y_train)
    )
    train_loader = DataLoader(train_dataset, batch_size=256, shuffle=False)
    
    best_val_auc = 0
    
    for epoch in range(epochs):
        model.train()
        
        f_ratio = 0.7 + 0.2 * min(1.0, epoch / (epochs // 2))
        
        all_losses = []
        all_indices = []
        
        for batch_idx, (x_batch, y_batch) in enumerate(train_loader):
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            
            with torch.no_grad():
                logits = model(x_batch)
                losses = F.binary_cross_entropy_with_logits(logits, y_batch, reduction='none')
                
                batch_start = batch_idx * 256
                for i, loss_val in enumerate(losses):
                    all_losses.append(loss_val.item())
                    all_indices.append(batch_start + i)
        
        sorted_indices = np.argsort(all_losses)
        n_select = int(len(sorted_indices) * f_ratio)
        selected_indices = sorted_indices[:n_select]
        
        X_selected = X_train[selected_indices]
        y_selected = y_train[selected_indices]
        
        selected_dataset = TensorDataset(
            torch.FloatTensor(X_selected),
            torch.FloatTensor(y_selected)
        )
        selected_loader = DataLoader(selected_dataset, batch_size=256, shuffle=True)
        
        train_loss = 0
        for x_batch, y_batch in selected_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            
            optimizer.zero_grad()
            logits = model(x_batch)
            loss = criterion(logits, y_batch)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        val_metrics = evaluate_model(model, X_val, y_val, device)
        
        if val_metrics['pr_auc'] > best_val_auc:
            best_val_auc = val_metrics['pr_auc']
            best_model_state = model.state_dict().copy()
        
        if (epoch + 1) % 20 == 0:
            print(f"Epoch {epoch+1}, f_ratio: {f_ratio:.2f}, selected: {n_select}/{len(X_train)}, "
                  f"Val PR-AUC: {val_metrics['pr_auc']:.4f}")
    
    model.load_state_dict(best_model_state)
    return model

print(f"\n{'='*60}")
print(f"Training with Small-Loss Selection")
print(f"{'='*60}")

model_small_loss = MLPClassifier(input_dim, hidden_dim=256)
model_small_loss = train_with_small_loss_selection(
    model_small_loss, X_train_processed, y_train_noisy.astype(np.float32),
    X_val_processed, y_val, epochs=80, device=device
)

metrics_small_loss = evaluate_model(model_small_loss, X_test_processed, y_test, device)
noisy_results['Small-Loss Selection'] = metrics_small_loss
print(f"\nTest Metrics:")
print(f"  Accuracy: {metrics_small_loss['accuracy']:.4f}, PR-AUC: {metrics_small_loss['pr_auc']:.4f}")



Training with Small-Loss Selection
Epoch 20, f_ratio: 0.79, selected: 26403/33212, Val PR-AUC: 0.7381
Epoch 40, f_ratio: 0.90, selected: 29724/33212, Val PR-AUC: 0.7154
Epoch 60, f_ratio: 0.90, selected: 29890/33212, Val PR-AUC: 0.6727
Epoch 80, f_ratio: 0.90, selected: 29890/33212, Val PR-AUC: 0.6593

Test Metrics:
  Accuracy: 0.8327, PR-AUC: 0.6532


### 4.4 Consistency Regularization на шумных метках

Добавляем consistency regularization (Π-model-lite) для улучшения устойчивости к шуму.


In [11]:
class WeakAugmentation:
    def __init__(self, noise_std=0.05, dropout_prob=0.05):
        self.noise_std = noise_std
        self.dropout_prob = dropout_prob
    
    def __call__(self, x):
        noise = torch.randn_like(x) * self.noise_std
        x_aug = x + noise
        mask = torch.rand_like(x) > self.dropout_prob
        x_aug = x_aug * mask
        return x_aug

class StrongAugmentation:
    def __init__(self, noise_std=0.15, dropout_prob=0.2, scale_range=0.1):
        self.noise_std = noise_std
        self.dropout_prob = dropout_prob
        self.scale_range = scale_range
    
    def __call__(self, x):
        noise = torch.randn_like(x) * self.noise_std
        x_aug = x + noise
        mask = torch.rand_like(x) > self.dropout_prob
        x_aug = x_aug * mask
        scale = 1 + (torch.rand_like(x) * 2 - 1) * self.scale_range
        x_aug = x_aug * scale
        return x_aug

def train_with_consistency(model, X_train, y_train, X_val, y_val, 
                          epochs=80, lambda_u=1.0, rampup_epochs=20, device='cpu'):
    model = model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
    criterion_sup = LabelSmoothingLoss(smoothing=0.1)
    
    weak_aug = WeakAugmentation()
    strong_aug = StrongAugmentation()
    
    train_dataset = TensorDataset(
        torch.FloatTensor(X_train),
        torch.FloatTensor(y_train)
    )
    train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
    
    best_val_auc = 0
    
    for epoch in range(epochs):
        model.train()
        train_loss_sup = 0
        train_loss_unsup = 0
        
        current_lambda = lambda_u * min(1.0, epoch / rampup_epochs)
        
        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            
            logits = model(x_batch)
            loss_sup = criterion_sup(logits, y_batch)
            
            x_weak = weak_aug(x_batch)
            x_strong = strong_aug(x_batch)
            
            logits_weak = model(x_weak)
            logits_strong = model(x_strong)
            
            probs_weak = torch.sigmoid(logits_weak)
            probs_strong = torch.sigmoid(logits_strong)
            
            loss_unsup = F.mse_loss(probs_weak, probs_strong)
            
            loss = loss_sup + current_lambda * loss_unsup
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            train_loss_sup += loss_sup.item()
            train_loss_unsup += loss_unsup.item()
        
        val_metrics = evaluate_model(model, X_val, y_val, device)
        
        if val_metrics['pr_auc'] > best_val_auc:
            best_val_auc = val_metrics['pr_auc']
            best_model_state = model.state_dict().copy()
        
        if (epoch + 1) % 20 == 0:
            print(f"Epoch {epoch+1}, λ_u: {current_lambda:.2f}, "
                  f"Val PR-AUC: {val_metrics['pr_auc']:.4f}")
    
    model.load_state_dict(best_model_state)
    return model

print(f"\n{'='*60}")
print(f"Training with Consistency Regularization")
print(f"{'='*60}")

model_consistency = MLPClassifier(input_dim, hidden_dim=256)
model_consistency = train_with_consistency(
    model_consistency, X_train_processed, y_train_noisy.astype(np.float32),
    X_val_processed, y_val, epochs=80, device=device
)

metrics_consistency = evaluate_model(model_consistency, X_test_processed, y_test, device)
noisy_results['Consistency Regularization'] = metrics_consistency
print(f"\nTest Metrics:")
print(f"  Accuracy: {metrics_consistency['accuracy']:.4f}, PR-AUC: {metrics_consistency['pr_auc']:.4f}")



Training with Consistency Regularization
Epoch 20, λ_u: 0.95, Val PR-AUC: 0.7613
Epoch 40, λ_u: 1.00, Val PR-AUC: 0.7400
Epoch 60, λ_u: 1.00, Val PR-AUC: 0.6783
Epoch 80, λ_u: 1.00, Val PR-AUC: 0.6515

Test Metrics:
  Accuracy: 0.8302, PR-AUC: 0.6523
