In [3]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.base import BaseEstimator, ClassifierMixin

class SyntheticDataGenerator(BaseEstimator):
    """
    Synthetic data generator for benchmarking probabilistic classifiers.
    Adheres to sklearn's API for integration.
    """
    def __init__(self, input_dim=10, n_classes=3, hidden_layers=(32, 32), random_seed=None):
        """
        Parameters:
        - input_dim: Number of input features
        - n_classes: Number of output classes
        - hidden_layers: Tuple specifying hidden layer sizes
        - random_seed: Random seed for reproducibility
        """
        self.input_dim = input_dim
        self.n_classes = n_classes
        self.hidden_layers = hidden_layers
        self.random_seed = random_seed
        self._initialize_model()

    def _initialize_model(self):
        if self.random_seed is not None:
            torch.manual_seed(self.random_seed)
            np.random.seed(self.random_seed)
        
        layers = []
        last_dim = self.input_dim
        for hidden_dim in self.hidden_layers:
            layers.append(nn.Linear(last_dim, hidden_dim))
            layers.append(nn.ReLU())
            last_dim = hidden_dim
        layers.append(nn.Linear(last_dim, self.n_classes))  # Output layer
        self.network = nn.Sequential(*layers)

    def generate_data(self, n_samples=1000):
        """
        Generates synthetic data and ground truth probabilities.

        Parameters:
        - n_samples: Number of samples to generate

        Returns:
        - X: Feature matrix (n_samples, input_dim)
        - y: Class labels (n_samples,)
        - P: Ground truth class probabilities (n_samples, n_classes)
        """
        X = np.random.randn(n_samples, self.input_dim).astype(np.float32)  # Random features
        P = self.predict_proba(X)  # Use predict_proba for probabilities
        y = np.array([np.random.choice(self.n_classes, p=p) for p in P])  # Sample labels
        return X, y, P

    def fit(self, X, y):
        """Placeholder to comply with sklearn's API."""
        pass

    def predict_proba(self, X):
        """
        Returns the ground truth probability distribution for the given inputs.

        Parameters:
        - X: Input features (numpy array of shape [n_samples, input_dim])

        Returns:
        - P: Ground truth class probabilities (numpy array of shape [n_samples, n_classes])
        """
        X_tensor = torch.tensor(X, dtype=torch.float32)
        with torch.no_grad():
            logits = self.network(X_tensor)
            P = F.softmax(logits, dim=1).numpy()
        return P

    def predict(self, X):
        """
        Predicts class labels by sampling from the ground truth probabilities.

        Parameters:
        - X: Input features (numpy array of shape [n_samples, input_dim])

        Returns:
        - y: Predicted class labels (numpy array of shape [n_samples])
        """
        P = self.predict_proba(X)
        y = np.array([np.random.choice(self.n_classes, p=p) for p in P])
        return y


In [5]:
# Generate synthetic data
generator = SyntheticDataGenerator(input_dim=5, n_classes=3, hidden_layers=(16, 16), random_seed=42)
X, _, _ = generator.generate_data(n_samples=10)

# Predict probabilities
P = generator.predict_proba(X)
print("Predicted probabilities:", P)

# Predict labels
y = generator.predict(X)
print("Predicted labels:", y)


Predicted probabilities: [[0.31108475 0.40565932 0.28325593]
 [0.31330678 0.3822424  0.30445075]
 [0.30056858 0.38960877 0.30982262]
 [0.30445853 0.39097705 0.30456442]
 [0.30840302 0.37961456 0.3119824 ]
 [0.3073267  0.37836528 0.31430796]
 [0.30829653 0.38295934 0.30874407]
 [0.30718902 0.38344714 0.3093638 ]
 [0.30440938 0.3903119  0.3052787 ]
 [0.2991909  0.39747033 0.3033387 ]]
Predicted labels: [2 0 1 0 2 1 1 0 1 1]


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

# Generate dataset
X, y, P_true = generator.generate_data(n_samples=1000)

# Train a classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X, y)

# Evaluate
P_pred = clf.predict_proba(X)
print("Log-loss:", log_loss(y, P_pred, labels=range(generator.n_classes)))


Log-loss: 1.0855126152309305
