### Model implementation

In [33]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import scipy.stats as st

Import a test dataset

In [34]:
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)

Split Dataset

In [35]:
# 80/20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Logistic Regression

Choose Solver

In [36]:
# newton-cg, sag where used
clf = LogisticRegression(random_state=0, solver='newton-cg').fit(X_train, y_train)
clf.predict(X_test)
clf.score(X_test, y_test)

1.0

## Choquistic Regression

In [37]:
class ChoquisticRegression(LogisticRegression):
    def __init__(self, additive='full', gamma=1.0, solver='lbfgs', max_iter=100, tol=1e-4):
        """
        additive: 'full' for unrestricted game-based aggregation, or '2-additive'
                  for the simplified variant.
        gamma: scaling factor (see Equations (20) and (24) in the paper).
        """
        self.additive = additive
        self.gamma = gamma
        super().__init__(solver=solver, max_iter=max_iter, tol=tol)
    
    def fit(self, X, y):
        # Apply the aggregation function before fitting
        X_aggregated = self.aggregate(X)
        return super().fit(X_aggregated, y)
    
    def aggregate(self, X):
        # Placeholder for the actual aggregation logic
        # For now, it just returns X without modification
        return X


In [38]:
import numpy as np
from itertools import chain, combinations
from scipy.optimize import minimize
from sklearn.base import BaseEstimator, ClassifierMixin
from scipy.special import expit  # numerically stable sigmoid
from sklearn.linear_model import LogisticRegression

def all_nonempty_subsets(iterable):
    """Return all non-empty subsets of the input iterable."""
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(1, len(s)+1))

class ChoquisticRegression(BaseEstimator, ClassifierMixin):
    def __init__(self, gamma=1.0, choquet_type='2-additive', solver='L-BFGS-B',
                 max_iter=100, tol=1e-5, random_state=None):
        """
        Parameters:
          gamma: Scaling parameter for the logistic function.
          choquet_type: 'general' (full game-based Choquet integral) or '2-additive'.
          solver: Optimizer to use (passed to scipy.optimize.minimize).
          max_iter: Maximum iterations for the optimizer.
          tol: Tolerance for convergence.
          random_state: Seed for reproducibility.
        """
        self.gamma = gamma
        self.choquet_type = choquet_type
        self.solver = solver
        self.max_iter = max_iter
        self.tol = tol
        self.random_state = random_state

    def fit(self, X, y):
        X = np.asarray(X)
        y = np.asarray(y).flatten()
        n_samples, m = X.shape

        # Use a baseline logistic regression for a good initialization.
        base_lr = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=self.random_state)
        base_lr.fit(X, y)
        init_beta0 = base_lr.intercept_[0]
        init_coef = base_lr.coef_.flatten()  # shape: (m,)

        if self.choquet_type == '2-additive':
            # In the 2-additive model we optimize over:
            #   beta0 (scalar), phi (m parameters), and I (unique pairwise interactions)
            n_I = m * (m - 1) // 2
            # Initialize phi with the linear coefficients and I to zeros.
            init_params = np.concatenate(([init_beta0], init_coef, np.zeros(n_I)))
            # Precompute upper triangular indices (for pairwise interactions)
            iu = np.triu_indices(m, k=1)

            def objective(params):
                beta0 = params[0]
                phi = params[1:1+m]
                I_params = params[1+m:]
                # Reconstruct symmetric interaction matrix I
                I = np.zeros((m, m))
                idx = 0
                for j in range(m):
                    for k in range(j+1, m):
                        I[j, k] = I_params[idx]
                        I[k, j] = I_params[idx]
                        idx += 1
                # Singleton term: vectorized over samples
                singleton_term = X.dot(phi - 0.5 * np.sum(I, axis=1))
                # Pairwise term: vectorized over samples using broadcasting
                pairwise_min = np.minimum(X[:, :, None], X[:, None, :])  # shape: (n_samples, m, m)
                pairwise_term = np.sum(2 * pairwise_min[:, iu[0], iu[1]] * I[iu], axis=1)
                f_vals = singleton_term + pairwise_term
                z = self.gamma * (f_vals - beta0)
                p = expit(z)
                eps = 1e-10
                loss = -np.sum(y * np.log(p + eps) + (1 - y) * np.log(1 - p + eps))
                return loss

            res = minimize(objective, init_params, method=self.solver,
                           options={'maxiter': self.max_iter, 'disp': False}, tol=self.tol)
            self.beta0_ = res.x[0]
            self.phi_ = res.x[1:1+m]
            I_params = res.x[1+m:]
            # Rebuild the symmetric I matrix from I_params.
            I = np.zeros((m, m))
            idx = 0
            for j in range(m):
                for k in range(j+1, m):
                    I[j, k] = I_params[idx]
                    I[k, j] = I_params[idx]
                    idx += 1
            self.I_ = I
            self.m_ = m

        elif self.choquet_type == 'general':
            # The general formulation (full game-based Choquet integral) is feasible only for small m.
            if m > 10:
                raise ValueError("The 'general' version is computationally feasible only for m <= 10 features.")
            # Enumerate all non-empty subsets of {0, ..., m-1}
            subsets = list(all_nonempty_subsets(range(m)))
            self._subsets = [frozenset(s) for s in subsets]
            self._coalition_index = {coalition: idx for idx, coalition in enumerate(self._subsets)}
            n_v_params = len(self._subsets)
            # Initialize v for each coalition as the sum of the corresponding linear coefficients.
            init_v = np.array([np.sum([init_coef[j] for j in coalition]) for coalition in self._subsets])
            init_params = np.concatenate(([init_beta0], init_v))

            def objective(params):
                beta0 = params[0]
                v = params[1:]
                loss = 0.0
                # Loop over samples (n_samples is usually moderate when m is small)
                for i in range(n_samples):
                    x = X[i]
                    # Sort x (ascending order) and get the corresponding feature indices.
                    order = np.argsort(x)
                    x_sorted = x[order]
                    f_val = 0.0
                    prev = 0.0
                    # Compute the Choquet integral for sample i
                    for j in range(m):
                        coalition = frozenset(order[j:])
                        idx = self._coalition_index[coalition]
                        diff = x_sorted[j] - prev
                        prev = x_sorted[j]
                        f_val += diff * v[idx]
                    z = self.gamma * (f_val - beta0)
                    p = expit(z)
                    eps = 1e-10
                    loss -= y[i] * np.log(p + eps) + (1 - y[i]) * np.log(1 - p + eps)
                return loss

            res = minimize(objective, init_params, method=self.solver,
                           options={'maxiter': self.max_iter, 'disp': False}, tol=self.tol)
            self.beta0_ = res.x[0]
            self.v_ = res.x[1:]
        else:
            raise ValueError("choquet_type must be either 'general' or '2-additive'")

        return self

    def decision_function(self, X):
        X = np.asarray(X)
        n_samples, m = X.shape
        if self.choquet_type == '2-additive':
            # Compute singleton term for all samples.
            singleton_term = X.dot(self.phi_ - 0.5 * np.sum(self.I_, axis=1))
            # Compute pairwise term vectorized.
            iu = np.triu_indices(m, k=1)
            pairwise_min = np.minimum(X[:, :, None], X[:, None, :])
            pairwise_term = np.sum(2 * pairwise_min[:, iu[0], iu[1]] * self.I_[iu], axis=1)
            f_vals = singleton_term + pairwise_term
            scores = self.gamma * (f_vals - self.beta0_)
            return scores
        elif self.choquet_type == 'general':
            scores = np.zeros(n_samples)
            for i in range(n_samples):
                x = X[i]
                order = np.argsort(x)
                x_sorted = x[order]
                f_val = 0.0
                prev = 0.0
                for j in range(m):
                    coalition = frozenset(order[j:])
                    idx = self._coalition_index[coalition]
                    diff = x_sorted[j] - prev
                    prev = x_sorted[j]
                    f_val += diff * self.v_[idx]
                scores[i] = self.gamma * (f_val - self.beta0_)
            return scores
        else:
            raise ValueError("Invalid choquet_type.")

    def predict_proba(self, X):
        scores = self.decision_function(X)
        prob = expit(scores)
        return np.vstack([1 - prob, prob]).T

    def predict(self, X):
        prob = self.predict_proba(X)[:, 1]
        return (prob >= 0.5).astype(int)


## ML Regression

In [39]:
class MultilinearLogisticRegression(LogisticRegression):
    def __init__(self, additive='full', gamma=1.0, solver='lbfgs', max_iter=100, tol=1e-4):
        """
        additive: 'full' for unrestricted game-based aggregation, or '2-additive'
                  for the simplified variant.
        gamma: scaling factor (see Equations (20) and (24) in the paper).
        """
        self.additive = additive
        self.gamma = gamma
        super().__init__(solver=solver, max_iter=max_iter, tol=tol)
    
    def decision_function(self, X):
        f_val = super().decision_function(X)
        return self.gamma * f_val  # Placeholder for game-theoretic aggregation
    
    def predict_proba(self, X):
        scores = self.decision_function(X)
        proba = 1 / (1 + np.exp(-scores))
        return np.vstack([1 - proba, proba]).T

## Helping functions

In [40]:
def compute_confidence_interval(accuracies, confidence=0.95):
    """
    Compute the mean accuracy and its 95% confidence interval.
    """
    mean_acc = np.mean(accuracies)
    sem = st.sem(accuracies)
    margin = sem * st.t.ppf((1 + confidence) / 2, len(accuracies) - 1)
    return mean_acc, (mean_acc - margin, mean_acc + margin)

def simulate_training_accuracy(model, X, y, n_simulations=50, test_size=0.2, random_state=42):
    """
    Run multiple training simulations (splitting data into train/test)
    and return the mean training accuracy (in %) with its 95% confidence interval.
    """
    accuracies = []
    for i in range(n_simulations):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, stratify=y, random_state=random_state + i
        )
        model.fit(X_train, y_train)
        acc = model.score(X_train, y_train)
        accuracies.append(acc * 100)  # Convert to percentage
    return compute_confidence_interval(accuracies)


## Results

In [41]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X, y = data.data, data.target

# Define models:
models = {
    "LR": LogisticRegression(solver='newton-cg', max_iter=100, tol=1e-4),
    "CR2add": ChoquisticRegression(choquet_type='2-additive', max_iter=100, tol=1e-4),
    "CR": ChoquisticRegression(choquet_type='general', max_iter=100, tol=1e-4),
    
    #"MLR2add": MultilinearLogisticRegression(additive='2-additive', solver='newton-cg', max_iter=100, tol=1e-4)
}

results = {}
for name, model in models.items():
    mean_acc, conf_interval = simulate_training_accuracy(model, X, y, n_simulations=2)
    results[name] = (mean_acc, conf_interval)
    print(f"{name}: {mean_acc:.2f}% {conf_interval}")

LR: 96.26% (np.float64(87.88601885509975), np.float64(104.64145367237278))


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


CR2add: 62.64% (np.float64(62.637362637362635), np.float64(62.637362637362635))


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


ValueError: The 'general' version is computationally feasible only for m <= 10 features.

Banknote

In [None]:
# Install dependencies as needed:
# pip install ucimlrepo
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
banknote_authentication = fetch_ucirepo(id=267) 
  
# data (as pandas dataframes) 
X = banknote_authentication.data.features 
y = banknote_authentication.data.targets.values.ravel() # Ravel to onvert y to a 1D array

# Convert y to a 1D array
print(banknote_authentication.metadata) 
  
# variable information 
print(banknote_authentication.variables) 


results = {}
for name, model in models.items():
    mean_acc, conf_interval = simulate_training_accuracy(model, X, y, n_simulations=50)
    results[name] = (mean_acc, conf_interval)
    print(f"{name}: {mean_acc:.2f}% {conf_interval}") 

DatasetNotFoundError: Error reading data csv file for "Banknote Authentication" dataset (id=267).

Blood transfusion

Mammographic Mass

In [None]:
mammographic_mass = fetch_ucirepo(id=161) 
  
# data (as pandas dataframes) 
X = mammographic_mass.data.features 
y = mammographic_mass.data.targets 
  
results = {}
for name, model in models.items():
    mean_acc, conf_interval = simulate_training_accuracy(model, X, y, n_simulations=50)
    results[name] = (mean_acc, conf_interval)
    print(f"{name}: {mean_acc:.2f}% {conf_interval}")

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

Raisin

In [None]:
raisin = fetch_ucirepo(id=850) 
  
# data (as pandas dataframes) 
X = raisin.data.features 
y = raisin.data.targets 

results = {}
for name, model in models.items():
    mean_acc, conf_interval = simulate_training_accuracy(model, X, y, n_simulations=50)
    results[name] = (mean_acc, conf_interval)
    print(f"{name}: {mean_acc:.2f}% {conf_interval}")