In [1]:
import numpy as np
from sklearn.model_selection import train_test_split

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
from matplotlib import pyplot as plt
from tqdm import tqdm
import pdb

import sys
sys.path.append("..")
sys.path.append("../third_party")


from cln import data
from cln import contamination
from cln.utils import evaluate_predictions, estimate_rho

from cln.classification import MarginalLabelNoiseConformal
from cln.classification_label_conditional import LabelNoiseConformal

from third_party import arc

In [2]:
# Define default parameters
exp_num = 1
data_name = 'synthetic1'
num_var = 20
K = 4
signal = 1
model_name = 'SVC'
epsilon = 0.1
nu = 0
contamination_model = "uniform"
n_train = 5000
n_cal = 1000
estimate = "none"
seed = 1

In [3]:
# Define other constant parameters
n_test = 2000
batch_size = 10
allow_empty = True
asymptotic_h_start = 1/400
asymptotic_MC_samples = 10000

# Initialize the data distribution
if data_name == "synthetic1":
    data_distribution = data.DataModel_1(K, num_var, signal=signal, random_state=seed)
elif data_name == "synthetic2":
    data_distribution = data.DataModel_2(K, num_var, signal=signal, random_state=seed)
elif data_name == "synthetic3":
    data_distribution = data.DataModel_3(K, num_var, signal=signal, random_state=seed)
else:
    print("Unknown data distribution!")
    sys.stdout.flush()
    exit(-1)

In [4]:
# Estimate the label proportions from the population model
rho = data_distribution.estimate_rho()

# Initialize noise contamination process
if contamination_model == "uniform":
    T = contamination.construct_T_matrix_simple(K, epsilon)
    M = contamination.convert_T_to_M(T,rho)
elif contamination_model == "block":
    T = contamination.construct_T_matrix_block(K, epsilon)
    M = contamination.convert_T_to_M(T,rho)
elif contamination_model == "RRB":
    T = contamination.construct_T_matrix_block_RR(K, epsilon, nu)
    M = contamination.convert_T_to_M(T,rho)
elif contamination_model == "random":
    T = contamination.construct_T_matrix_random(K, epsilon, random_state=seed)
    M = contamination.convert_T_to_M(T,rho)
else:
    print("Unknown contamination model!")
    sys.stdout.flush()
    exit(-1)

# Compute the contaminated label proportions
rho_tilde = np.dot(T, rho)

In [5]:
# Initialize black-box model
if model_name == 'RFC':
    black_box = arc.black_boxes.RFC(n_estimators=100, max_features="sqrt")
elif model_name == 'SVC':
    black_box = arc.black_boxes.SVC(clip_proba_factor = 1e-5)
elif model_name == 'NN':
    black_box = arc.black_boxes.NNet(max_iter=100)
else:
    print("Unknown model!")
    sys.stdout.flush()
    exit(-1)

In [6]:
"""
# Generate a large data set
print("\nGenerating data...", end=' ')
sys.stdout.flush()
data_distribution.set_seed(1)
X_all, Y_all = data_distribution.sample(n_train+n_cal+n_test)
print("Done.")
sys.stdout.flush()
# Separate the test set
X, X_test, Y, Y_test = train_test_split(X_all, Y_all, test_size=n_test, random_state=2)

# Generate the contaminated labels
print("Generating contaminated labels...", end=' ')
sys.stdout.flush()
contamination_process = contamination.LinearContaminationModel(T, random_state=3)
Yt = contamination_process.sample_labels(Y)
print("Done.")
sys.stdout.flush()

# Apply standard method to corrupted labels (for training)
print("Training the predictive model...", end=' ')
sys.stdout.flush()
method_train = arc.methods.SplitConformal(X, Yt, black_box, K, 0.1, n_cal=n_cal, random_state=4)
print("Done.")
sys.stdout.flush()

# Extract the pre-trained model
black_box_pt = method_train.black_box

# Separate the training e calibration set
X_train, X_cal, Y_train, Y_cal = train_test_split(X, Y, test_size=n_cal, random_state=5)
# Estimate the label contamination process
p_hat_cal = black_box_pt.predict_proba(X_cal)
if not isinstance(p_hat_cal, np.ndarray):
    p_hat_cal = np.asarray(p_hat_cal)
_, K_out = p_hat_cal.shape
assert K_out == K, f"black_box_pt returned {K_out} classes, expected {K}"
# Estimate the contamination process using anchor points
T_hat = np.zeros((K, K), dtype=float)
x_bar_list = []

# Loop over classes l in {0, ..., K-1}
for l in range(K):
    # 1) Find x_bar^l = argmax_x p_hat(tilde{y}=l | x)
    #    This is the index in X_cal where class l has maximum probability
    idx_star = np.argmax(p_hat_cal[:, l])
    x_bar_l = X_cal[idx_star]

    # 2) T_hat_{k,l} = p_hat(tilde{y}=k | x_bar^l)
    #    This is simply the probability vector at that index
    T_hat[:, l] = p_hat_cal[idx_star, :]

    x_bar_list.append(x_bar_l)
# Alternative method
gamma = 0.005
if not (0 < gamma <= 1):
    raise ValueError("gamma must be in (0, 1].")

# Number of points to keep per class
m = max(1, int(np.ceil(gamma * n_cal)))

T_hat_gamma = np.zeros((K, K), dtype=float)
A_indices_list = []

for l in range(K):
    # Scores for class l
    scores_l = p_hat_cal[:, l]

    # Indices of top-m scores for class l
    top_idx = np.argsort(scores_l)[::-1][:m]

    # Indexes with scores between the 85th and the 90th quantile
    # low_q  = np.quantile(scores_l, 0.9)
    # high_q = np.quantile(scores_l, 0.95)
    # idx_between = np.where((scores_l >= low_q) & (scores_l <= high_q))[0]

    # Average the predicted probability vectors over these top examples
    T_hat_gamma[:, l] = p_hat_cal[top_idx, :].mean(axis=0)

    A_indices_list.append(top_idx)

print(T)

print(T_hat)

print(T_hat_gamma)
"""

'\n# Generate a large data set\nprint("\nGenerating data...", end=\' \')\nsys.stdout.flush()\ndata_distribution.set_seed(1)\nX_all, Y_all = data_distribution.sample(n_train+n_cal+n_test)\nprint("Done.")\nsys.stdout.flush()\n# Separate the test set\nX, X_test, Y, Y_test = train_test_split(X_all, Y_all, test_size=n_test, random_state=2)\n\n# Generate the contaminated labels\nprint("Generating contaminated labels...", end=\' \')\nsys.stdout.flush()\ncontamination_process = contamination.LinearContaminationModel(T, random_state=3)\nYt = contamination_process.sample_labels(Y)\nprint("Done.")\nsys.stdout.flush()\n\n# Apply standard method to corrupted labels (for training)\nprint("Training the predictive model...", end=\' \')\nsys.stdout.flush()\nmethod_train = arc.methods.SplitConformal(X, Yt, black_box, K, 0.1, n_cal=n_cal, random_state=4)\nprint("Done.")\nsys.stdout.flush()\n\n# Extract the pre-trained model\nblack_box_pt = method_train.black_box\n\n# Separate the training e calibration s

In [7]:
# Add important parameters to table of results
header = pd.DataFrame({'data':[data_name], 'num_var':[num_var], 'K':[K],
                       'signal':[signal], 'n_train':[n_train], 'n_cal':[n_cal],
                       'epsilon':[epsilon], 'nu':[nu], 'contamination':[contamination_model],
                       'model_name':[model_name], 'estimate':[estimate], 'seed':[seed]})

# Output file
outfile_prefix = "exp"+str(exp_num) + "/" + data_name + "_p" + str(num_var)
outfile_prefix += "_K" + str(K) + "_signal" + str(signal) + "_" + model_name
outfile_prefix += "_eps" + str(epsilon) + "_nu" + str(nu) + "_" + contamination_model
outfile_prefix += "_nt" + str(n_train) + "_nc" + str(n_cal) + "_est" + estimate + "_seed" + str(seed)
print("Output file: {:s}.".format("results/"+outfile_prefix), end="\n")
sys.stdout.flush()

# Describe the experiment
def run_experiment(random_state):
    print("\nRunning experiment in batch {:d}...".format(random_state))
    sys.stdout.flush()

    # Generate a large data set
    print("\nGenerating data...", end=' ')
    sys.stdout.flush()
    data_distribution.set_seed(random_state+1)
    X_all, Y_all = data_distribution.sample(n_train+n_cal+n_test)
    print("Done.")
    sys.stdout.flush()

    # Separate the test set
    X, X_test, Y, Y_test = train_test_split(X_all, Y_all, test_size=n_test, random_state=random_state+2)

    # Generate the contaminated labels
    print("Generating contaminated labels...", end=' ')
    sys.stdout.flush()
    contamination_process = contamination.LinearContaminationModel(T, random_state=random_state+3)
    Yt = contamination_process.sample_labels(Y)
    print("Done.")
    sys.stdout.flush()

    # Estimate (if applicable) the label contamination model
    if estimate=="none":
        rho_tilde_hat = rho_tilde
    elif estimate=="rho":
        rho_tilde_hat = estimate_rho(Yt, K)
    else:
        print("Unknown estimation option!")
        sys.stdout.flush()
        exit(-1)


    # Apply standard method to corrupted labels (for training)
    print("Training the predictive model...", end=' ')
    sys.stdout.flush()
    method_train = arc.methods.SplitConformal(X, Yt, black_box, K, 0.1, n_cal=n_cal, random_state=random_state)
    print("Done.")
    sys.stdout.flush()

    # Extract the pre-trained model
    black_box_pt = method_train.black_box

    # Estimate the contamination process using anchor points
    _, X_cal, _, _ = train_test_split(X, Y, test_size=n_cal, random_state=random_state)
    p_hat_cal = black_box_pt.predict_proba(X_cal)
    if not isinstance(p_hat_cal, np.ndarray):
        p_hat_cal = np.asarray(p_hat_cal)
    _, K_out = p_hat_cal.shape
    assert K_out == K, f"black_box_pt returned {K_out} classes, expected {K}"

    T_hat = np.zeros((K, K), dtype=float)
    x_bar_list = []
    for l in range(K):
        idx_star = np.argmax(p_hat_cal[:, l])
        x_bar_l = X_cal[idx_star]
        T_hat[:, l] = p_hat_cal[idx_star, :]

        x_bar_list.append(x_bar_l)
    

    res = pd.DataFrame({})
    for alpha in [0.1]:
        #for guarantee in ['lab-cond', 'marginal']:
        for guarantee in ['marginal']:
            print("\nSeeking {:s} coverage at level {:.2f}.".format(guarantee, 1-alpha))

            #if guarantee=='lab-cond':
            #    label_conditional = True
            #else:
            label_conditional = False

            # Define a dictionary of methods with their names and corresponding initialization parameters
            methods = {
                "Standard": lambda: arc.methods.SplitConformal(X, Yt, black_box_pt, K, alpha, n_cal=n_cal,
                                                               label_conditional=label_conditional, allow_empty=allow_empty,
                                                               pre_trained=True, random_state=random_state),

                "Adaptive optimized": lambda: MarginalLabelNoiseConformal(X, Yt, black_box_pt, K, alpha, n_cal=n_cal,
                                                                          epsilon=epsilon, T=T, rho_tilde=rho_tilde_hat,
                                                                          allow_empty=allow_empty, method="improved",
                                                                          optimized=True, optimistic=False, verbose=False,
                                                                          pre_trained=True, random_state=random_state),

                "Adaptive optimized+": lambda: MarginalLabelNoiseConformal(X, Yt, black_box_pt, K, alpha, n_cal=n_cal,
                                                                          epsilon=epsilon, T=T, rho_tilde=rho_tilde_hat,
                                                                          allow_empty=allow_empty, method="improved",
                                                                          optimized=True, optimistic=True, verbose=False,
                                                                          pre_trained=True, random_state=random_state),

                "Asymptotic": lambda: MarginalLabelNoiseConformal(X, Yt, black_box_pt, K, alpha, n_cal=n_cal,
                                                                   epsilon=epsilon, asymptotic_h_start=asymptotic_h_start,
                                                                   asymptotic_MC_samples=asymptotic_MC_samples, T=T,
                                                                   rho_tilde=rho_tilde_hat, allow_empty=allow_empty,
                                                                   method="asymptotic", optimistic=False, verbose=False,
                                                                   pre_trained=True, random_state=random_state),

                "Asymptotic+": lambda: MarginalLabelNoiseConformal(X, Yt, black_box_pt, K, alpha, n_cal=n_cal,
                                                                   epsilon=epsilon, asymptotic_h_start=asymptotic_h_start,
                                                                   asymptotic_MC_samples=asymptotic_MC_samples, T=T,
                                                                   rho_tilde=rho_tilde_hat, allow_empty=allow_empty,
                                                                   method="asymptotic", optimistic=True, verbose=False,
                                                                   pre_trained=True, random_state=random_state),

                "Adaptive optimized AP": lambda: MarginalLabelNoiseConformal(X, Yt, black_box_pt, K, alpha, n_cal=n_cal,
                                                                          epsilon=epsilon, T=T_hat, rho_tilde=rho_tilde_hat,
                                                                          allow_empty=allow_empty, method="improved",
                                                                          optimized=True, optimistic=False, verbose=False,
                                                                          pre_trained=True, random_state=random_state),

                "Adaptive optimized+ AP": lambda: MarginalLabelNoiseConformal(X, Yt, black_box_pt, K, alpha, n_cal=n_cal,
                                                                          epsilon=epsilon, T=T_hat, rho_tilde=rho_tilde_hat,
                                                                          allow_empty=allow_empty, method="improved",
                                                                          optimized=True, optimistic=True, verbose=False,
                                                                          pre_trained=True, random_state=random_state),

                "Asymptotic AP": lambda: MarginalLabelNoiseConformal(X, Yt, black_box_pt, K, alpha, n_cal=n_cal,
                                                                   epsilon=epsilon, asymptotic_h_start=asymptotic_h_start,
                                                                   asymptotic_MC_samples=asymptotic_MC_samples, T=T_hat,
                                                                   rho_tilde=rho_tilde_hat, allow_empty=allow_empty,
                                                                   method="asymptotic", optimistic=False, verbose=False,
                                                                   pre_trained=True, random_state=random_state),

                "Asymptotic+ AP": lambda: MarginalLabelNoiseConformal(X, Yt, black_box_pt, K, alpha, n_cal=n_cal,
                                                                   epsilon=epsilon, asymptotic_h_start=asymptotic_h_start,
                                                                   asymptotic_MC_samples=asymptotic_MC_samples, T=T_hat,
                                                                   rho_tilde=rho_tilde_hat, allow_empty=allow_empty,
                                                                   method="asymptotic", optimistic=True, verbose=False,
                                                                   pre_trained=True, random_state=random_state),

            }

            # Initialize an empty list to store the evaluation results
            res_list = []

            # Loop through the methods, apply them, and evaluate the results
            for method_name, method_func in methods.items():
                print(f"Applying {method_name} method...", end=' ')
                sys.stdout.flush()

                # Initialize and apply the method
                method = method_func()
                predictions = method.predict(X_test)

                print("Done.")
                sys.stdout.flush()

                # Evaluate the method
                res_new = evaluate_predictions(predictions, X_test, Y_test, K, verbose=False)
                res_new['Method'] = method_name
                res_new['Guarantee'] = guarantee
                res_new['Alpha'] = alpha
                res_new['random_state'] = random_state

                # Append the result to the results list
                res_list.append(res_new)

            # Combine all results into a single DataFrame
            res = pd.concat(res_list)

    print(res)

    return res

Output file: results/exp1/synthetic1_p20_K4_signal1_SVC_eps0.1_nu0_uniform_nt5000_nc1000_estnone_seed1.


In [8]:
# Run all experiments
results = pd.DataFrame({})
for batch in np.arange(1,batch_size+1):
    res = run_experiment(1000*seed+batch-1000)
    results = pd.concat([results, res])

    # Save results
    outfile = "results/" + outfile_prefix + ".txt"
    results_out = pd.concat([header,results], axis=1)
    results_out.to_csv(outfile, index=False, float_format="%.5f")

print("\nPreview of results:")
print(results)
sys.stdout.flush()

print("\nSummary of results:")
summary = results.groupby(['Alpha', 'Guarantee', 'Method', 'Label']).agg(['mean','std']).reset_index()
print(summary)
sys.stdout.flush()


print("\nFinished.\nResults written to {:s}\n".format(outfile))
sys.stdout.flush()


Running experiment in batch 1...

Generating data... Done.
Generating contaminated labels... Done.
Training the predictive model... Done.

Seeking marginal coverage at level 0.90.
Applying Standard method... Done.
Applying Adaptive optimized method... Done.
Applying Adaptive optimized+ method... Done.
Applying Asymptotic method... Done.
Applying Asymptotic+ method... Done.
Applying Adaptive optimized AP method... Done.
Applying Adaptive optimized+ AP method... Done.
Applying Asymptotic AP method... Done.
Applying Asymptotic+ AP method... Done.
   Coverage    Size     Label                  Method Guarantee  Alpha   
0    0.9235  2.2655  marginal                Standard  marginal    0.1  \
0    0.9085  2.1420  marginal      Adaptive optimized  marginal    0.1   
0    0.9085  2.1420  marginal     Adaptive optimized+  marginal    0.1   
0    0.9065  2.1320  marginal              Asymptotic  marginal    0.1   
0    0.9065  2.1320  marginal             Asymptotic+  marginal    0.1   
0    