# Experiment 1

## Randomized reponde model

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
from matplotlib import pyplot as plt
from tqdm import tqdm
import pdb

import sys
sys.path.append("..")
sys.path.append("../third_party")

from cln import data
from cln import contamination
from cln.utils import evaluate_predictions, estimate_rho

from cln.classification import MarginalLabelNoiseConformal

from third_party import arc

In [18]:
# Define default parameters
exp_num = 1

# Contamination model
contamination_model = "uniform"
epsilon = 0.1
nu = "none"
estimate = "none"

# Parameters for data simulation
data_name = 'synthetic1'
num_var = 10
K = 4
signal = 1
seed = 1

n_train = 1000
#n_cal_vals = [100, 500, 1000, 2000, 5000, 10000, 20000, 50000, 100000]
n_cal_vals = [100, 500, 1000, 2000, 5000, 10000, 20000]

# Parameters for model fitting
model_name = 'RFC'

In [19]:
# Define other constant parameters
n_test = 2000
batch_size = 5
allow_empty = True

In [20]:
# Initialize the data distribution
if data_name == "synthetic1":
    data_distribution = data.DataModel_1(K, num_var, signal=signal, random_state=seed)
elif data_name == "synthetic2":
    data_distribution = data.DataModel_2(K, num_var, signal=signal, random_state=seed)
elif data_name == "synthetic3":
    data_distribution = data.DataModel_3(K, num_var, signal=signal, random_state=seed)
else:
    print("Unknown data distribution!")
    sys.stdout.flush()
    exit(-1)

In [21]:
# Estimate the label proportions from the population model
rho = data_distribution.estimate_rho()

In [22]:
# Initialize noise contamination process
if contamination_model == "uniform":
    T = contamination.construct_T_matrix_simple(K, epsilon)  
    M = contamination.convert_T_to_M(T,rho)
elif contamination_model == "block":
    T = contamination.construct_T_matrix_block(K, epsilon)
    M = contamination.convert_T_to_M(T,rho)
elif contamination_model == "RRB":
    T = contamination.construct_T_matrix_block_RR(K, epsilon, nu)
    M = contamination.convert_T_to_M(T,rho)
elif contamination_model == "random":
    T = contamination.construct_T_matrix_random(K, epsilon, random_state=seed)
    M = contamination.convert_T_to_M(T,rho)
else:
    print("Unknown contamination (M) model!")
    sys.stdout.flush()
    exit(-1)

In [23]:
# Compute the contaminated label proportions
rho_tilde = np.dot(T, rho)

In [24]:
# Initialize black-box model
if model_name == 'RFC':
    black_box = arc.black_boxes.RFC(n_estimators=100)
elif model_name == 'SVC':
    black_box = arc.black_boxes.SVC(clip_proba_factor = 1e-5)
elif model_name == 'NN':
    black_box = arc.black_boxes.NNet(max_iter=100)
else:
    print("Unknown model!")
    sys.stdout.flush()
    exit(-1)

In [25]:
for n_cal in n_cal_vals:
    # Add important parameters to table of results
    header = pd.DataFrame({'data':[data_name], 'num_var':[num_var], 'K':[K],
                            'signal':[signal], 'n_train':[n_train], 'n_cal':[n_cal],
                            'epsilon':[epsilon], 'nu':[nu],'contamination':[contamination_model],
                            'model_name':[model_name], 'estimate':[estimate], 'seed':[seed]})
    # Output file
    outfile_prefix = "exp"+str(exp_num) + "/" + data_name + "_p" + str(num_var)
    outfile_prefix += "_K" + str(K) + "_signal" + str(signal) + "_" + model_name
    outfile_prefix += "_eps" + str(epsilon) + "_nu" + str(nu) + "_" + contamination_model
    outfile_prefix += "_nt" + str(n_train) + "_nc" + str(n_cal) + "_est" + estimate + "_seed" + str(seed)
    print("Output file: {:s}.".format("results_improved/"+outfile_prefix), end="\n")
    sys.stdout.flush()

    # Describe the experiment
    def run_experiment(random_state):
        print("\nRunning experiment in batch {:d}...".format(random_state))
        sys.stdout.flush()

        # Generate a large data set
        print("\nGenerating data...", end=' ')
        sys.stdout.flush()
        data_distribution.set_seed(random_state+1)
        X_all, Y_all = data_distribution.sample(n_train+n_cal+n_test)
        print("Done.")
        sys.stdout.flush()

        # Separate the test set
        X, X_test, Y, Y_test = train_test_split(X_all, Y_all, test_size=n_test, random_state=random_state+2)

        # Generate the contaminated labels
        print("Generating contaminated labels...", end=' ')
        sys.stdout.flush()
        contamination_process = contamination.LinearContaminationModel(T, random_state=random_state+3)
        Yt = contamination_process.sample_labels(Y)
        print("Done.")
        sys.stdout.flush()

        # Estimate (if applicable) the label contamination model
        if estimate=="none":
            rho_tilde_hat = rho_tilde
        elif estimate=="rho":
            print("The model must be known at this stage")
            sys.stdout.flush()
            exit(-1)   
        else:
            print("Unknown estimation option!")
            sys.stdout.flush()
            exit(-1)


        # Apply standard method to corrupted labels (for training)
        print("Training the predictive model...", end=' ')
        sys.stdout.flush()
        method_train = arc.methods.SplitConformal(X, Yt, black_box, K, 0.1, n_cal=n_cal, random_state=random_state)
        print("Done.")
        sys.stdout.flush()

        # Extract the pre-trained model
        black_box_pt = method_train.black_box

        res = pd.DataFrame({})
        for alpha in [0.1]:
            for guarantee in ['marginal']:

                print("\nSeeking {:s} coverage at level {:.2f}.".format(guarantee, 1-alpha))

                # Apply standard method to corrupted labels
                print("Applying standard method (with model training)...", end=' ')
                sys.stdout.flush()
                method_sc = arc.methods.SplitConformal(X, Yt, black_box_pt, K, alpha, n_cal=n_cal, label_conditional=False,
                                                       allow_empty=allow_empty, pre_trained=True, random_state=random_state)
                S_sc = method_sc.predict(X_test)
                print("Done.")
                sys.stdout.flush()


                # Apply old adaptive method to corrupted labels
                print("Applying adaptive method...", end=' ')
                sys.stdout.flush()
                method_ln = MarginalLabelNoiseConformal(X, Yt, black_box_pt, K, alpha, n_cal=n_cal, epsilon=epsilon, T=T, M=M, rho_tilde=rho_tilde_hat,
                                                allow_empty=allow_empty, improved=False, optimized = False, asymptotic=False, verbose=False, pre_trained=True, random_state=random_state)
                S_ln = method_ln.predict(X_test)
                print("Done.")
                sys.stdout.flush()

                
                # Apply optimized adaptive method to corrupted labels
                print("Applying optimized adaptive method...", end=' ')
                sys.stdout.flush()
                method_ln_imp = MarginalLabelNoiseConformal(X, Yt, black_box_pt, K, alpha, n_cal=n_cal, epsilon=epsilon, T=T, rho_tilde=rho_tilde_hat,
                                                allow_empty=allow_empty, improved=True, optimized = True, asymptotic=False, verbose=False, pre_trained=True, random_state=random_state)
                S_ln_imp = method_ln_imp.predict(X_test)
                print("Done.")
                sys.stdout.flush()

                
                # Apply simplified adaptive method to corrupted labels
                print("Applying simplified adaptive method...", end=' ')
                sys.stdout.flush()
                method_ln_imp_simpl = MarginalLabelNoiseConformal(X, Yt, black_box_pt, K, alpha, n_cal=n_cal, epsilon=epsilon, T=T, rho_tilde=rho_tilde_hat,
                                                allow_empty=allow_empty, improved=True, optimized = False, asymptotic=False, verbose=False, pre_trained=True, random_state=random_state)
                S_ln_imp_simpl = method_ln_imp_simpl.predict(X_test)
                print("Done.")
                sys.stdout.flush()
                

                # Evaluate methods
                res_sc = evaluate_predictions(S_sc, X_test, Y_test, K, verbose=False)
                res_ln = evaluate_predictions(S_ln, X_test, Y_test, K, verbose=False)
                res_ln_imp = evaluate_predictions(S_ln_imp, X_test, Y_test, K, verbose=False)
                res_ln_imp_simpl = evaluate_predictions(S_ln_imp_simpl, X_test, Y_test, K, verbose=False)

                # Combine results
                res_sc['Method'] = "Standard"
                res_ln['Method'] = "Adaptive"
                res_ln_imp['Method'] = "Adaptive optimized"
                res_ln_imp_simpl['Method'] = "Adaptive simplified"
                res_new = pd.concat([res_sc, res_ln, res_ln_imp, res_ln_imp_simpl])
                #res_new = pd.concat([res_sc, res_ln_imp, res_ln_imp_simpl])

                res_new['Guarantee'] = guarantee
                res_new['Alpha'] = alpha
                res_new['random_state'] = random_state
                res = pd.concat([res, res_new])

        print(res)

        return res
    
    # Run all experiments
    results = pd.DataFrame({})
    for batch in np.arange(1,batch_size+1):
        res = run_experiment(1000*seed+batch-1000)
        results = pd.concat([results, res])

        # Save results
        outfile = "results/" + outfile_prefix + ".txt"
        results_out = pd.concat([header,results], axis=1)
        results_out.to_csv(outfile, index=False, float_format="%.5f")

    #print("\nPreview of results:")
    #print(results)
    #sys.stdout.flush()

    #print("\nSummary of results:")
    #summary = results.groupby(['Alpha', 'Guarantee', 'Method', 'Label']).agg(['mean','std']).reset_index()
    #print(summary)
    #sys.stdout.flush()


    #print("\nFinished.\nResults written to {:s}\n".format(outfile))
    #sys.stdout.flush()
    print(n_cal)

Output file: results_improved/exp1/synthetic1_p10_K4_signal1_RFC_eps0.1_nunone_uniform_nt1000_nc100_estnone_seed1.

Running experiment in batch 1...

Generating data... Done.
Generating contaminated labels... Done.
Training the predictive model... Done.

Seeking marginal coverage at level 0.90.
Applying standard method (with model training)... Done.
Applying adaptive method... Done.
Applying optimized adaptive method... Done.
Applying simplified adaptive method... Done.
   Coverage      Size     Label               Method Guarantee  Alpha   
0  0.982000  2.740000  marginal             Standard  marginal    0.1  \
0  0.991853  2.753564         0             Standard  marginal    0.1   
0  0.984064  2.798805         1             Standard  marginal    0.1   
0  0.969754  2.758034         2             Standard  marginal    0.1   
0  0.983264  2.644351         3             Standard  marginal    0.1   
0  1.000000  4.000000  marginal             Adaptive  marginal    0.1   
0  1.000000  4


## Two-level randomized response model

In [None]:
# Define default parameters
exp_num = 1

# Contamination model
contamination_model = "RRB"
epsilon = 0.1
nu = 0.2
estimate = "none"

# Parameters for data simulation
data_name = 'synthetic1'
num_var = 10
K = 16
signal = 1
seed = 1

n_train = 1000
#n_cal_vals = [100, 500, 1000, 2000, 5000, 10000, 20000, 50000, 100000]
n_cal_vals = [100, 500, 1000, 2000, 5000, 10000, 20000]

# Parameters for model fitting
model_name = 'RFC'

In [27]:
# Define other constant parameters
n_test = 2000
batch_size = 5
allow_empty = True

In [28]:
# Initialize the data distribution
if data_name == "synthetic1":
    data_distribution = data.DataModel_1(K, num_var, signal=signal, random_state=seed)
elif data_name == "synthetic2":
    data_distribution = data.DataModel_2(K, num_var, signal=signal, random_state=seed)
elif data_name == "synthetic3":
    data_distribution = data.DataModel_3(K, num_var, signal=signal, random_state=seed)
else:
    print("Unknown data distribution!")
    sys.stdout.flush()
    exit(-1)

In [29]:
# Estimate the label proportions from the population model
rho = data_distribution.estimate_rho()

In [30]:
# Initialize noise contamination process
if contamination_model == "uniform":
    T = contamination.construct_T_matrix_simple(K, epsilon)  
    M = contamination.convert_T_to_M(T,rho)
elif contamination_model == "block":
    T = contamination.construct_T_matrix_block(K, epsilon)
    M = contamination.convert_T_to_M(T,rho)
elif contamination_model == "RRB":
    T = contamination.construct_T_matrix_block_RR(K, epsilon, nu)
    M = contamination.convert_T_to_M(T,rho)
elif contamination_model == "random":
    T = contamination.construct_T_matrix_random(K, epsilon, random_state=seed)
    M = contamination.convert_T_to_M(T,rho)
else:
    print("Unknown contamination (M) model!")
    sys.stdout.flush()
    exit(-1)

In [31]:
# Compute the contaminated label proportions
rho_tilde = np.dot(T, rho)

In [32]:
# Initialize black-box model
if model_name == 'RFC':
    black_box = arc.black_boxes.RFC(n_estimators=100)
elif model_name == 'SVC':
    black_box = arc.black_boxes.SVC(clip_proba_factor = 1e-5)
elif model_name == 'NN':
    black_box = arc.black_boxes.NNet(max_iter=100)
else:
    print("Unknown model!")
    sys.stdout.flush()
    exit(-1)

In [33]:
for n_cal in n_cal_vals:
    # Add important parameters to table of results
    header = pd.DataFrame({'data':[data_name], 'num_var':[num_var], 'K':[K],
                            'signal':[signal], 'n_train':[n_train], 'n_cal':[n_cal],
                            'epsilon':[epsilon], 'nu':[nu],'contamination':[contamination_model],
                            'model_name':[model_name], 'estimate':[estimate], 'seed':[seed]})
    # Output file
    outfile_prefix = "exp"+str(exp_num) + "/" + data_name + "_p" + str(num_var)
    outfile_prefix += "_K" + str(K) + "_signal" + str(signal) + "_" + model_name
    outfile_prefix += "_eps" + str(epsilon) + "_nu" + str(nu) + "_" + contamination_model
    outfile_prefix += "_nt" + str(n_train) + "_nc" + str(n_cal) + "_est" + estimate + "_seed" + str(seed)
    print("Output file: {:s}.".format("results/"+outfile_prefix), end="\n")
    sys.stdout.flush()

    # Describe the experiment
    def run_experiment(random_state):
        print("\nRunning experiment in batch {:d}...".format(random_state))
        sys.stdout.flush()

        # Generate a large data set
        print("\nGenerating data...", end=' ')
        sys.stdout.flush()
        data_distribution.set_seed(random_state+1)
        X_all, Y_all = data_distribution.sample(n_train+n_cal+n_test)
        print("Done.")
        sys.stdout.flush()

        # Separate the test set
        X, X_test, Y, Y_test = train_test_split(X_all, Y_all, test_size=n_test, random_state=random_state+2)

        # Generate the contaminated labels
        print("Generating contaminated labels...", end=' ')
        sys.stdout.flush()
        contamination_process = contamination.LinearContaminationModel(T, random_state=random_state+3)
        Yt = contamination_process.sample_labels(Y)
        print("Done.")
        sys.stdout.flush()

        # Estimate (if applicable) the label contamination model
        if estimate=="none":
            rho_tilde_hat = rho_tilde
        elif estimate=="rho":
            print("The model must be known at this stage")
            sys.stdout.flush()
            exit(-1)   
        else:
            print("Unknown estimation option!")
            sys.stdout.flush()
            exit(-1)


        # Apply standard method to corrupted labels (for training)
        print("Training the predictive model...", end=' ')
        sys.stdout.flush()
        method_train = arc.methods.SplitConformal(X, Yt, black_box, K, 0.1, n_cal=n_cal, random_state=random_state)
        print("Done.")
        sys.stdout.flush()

        # Extract the pre-trained model
        black_box_pt = method_train.black_box

        res = pd.DataFrame({})
        for alpha in [0.1]:
            for guarantee in ['marginal']:

                print("\nSeeking {:s} coverage at level {:.2f}.".format(guarantee, 1-alpha))

                # Apply standard method to corrupted labels
                print("Applying standard method (with model training)...", end=' ')
                sys.stdout.flush()
                method_sc = arc.methods.SplitConformal(X, Yt, black_box_pt, K, alpha, n_cal=n_cal, label_conditional=False,
                                                       allow_empty=allow_empty, pre_trained=True, random_state=random_state)
                S_sc = method_sc.predict(X_test)
                print("Done.")
                sys.stdout.flush()


                # Apply old adaptive method to corrupted labels
                print("Applying adaptive method...", end=' ')
                sys.stdout.flush()
                method_ln = MarginalLabelNoiseConformal(X, Yt, black_box_pt, K, alpha, n_cal=n_cal, epsilon=epsilon, T=T, M=M, rho_tilde=rho_tilde_hat,
                                                allow_empty=allow_empty, improved=False, optimized = False, asymptotic=False, verbose=False, pre_trained=True, random_state=random_state)
                S_ln = method_ln.predict(X_test)
                print("Done.")
                sys.stdout.flush()

                # Apply optimized adaptive method to corrupted labels
                print("Applying optimized adaptive method...", end=' ')
                sys.stdout.flush()
                method_ln_imp = MarginalLabelNoiseConformal(X, Yt, black_box_pt, K, alpha, n_cal=n_cal, epsilon=epsilon, T=T, rho_tilde=rho_tilde_hat,
                                                allow_empty=allow_empty, improved=True, optimized = True, asymptotic=False, verbose=False, pre_trained=True, random_state=random_state)
                S_ln_imp = method_ln_imp.predict(X_test)
                print("Done.")
                sys.stdout.flush()

                # Apply simplified adaptive method to corrupted labels
                print("Applying simplified adaptive method...", end=' ')
                sys.stdout.flush()
                method_ln_imp_simpl = MarginalLabelNoiseConformal(X, Yt, black_box_pt, K, alpha, n_cal=n_cal, epsilon=epsilon, T=T, rho_tilde=rho_tilde_hat,
                                                allow_empty=allow_empty, improved=True, optimized = False, asymptotic=False, verbose=False, pre_trained=True, random_state=random_state)
                S_ln_imp_simpl = method_ln_imp_simpl.predict(X_test)
                print("Done.")
                sys.stdout.flush()

                # Evaluate methods
                res_sc = evaluate_predictions(S_sc, X_test, Y_test, K, verbose=False)
                res_ln = evaluate_predictions(S_ln, X_test, Y_test, K, verbose=False)
                res_ln_imp = evaluate_predictions(S_ln_imp, X_test, Y_test, K, verbose=False)
                res_ln_imp_simpl = evaluate_predictions(S_ln_imp_simpl, X_test, Y_test, K, verbose=False)

                # Combine results
                res_sc['Method'] = "Standard"
                res_ln['Method'] = "Adaptive"
                res_ln_imp['Method'] = "Adaptive optimized"
                res_ln_imp_simpl['Method'] = "Adaptive simplified"
                res_new = pd.concat([res_sc, res_ln, res_ln_imp, res_ln_imp_simpl])

                res_new['Guarantee'] = guarantee
                res_new['Alpha'] = alpha
                res_new['random_state'] = random_state
                res = pd.concat([res, res_new])

        print(res)

        return res
    
    # Run all experiments
    results = pd.DataFrame({})
    for batch in np.arange(1,batch_size+1):
        res = run_experiment(1000*seed+batch-1000)
        results = pd.concat([results, res])

        # Save results
        outfile = "results/" + outfile_prefix + ".txt"
        results_out = pd.concat([header,results], axis=1)
        results_out.to_csv(outfile, index=False, float_format="%.5f")

    #print("\nPreview of results:")
    #print(results)
    #sys.stdout.flush()

    #print("\nSummary of results:")
    #summary = results.groupby(['Alpha', 'Guarantee', 'Method', 'Label']).agg(['mean','std']).reset_index()
    #print(summary)
    #sys.stdout.flush()


    #print("\nFinished.\nResults written to {:s}\n".format(outfile))
    #sys.stdout.flush()
    print(n_cal)

Output file: results/exp1/synthetic1_p10_K16_signal1_RFC_eps0.1_nu0.2_RRB_nt1000_nc100_estnone_seed1.

Running experiment in batch 1...

Generating data... Done.
Generating contaminated labels... Done.
Training the predictive model... Done.

Seeking marginal coverage at level 0.90.
Applying standard method (with model training)... Done.
Applying adaptive method... Done.
Applying optimized adaptive method... Done.
Applying simplified adaptive method... Done.
    Coverage       Size     Label               Method Guarantee  Alpha   
0   0.969500  11.279000  marginal             Standard  marginal    0.1  \
0   0.992481  11.526316         0             Standard  marginal    0.1   
0   1.000000  11.185484         1             Standard  marginal    0.1   
0   0.992063  11.293651         2             Standard  marginal    0.1   
0   0.970588  11.198529         3             Standard  marginal    0.1   
..       ...        ...       ...                  ...       ...    ...   
0   0.976923 