# Experiment 1

Let us make a first attempt in doing the experiments.

In [75]:
import numpy as np
from sklearn.model_selection import train_test_split

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
from matplotlib import pyplot as plt
from tqdm import tqdm
import pdb

import sys
sys.path.append("..")
sys.path.append("../third_party")


from cln import data
from cln import contamination
from cln.utils import evaluate_predictions, estimate_rho

from cln.classification import LabelNoiseConformal

from third_party import arc

In [96]:
# Define default parameters
exp_num = 1
data_name = 'synthetic1'
num_var = 20
K = 10
signal = 1
model_name = 'RFC'
epsilon = 0.2
#contamination_model = "uniform"
contamination_model = "random"
#contamination_model = "block"
n_train = 1000
#n_cal = 500
n_cal_vals = [500, 1000, 2000, 5000, 10000, 20000, 50000, 100000]
estimate = "none"
seed = 1

In [97]:
# Define other constant parameters
n_test = 2000
batch_size = 5
allow_empty = True

In [98]:
# Initialize the data distribution
if data_name == "synthetic1":
    data_distribution = data.DataModel_1(K, num_var, signal=signal, random_state=seed)
elif data_name == "synthetic2":
    data_distribution = data.DataModel_2(K, num_var, signal=signal, random_state=seed)
elif data_name == "synthetic3":
    data_distribution = data.DataModel_3(K, num_var, signal=signal, random_state=seed)
else:
    print("Unknown data distribution!")
    sys.stdout.flush()
    exit(-1)

In [99]:
# Estimate the label proportions from the population model
rho = data_distribution.estimate_rho()

In [100]:
# Initialize noise contamination process
if contamination_model == "uniform":
    T = contamination.construct_T_matrix_simple(K, epsilon)  
    M = contamination.convert_T_to_M(T,rho)
elif contamination_model == "block":
    T = contamination.construct_T_matrix_block(K, epsilon)
    M = contamination.convert_T_to_M(T,rho)
elif contamination_model == "random":
    T = contamination.construct_T_matrix_random(K, epsilon, random_state=seed)
    M = contamination.convert_T_to_M(T,rho)
else:
    print("Unknown contamination (M) model!")
    sys.stdout.flush()
    exit(-1)

In [101]:
# Compute the contaminated label proportions
rho_tilde = np.dot(T, rho)

In [102]:
# Initialize black-box model
if model_name == 'RFC':
    black_box = arc.black_boxes.RFC(n_estimators=100)
elif model_name == 'SVC':
    black_box = arc.black_boxes.SVC(clip_proba_factor = 1e-5)
elif model_name == 'NN':
    black_box = arc.black_boxes.NNet(max_iter=100)
else:
    print("Unknown model!")
    sys.stdout.flush()
    exit(-1)

In [103]:
if estimate=="none":
        rho_hat = rho
        rho_tilde_hat = rho_tilde
        M_hat = M

In [104]:
"""
for n_cal in n_cal_vals:
     # Add important parameters to table of results
     header = pd.DataFrame({'data':[data_name], 'num_var':[num_var], 'K':[K],
                             'signal':[signal], 'n_train':[n_train], 'n_cal':[n_cal],
                             'epsilon':[epsilon], 'contamination':[contamination_model],
                             'model_name':[model_name], 'estimate':[estimate], 'seed':[seed]})
     # Output file
     outfile_prefix = "exp"+str(exp_num) + "/" + data_name + "_p" + str(num_var)
     outfile_prefix += "_K" + str(K) + "_signal" + str(signal) + "_" + model_name
     outfile_prefix += "_eps" + str(epsilon) + "_" + contamination_model
     outfile_prefix += "_nt" + str(n_train) + "_nc" + str(n_cal) + "_est" + estimate + "_seed" + str(seed)
     print("Output file: {:s}.".format("results/"+outfile_prefix), end="\n")
     sys.stdout.flush()
    
     # Describe the experiment
     def run_experiment(random_state):
         print("\nRunning experiment in batch {:d}...".format(random_state))
         sys.stdout.flush()

         # Generate a large data set
         print("\nGenerating data...", end=' ')
         sys.stdout.flush()
         data_distribution.set_seed(random_state+1)
         X_all, Y_all = data_distribution.sample(n_train+n_cal+n_test)
         print("Done.")
         sys.stdout.flush()

         # Separate the test set
         X, X_test, Y, Y_test = train_test_split(X_all, Y_all, test_size=n_test, random_state=random_state+2)

         # Generate the contaminated labels
         print("Generating contaminated labels...", end=' ')
         sys.stdout.flush()
         contamination_process = contamination.LinearContaminationModel(T, random_state=random_state+3)
         Yt = contamination_process.sample_labels(Y)
         print("Done.")
         sys.stdout.flush()

         # Estimate (if applicable) the label contamination model
         if estimate=="none":
             rho_hat = rho
             rho_tilde_hat = rho_tilde
             M_hat = M
         elif estimate=="rho":
             rho_tilde_hat = estimate_rho(Yt, K)
             rho_hat = np.dot(M.T, rho_tilde_hat)
             M_hat = M        
         else:
             print("Unknown estimation option!")
             sys.stdout.flush()
             exit(-1)


         # Apply standard method to corrupted labels (for training)
         print("Training the predictive model...", end=' ')
         sys.stdout.flush()
         method_train = arc.methods.SplitConformal(X, Yt, black_box, K, 0.1, n_cal=n_cal, random_state=random_state)
         print("Done.")
         sys.stdout.flush()

         # Extract the pre-trained model
         black_box_pt = method_train.black_box

         res = pd.DataFrame({})
         for alpha in [0.1]:
             for guarantee in ['lab-cond', 'marginal']:

                 print("\nSeeking {:s} coverage at level {:.2f}.".format(guarantee, 1-alpha))

                 if guarantee=='lab-cond':
                     label_conditional = True
                 else:
                     label_conditional = False

                 # Apply standard method to corrupted labels
                 print("Applying standard method (with model training)...", end=' ')
                 sys.stdout.flush()
                 method_sc = arc.methods.SplitConformal(X, Yt, black_box_pt, K, alpha, n_cal=n_cal, label_conditional=label_conditional,
                                                        allow_empty=allow_empty, pre_trained=True, random_state=random_state)
                 S_sc = method_sc.predict(X_test)
                 print("Done.")
                 sys.stdout.flush()


                 # Apply label-noise method to corrupted labels (pessimistic)
                 print("Applying adaptive method...", end=' ')
                 sys.stdout.flush()
                 method_ln_pes = LabelNoiseConformal(X, Yt, black_box_pt, K, alpha, n_cal=n_cal, M=M_hat, rho_tilde=rho_tilde_hat, label_conditional=label_conditional,
                                                 optimistic=False, improved=False, allow_empty=allow_empty, verbose=False, pre_trained=True, random_state=random_state)
                 S_ln_pes = method_ln_pes.predict(X_test)
                 print("Done.")
                 sys.stdout.flush()

                 # Apply label-noise method to corrupted labels (optimistic)
                 print("Applying adaptive (optimistic) method...", end=' ')
                 sys.stdout.flush()
                 method_ln_opt = LabelNoiseConformal(X, Yt, black_box_pt, K, alpha, n_cal=n_cal, M=M_hat, rho_tilde=rho_tilde_hat, label_conditional=label_conditional,
                                                 optimistic=True, improved=False, allow_empty=allow_empty, verbose=False, pre_trained=True, random_state=random_state)
                 S_ln_opt = method_ln_opt.predict(X_test)
                 print("Done.")
                 sys.stdout.flush()

                 # Evaluate methods
                 res_sc = evaluate_predictions(S_sc, X_test, Y_test, K, verbose=False)
                 res_ln_pes = evaluate_predictions(S_ln_pes, X_test, Y_test, K, verbose=False)
                 res_ln_opt = evaluate_predictions(S_ln_opt, X_test, Y_test, K, verbose=False)

                 # Combine results
                 res_sc['Method'] = "Standard"
                 res_ln_pes['Method'] = "Adaptive (pessimistic)"
                 res_ln_opt['Method'] = "Adaptive (optimistic)"
                 res_new = pd.concat([res_sc, res_ln_pes, res_ln_opt])
                 res_new['Guarantee'] = guarantee
                 res_new['Alpha'] = alpha
                 res_new['random_state'] = random_state
                 res = pd.concat([res, res_new])

         print(res)

         return res
    
     # Run all experiments
     results = pd.DataFrame({})
     for batch in np.arange(1,batch_size+1):
         res = run_experiment(1000*seed+batch-1000)
         results = pd.concat([results, res])

         # Save results
         outfile = "results/" + outfile_prefix + ".txt"
         results_out = pd.concat([header,results], axis=1)
         results_out.to_csv(outfile, index=False, float_format="%.5f")

     #print("\nPreview of results:")
     #print(results)
     #sys.stdout.flush()

     #print("\nSummary of results:")
     #summary = results.groupby(['Alpha', 'Guarantee', 'Method', 'Label']).agg(['mean','std']).reset_index()
     #print(summary)
     #sys.stdout.flush()


     #print("\nFinished.\nResults written to {:s}\n".format(outfile))
     #sys.stdout.flush()
     print(n_cal)

"""

'\nfor n_cal in n_cal_vals:\n     # Add important parameters to table of results\n     header = pd.DataFrame({\'data\':[data_name], \'num_var\':[num_var], \'K\':[K],\n                             \'signal\':[signal], \'n_train\':[n_train], \'n_cal\':[n_cal],\n                             \'epsilon\':[epsilon], \'contamination\':[contamination_model],\n                             \'model_name\':[model_name], \'estimate\':[estimate], \'seed\':[seed]})\n     # Output file\n     outfile_prefix = "exp"+str(exp_num) + "/" + data_name + "_p" + str(num_var)\n     outfile_prefix += "_K" + str(K) + "_signal" + str(signal) + "_" + model_name\n     outfile_prefix += "_eps" + str(epsilon) + "_" + contamination_model\n     outfile_prefix += "_nt" + str(n_train) + "_nc" + str(n_cal) + "_est" + estimate + "_seed" + str(seed)\n     print("Output file: {:s}.".format("results/"+outfile_prefix), end="\n")\n     sys.stdout.flush()\n    \n     # Describe the experiment\n     def run_experiment(random_state

Proviamo a rifare l'esperimento ma considerando solo la copertura marginale e nel caso improved.

In [105]:
for n_cal in n_cal_vals:
    # Add important parameters to table of results
    header = pd.DataFrame({'data':[data_name], 'num_var':[num_var], 'K':[K],
                            'signal':[signal], 'n_train':[n_train], 'n_cal':[n_cal],
                            'epsilon':[epsilon], 'contamination':[contamination_model],
                            'model_name':[model_name], 'estimate':[estimate], 'seed':[seed]})
    # Output file
    outfile_prefix = "exp"+str(exp_num) + "/" + data_name + "_p" + str(num_var)
    outfile_prefix += "_K" + str(K) + "_signal" + str(signal) + "_" + model_name
    outfile_prefix += "_eps" + str(epsilon) + "_" + contamination_model
    outfile_prefix += "_nt" + str(n_train) + "_nc" + str(n_cal) + "_est" + estimate + "_seed" + str(seed)
    print("Output file: {:s}.".format("results_improved/"+outfile_prefix), end="\n")
    sys.stdout.flush()

    # Describe the experiment
    def run_experiment(random_state):
        print("\nRunning experiment in batch {:d}...".format(random_state))
        sys.stdout.flush()

        # Generate a large data set
        print("\nGenerating data...", end=' ')
        sys.stdout.flush()
        data_distribution.set_seed(random_state+1)
        X_all, Y_all = data_distribution.sample(n_train+n_cal+n_test)
        print("Done.")
        sys.stdout.flush()

        # Separate the test set
        X, X_test, Y, Y_test = train_test_split(X_all, Y_all, test_size=n_test, random_state=random_state+2)

        # Generate the contaminated labels
        print("Generating contaminated labels...", end=' ')
        sys.stdout.flush()
        contamination_process = contamination.LinearContaminationModel(T, random_state=random_state+3)
        Yt = contamination_process.sample_labels(Y)
        print("Done.")
        sys.stdout.flush()

        # Estimate (if applicable) the label contamination model
        if estimate=="none":
            rho_hat = rho
            rho_tilde_hat = rho_tilde
            M_hat = M
        elif estimate=="rho":
            rho_tilde_hat = estimate_rho(Yt, K)
            rho_hat = np.dot(M.T, rho_tilde_hat)
            M_hat = M        
        else:
            print("Unknown estimation option!")
            sys.stdout.flush()
            exit(-1)


        # Apply standard method to corrupted labels (for training)
        print("Training the predictive model...", end=' ')
        sys.stdout.flush()
        method_train = arc.methods.SplitConformal(X, Yt, black_box, K, 0.1, n_cal=n_cal, random_state=random_state)
        print("Done.")
        sys.stdout.flush()

        # Extract the pre-trained model
        black_box_pt = method_train.black_box

        res = pd.DataFrame({})
        for alpha in [0.1]:
            for guarantee in ['marginal']:

                print("\nSeeking {:s} coverage at level {:.2f}.".format(guarantee, 1-alpha))

                if guarantee=='lab-cond':
                    label_conditional = True
                else:
                    label_conditional = False

                # Apply standard method to corrupted labels
                print("Applying standard method (with model training)...", end=' ')
                sys.stdout.flush()
                method_sc = arc.methods.SplitConformal(X, Yt, black_box_pt, K, alpha, n_cal=n_cal, label_conditional=label_conditional,
                                                       allow_empty=allow_empty, pre_trained=True, random_state=random_state)
                S_sc = method_sc.predict(X_test)
                print("Done.")
                sys.stdout.flush()


                # Apply label-noise method to corrupted labels (pessimistic)
                print("Applying adaptive method...", end=' ')
                sys.stdout.flush()
                method_ln_pes = LabelNoiseConformal(X, Yt, black_box_pt, K, alpha, n_cal=n_cal, M=M_hat, rho_tilde=rho_tilde_hat, label_conditional=label_conditional,
                                                optimistic=False, improved=False, allow_empty=allow_empty, verbose=False, pre_trained=True, random_state=random_state)
                S_ln_pes = method_ln_pes.predict(X_test)
                print("Done.")
                sys.stdout.flush()

                # Apply label-noise method to corrupted labels (pessimistic), improved for marginal coverage
                print("Applying adaptive (pessimistic) method improved...", end=' ')
                sys.stdout.flush()
                method_ln_imp = LabelNoiseConformal(X, Yt, black_box_pt, K, alpha, n_cal=n_cal, M=M_hat, rho_tilde=rho_tilde_hat, label_conditional=label_conditional,
                                                optimistic=False, improved=True, allow_empty=allow_empty, verbose=False, pre_trained=True, random_state=random_state)
                S_ln_imp = method_ln_imp.predict(X_test)
                print("Done.")
                sys.stdout.flush()

                # Evaluate methods
                res_sc = evaluate_predictions(S_sc, X_test, Y_test, K, verbose=False)
                res_ln_pes = evaluate_predictions(S_ln_pes, X_test, Y_test, K, verbose=False)
                res_ln_imp = evaluate_predictions(S_ln_imp, X_test, Y_test, K, verbose=False)

                # Combine results
                res_sc['Method'] = "Standard"
                res_ln_pes['Method'] = "Adaptive (pessimistic)"
                res_ln_imp['Method'] = "Adaptive (pessimistic) improved"
                res_new = pd.concat([res_sc, res_ln_pes, res_ln_imp])
                res_new['Guarantee'] = guarantee
                res_new['Alpha'] = alpha
                res_new['random_state'] = random_state
                res = pd.concat([res, res_new])

        print(res)

        return res
    
    # Run all experiments
    results = pd.DataFrame({})
    for batch in np.arange(1,batch_size+1):
        res = run_experiment(1000*seed+batch-1000)
        results = pd.concat([results, res])

        # Save results
        outfile = "results_improved/" + outfile_prefix + ".txt"
        results_out = pd.concat([header,results], axis=1)
        results_out.to_csv(outfile, index=False, float_format="%.5f")

    #print("\nPreview of results:")
    #print(results)
    #sys.stdout.flush()

    #print("\nSummary of results:")
    #summary = results.groupby(['Alpha', 'Guarantee', 'Method', 'Label']).agg(['mean','std']).reset_index()
    #print(summary)
    #sys.stdout.flush()


    #print("\nFinished.\nResults written to {:s}\n".format(outfile))
    #sys.stdout.flush()
    print(n_cal)

Output file: results_improved/exp1/synthetic1_p20_K10_signal1_RFC_eps0.2_random_nt1000_nc500_estnone_seed1.

Running experiment in batch 1...

Generating data... Done.
Generating contaminated labels... Done.
Training the predictive model... Done.

Seeking marginal coverage at level 0.90.
Applying standard method (with model training)... Done.
Applying adaptive method... Done.
Applying adaptive (pessimistic) method improved... Done.
   Coverage       Size     Label                           Method Guarantee   
0  0.956000   7.031500  marginal                         Standard  marginal  \
0  0.974227   6.979381         0                         Standard  marginal   
0  0.961353   7.241546         1                         Standard  marginal   
0  0.958115   6.801047         2                         Standard  marginal   
0  0.960784   6.970588         3                         Standard  marginal   
0  0.970000   6.980000         4                         Standard  marginal   
0  0.949749