In [None]:
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from tqdm import tqdm, tnrange, tqdm_notebook
import os

pd.set_option('display.max_rows', None)
# PARETO FRONT TOOLS
def check_dominance(p1,p2):
    """check if p1 dominates p2. Assuming that the objectives are to be minimized."""
    flag1 = 0
    flag2 = 0

    for o1,o2 in zip(p1,p2):
        if o1 < o2:
            flag1 = 1
        elif o1 > o2:
            flag2 = 1

    if flag1==1 and flag2 == 0:
        return 1
    elif flag1==0 and flag2 == 1:
        return -1
    else:
        return 0


def front(obj1,obj2):
    """return indices from x and y that are on the Pareto front. Assume that the objectives are to be minimized."""
    rank = []
    assert(len(obj1)==len(obj2))
    n_inds = len(obj1)
    front = []

    for i in np.arange(n_inds):
        p = (obj1[i],obj2[i])
        dcount = 0
        dom = []
        for j in np.arange(n_inds):
            q = (obj1[j],obj2[j])
            compare = check_dominance(p,q)
            if compare == 1:
                dom.append(j)
            elif compare == -1:
                dcount = dcount +1

        if dcount == 0:
            front.append(i)

    f_obj2 = [obj2[f] for f in front]
    s2 = np.argsort(np.array(f_obj2))
    front = [front[s] for s in s2]

    return front

print(front([1, 2, 3, 0, 4], [0, 1, 2, 1, 4]))

In [None]:
# Hypervolume of pareto front for different datasets
import numpy as np
import pandas as pd
from deap.tools._hypervolume import pyhv

# Need to optimize this code
def save_hv_results(base_result_folder, hv_folder, taskid, experiments, objectives, runs):
    assert os.path.isdir(hv_folder), f"Folder to save HV values does not exist: {hv_folder}"

    hv_df = pd.DataFrame(columns = ['hv', 'dataset', 'exp', 'num_pts', 'avg_pred_perf', 'avg_fair_perf']) # results on training data
    hv_test_df = pd.DataFrame(columns = ['hv', 'dataset', 'exp', 'num_pts', 'avg_pred_perf', 'avg_fair_perf']) # results on test data

    for exp in experiments:
        print("Processing experiment:", exp)
        for rep in range(runs):
            print("Processing run:", rep)
        
            save_folder = f"{base_result_folder}/{taskid}_{rep}_{exp}"
            # If hv_values file exist, take the values from there.
            if os.path.exists(f"{save_folder}/hv_values.pkl"):
                with open(f"{save_folder}/hv_values.pkl",'rb') as file:
                    hv_file = pd.read_pickle(file)
                hv_df.loc[len(hv_df.index)] = {'hv' : hv_file['train_hv']['hv'], 'dataset' : taskid, 'exp' : exp, 'num_pts':hv_file['train_hv']['num_pts'],'avg_pred_perf':hv_file['train_hv']['avg_pred_perf'], 'avg_fair_perf':hv_file['train_hv']['avg_fair_perf']}
                hv_test_df.loc[len(hv_test_df.index)] = {'hv' : hv_file['test_hv']['hv'], 'dataset' : taskid, 'exp' : exp, 'num_pts':hv_file['test_hv']['num_pts'],'avg_pred_perf':hv_file['test_hv']['avg_pred_perf'], 'avg_fair_perf':hv_file['test_hv']['avg_fair_perf']}
                
            else:
                x_vals = [] # auroc, etc
                y_vals = [] # fnr, etc.

                results_file = f"{save_folder}/scores.pkl"
                with open(results_file,'rb') as file:
                    this_df = pd.read_pickle(file)
        
                # Each run has 5000 individuals, and therefore will have one hv for validation set and one for test set
                # based on pareto-front derived from these 1000 individuals
                train_pred_perf = this_df['train_'+objectives[0]].to_numpy()
                x_vals = 1-train_pred_perf
                y_vals = this_df['train_'+objectives[1]].to_numpy()

                PF = front(x_vals,y_vals)
                pf_x = [x_vals[i] for i in PF]
                pf_y = [y_vals[i] for i in PF]
                hv = pyhv.hypervolume([(xi,yi) for xi,yi in zip(pf_x,pf_y)], ref=np.array([1,1]))
                avg_pred_perf = np.mean([1-x for x in pf_x])
                avg_fair_perf = np.mean(pf_y)
                hv_df.loc[len(hv_df.index)] = {'hv' : hv, 'dataset' : task_id, 'exp' : exp, 'num_pts':len(pf_x),'avg_pred_perf':avg_pred_perf, 'avg_fair_perf':avg_fair_perf}
        
                test_pred_perf = this_df[objectives[0]].to_numpy()
                x_vals = 1-test_pred_perf
                y_vals = this_df[objectives[1]].to_numpy()

                PF = front(x_vals,y_vals)
                pf_x = [x_vals[i] for i in PF]
                pf_y = [y_vals[i] for i in PF]
                hv = pyhv.hypervolume([(xi,yi) for xi,yi in zip(pf_x,pf_y)], ref=np.array([1,1]))
                avg_pred_perf = np.mean([1-x for x in pf_x])
                avg_fair_perf = np.mean(pf_y)
                hv_test_df.loc[len(hv_test_df.index)] = {'hv' : hv, 'dataset' : task_id, 'exp' : exp, 'num_pts':len(pf_x),'avg_pred_perf':avg_pred_perf, 'avg_fair_perf':avg_fair_perf}

    hv_train_file = f"{hv_folder}/hv_train_{task_id}.csv"
    hv_test_file = f"{hv_folder}/hv_test_{task_id}.csv"
    hv_df.to_csv(hv_train_file)
    hv_test_df.to_csv(hv_test_file)

task_ids = ['heart_disease', 'student_math', 'us_crime', 'nlsy', 'compas', 'law_school','pmad_phq', 'pmad_epds']
experiments1 = ['Equal Weights',
               'Deterministic Weights',
              'Evolved Weights']
objective_functions=['accuracy', 'subgroup_FNR_loss']
#objective_functions=['accuracy', 'demographic_parity_difference']
results_folder = '~/Documents/Results/results10'
hv_folder = '~/Documents/Results/hv_values10'

files_dir = [
    f for f in os.listdir(results_folder) if os.path.isdir(os.path.join(results_folder, f))
]
print(files_dir)

for task_id in task_ids:
    print('Processing task_id:', task_id)
    save_hv_results(f'{results_folder}/{files_dir[0]}', hv_folder, task_id, experiments1,objective_functions, 20)

In [25]:
hv_folder = '/Users/sainia3/Documents/Results/hv_values10'

# List of CSV files to be merged
csv_files = [f"{hv_folder}/hv_test_{task_id}.csv" for task_id in task_ids]

# Read and concatenate all CSV files into one dataframe
df_list = [pd.read_csv(file) for file in csv_files]  # Reading each file into a DataFrame
merged_df = pd.concat(df_list, ignore_index=True)    # Concatenating DataFrames

# Save the merged dataframe into a new CSV file
output_file = f"{hv_folder}/hv_test.csv"
merged_df.to_csv(output_file, index=False)

print(f"Merged CSV file saved as {output_file}")