In [None]:

import os

from cllm.utils import *
from cllm.curation import *

import pickle
import numpy as np
from copy import deepcopy
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

n_synthetic = 1000
retrain = False
nest=100
downstream = "xgb"
curation_ythresh=0.2
curation_xthresh = 0.15
temp = 0.9

# Factors to evaluate
seeds = [0,1,2,3,4,5,6,7,8,9]
n_samples = [10,20,50,100] 
datasets = ["covid", "adult", "seer", "cutract", "maggic"]

ylabel_map = {"covid": "is_dead",
            "adult": "salary",
            "compas": "y",
            "seer": "mortCancer",
            "cutract":"mortCancer",
            "maggic": "death_all",
            "support": "death",
            "bio": "y", 
            "higgs": "y",
            "drug": "target"
            }

for dataset in datasets:  

    for ns in n_samples:
        performance_all = []
        acc_list_all = []
        auc_list_all = []
        f1_list_all = []

        acc_clf_all = []
        auc_clf_all = []
        f1_clf_all = []

        results_plot_all = []
        results_data_all = []
        subset_dict_all = []
         
        for seed in seeds:
            try:
                    print(f"Computing for {dataset} with seed {seed} and ns {ns}")

                    dfs_dicts = {}
                    results = {}
                    subset_dict = {}

                    ylabel =  ylabel_map[dataset]

                    # Get the GPT-4 Generated data
                    gpt_model = "gpt-4"
                    tmp_df = process_gpt(dataset=dataset, n_synthetic=n_synthetic, temp=temp, gpt_model=gpt_model, ns=ns, seed=seed)
                    tmp_df = tmp_df.reset_index(drop=True)
                    results["gpt-4"] =  {"X": tmp_df.drop(columns=["target"]), "y": tmp_df["target"], "df": tmp_df}

                    # Get the GPT-3.5 Generated data
                    gpt_model = "gpt3"
                    tmp_df = process_gpt(dataset=dataset, n_synthetic=n_synthetic, temp=temp, gpt_model=gpt_model, ns=ns, seed=seed)
                    tmp_df = tmp_df.reset_index(drop=True)
                    results[gpt_model] =  {"X": tmp_df.drop(columns=["target"]), "y": tmp_df["target"], "df": tmp_df}
     
                    # Extract the Great datasets
                    filename = f"../save_dfs/great_pipeline_{dataset}_{seed}_{ns}.pickle"
                    if os.path.exists(filename):
                        with open(filename, "rb") as f:
                                    great_df = pickle.load(f)

                    tmp_df = great_df["great"]["X"]
                    tmp_df["target"] = great_df["great"]["y"]
                    tmp_df = tmp_df.reset_index(drop=True)
                    results["great"] =  {"X": tmp_df.drop(columns=["target"]), "y": tmp_df["target"], "df": tmp_df}
  

                    # Get all the baseline results
                    filename = f"../save_dfs/pipeline_{dataset}_{seed}_{ns}.pickle"
                    with open(filename, "rb") as f:
                            df = pickle.load(f)

                    # Extract each baseline method, as well as Dorig, Doracle, Dtest
                    for model in list(df.keys()):

                        if "great" in model:
                            continue

                        tmp_df = df[model]["X"]
                        tmp_df["target"] = df[model]["y"]

                        # reset index
                        tmp_df = tmp_df.reset_index(drop=True)

                        if model == "Original":
                            X_train_orig, y_train_orig = tmp_df.drop(columns=["target"]), tmp_df["target"]
                        elif model == "Oracle":
                            X_oracle, y_oracle = tmp_df.drop(columns=["target"]), tmp_df["target"]
                        elif model == "Test":
                            X_test, y_test = tmp_df.drop(columns=["target"]), tmp_df["target"]

                        if model!="Test":
                            results[model] =  {"X": tmp_df.drop(columns=["target"]), "y": tmp_df["target"], "df": tmp_df}

                    
                    
                    acc_list = []
                    auc_list = []
                    recall_list = []
                    precision_list = []
                    f1_list = []
                    model_names = []
                    acc_per_clf = {}
                    auc_per_clf = {}
                    f1_per_clf = {}


                    print("Applying curation and store subsets...")
                    results_models = list(results.keys())

                    # enrich results dict
                    for model in results_models:
                        if model=="Original" or model=="Oracle":
                            continue

                        X_eval, y_eval = results[model]["X"], results[model]["y"]

                        df_sample = deepcopy(X_eval)
                        df_sample["target"] = y_eval
                        if len(df_sample)>1000:
                            df_sample = df_sample.sample(n=1000, random_state=seed,).reset_index(drop=True)
                        X_eval, y_eval = df_sample.drop(columns=["target"]), df_sample["target"]

                        # CLLM Curation Mechanism
                        easy_train, ambig_train, hard_train, dataiq_model  = data_centric_curation(X_train_orig = X_train_orig, 
                                    y_train_orig= y_train_orig, 
                                    X_check = X_eval,
                                    y_check = y_eval,
                                    retrain=retrain,
                                    nest=nest,
                                    curation_ythresh=curation_ythresh,
                                    curation_xthresh=curation_xthresh,
                                    )
                        
                        df_save = deepcopy(X_eval)
                        df_save["target"] = y_eval

                        # Store data subsets
                        subset_dict[model] = {"easy": df_save.iloc[easy_train,:], "ambig": df_save.iloc[ambig_train,:], "hard": df_save.iloc[hard_train,:], "easy_ambig": df_save.iloc[np.concatenate((easy_train, ambig_train)),:], "all": df_save, "easy_ids": easy_train, "ambig_ids": ambig_train, "hard_ids": hard_train, "easy_ambig_ids": np.concatenate((easy_train, ambig_train))}
                        
                        # results[f"{model}_easy"] = {"X": X_eval.iloc[easy_train,:], "y": y_eval[easy_train], "df": df_save.iloc[easy_train,:]}
                        # results[f"{model}_ambig"] = {"X": X_eval.iloc[ambig_train,:], "y": y_eval[ambig_train], "df": df_save.iloc[ambig_train,:]}
                        
                        # The curated is after we remove "Hard" synthetic samples
                        results[f"{model}_curated"] = {"X": X_eval.iloc[np.concatenate((easy_train, ambig_train)),:], "y": y_eval[np.concatenate((easy_train, ambig_train))], "df": df_save.iloc[np.concatenate((easy_train, ambig_train)),:]}

              
                    print("Fitting downstream models on the different stored datasets...")
                    for idx, model in enumerate(list(results.keys())):
                        
                        X_eval, y_eval = results[model]["X"], results[model]["y"]
                        clf1 = XGBClassifier(n_estimators=nest, random_state=seed)
                        clf2 = RandomForestClassifier(n_estimators=nest, random_state=seed)
                        clf3 = LogisticRegression(random_state=seed)
                        clf4 = DecisionTreeClassifier(random_state=seed)
  
                        X_eval, y_eval = results[model]["X"], results[model]["y"]
                        try:
                            scaler = preprocessing.StandardScaler().fit(X_eval)
                        except:
                             print(model)

                        # Perform model evaluation
                        acc1, rec1, prec1, f11, auc1, clf1 = evaluate_model(scaler.transform(X_eval), y_eval, scaler.transform(X_test), y_test, clf1)
                        acc2, rec2, prec2, f12, auc2, clf2 = evaluate_model(scaler.transform(X_eval), y_eval, scaler.transform(X_test), y_test, clf2)
                        acc3, rec3, prec3, f13, auc3, clf3 = evaluate_model(scaler.transform(X_eval), y_eval, scaler.transform(X_test), y_test, clf3)
                        acc4, rec4, prec4, f14, auc4, clf4 = evaluate_model(scaler.transform(X_eval), y_eval, scaler.transform(X_test), y_test, clf4)

                        acc_per_clf[model] = {"xgb": acc1, "rf": acc2, "lr": acc3, "dt": acc4}
                        auc_per_clf[model] = {"xgb": auc1, "rf": auc2, "lr": auc3, "dt": auc4}
                        f1_per_clf[model] = {"xgb": f11, "rf": f12, "lr": f13, "dt": f14}

                        acc = np.mean([acc1, acc2, acc3])
                        rec = np.mean([rec1, rec2, rec3])
                        prec = np.mean([prec1, prec2, prec3])
                        f1 = np.mean([f11, f12, f13])
                        auc = np.mean([auc1, auc2, auc3])
                        
                        acc_list.append(acc)
                        recall_list.append(rec)
                        precision_list.append(prec)
                        f1_list.append(f1)
                        auc_list.append(auc)
                        model_names.append(model)

                        if model!="Original" or model!="Oracle":

                            if model=="Oracle":
                                oracle_idx = idx

                            continue

                    # Store results
                    performance_dict = {"Accuracy": acc_list, "AUC": auc_list, "Group": model_names, "Recall": recall_list, "Precision": precision_list, "F1": f1_list, "Dataset": dataset, "Seed": seed, "acc_clf": acc_per_clf, "auc_clf": auc_per_clf, "f1_clf": f1_per_clf}
                    performance_all.append(performance_dict)

                    acc_list_all.append(acc_list)
                    auc_list_all.append(auc_list)
                    f1_list_all.append(f1_list)
                    acc_clf_all.append(acc_per_clf)
                    auc_clf_all.append(auc_per_clf)
                    f1_clf_all.append(f1_per_clf)

                    results_data_all.append(results)
                    subset_dict_all.append(subset_dict)


            except Exception as e:
                import traceback
                print(traceback.format_exc())
                print("Error in fitting models: ", e)
                continue    
        
            
            try:

                acc_list = np.mean(np.array(acc_list_all), axis=0)
                auc_list = np.mean(np.array(auc_list_all), axis=0)
                f1_list = np.mean(np.array(f1_list_all), axis=0)

                df = pd.DataFrame({"Accuracy": acc_list, "AUC": auc_list, "Group": model_names})
                df = pd.melt(df, id_vars=["Group"], value_vars=["Accuracy", "AUC"])
                df.columns = ["Group", "Metric", "Score"]

                results_plot_all.append(df)

                # create a dict to store all the results
                all_results = {"model_order": list(results.keys()),
                                "performance_all": performance_all, 
                                "acc_list_all": acc_list_all, 
                                "auc_list_all": auc_list_all, 
                                "f1_list_all": f1_list_all,
                                "results_plot_all": results_plot_all, 
                                "results_data_all": results_data_all, 
                                "subset_dict_all": subset_dict_all, 
                                }
                
                # pickle all the results to results summary folder
                filename = f"../results_summary/results_summary_{dataset}_{ns}.pickle"

                print("Saving file as pickle... ")
                with open(filename, "wb") as f:
                    pickle.dump(all_results, f)

            except Exception as e:
                import traceback
                print(traceback.format_exc())
                print("Error in computing performance metrics: ", e)
                continue
 