In [3]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sdv.tabular import CTGAN, TVAE
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import sys
sys.path.append('../')
from src.dataloader import *
from src.models import *
from triage.triage import Triage
from src.utils import *

_ALL_REGRESSION_DATASETS = ALL_REGRESSION_DATASETS


nest = 20


In [4]:
uci_datasets = [
    "boston",
    "star",
    "bio",
    "concrete",
    "protein",
    "bike",
]


In [5]:
def get_uci(dataset):

    if dataset in uci_datasets:
        regression_datasets = list(_ALL_REGRESSION_DATASETS.keys())
        regression_datasets.sort()

        if dataset in regression_datasets:
            # xlrd, openpyxl
            import tempfile

            dataset_name = regression_datasets[mapper[dataset]]
            with tempfile.TemporaryDirectory() as data_dir:
                # download and load data
                download_regression_dataset(dataset_name, data_dir)
                X, y = load_regression_dataset(
                    dataset_name, data_dir, shuffle_train=True, batch_size=512
                )

        else:
            X, y = GetDataset(dataset, "./data/")

    return X, y


In [6]:
import pandas as pd
from hyperimpute.plugins.imputers import Imputers


def get_medical(dataset="mimic_antibiotics", processing_needed=False, seed=42):


    if dataset == "mimic_antibiotics":
        outcome_variable = "wbc"
        df_static = pd.read_csv("data/mimic_antibiotics_static_train_data.csv")
        df_temporal = pd.read_csv("data/mimic_antibiotics_temporal_train_data_eav.csv")
        processing_needed = True


    if dataset == "los":
        outcome_variable = "lengthofstay"
        df = pd.read_csv("data/LengthOfStay.csv")
        dropcols = ["eid", "vdate", "discharged", "facid"]
        df = df.drop(columns=dropcols)
        df = df.sample(n=10000)

        from sklearn import preprocessing

        le1 = preprocessing.LabelEncoder()
        df["rcount"] = le1.fit_transform(df["rcount"])
        le2 = preprocessing.LabelEncoder()
        df["gender"] = le2.fit_transform(df["gender"])
        processing_needed = False

    if dataset != "cancer" and dataset not in uci_datasets:
        if processing_needed:
            ids_static = df_static.id.unique()
            ids_temporal = df_temporal[df_temporal["time"] == 0].id.unique()

            if len(ids_temporal) < len(ids_static):
                ids = ids_temporal
            else:
                ids = ids_static

            df_static = df_static[df_static.id.isin(ids)]
            df_temporal = df_temporal[df_temporal.id.isin(ids)]
            df_temporal = df_temporal[df_temporal["time"] == 0]

            df_temporal_label = df_temporal[df_temporal["variable"] == outcome_variable]
            if len(df_temporal_label) == 0:
                df_temporal_label = df_static[[outcome_variable, "id"]]

            if len(df_temporal_label) != len(df_temporal):
                ids = df_temporal_label.id
                df_static = df_static[df_static.id.isin(ids)]
                df_temporal = df_temporal[df_temporal.id.isin(ids)]

            df_static.shape, df_temporal[
                df_temporal["variable"] == outcome_variable
            ].shape

            df_temporal = df_temporal.sort_values(by=["id"])
            df_static = df_static.sort_values(by=["id"])

            temporal_unique_ids = df_temporal.id.unique().shape
            static_shape = df_static.shape
   
            assert temporal_unique_ids[0] == static_shape[0]

            feature_dict = {}
            for variable_name in list(df_temporal.variable.unique()):
                tdf = df_temporal[df_temporal["variable"] == variable_name]
                variable_array = tdf.drop_duplicates(subset=["id"]).value.to_numpy()
                feature_dict[variable_name] = variable_array

            tmp_df = pd.DataFrame.from_dict(
                feature_dict, columns=df_static.id, orient="index"
            ).T.reset_index(level=0)
         

            tmp_df.shape
            df_overall = df_static.merge(tmp_df, on="id", how="left")

            assert df_overall[outcome_variable].shape[0] == static_shape[0]

        else:
            df_overall = df

    ###########################################
    # DATA SETUP
    ###########################################
    if dataset != "cancer":
        print(len(df_overall), dataset)
        cols = df_overall.columns
        if len(df_overall) > 10000:
            df_overall = df_overall.sample(n=10000)

        from sklearn.impute import SimpleImputer

        imp_mean = SimpleImputer(strategy="mean")
        df_overall = imp_mean.fit_transform(df_overall)
        df_overall = pd.DataFrame(df_overall)
        df_overall.columns = cols

        X = df_overall.drop(columns=[outcome_variable])
        y = df_overall[outcome_variable].values

    if dataset == "cancer":
        X, y, df = load_seer_cutract_dataset(name="seer", seed=seed)
        y = y.values

    return X, y


In [7]:
mapper = {}
for i, val in enumerate(regression_datasets):
    mapper[val] = i


In [8]:
import random

import matplotlib.pyplot as plt  # for plotting
import numpy as np
import xgboost as xgb
from sklearn import datasets
from tqdm import tqdm

dataset_lists = [
    "bike",
    "boston",
    "concrete",
    "star",
    "mimic_antibiotics",
    "los",
    "cancer",
    "protein",
    "bio",
]
n_runs = 5
cal_size = 0.2


final_results = {}

for dataset in tqdm(dataset_lists):
    print(dataset)
    try:
        results = {}

        models = [CTGAN(), TVAE()]
        modelnames = ["CTGAN", "TVAE"]

        for idx in range(len(models)):
            prop_discarded = []
            mse = []
            mae = []

            for j in range(n_runs):

                print(modelnames[idx], j)

                seed = j * 10
                random.seed(seed)
                seed_everything(seed)

                if dataset in uci_datasets:
                    X_prop_train, y_prop_train = get_uci(dataset=dataset)
                else:
                    X_prop_train, y_prop_train = get_medical(dataset=dataset, seed=seed)

                df = pd.DataFrame.from_records(
                    X_prop_train
                )  # pd.DataFrame(X_prop_train)
                ints = list(df.columns)
                df.columns = [str(x) for x in ints]

                df["y"] = y_prop_train

                model = models[idx]

                # Fit synthetic data model
                model.fit(df)

                # Sample data to train
                nrows = 10000
                sampled_data = model.sample(num_rows=nrows, randomize_samples=False)
                X_prop_train = sampled_data.drop(columns=["y"])
                y_prop_train = sampled_data["y"]

                min_y = y_prop_train.min()
                max_y = y_prop_train.max()

                y_prop_train = np.array(
                    [
                        (y_prop_train[i] - min_y) / (max_y - min_y)
                        for i in range(len(y_prop_train))
                    ]
                )
                y_prop_train = pd.Series(y_prop_train)

                if dataset in uci_datasets:
                    X_eval, y_eval = get_uci(dataset=dataset)
                else:
                    X_eval, y_eval = get_medical(dataset=dataset, seed=seed)

                y_eval = np.array(
                    [(y_eval[i] - min_y) / (max_y - min_y) for i in range(len(y_eval))]
                )
                try:
                    y_eval = pd.Series(y_eval)
                except:
                    y_eval = pd.Series(y_eval.reshape(-1))

                test_ids = random.sample(
                    list(range(len(y_eval))), int(0.5 * len(y_eval))
                )
                try:
                    X_test = X_eval.iloc[test_ids, :]
                except:
                    X_test = X_eval[test_ids, :]

                try:
                    y_test = y_eval.iloc[test_ids]
                except:
                    y_test = y_eval[test_ids]

                remaining_eval_ids = np.setdiff1d(range(len(y_eval)), test_ids)

                try:
                    _, X_cal, _, y_cal = train_test_split(
                        X_eval.iloc[remaining_eval_ids, :],
                        y_eval.iloc[remaining_eval_ids],
                        test_size=cal_size,
                        random_state=seed,
                    )
                except:
                    _, X_cal, _, y_cal = train_test_split(
                        X_eval[remaining_eval_ids, :],
                        y_eval[remaining_eval_ids],
                        test_size=cal_size,
                        random_state=seed,
                    )

                X_prop_train, X_cal, X_test = (
                    np.array(X_prop_train),
                    np.array(X_cal),
                    np.array(X_test),
                )
                y_prop_train, y_cal, y_test = (
                    np.array(y_prop_train),
                    np.array(y_cal),
                    np.array(y_test),
                )

                prop = 0.1

                num_ids = int(prop * len(y_prop_train))

                last_ids = range(len(y_prop_train))

                nest = 10
                learner = xgb.XGBRegressor(n_estimators=nest, random_state=seed)
                learner.fit(X_prop_train, y_prop_train)

                y_eval = y_prop_train
                X_eval = X_prop_train

                triage = Triage(
                    X_eval=X_eval,
                    y_eval=y_eval,
                    X_cal=X_cal,
                    y_cal=y_cal,
                    nest=nest,
                    learner=learner,
                )
                groups_ids, raw_metrics = triage.run(
                    compute_cpd=True, compute_crps=True
                )
                triage_array = raw_metrics["score_metric"]

                percentile_thresh = 75
                thresh = 0.33
                conf_thresh_low = thresh
                conf_thresh_high = 1 - thresh
                conf_thresh = 0.5

                metric = triage_array

                uncert = np.std(metric, axis=-1)
                confidence = np.mean(metric, axis=-1)
                # Get groups and mainly well-estimated groups
                oe_group = np.where(
                    (confidence <= conf_thresh_low)
                    & (uncert <= np.percentile(uncert, percentile_thresh))
                )[0]
                ue_group = np.where(
                    (confidence >= conf_thresh_high)
                    & (uncert <= np.percentile(uncert, percentile_thresh))
                )[0]
                combined_group = np.concatenate((oe_group, ue_group))
                we_group = []
                for id in range(len(confidence)):
                    if id not in combined_group:
                        we_group.append(id)
                we_group = np.array(we_group)

                prop_discarded.append(
                    (len(ue_group) + len(oe_group)) / len(X_prop_train)
                )

                y_pred = learner.predict(X_test)

                from sklearn.metrics import mean_absolute_error

                mae.append(mean_absolute_error(y_pred, y_test))

            print(modelnames[idx])
            results[modelnames[idx]] = {
                "discard": np.mean(prop_discarded),
                "mae": np.mean(mae),
            }
    except Exception as e:
        print(e)
        continue
    print(dataset)
    print(results)
    final_results[dataset] = results


  0%|          | 0/9 [00:00<?, ?it/s]

bike
CTGAN 0
[Errno 2] No such file or directory: './data/bike_train.csv'
boston
CTGAN 0
CTGAN 1
CTGAN 2
CTGAN 3
CTGAN 4
CTGAN
TVAE 0
TVAE 1
TVAE 2
TVAE 3
TVAE 4


 22%|██▏       | 2/9 [01:52<06:32, 56.11s/it]

TVAE
boston
{'CTGAN': {'discard': 0.59248, 'mae': 0.16940937866560452}, 'TVAE': {'discard': 0.40671999999999997, 'mae': 0.12299813702623756}}
concrete
CTGAN 0
CTGAN 1
CTGAN 2
CTGAN 3
CTGAN 4
CTGAN
TVAE 0
TVAE 1
TVAE 2
TVAE 3
TVAE 4


 33%|███▎      | 3/9 [04:22<09:32, 95.40s/it]

TVAE
concrete
{'CTGAN': {'discard': 0.5768600000000002, 'mae': 0.18450783407877333}, 'TVAE': {'discard': 0.5128600000000001, 'mae': 0.11010332626540328}}
star
CTGAN 0
[Errno 2] No such file or directory: './data/STAR.csv'
mimic_antibiotics
CTGAN 0
[Errno 2] No such file or directory: 'data/mimic_antibiotics_static_train_data.csv'
los
CTGAN 0
[Errno 2] No such file or directory: 'data/LengthOfStay.csv'
cancer
CTGAN 0
CTGAN 1
CTGAN 2
CTGAN 3
CTGAN 4
CTGAN
TVAE 0
TVAE 1
TVAE 2
TVAE 3
TVAE 4


 78%|███████▊  | 7/9 [21:36<07:00, 210.28s/it]

TVAE
cancer
{'CTGAN': {'discard': 0.51586, 'mae': 0.19781326705459215}, 'TVAE': {'discard': 0.50706, 'mae': 0.182426155038921}}
protein
CTGAN 0
CTGAN 1
CTGAN 2
CTGAN 3
CTGAN 4
CTGAN
TVAE 0
TVAE 1
TVAE 2
TVAE 3
TVAE 4


100%|██████████| 9/9 [1:03:12<00:00, 421.43s/it]

TVAE
protein
{'CTGAN': {'discard': 0.42779999999999996, 'mae': 0.22943454709065453}, 'TVAE': {'discard': 0.5052000000000001, 'mae': 0.2090141668717247}}
bio
CTGAN 0
[Errno 2] No such file or directory: './data/CASP.csv'





In [9]:
for key in final_results.keys():
    print(
        key,
        "CTGAN:",
        f"{np.round(1-final_results[key]['CTGAN']['discard'],2)*100}%",
        np.round(final_results[key]["CTGAN"]["mae"], 2),
        "TVAE: ",
        f"{np.round(1-final_results[key]['TVAE']['discard'],2)*100}%",
        np.round(final_results[key]["TVAE"]["mae"], 2),
    )


boston CTGAN: 41.0% 0.17 TVAE:  59.0% 0.12
concrete CTGAN: 42.0% 0.18 TVAE:  49.0% 0.11
cancer CTGAN: 48.0% 0.2 TVAE:  49.0% 0.18
protein CTGAN: 56.99999999999999% 0.23 TVAE:  49.0% 0.21


In [10]:
final_results


{'boston': {'CTGAN': {'discard': 0.59248, 'mae': 0.16940937866560452},
  'TVAE': {'discard': 0.40671999999999997, 'mae': 0.12299813702623756}},
 'concrete': {'CTGAN': {'discard': 0.5768600000000002,
   'mae': 0.18450783407877333},
  'TVAE': {'discard': 0.5128600000000001, 'mae': 0.11010332626540328}},
 'cancer': {'CTGAN': {'discard': 0.51586, 'mae': 0.19781326705459215},
  'TVAE': {'discard': 0.50706, 'mae': 0.182426155038921}},
 'protein': {'CTGAN': {'discard': 0.42779999999999996,
   'mae': 0.22943454709065453},
  'TVAE': {'discard': 0.5052000000000001, 'mae': 0.2090141668717247}}}