In [1]:
# importing required libraries
%pip install ganite
import os, sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import math as m
from ganite import Ganite
from ganite.utils.metrics import sqrt_PEHE_with_diff
from tqdm import tqdm

Note: you may need to restart the kernel to use updated packages.


In [2]:
# stdlib
import os
import random
from pathlib import Path
from typing import Any, Tuple

import ganite.logger as log

# third party
import numpy as np

# from .network import download_if_needed

np.random.seed(0)
random.seed(0)

TRAIN_DATASET = "ihdp_npci_1-1000.train.npz"
TEST_DATASET = "ihdp_npci_1-1000.test.npz"



# helper functions
def load_data_npz(fname: Path, get_po: bool = True) -> dict:
    """
    Helper function for loading the IHDP data set (adapted from https://github.com/clinicalml/cfrnet)
    Parameters
    ----------
    fname: Path
        Dataset path
    Returns
    -------
    data: dict
        Raw IHDP dict, with X, w, y and yf keys.
    """
    data_in = np.load(fname)
    data = {"X": data_in["x"], "w": data_in["t"], "y": data_in["yf"]}
    try:
        data["ycf"] = data_in["ycf"]
    except BaseException:
        data["ycf"] = None

    if get_po:
        data["mu0"] = data_in["mu0"]
        data["mu1"] = data_in["mu1"]

    data["HAVE_TRUTH"] = not data["ycf"] is None
    data["dim"] = data["X"].shape[1]
    data["n"] = data["X"].shape[0]

    return data


def prepare_ihdp_data(
    data_train: dict,
    data_test: dict,
    rescale: bool = True,
    setting: str = "C",
    return_pos: bool = False,
) -> Tuple:
    """
    Helper for preprocessing the IHDP dataset.
    Parameters
    ----------
    data_train: pd.DataFrame or dict
        Train dataset
    data_test: pd.DataFrame or dict
        Test dataset
    rescale: bool, default True
        Rescale the outcomes to have similar scale
    setting: str, default C
        Experiment setting
    return_pos: bool
        Return potential outcomes
    Returns
    -------
    X: dict or pd.DataFrame
        Training Feature set
    y: pd.DataFrame or list
        Outcome list
    t: pd.DataFrame or list
        Treatment list
    cate_true_in: pd.DataFrame or list
        Average treatment effects for the training set
    X_t: pd.Dataframe or list
        Test feature set
    cate_true_out: pd.DataFrame of list
        Average treatment effects for the testing set
    """

    X, y, w, mu0, mu1 = (
        data_train["X"],
        data_train["y"],
        data_train["w"],
        data_train["mu0"],
        data_train["mu1"],
    )

    X_t, _, _, mu0_t, mu1_t = (
        data_test["X"],
        data_test["y"],
        data_test["w"],
        data_test["mu0"],
        data_test["mu1"],
    )
    if setting == "D":
        y[w == 1] = y[w == 1] + mu0[w == 1]
        mu1 = mu0 + mu1
        mu1_t = mu0_t + mu1_t

    if rescale:
        # rescale all outcomes to have similar scale of CATEs if sd_cate > 1
        cate_in = mu0 - mu1
        sd_cate = np.sqrt(cate_in.var())

        if sd_cate > 1:
            # training data
            error = y - w * mu1 - (1 - w) * mu0
            mu0 = mu0 / sd_cate
            mu1 = mu1 / sd_cate
            y = w * mu1 + (1 - w) * mu0 + error

            # test data
            mu0_t = mu0_t / sd_cate
            mu1_t = mu1_t / sd_cate

    cate_true_in = mu1 - mu0
    cate_true_out = mu1_t - mu0_t

    if return_pos:
        return X, y, w, cate_true_in, X_t, cate_true_out, mu0, mu1, mu0_t, mu1_t

    return X, y, w, cate_true_in, X_t, cate_true_out


def get_one_data_set(D: dict, i_exp: int, get_po: bool = True) -> dict:
    """
    Helper for getting the IHDP data for one experiment. Adapted from https://github.com/clinicalml/cfrnet
    Parameters
    ----------
    D: dict or pd.DataFrame
        All the experiment
    i_exp: int
        Experiment number
    Returns
    -------
    data: dict or pd.Dataframe
        dict with the experiment
    """
    D_exp = {}
    D_exp["X"] = D["X"][:, :, i_exp - 1]
    D_exp["w"] = D["w"][:, i_exp - 1 : i_exp]
    D_exp["y"] = D["y"][:, i_exp - 1 : i_exp]
    if D["HAVE_TRUTH"]:
        D_exp["ycf"] = D["ycf"][:, i_exp - 1 : i_exp]
    else:
        D_exp["ycf"] = None

    if get_po:
        D_exp["mu0"] = D["mu0"][:, i_exp - 1 : i_exp]
        D_exp["mu1"] = D["mu1"][:, i_exp - 1 : i_exp]

    return D_exp


def load_ihdp(i):
    data_train, data_test = load_raw()

    exp = i+1
    data_exp = get_one_data_set(data_train, i_exp=exp, get_po=True)
    data_exp_test = get_one_data_set(data_test, i_exp=exp, get_po=True)

    (
        X,
        y,
        w,
        cate_true_in,
        X_t,
        cate_true_out,
        mu0,
        mu1,
        mu0_t,
        mu1_t,
    ) = prepare_ihdp_data(
        data_exp,
        data_exp_test,
        rescale=True,
        return_pos=True,
    )

    return (
        X,
        w,
        y,
        np.asarray([mu0, mu1]).squeeze().T,
        X_t,
        np.asarray([mu0_t, mu1_t]).squeeze().T,
    )


def load_raw():
    """
    Get IHDP raw train/test sets.
    Parameters
    ----------
    data_path: Path
        Path to the dataset csv. If the data is missing, it will be downloaded.
    Returns
    -------
    data_train: dict or pd.DataFrame
        Training data
    data_test: dict or pd.DataFrame
        Testing data
    """

#     try:
#         os.mkdir(data_path)
#     except BaseException:
#         pass

#     train_csv = data_path / TRAIN_DATASET
#     test_csv = data_path / TEST_DATASET

#     log.debug(f"load raw dataset f{train_csv}")

#     download_if_needed(train_csv, http_url=TRAIN_URL)
#     download_if_needed(test_csv, http_url=TEST_URL)
    train_csv = "ihdp_npci_1-1000.train.npz"
    test_csv = "ihdp_npci_1-1000.test.npz"
    data_train = load_data_npz(train_csv, get_po=True)
    data_test = load_data_npz(test_csv, get_po=True)

    return data_train, data_test

In [3]:
rpehe = np.zeros(1000)
for i in range(1000):
    X_train, W_train, Y_train, Y_train_full, X_test, Y_test = load_ihdp(i)
    model = Ganite(X_train, W_train, Y_train, num_iterations=1000)
    pred = model(X_test).cpu().numpy()
    rmse = sqrt_PEHE_with_diff(Y_test, pred)
    dataset='ihdp'
    if i%20==0:
        print(f"PEHE score for GANITE on {dataset} = {rmse}")
    rpehe[i]=rmse
    

PEHE score for GANITE on ihdp = 1.7384815014766548
PEHE score for GANITE on ihdp = 1.5762207812291553
PEHE score for GANITE on ihdp = 1.7416427090860167
PEHE score for GANITE on ihdp = 1.6823226103611775
PEHE score for GANITE on ihdp = 1.3743827107933329
PEHE score for GANITE on ihdp = 2.072576503449778
PEHE score for GANITE on ihdp = 1.794213501928264
PEHE score for GANITE on ihdp = 2.742754476418878
PEHE score for GANITE on ihdp = 2.5027265022214684
PEHE score for GANITE on ihdp = 2.2581949355193176
PEHE score for GANITE on ihdp = 1.8329250494425395
PEHE score for GANITE on ihdp = 1.5643042606719901
PEHE score for GANITE on ihdp = 1.5330507389277424
PEHE score for GANITE on ihdp = 2.245189642721363
PEHE score for GANITE on ihdp = 1.7386304747910597
PEHE score for GANITE on ihdp = 1.9587468618272528
PEHE score for GANITE on ihdp = 1.9970339810091795
PEHE score for GANITE on ihdp = 1.8469408225587143
PEHE score for GANITE on ihdp = 1.5741026368235422
PEHE score for GANITE on ihdp = 1.3

In [4]:
print(rpehe.mean())

1.827239007326103


In [5]:
from scipy.stats import sem
print(sem(rpehe,axis=0))

0.01388934843300318
