In [1]:
from scripts.transformer_prediction_interface.base import DoPFNRegressor
import torch

## Sales Dataset

In [2]:
from copy import deepcopy
from datasets import load_dataset

dataset = load_dataset(ds_name="sales_cate")
dopfn = DoPFNRegressor()

train_ds, test_ds = dataset.generate_valid_split(n_splits=2)

dopfn.fit(train_ds.x_obs, train_ds.y_obs)

x_int = deepcopy(test_ds.x_int)

y_pred = dopfn.predict_full(x_int)["mean"]

Changed model to be compatible with CPU, this is needed for the current version of PyTorch, see issue: https://github.com/pytorch/pytorch/issues/97128. The model will be slower if reused on GPU.


Running inference: 100%|██████████| 1/1 [00:00<00:00,  2.02batch/s]


In [3]:
(((y_pred - test_ds.y_int.numpy()) / (test_ds.y_int.max() - test_ds.y_int.min())) ** 2).mean()

tensor(0.3136)

## ACIC 2016 Dataset

In [4]:
# The ACIC 2016 challenge dataset
#
# Sources:
# [1] Dorie, Vincent, et al. "Automated versus do-it-yourself methods for causal inference: Lessons learned
# from a data analysis competition." (2019): 43-68.
# [2] https://github.com/BiomedSciAI/causallib/tree/master/causallib/datasets/data/acic_challenge_2016
#
# The challenge includes 10 different datasets.

from typing import Any

import numpy as np
import pandas as pd

from abc import ABC, abstractmethod
from dataclasses import dataclass

import numpy as np


@dataclass
class CATE_Dataset:  # conditional average treatment effect
    X_train: np.ndarray
    t_train: np.ndarray
    y_train: np.ndarray
    X_test: np.ndarray
    true_cate: np.ndarray


class EvalDatasetCatalog(ABC):
    """
    The dataset catalog is a collection of datasets used for evaluating the model.
    """

    def __init__(self, n_tables: int, name: str):
        self.n_tables = n_tables
        self.name = name

    def __len__(self):
        return self.n_tables

    def __str__(self):
        return self.name

    @abstractmethod
    def __getitem__(self, index) -> Any:
        raise NotImplementedError("This method should be implemented by the subclass")


X_CSV_URL = (
    "https://raw.githubusercontent.com/BiomedSciAI/causallib/master/causallib/datasets/data/acic_challenge_2016/x.csv"
)

ZY_CSV_URL = (
    lambda i: f"https://raw.githubusercontent.com/BiomedSciAI/causallib/master/causallib/datasets/data/acic_challenge_2016/zymu_{i}.csv"
)


class ACIC2016Dataset(EvalDatasetCatalog):
    def __init__(self, test_ratio: float = 0.1, seed: int = 42, n_tables: int = 10):
        super().__init__(n_tables, name="ACIC2016")
        self.test_ratio = test_ratio
        self.x_data = pd.read_csv(X_CSV_URL)
        self.rngs = [np.random.default_rng(seed + i) for i in range(n_tables)]
        self.datasets = [self._get_data(i) for i in range(n_tables)]

    def _get_data(self, idx: int) -> CATE_Dataset:
        """Loads and processes a single dataset split."""
        # Download file URLs
        simulation_url = ZY_CSV_URL(idx + 1)

        sim_data = pd.read_csv(simulation_url)

        # Define column names for x.csv and simulation data
        self.x_data.columns = [f"x_{i+1}" for i in range(self.x_data.shape[1])]
        sim_data.columns = ["z", "y0", "y1", "mu0", "mu1"]

        # Handle categorical variables
        categorical_columns = ["x_2", "x_21", "x_24"]
        numerical_columns = [f"x_{i+1}" for i in range(self.x_data.shape[1]) if f"x_{i+1}" not in categorical_columns]
        self.x_data["x_2_numeric"] = self.x_data["x_2"].astype("category").cat.codes
        self.x_data["x_21_numeric"] = self.x_data["x_21"].astype("category").cat.codes
        self.x_data["x_24_numeric"] = self.x_data["x_24"].astype("category").cat.codes
        numerical_columns = numerical_columns + ["x_2_numeric", "x_21_numeric", "x_24_numeric"]
        self.x_data = self.x_data.loc[:, numerical_columns]

        # Convert to tensors
        covariates = self.x_data.values.astype(np.float32)  # Covariates with encoded categorical variables
        treatments = sim_data["z"].values.astype(np.float32)  # Treatment

        y1 = sim_data["y1"].values.astype(np.float32)  # Potential outcomes under treatment
        y0 = sim_data["y0"].values.astype(np.float32)  # Potential outcomes under control
        outcomes = np.where(treatments == 1, y1, y0)

        mu0 = sim_data["mu0"].values.astype(np.float32)
        mu1 = sim_data["mu1"].values.astype(np.float32)
        cate = mu1 - mu0

        # Split the dataset into train and test sets
        indices = self.rngs[idx].permutation(covariates.shape[0])
        split_idx = int(len(indices) * (1 - self.test_ratio))
        train_indices = indices[:split_idx]
        test_indices = indices[split_idx:]
        cate_dataset = CATE_Dataset(
            X_train=covariates[train_indices],
            t_train=treatments[train_indices],
            y_train=outcomes[train_indices],
            X_test=covariates[test_indices],
            true_cate=cate[test_indices],
        )

        return cate_dataset

    def __getitem__(self, index) -> CATE_Dataset:
        return self.datasets[index]

In [5]:
from copy import deepcopy

dataset = ACIC2016Dataset()

pehes = []
for i in range(len(dataset)):
    cate_dset: CATE_Dataset = dataset[i]
    X_train = cate_dset.X_train
    t_train = cate_dset.t_train
    X_t_train = np.concatenate(
        [t_train[:, None], X_train],
        axis=1,
    )
    dopfn = DoPFNRegressor()
    dopfn.fit(X_t_train, cate_dset.y_train)

    x_1, x_0 = deepcopy(cate_dset.X_test), deepcopy(cate_dset.X_test)
    X_test_0 = np.concatenate(
        [
            np.zeros((x_0.shape[0], 1)),
            x_0,
        ],
        axis=1,
    )
    X_test_1 = np.concatenate(
        [
            np.ones((x_1.shape[0], 1)),
            x_1,
        ],
        axis=1,
    )

    y_test_0 = dopfn.predict(torch.from_numpy(X_test_0))
    y_test_1 = dopfn.predict(torch.from_numpy(X_test_1))
    cate_pred = y_test_1 - y_test_0
    pehe = np.sqrt(np.mean((cate_pred - cate_dset.true_cate) ** 2))
    pehes.append(pehe)

avg_pehe = sum(pehes) / len(pehes)

Running inference: 100%|██████████| 1/1 [00:04<00:00,  4.95s/batch]
Running inference: 100%|██████████| 1/1 [00:05<00:00,  5.17s/batch]
Running inference: 100%|██████████| 1/1 [00:05<00:00,  5.12s/batch]
Running inference: 100%|██████████| 1/1 [00:04<00:00,  4.96s/batch]
Running inference: 100%|██████████| 1/1 [00:06<00:00,  6.05s/batch]
Running inference: 100%|██████████| 1/1 [00:10<00:00, 10.27s/batch]
Running inference: 100%|██████████| 1/1 [00:08<00:00,  8.20s/batch]
Running inference: 100%|██████████| 1/1 [00:07<00:00,  7.88s/batch]
Running inference: 100%|██████████| 1/1 [00:05<00:00,  5.08s/batch]
Running inference: 100%|██████████| 1/1 [00:04<00:00,  4.92s/batch]
Running inference: 100%|██████████| 1/1 [00:05<00:00,  5.51s/batch]
Running inference: 100%|██████████| 1/1 [00:05<00:00,  5.43s/batch]
Running inference: 100%|██████████| 1/1 [00:04<00:00,  4.87s/batch]
Running inference: 100%|██████████| 1/1 [00:05<00:00,  5.41s/batch]
Running inference: 100%|██████████| 1/1 [00:05<0

In [6]:
print(f"Average PEHE over {len(dataset)} datasets: {avg_pehe:.4f}")

Average PEHE over 10 datasets: 4.8290
