In [1]:
from fklearn.data.datasets import make_confounded_data

In [2]:
N = 10000

features = ["sex","age","severity"]
treatment = ["medication"]

In [3]:
df_rnd, df_obs, df_cf = make_confounded_data(N)

In [4]:
df_obs.corr()

Unnamed: 0,sex,age,severity,medication,recovery
sex,1.0,0.020541,0.022876,0.226197,0.289954
age,0.020541,1.0,0.557409,0.470338,0.598271
severity,0.022876,0.557409,1.0,0.799194,0.172188
medication,0.226197,0.470338,0.799194,1.0,-0.085889
recovery,0.289954,0.598271,0.172188,-0.085889,1.0


In [5]:
df_obs.head(5)

Unnamed: 0,sex,age,severity,medication,recovery
0,0,35.857725,0.713511,1.0,17
1,1,26.162705,0.191286,0.0,28
2,1,25.284784,0.010045,0.0,26
3,1,41.616736,0.858321,1.0,35
4,0,22.062427,0.008502,0.0,14


In [None]:
def make_confounded_data(n: int) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Generates fake data for counterfactual experimentation. The covariants are
    sex, age and severity, the treatment is a binary variable, medication and the response
    days until recovery.

    Parameters
    ----------
    n : int
        The number of samples to generate

    Returns
    ----------
    df_rnd : pd.DataFrame
        A dataframe where the treatment is randomly assigned.

    df_obs : pd.DataFrame
        A dataframe with confounding.

    df_df : pd.DataFrame
        A counter factual dataframe with confounding. Same as df_obs, but
        with the treatment flipped.
    """

    def get_severity(df: pd.DataFrame) -> np.ndarray:
        return ((np.random.beta(1, 3, size=df.shape[0]) * (df["age"] < 30))
                + (np.random.beta(3, 1.5, size=df.shape[0]) * (df["age"] >= 30)))

    def get_treatment(df: pd.DataFrame) -> pd.Series:
        return (.33 * df["sex"]
                + 1.5 * df["severity"]
                + 0.15 * np.random.normal(size=df.shape[0]) > 0.8).astype(float)

    def get_recovery(df: pd.DataFrame) -> np.ndarray:
        return np.random.poisson(np.exp(2
                                        + 0.5 * df["sex"]
                                        + 0.03 * df["age"]
                                        + df["severity"]
                                        - df["medication"]))

    np.random.seed(1111)
    sexes = np.random.randint(0, 2, size=n)
    ages = np.random.gamma(8, scale=4, size=n)
    meds = np.random.randint(0, 2, size=n)

    # random data
    df_rnd = pd.DataFrame(dict(sex=sexes, age=ages, medication=meds))
    df_rnd['severity'] = get_severity(df_rnd)
    df_rnd['recovery'] = get_recovery(df_rnd)

    features = ['sex', 'age', 'severity', 'medication', 'recovery']
    df_rnd = df_rnd[features]  # to enforce column order

    # obs data
    df_obs = df_rnd.copy()
    df_obs['medication'] = get_treatment(df_obs)
    df_obs['recovery'] = get_recovery(df_obs)

    # caunter_factual data
    df_ctf = df_obs.copy()
    df_ctf['medication'] = ((df_ctf['medication'] == 1) ^ 1).astype(float)
    df_ctf['recovery'] = get_recovery(df_ctf)

    return df_rnd, df_obs, df_ctf
