
# Confounded vs deconfounded model comparison

Train multiple classifiers on a confounded dataset and several pre-generated deconfounded datasets (e.g., backdoor, frontdoor, truncated factorisation). Compare their predictive metrics on a shared observational hold-out and on each deconfounded hold-out.



**Workflow**
1. Load confounded CSV and multiple deconfounded CSVs (already generated; no bootstrap here).
2. Align feature columns common to all datasets and split each into train/test.
3. Fit a set of common models on each training set.
4. Evaluate on the same confounded hold-out to see how deconfounding affects generalization to observational data.
5. Also evaluate on each deconfounded hold-out for interventional-like performance.

> Set DECONFOUNDED_DATASETS below to your deconfounded files (same schema + target).


In [None]:

import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    average_precision_score,
    brier_score_loss,
    log_loss,
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.base import clone


In [None]:

# === Paths & columns ===
CONFOUNDED_PATH = 'heart_disease_preprocessed.csv'
DECONFOUNDED_DATASETS = [
    {'name': 'Backdoor', 'path': 'heart_disease_deconf_backdoor.csv'},
    {'name': 'Frontdoor', 'path': 'heart_disease_deconf_frontdoor.csv'},
    {'name': 'TruncatedFactorisation', 'path': 'heart_disease_deconf_truncated.csv'},
]

TARGET = 'heartdiseasepresence'
TEST_SIZE = 0.3
RANDOM_SEED = 42

assert Path(CONFOUNDED_PATH).exists(), f"Confounded file not found: {CONFOUNDED_PATH}"
for ds in DECONFOUNDED_DATASETS:
    assert Path(ds['path']).exists(), f"Deconfounded file not found: {ds['path']}"


In [None]:

    # Load datasets
df_conf = pd.read_csv(CONFOUNDED_PATH)
deconf_dfs = {}
for ds in DECONFOUNDED_DATASETS:
    deconf_dfs[ds['name']] = pd.read_csv(ds['path'])

    # Align columns across all datasets (intersection to ensure comparability)
common_cols = set(df_conf.columns)
for name, df in deconf_dfs.items():
    common_cols = common_cols.intersection(set(df.columns))
assert TARGET in common_cols, f"Target '{TARGET}' must exist in all datasets"

feature_cols = sorted([c for c in common_cols if c != TARGET])

df_conf = df_conf[feature_cols + [TARGET]]
for name in deconf_dfs:
    deconf_dfs[name] = deconf_dfs[name][feature_cols + [TARGET]]

print('Common features:', len(feature_cols))
print('Confounded shape:', df_conf.shape)
for name, df in deconf_dfs.items():
    print(f"Deconfounded ({name}) shape: {df.shape}")

print('Confounded target balance:', df_conf[TARGET].value_counts())
for name, df in deconf_dfs.items():
    print(f"Deconfounded ({name}) target balance:", df[TARGET].value_counts())

df_conf.head()

In [None]:

# Train/test splits for each dataset
splits = {
    'Confounded': {
        'train': None,
        'test': None,
        'label': 'Confounded (observational)'
    }
}

conf_train, conf_test = train_test_split(
    df_conf, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=df_conf[TARGET]
)
splits['Confounded']['train'] = conf_train
splits['Confounded']['test'] = conf_test

for name, df in deconf_dfs.items():
    train_df, test_df = train_test_split(
        df, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=df[TARGET]
    )
    splits[name] = {'train': train_df, 'test': test_df, 'label': f'Deconfounded ({name})'}

for name, split in splits.items():
    print(name, 'train/test sizes:', len(split['train']), len(split['test']))



### Models and evaluation helpers

Metrics: Accuracy, ROC AUC, PR AUC (average precision), Brier score, and log loss.


In [None]:

# based on whatever hyperparameter worked well evaluated based on HPO on confounded data

models = {
    'LogisticRegression': LogisticRegression(max_iter=2000, solver='liblinear', random_state=RANDOM_SEED),
    'RandomForest': RandomForestClassifier(n_estimators=300, random_state=RANDOM_SEED),
    'GradientBoosting': GradientBoostingClassifier(random_state=RANDOM_SEED),
    'SVC_RBF': SVC(probability=True, kernel='rbf', C=2.0, gamma='scale', random_state=RANDOM_SEED),
}

def get_probas(model, X):
    if hasattr(model, 'predict_proba'):
        return model.predict_proba(X)[:, 1]
    if hasattr(model, 'decision_function'):
        scores = model.decision_function(X)
        return 1 / (1 + np.exp(-scores))
    raise ValueError('Model does not support probability-like outputs')

def evaluate_model(model, X_train, y_train, X_eval, y_eval):
    fitted = clone(model)
    fitted.fit(X_train, y_train)
    preds = fitted.predict(X_eval)
    probas = get_probas(fitted, X_eval)
    return {
        'accuracy': accuracy_score(y_eval, preds),
        'roc_auc': roc_auc_score(y_eval, probas),
        'pr_auc': average_precision_score(y_eval, probas),
        'brier': brier_score_loss(y_eval, probas),
        'log_loss': log_loss(y_eval, probas),
    }


In [None]:

# Evaluate all train/eval combinations
results = []

# Evaluation sets: always include the confounded test, plus each deconfounded test
eval_sets = [('Confounded', splits['Confounded']['test'])]
for name, split in splits.items():
    if name == 'Confounded':
        continue
    eval_sets.append((name, split['test']))

for model_name, model in models.items():
    for train_name, split in splits.items():
        X_train = split['train'][feature_cols]
        y_train = split['train'][TARGET]

        for eval_name, eval_df in eval_sets:
            X_eval = eval_df[feature_cols]
            y_eval = eval_df[TARGET]
            metrics = evaluate_model(model, X_train, y_train, X_eval, y_eval)
            metrics.update({
                'model': model_name,
                'training_data': splits[train_name]['label'],
                'evaluation': 'Confounded test' if eval_name == 'Confounded' else f'Deconfounded test ({eval_name})'
            })
            results.append(metrics)

results_df = pd.DataFrame(results)
results_df



### Combined, tidy comparison table


In [None]:

def tidy(df):
    cols = ['model', 'training_data', 'evaluation', 'accuracy', 'roc_auc', 'pr_auc', 'brier', 'log_loss']
    return df[cols].sort_values(['evaluation', 'model', 'training_data']).reset_index(drop=True)

tidy(results_df)



### Visualize observational (confounded) test accuracy


In [None]:

import matplotlib.pyplot as plt

plot_df = results_df[results_df['evaluation'] == 'Confounded test'].copy()
fig, ax = plt.subplots(figsize=(8, 4))

train_labels = plot_df['training_data'].unique()
train_labels_sorted = sorted(train_labels)
x_pos = np.arange(len(train_labels_sorted))

width = 0.15
for idx, model_name in enumerate(sorted(plot_df['model'].unique())):
    subset = plot_df[plot_df['model'] == model_name].set_index('training_data').loc[train_labels_sorted]
    bars = ax.bar(x_pos + idx * width, subset['accuracy'], width=width, label=model_name)
    for bar in bars:
        ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.005, f"{bar.get_height():.3f}",
                ha='center', va='bottom', fontsize=7)

ax.set_xticks(x_pos + width * (len(plot_df['model'].unique()) - 1) / 2)
ax.set_xticklabels(train_labels_sorted, rotation=15, ha='right')
ax.set_ylabel('Accuracy on confounded test')
ax.set_ylim(0, 1)
ax.legend(title='Model')
plt.tight_layout()
plt.show()
