# Experiment 1: the influence of a confounder on classification performance

In this experiment, we use a simplified causal graph consisting of three nodes: immune state, confounder, and AIRR, to show how confounders might influence the prediction task.

Immune state is a binary variable and can have values `True` or `False` to indicate if an individual is diseased or healthy. Confounder also in this setting has two values: `C1` and `C2`. AIRR is a set of sequences simulated based on the values of the immune state and the confounder for the given individual.

Steps:

1. Simulate training and test dataset from a causal graph to include: confounder (implemented by implanting 3-mer `ADR` in the repertoire), immune state (implemented by implanting a signal in the repertoire but the exact 3-mer depends on the value of the confounder; it is either `QPR` or `EQY`)

2. Train an ML model (here: logistic regression on repertoires represented by the k-mer frequencies) on the train set and assess its performance on the test set in the presence of confounder with stable distribution across train and test.

Software used: 

- DagSim for simulation of the causal graph; 
- immuneML v2.1 for implanting signal in AIRRs and for training and assessing machine learning classifiers; 
- OLGA for simulation of naive AIRRs

In [None]:
import yaml
import numpy as np
from util.dataset_util import setup_path
from util.implanting import make_immune_state_signals, make_confounder_signal
from util.experiment1 import make_graph

## Experiment 1a - Stable confounder distribution is not a problem for prediction

### Step 1: AIRR simulation from a causal graph

In [None]:
experiment_name = "1a"
result_path = setup_path('./experiment1a/')
data_path = setup_path("./experiment1a/data/")

# how many repertoires to make for training and testing
train_example_count = 200
test_example_count = 100

# immune state: two binomial distributions depending on the confounder value with probability of success p
immune_state_p_conf1 = 0.8 # for confounder = C1
immune_state_p_conf2 = 0.2 # for confounder = C2

# confounder: binomial distribution with probability of success p
confounder_p_train = 0.5
confounder_p_test = 0.5

# other parameters
immune_state_implanting_rate = 0.02 # percentage of repertoire sequences to include immune state signal
confounder_implanting_rate = 0.2
sequence_count = 100 # number of sequences in one repertoire

immune_state_signal_name = "immune_state"
immune_state_signals = make_immune_state_signals(signal_name=immune_state_signal_name)
confounder_signal = make_confounder_signal()

# define nodes of the causal graph
graph = make_graph(confounder_p_train=confounder_p_train, immune_state_p_conf1=immune_state_p_conf1, 
                   immune_state_p_conf2=immune_state_p_conf2, data_path=data_path, sequence_count=sequence_count,
                   immune_state_signals=immune_state_signals, confounder_signal=confounder_signal, 
                   immune_state_implanting_rate=immune_state_implanting_rate, 
                   confounder_implanting_rate=confounder_implanting_rate, experiment_name=experiment_name)

graph.draw()

In [None]:
# simulate a dataset using the graph

from util.experiment1 import simulate_dataset

simulate_dataset(train_example_count=train_example_count, test_example_count=test_example_count, data_path=data_path,
                 graph=graph, confounder_p_test=confounder_p_test, sequence_count=sequence_count, 
                 immune_state_signals=immune_state_signals, confounder_signal=confounder_signal, 
                 immune_state_implanting_rate=immune_state_implanting_rate, 
                 confounder_implanting_rate=confounder_implanting_rate, experiment_name=experiment_name)



In [None]:
import pandas as pd
from glob import glob
from sklearn.metrics import matthews_corrcoef
from pathlib import Path


def print_all_simulation_stats(start_path: Path):
    metadata_files = glob(str(start_path / "**/*metadata.csv"), recursive=True)

    for metadata_file in metadata_files:
        print(f"\n\n{Path(metadata_file).name}\n")
        for k, v in get_simulation_stats(metadata_file).items():
            print(k)
            print(f"{v}\n")


def get_simulation_stats(metadata_path: Path):
    df = pd.read_csv(metadata_path)
    conf_values, conf_counts = np.unique(df['confounder'], return_counts=True)
    immune_values, immune_counts = np.unique(df['immune_state'], return_counts=True)
    return {
        "Matthews correlation coefficient between immune state and confounder": matthews_corrcoef(df['immune_state'], df['confounder']),
        "Confounder stats": {val: count for val, count in zip(conf_values, conf_counts)},
        "Immune state stats": {val: count for val, count in zip(immune_values, immune_counts)},
        "Immune state per confounder value": df[['confounder', 'immune_state']].value_counts()
    }

print_all_simulation_stats(data_path)

# Step 2: train an ML model and assess performance

In [None]:
from util.experiment1 import define_specs

specs = define_specs(data_path, experiment_name=experiment_name)

ml_result_path = setup_path(f"./experiment{experiment_name}/ml_result/")
specification_path = ml_result_path / "specs.yaml"

with open(specification_path, "w") as file:
    yaml.dump(specs, file)

In [None]:
# run immuneML with the specs file

from immuneML.app.ImmuneMLApp import ImmuneMLApp

output_path = ml_result_path / "result/"

app = ImmuneMLApp(specification_path = specification_path, result_path = output_path)
result = app.run()

In [None]:
from util.plotting import plot_balanced_error_rate

plot_balanced_error_rate(iml_result=result, result_path=ml_result_path)

In [None]:
# show what the model has learned

from IPython.display import IFrame


IFrame(src=str(ml_result_path / "result/HTML_output/train_ml_split_1_immune_state_kmer_frequency_logistic_regression_optimal_reports_ml_method_coefficients_largest_25_coefficients.html"),  width=700, height=600)

## Experiment 1b - Minor changes in confounder distribution do not always influence performance

### Step 1: AIRR simulation from the causal graph

In [None]:
experiment_name = "1b"
result_path = setup_path('./experiment1b/')
data_path = setup_path("./experiment1b/data/")

# how many repertoires to make for training and testing
train_example_count = 200
test_example_count = 100

# immune state: two binomial distributions depending on the confounder value with probability of success p
immune_state_p_conf1 = 0.8 # for confounder = C1
immune_state_p_conf2 = 0.2 # for confounder = C2

# confounder: binomial distribution with probability of success p
confounder_p_train = 0.4
confounder_p_test = 0.5

# other parameters
immune_state_implanting_rate = 0.02 # percentage of repertoire sequences to include immune state signal
confounder_implanting_rate = 0.2
sequence_count = 500 # number of sequences in one repertoire

immune_state_signal_name = "immune_state"
immune_state_signals = make_immune_state_signals(signal_name=immune_state_signal_name)
confounder_signal = make_confounder_signal()

# define nodes of the causal graph
graph = make_graph(confounder_p_train=confounder_p_train, immune_state_p_conf1=immune_state_p_conf1, 
                   immune_state_p_conf2=immune_state_p_conf2, data_path=data_path, sequence_count=sequence_count,
                   immune_state_signals=immune_state_signals, confounder_signal=confounder_signal, 
                   immune_state_implanting_rate=immune_state_implanting_rate, 
                   confounder_implanting_rate=confounder_implanting_rate, experiment_name=experiment_name)

graph.draw()

In [None]:
# simulate a dataset using the graph

from util.experiment1 import simulate_dataset

simulate_dataset(train_example_count=train_example_count, test_example_count=test_example_count, data_path=data_path,
                 graph=graph, confounder_p_test=confounder_p_test, sequence_count=sequence_count, 
                 immune_state_signals=immune_state_signals, confounder_signal=confounder_signal, 
                 immune_state_implanting_rate=immune_state_implanting_rate, 
                 confounder_implanting_rate=confounder_implanting_rate, experiment_name=experiment_name)



### Step 2: train an ML model and assess performance

In [None]:
from util.experiment1 import define_specs

specs = define_specs(data_path, experiment_name=experiment_name)

ml_result_path = setup_path(f"./experiment{experiment_name}/ml_result/")
specification_path = ml_result_path / "specs.yaml"

with open(specification_path, "w") as file:
    yaml.dump(specs, file)

In [None]:
# run immuneML with the specs file

from immuneML.app.ImmuneMLApp import ImmuneMLApp

output_path = ml_result_path / "result/"

app = ImmuneMLApp(specification_path = specification_path, result_path = output_path)
result = app.run()

In [None]:
from util.plotting import plot_balanced_error_rate

plot_balanced_error_rate(iml_result=result, result_path=ml_result_path)

In [None]:
# show what the model has learned

from IPython.display import IFrame


IFrame(src=str(ml_result_path / "result/HTML_output/train_ml_split_1_immune_state_kmer_frequency_logistic_regression_optimal_reports_ml_method_coefficients_largest_25_coefficients.html"),  width=700, height=600)

## Experiment 1c - Changes in confounder distribution can degrade performance

### Step 1: AIRR simulation from the causal graph

In [None]:
experiment_name = "1c"
result_path = setup_path('./experiment1c/')
data_path = setup_path("./experiment1c/data/")

# how many repertoires to make for training and testing
train_example_count = 200
test_example_count = 100

# immune state: two binomial distributions depending on the confounder value with probability of success p
immune_state_p_conf1 = 0.8 # for confounder = C1
immune_state_p_conf2 = 0.2 # for confounder = C2

# confounder: binomial distribution with probability of success p
confounder_p_train = 0.4
confounder_p_test = 0.6

# other parameters
immune_state_implanting_rate = 0.02 # percentage of repertoire sequences to include immune state signal
confounder_implanting_rate = 0.2
sequence_count = 500 # number of sequences in one repertoire

immune_state_signal_name = "immune_state"
immune_state_signals = make_immune_state_signals(signal_name=immune_state_signal_name)
confounder_signal = make_confounder_signal()

# define nodes of the causal graph
graph = make_graph(confounder_p_train=confounder_p_train, immune_state_p_conf1=immune_state_p_conf1, 
                   immune_state_p_conf2=immune_state_p_conf2, data_path=data_path, sequence_count=sequence_count,
                   immune_state_signals=immune_state_signals, confounder_signal=confounder_signal, 
                   immune_state_implanting_rate=immune_state_implanting_rate, 
                   confounder_implanting_rate=confounder_implanting_rate, experiment_name=experiment_name)

graph.draw()

In [None]:
# simulate a dataset using the graph

from util.experiment1 import simulate_dataset

simulate_dataset(train_example_count=train_example_count, test_example_count=test_example_count, data_path=data_path,
                 graph=graph, confounder_p_test=confounder_p_test, sequence_count=sequence_count, 
                 immune_state_signals=immune_state_signals, confounder_signal=confounder_signal, 
                 immune_state_implanting_rate=immune_state_implanting_rate, 
                 confounder_implanting_rate=confounder_implanting_rate, experiment_name=experiment_name)


### Step 2: train an ML model and assess performance

In [None]:
from util.experiment1 import define_specs

specs = define_specs(data_path, experiment_name=experiment_name)

ml_result_path = setup_path(f"./experiment{experiment_name}/ml_result/")
specification_path = ml_result_path / "specs.yaml"

with open(specification_path, "w") as file:
    yaml.dump(specs, file)

In [None]:
# run immuneML with the specs file

from immuneML.app.ImmuneMLApp import ImmuneMLApp

output_path = ml_result_path / "result/"

app = ImmuneMLApp(specification_path = specification_path, result_path = output_path)
result = app.run()

In [None]:
from util.plotting import plot_balanced_error_rate

plot_balanced_error_rate(iml_result=result, result_path=ml_result_path)

In [None]:
# show what the model has learned

from IPython.display import IFrame


IFrame(src=str(ml_result_path / "result/HTML_output/train_ml_split_1_immune_state_kmer_frequency_logistic_regression_optimal_reports_ml_method_coefficients_largest_25_coefficients.html"),  width=700, height=600)