# Experiment 3: the influence of a batch effects in AIRR data on classification performance

In this experiment, we show how the presence of batch effects can make disease prediction poor if the disease status and batch effect are correlated.

Steps:

...

The epitope-specific signal:

<table style='border: 1px solid; border-collapse: collapse; margin-left: 0'>
    <tr style='border: 1px solid;'>
        <td style='border: 1px solid;'></td>
        <td colspan="3" style='border: 1px solid;'>specificity label</td>
    </tr>
    <tr style='border: 1px solid;'>
        <td rowspan="3" style='border: 1px solid;'>motif presence</td>
        <td style='border: 1px solid;'></td>
        <td style='border: 1px solid;'>negative</td>
        <td style='border: 1px solid;'>positive</td>
    </tr>
    <tr style='border: 1px solid;'>
        <td style='border: 1px solid;'>negative</td>
        <td style='border: 1px solid;'>45</td>
        <td style='border: 1px solid;'>15</td>
    </tr>
    <tr style='border: 1px solid;'>
        <td style='border: 1px solid;'>positive</td>
        <td style='border: 1px solid;'>5</td>
        <td style='border: 1px solid;'>35</td>
    </tr>
</table>

Software used: 

- OLGA for simulation of naive AIRRs,
- immuneML v2.1 for implanting signal in AIRRs.

In [3]:
from datetime import datetime
from pathlib import Path
from IPython.display import IFrame

from sklearn.linear_model import Ridge, LinearRegression

from causal_airr_scripts.dataset_util import setup_path
from causal_airr_scripts.experiment3.SimConfig import SimConfig, ImplantingConfig, ImplantingSetting, ImplantingUnit, ImplantingGroup
from causal_airr_scripts.experiment3.experiment3 import Experiment3
from causal_airr_scripts.util import write_config

In [9]:
result_path = Path("./")
num_processes = 2
name = 'full_run'


signal = dict(motif_seeds=["YEQ", "PQH", "LFF"], seq_position_weights={108: 0.5, 109: 0.5}, 
              hamming_dist_weights={1: 0.8, 0: 0.2}, position_weights={1: 1.})

batch_signal = dict(motif_seeds=['CAA'], hamming_dist_weights={1: 0.8, 0: 0.2}, position_weights={1: 1}, 
                    seq_position_weights={104: 1.}, signal_name='batch_signal')

probability_setup = {'control': ImplantingUnit(0.4, 0.25, 0.875, 0),
                     'batch0': ImplantingUnit(0.16, 3 / 84, 7 / 16, 0.1),
                     'batch1': ImplantingUnit(0.64, 3 / 4, 63 / 64, 0.9),
                     'batch_test': ImplantingUnit(0.4, 0.25, 0.875, batch_implanting_prob=0.5)}

sequence_count = 500

config = SimConfig(k=3, repetitions=5, 
                   olga_model_name='humanTRB', sequence_encoding='continuous_kmer', 
                   signal=signal, batch_signal=batch_signal,
                   batch_corrections=[None, LinearRegression(), Ridge(alpha=1e2), Ridge(alpha=1e4), 
                                      Ridge(alpha=1e6), Ridge(alpha=1e8)], 
                   implanting_config=ImplantingConfig(
                       control=ImplantingSetting(
                           train=ImplantingGroup(baseline=probability_setup['control'], 
                                                 modified=probability_setup['control'], 
                                                 seq_count=sequence_count),
                           test=ImplantingGroup(baseline=probability_setup['control'], 
                                                modified=probability_setup['control'], 
                                                seq_count=sequence_count),
                           name='control'),
                       batch=ImplantingSetting(
                           train=ImplantingGroup(baseline=probability_setup['batch0'], 
                                                 seq_count=sequence_count, 
                                                 modified=probability_setup['batch1']),
                           test=ImplantingGroup(baseline=probability_setup['batch_test'], 
                                                seq_count=sequence_count, 
                                                modified=probability_setup['batch_test']),
                           name='batch')))

path = setup_path(result_path / f"experiment3_AIRR_{name}_seqcount_{sequence_count}_{datetime.now()}")
write_config(config, path)


In [10]:

experiment = Experiment3(config, num_processes=num_processes)
experiment.run(path)


2023-02-03 18:05:45,951 INFO: Setting temporary cache path to experiment3_AIRR_full_run_seqcount_500_2023-02-03 18:05:41.520760/cache


2023-02-03 18:05:45.952833: Setting temporary cache path to experiment3_AIRR_full_run_seqcount_500_2023-02-03 18:05:41.520760/cache


2023-02-03 18:05:45,955 INFO: Starting run for implanting_group: {'name': 'batch', 'train': {'baseline': {'label_implanting_prob': 0.16, 'label_given_no_motif_prob': 0.03571428571428571, 'label_given_motif_prob': 0.4375, 'batch_implanting_prob': 0.1}, 'modified': {'label_implanting_prob': 0.64, 'label_given_no_motif_prob': 0.75, 'label_given_motif_prob': 0.984375, 'batch_implanting_prob': 0.9}, 'seq_count': 500}, 'test': {'baseline': {'label_implanting_prob': 0.4, 'label_given_no_motif_prob': 0.25, 'label_given_motif_prob': 0.875, 'batch_implanting_prob': 0.5}, 'modified': {'label_implanting_prob': 0.4, 'label_given_no_motif_prob': 0.25, 'label_given_motif_prob': 0.875, 'batch_implanting_prob': 0.5}, 'seq_count': 500}}, correct=no_correction
  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,

ImportHelper: 1 sequences were removed from the dataset because their amino acid sequence contained illegal characters. 


np.matrix usage is deprecated in 1.0 and will raise a 

2023-02-03 18:08:27,814 INFO: Starting run for implanting_group: {'name': 'batch', 'train': {'baseline': {'label_implanting_prob': 0.16, 'label_given_no_motif_prob': 0.03571428571428571, 'label_given_motif_prob': 0.4375, 'batch_implanting_prob': 0.1}, 'modified': {'label_implanting_prob': 0.64, 'label_given_no_motif_prob': 0.75, 'label_given_motif_prob': 0.984375, 'batch_implanting_prob': 0.9}, 'seq_count': 500}, 'test': {'baseline': {'label_implanting_prob': 0.4, 'label_given_no_motif_prob': 0.25, 'label_given_motif_prob': 0.875, 'batch_implanting_prob': 0.5}, 'modified': {'label_implanting_prob': 0.4, 'label_given_no_motif_prob': 0.25, 'label_given_motif_prob': 0.875, 'batch_implanting_prob': 0.5}, 'seq_count': 500}}, correct=linear_reg
  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,

ImportHelper: 1 sequences were removed from the dataset because their amino acid sequence contained illegal characters. 


np.matrix usage is deprecated in 1.0 and will raise a Typ

2023-02-03 18:11:29,487 INFO: Starting run for implanting_group: {'name': 'batch', 'train': {'baseline': {'label_implanting_prob': 0.16, 'label_given_no_motif_prob': 0.03571428571428571, 'label_given_motif_prob': 0.4375, 'batch_implanting_prob': 0.1}, 'modified': {'label_implanting_prob': 0.64, 'label_given_no_motif_prob': 0.75, 'label_given_motif_prob': 0.984375, 'batch_implanting_prob': 0.9}, 'seq_count': 500}, 'test': {'baseline': {'label_implanting_prob': 0.4, 'label_given_no_motif_prob': 0.25, 'label_given_motif_prob': 0.875, 'batch_implanting_prob': 0.5}, 'modified': {'label_implanting_prob': 0.4, 'label_given_no_motif_prob': 0.25, 'label_given_motif_prob': 0.875, 'batch_implanting_prob': 0.5}, 'seq_count': 500}}, correct=ridge_1e+02
  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,

ImportHelper: 1 sequences were removed from the dataset because their amino acid sequence contained illegal characters. 


np.matrix usage is deprecated in 1.0 and will raise a Ty

2023-02-03 18:14:10,438 INFO: Starting run for implanting_group: {'name': 'batch', 'train': {'baseline': {'label_implanting_prob': 0.16, 'label_given_no_motif_prob': 0.03571428571428571, 'label_given_motif_prob': 0.4375, 'batch_implanting_prob': 0.1}, 'modified': {'label_implanting_prob': 0.64, 'label_given_no_motif_prob': 0.75, 'label_given_motif_prob': 0.984375, 'batch_implanting_prob': 0.9}, 'seq_count': 500}, 'test': {'baseline': {'label_implanting_prob': 0.4, 'label_given_no_motif_prob': 0.25, 'label_given_motif_prob': 0.875, 'batch_implanting_prob': 0.5}, 'modified': {'label_implanting_prob': 0.4, 'label_given_no_motif_prob': 0.25, 'label_given_motif_prob': 0.875, 'batch_implanting_prob': 0.5}, 'seq_count': 500}}, correct=ridge_1e+04
  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,

ImportHelper: 1 sequences were removed from the dataset because their amino acid sequence contained illegal characters. 


np.matrix usage is deprecated in 1.0 and will raise a Ty

2023-02-03 18:16:55,025 INFO: Starting run for implanting_group: {'name': 'batch', 'train': {'baseline': {'label_implanting_prob': 0.16, 'label_given_no_motif_prob': 0.03571428571428571, 'label_given_motif_prob': 0.4375, 'batch_implanting_prob': 0.1}, 'modified': {'label_implanting_prob': 0.64, 'label_given_no_motif_prob': 0.75, 'label_given_motif_prob': 0.984375, 'batch_implanting_prob': 0.9}, 'seq_count': 500}, 'test': {'baseline': {'label_implanting_prob': 0.4, 'label_given_no_motif_prob': 0.25, 'label_given_motif_prob': 0.875, 'batch_implanting_prob': 0.5}, 'modified': {'label_implanting_prob': 0.4, 'label_given_no_motif_prob': 0.25, 'label_given_motif_prob': 0.875, 'batch_implanting_prob': 0.5}, 'seq_count': 500}}, correct=ridge_1e+06
  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,

ImportHelper: 1 sequences were removed from the dataset because their amino acid sequence contained illegal characters. 


np.matrix usage is deprecated in 1.0 and will raise a Ty

2023-02-03 18:19:42,298 INFO: Starting run for implanting_group: {'name': 'batch', 'train': {'baseline': {'label_implanting_prob': 0.16, 'label_given_no_motif_prob': 0.03571428571428571, 'label_given_motif_prob': 0.4375, 'batch_implanting_prob': 0.1}, 'modified': {'label_implanting_prob': 0.64, 'label_given_no_motif_prob': 0.75, 'label_given_motif_prob': 0.984375, 'batch_implanting_prob': 0.9}, 'seq_count': 500}, 'test': {'baseline': {'label_implanting_prob': 0.4, 'label_given_no_motif_prob': 0.25, 'label_given_motif_prob': 0.875, 'batch_implanting_prob': 0.5}, 'modified': {'label_implanting_prob': 0.4, 'label_given_no_motif_prob': 0.25, 'label_given_motif_prob': 0.875, 'batch_implanting_prob': 0.5}, 'seq_count': 500}}, correct=ridge_1e+08
  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,

ImportHelper: 1 sequences were removed from the dataset because their amino acid sequence contained illegal characters. 


np.matrix usage is deprecated in 1.0 and will raise a Ty

2023-02-03 18:22:31,372 INFO: Starting run for implanting_group: {'name': 'control', 'train': {'baseline': {'label_implanting_prob': 0.4, 'label_given_no_motif_prob': 0.25, 'label_given_motif_prob': 0.875, 'batch_implanting_prob': 0}, 'modified': {'label_implanting_prob': 0.4, 'label_given_no_motif_prob': 0.25, 'label_given_motif_prob': 0.875, 'batch_implanting_prob': 0}, 'seq_count': 500}, 'test': {'baseline': {'label_implanting_prob': 0.4, 'label_given_no_motif_prob': 0.25, 'label_given_motif_prob': 0.875, 'batch_implanting_prob': 0}, 'modified': {'label_implanting_prob': 0.4, 'label_given_no_motif_prob': 0.25, 'label_given_motif_prob': 0.875, 'batch_implanting_prob': 0}, 'seq_count': 500}}, correct=no_correction
  n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,

ImportHelper: 1 sequences were removed from the dataset because their amino acid sequence contained illegal characters. 


np.matrix usage is deprecated in 1.0 and will raise a TypeError in 1.2. Please co

In [11]:
# performance on the test set where the batch effect is not correlated with receptor sequence specificity

IFrame(src=str(path / "summary_metrics.html"),  width=900, height=600)


In [12]:
# top 30 coefficients from the logistic regression models and how much they overlap with the true signal

IFrame(src=str(path / "summary_enriched_kmers.html"),  width=900, height=800)