In [1]:
x=1

In [2]:
import random
import scanpy as sc
import numpy as np
from sklearn.model_selection import KFold

In [3]:
# read in the data
# integrated latent space has to be stored in .X
adata = sc.read('../../../pipeline_mil/data/raw/pbmc_latent_cell.h5ad')
adata

  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 1000000 × 30
    obs: 'dataset', 'sample_ID_czi', 'subject_ID_czi', 'cell_type_ontology_term_id', 'cell_type', 'total_counts_lisa', 'batch_czi', 'assay_ontology_term_id', 'assay', 'development_stage_ontology_term_id', 'development_stage', 'disease_ontology_term_id', 'disease', 'ethnicity_ontology_term_id', 'ethnicity', 'organism_ontology_term_id', 'organism', 'sex_ontology_term_id', 'sex', 'tissue_ontology_term_id', 'tissue', 'cell_type_for_integration', 'sample_ID_lataq', 'batch', 'batch_clean'
    uns: 'cell_type_colors', 'dataset_colors', 'neighbors', 'umap'
    obsm: 'X_umap'
    obsp: 'connectivities', 'distances'

# subset data

In [4]:
condition_key = 'disease' # what to predict
sample_key = 'sample_ID_lataq' # donors or samples
conditions_to_exclude = ['influenza', 'respiratory system disease', 'CD3-CD28-stimulated'] # leave empty if you want to include all conditions
n_samples_per_condition = 190 # if subsetting the data to have balanced classes, set to the min number of samples per condition

In [5]:
adata.obs[condition_key].value_counts()

COVID-19                        476668
normal                          398207
systemic lupus erythematosus    118301
influenza                         3764
respiratory system disease        1879
CD3-CD28-stimulated               1181
Name: disease, dtype: int64

In [6]:
adata.obs[[condition_key, sample_key]].drop_duplicates().groupby(condition_key).agg('count')

Unnamed: 0_level_0,sample_ID_lataq
disease,Unnamed: 1_level_1
CD3-CD28-stimulated,2
COVID-19,818
influenza,17
normal,1343
respiratory system disease,5
systemic lupus erythematosus,190


In [7]:
adata = adata[~adata.obs[condition_key].isin(conditions_to_exclude)].copy()
adata

  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 993176 × 30
    obs: 'dataset', 'sample_ID_czi', 'subject_ID_czi', 'cell_type_ontology_term_id', 'cell_type', 'total_counts_lisa', 'batch_czi', 'assay_ontology_term_id', 'assay', 'development_stage_ontology_term_id', 'development_stage', 'disease_ontology_term_id', 'disease', 'ethnicity_ontology_term_id', 'ethnicity', 'organism_ontology_term_id', 'organism', 'sex_ontology_term_id', 'sex', 'tissue_ontology_term_id', 'tissue', 'cell_type_for_integration', 'sample_ID_lataq', 'batch', 'batch_clean'
    uns: 'cell_type_colors', 'dataset_colors', 'neighbors', 'umap'
    obsm: 'X_umap'
    obsp: 'connectivities', 'distances'

In [8]:
# do not execute this cell if you want to keep all samples
random.seed(123)
samples_to_keep = []

for cond in np.unique(adata.obs[condition_key]):
    cond_samples = adata[adata.obs[condition_key] == cond].obs[sample_key].drop_duplicates()
    cond_samples_to_keep = random.sample(list(cond_samples), n_samples_per_condition)
    samples_to_keep.extend(cond_samples_to_keep)
len(samples_to_keep)

570

In [9]:
# do not execute this cell if you want to keep all samples
adata = adata[adata.obs[sample_key].isin(samples_to_keep)].copy()
adata

  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 278357 × 30
    obs: 'dataset', 'sample_ID_czi', 'subject_ID_czi', 'cell_type_ontology_term_id', 'cell_type', 'total_counts_lisa', 'batch_czi', 'assay_ontology_term_id', 'assay', 'development_stage_ontology_term_id', 'development_stage', 'disease_ontology_term_id', 'disease', 'ethnicity_ontology_term_id', 'ethnicity', 'organism_ontology_term_id', 'organism', 'sex_ontology_term_id', 'sex', 'tissue_ontology_term_id', 'tissue', 'cell_type_for_integration', 'sample_ID_lataq', 'batch', 'batch_clean'
    uns: 'cell_type_colors', 'dataset_colors', 'neighbors', 'umap'
    obsm: 'X_umap'
    obsp: 'connectivities', 'distances'

# add splits

In [10]:
n_splits = 5 # or 3

In [11]:
train_val_sizes = []
samples = np.unique(adata.obs[sample_key])
kf = KFold(n_splits=n_splits, random_state=1, shuffle=True)

for i, (train_index, val_index) in enumerate(kf.split(samples)):
    train_samples = samples[train_index]
    val_samples = samples[val_index]
    adata.obs[f'split{i}'] = np.nan
    adata.obs.loc[adata.obs[sample_key].isin(train_samples), f'split{i}'] = 'train'
    adata.obs.loc[adata.obs[sample_key].isin(val_samples), f'split{i}'] = 'val'
    adata_train = adata[adata.obs[f'split{i}'] == 'train'].copy()
    adata_val = adata[adata.obs[f'split{i}'] == 'val'].copy()
    train_conditions = set(adata_train.obs[condition_key].cat.categories)
    val_conditions = set(adata_val.obs[condition_key].cat.categories)
    assert val_conditions.issubset(train_conditions) # check that all conditions in the validation set are also in the training set
    train_val_sizes.append((len(adata_train), len(adata_val)))
    adata.obs[f'split{i}'] = adata.obs[f'split{i}'].astype('category')

train_val_sizes

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


[(222769, 55588),
 (222075, 56282),
 (221436, 56921),
 (221802, 56555),
 (225346, 53011)]

In [12]:
adata.obs_names_make_unique()

In [13]:
# this will be the input to the model
adata.write('../../../pipeline_mil/data/pp/scpoli_covid_lupus_healthy_balanced.h5ad')