In [15]:
import scanpy as sc
import numpy as np

import random

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
rna = sc.read('../../../pipeline/data/pp/pbmc_full_rna.h5ad')
rna

AnnData object with n_obs × n_vars = 647366 × 2000
    obs: 'sample_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'full_clustering', 'initial_clustering', 'Resample', 'Collection_Day', 'Sex', 'Age_interval', 'Swab_result', 'Status', 'Smoker', 'Status_on_day_collection', 'Status_on_day_collection_summary', 'Days_from_onset', 'Site', 'time_after_LPS', 'Worst_Clinical_Status', 'Outcome', 'patient_id', 'split0', 'split1', 'split2', 'split3', 'split4'
    var: 'feature_types', 'means', 'variances', 'residual_variances', 'highly_variable_rank', 'highly_variable_nbatches', 'highly_variable_intersection', 'highly_variable'
    uns: 'hvg', 'leiden', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    layers: 'counts'

In [4]:
adt = sc.read('../../../pipeline/data/pp/pbmc_full_adt.h5ad')
adt

AnnData object with n_obs × n_vars = 647366 × 192
    obs: 'sample_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'full_clustering', 'initial_clustering', 'Resample', 'Collection_Day', 'Sex', 'Age_interval', 'Swab_result', 'Status', 'Smoker', 'Status_on_day_collection', 'Status_on_day_collection_summary', 'Days_from_onset', 'Site', 'time_after_LPS', 'Worst_Clinical_Status', 'Outcome', 'patient_id', 'split0', 'split1', 'split2', 'split3', 'split4'
    var: 'feature_types'
    uns: 'hvg', 'leiden', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    layers: 'counts'

# Adjust conditions

In [5]:
rna.obs['Status_on_day_collection_summary'].cat.categories

Index(['Asymptomatic', 'Critical', 'Healthy', 'LPS_10hours', 'LPS_90mins',
       'Mild', 'Moderate', 'Non_covid', 'Severe'],
      dtype='object')

In [6]:
drop_conditions = ['LPS_10hours', 'LPS_90mins', 'Non_covid', 'Asymptomatic']

In [7]:
rna = rna[~rna.obs['Status_on_day_collection_summary'].isin(drop_conditions)]
adt = adt[~adt.obs['Status_on_day_collection_summary'].isin(drop_conditions)]
(rna.shape, adt.shape)

((590724, 2000), (590724, 192))

In [9]:
rna.obs['Status_on_day_collection_summary'] = rna.obs['Status_on_day_collection_summary'].str.replace('Critical', 'Severe')
rna.obs['Status_on_day_collection_summary'] = rna.obs['Status_on_day_collection_summary'].str.replace('Moderate', 'Mild')
rna.obs['Status_on_day_collection_summary'] = rna.obs['Status_on_day_collection_summary'].astype('category')

  rna.obs['Status_on_day_collection_summary'] = rna.obs['Status_on_day_collection_summary'].str.replace('Critical', 'Severe')


In [10]:
rna.obs['Status_on_day_collection_summary']

covid_index
AAACCTGAGAAACCTA-MH9179824       Mild
AAACCTGAGAGTAATC-MH9179824       Mild
AAACCTGAGAGTGAGA-MH9179824       Mild
AAACCTGAGGAATCGC-MH9179824       Mild
AAACCTGAGTGTTGAA-MH9179824       Mild
                               ...   
BGCV15_TTTGTCAAGGCGATAC-1      Severe
BGCV15_TTTGTCACAAGCCATT-1     Healthy
BGCV15_TTTGTCACAGACACTT-1        Mild
BGCV15_TTTGTCAGTTACGGAG-1        Mild
BGCV15_TTTGTCATCGAATGGG-1      Severe
Name: Status_on_day_collection_summary, Length: 590724, dtype: category
Categories (3, object): ['Healthy', 'Mild', 'Severe']

In [11]:
adt.obs = rna.obs

In [12]:
rna

AnnData object with n_obs × n_vars = 590724 × 2000
    obs: 'sample_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'full_clustering', 'initial_clustering', 'Resample', 'Collection_Day', 'Sex', 'Age_interval', 'Swab_result', 'Status', 'Smoker', 'Status_on_day_collection', 'Status_on_day_collection_summary', 'Days_from_onset', 'Site', 'time_after_LPS', 'Worst_Clinical_Status', 'Outcome', 'patient_id', 'split0', 'split1', 'split2', 'split3', 'split4'
    var: 'feature_types', 'means', 'variances', 'residual_variances', 'highly_variable_rank', 'highly_variable_nbatches', 'highly_variable_intersection', 'highly_variable'
    uns: 'hvg', 'leiden', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    layers: 'counts'

In [13]:
rna.obs[['patient_id', 'Status_on_day_collection_summary']].drop_duplicates().value_counts('Status_on_day_collection_summary')

Status_on_day_collection_summary
Mild       53
Severe     28
Healthy    23
Name: count, dtype: int64

In [16]:
mild_samples = np.unique(rna[rna.obs['Status_on_day_collection_summary'] == 'Mild'].obs['patient_id'])
mild_samples_to_keep = random.sample(list(mild_samples), 23)
severe_samples = np.unique(rna[rna.obs['Status_on_day_collection_summary'] == 'Severe'].obs['patient_id'])
severe_samples_to_keep = random.sample(list(severe_samples), 23)
rna = rna[(rna.obs['Status_on_day_collection_summary'] == 'Healthy') | (rna.obs['patient_id'].isin(mild_samples_to_keep)) | (rna.obs['patient_id'].isin(severe_samples_to_keep))].copy()
rna

AnnData object with n_obs × n_vars = 365712 × 2000
    obs: 'sample_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'full_clustering', 'initial_clustering', 'Resample', 'Collection_Day', 'Sex', 'Age_interval', 'Swab_result', 'Status', 'Smoker', 'Status_on_day_collection', 'Status_on_day_collection_summary', 'Days_from_onset', 'Site', 'time_after_LPS', 'Worst_Clinical_Status', 'Outcome', 'patient_id', 'split0', 'split1', 'split2', 'split3', 'split4'
    var: 'feature_types', 'means', 'variances', 'residual_variances', 'highly_variable_rank', 'highly_variable_nbatches', 'highly_variable_intersection', 'highly_variable'
    uns: 'hvg', 'leiden', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    layers: 'counts'

In [17]:
rna.obs[['patient_id', 'Status_on_day_collection_summary']].drop_duplicates().value_counts('Status_on_day_collection_summary')

Status_on_day_collection_summary
Healthy    23
Mild       23
Severe     23
Name: count, dtype: int64

In [18]:
adt = adt[rna.obs_names].copy()
adt

AnnData object with n_obs × n_vars = 365712 × 192
    obs: 'sample_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'full_clustering', 'initial_clustering', 'Resample', 'Collection_Day', 'Sex', 'Age_interval', 'Swab_result', 'Status', 'Smoker', 'Status_on_day_collection', 'Status_on_day_collection_summary', 'Days_from_onset', 'Site', 'time_after_LPS', 'Worst_Clinical_Status', 'Outcome', 'patient_id', 'split0', 'split1', 'split2', 'split3', 'split4'
    var: 'feature_types'
    uns: 'hvg', 'leiden', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    layers: 'counts'

In [19]:
rna.write('../../../pipeline/data/pp/pbmc_healthy_covid_rna_3cond_balanced.h5ad')
adt.write('../../../pipeline/data/pp/pbmc_healthy_covid_adt_3cond_balanced.h5ad')