In [16]:
vimport anndata as ad
import pandas as pd
import numpy as np
from numpy.random import default_rng
import scanpy as sc
import scipy
from scipy import sparse

In [17]:
data_path = '../../../../lustre/groups/ml01/datasets/projects/202201212_psngraph_fabiola.curion/data.dir/'
!ls $data_path

all_proteins_V1.txt  combat_rna.h5ad		 cytof_full.h5ad
cite_cells.csv	     cytof_cells.csv		 facs_full.h5ad
combat_adt.h5ad      cytof_cells_harmonised.csv


## Loading adatas

In [18]:
rna = ad.read_h5ad(data_path+"combat_rna.h5ad")
adt = ad.read_h5ad(data_path+"combat_adt.h5ad")
cytof = ad.read_h5ad(data_path+"cytof_full.h5ad")
facs = ad.read_h5ad(data_path+"facs_full.h5ad")

  utils.warn_names_duplicates("obs")


## Fixing warnings

In [19]:
facs.obs_names = [str(i)+'_facs' for i in range(len(facs))]
adt.var.index = [index[3:] for index in adt.var.index]

In [20]:
rna.shape, adt.shape, cytof.shape, facs.shape

((836148, 20615), (836148, 192), (7118158, 48), (131920, 12))

## Loading label harmonization files

In [21]:
cite_cells = pd.read_csv(data_path+'cite_cells.csv')
cytof_cells = pd.read_csv(data_path+'cytof_cells.csv')
cytof_cells_harmonised = pd.read_csv(data_path+'cytof_cells_harmonised.csv', sep=';')

## Removing original obsm

In [22]:
rna.obsm = None
adt.obsm = None
cytof.obsm = None
facs.obsm = None

## Check variable distributions

In [23]:
#plt.hist(rna.X.data[:100])

In [24]:
#plt.hist(rna.layers['raw'].data[:100])

In [25]:
#plt.hist(adt.X.data[:100])

In [26]:
#plt.hist(adt.layers['raw'].data[:100])

In [27]:
#plt.hist(cytof.X[:,9])

In [28]:
#plt.hist(facs.X[:,1])

In [29]:
#plt.hist(facs.X[:,1])

## Subsampling cytof

In [30]:
#subset size is taken to equal the length of the smallest dataset: i.e. facs
rng = default_rng(seed=100)
cytof_subset = rng.choice(len(cytof), size=round(1e6), replace=False)
cytof = cytof[cytof_subset,].copy()
print(cytof.shape)

(1000000, 48)


In [31]:
(rna.obs_names == adt.obs_names).all()

True

## RNA variable selection and pca

In [33]:
sc.pp.highly_variable_genes(rna, n_top_genes=4000)
highly_variable_genes = rna.var['highly_variable']
rna = rna[:,highly_variable_genes].copy()
sc.tl.pca(rna, n_comps=50, svd_solver="auto")

## Use normalized expression as facs.X

In [34]:
facs.layers['original_X'] = facs.X.copy()
facs.X = facs.layers['exprs'].copy()

## Check data matrix sparsification and sparsify

In [35]:
scipy.sparse.issparse(rna.X), scipy.sparse.issparse(adt.X), scipy.sparse.issparse(cytof.X), scipy.sparse.issparse(facs.X)

(True, True, False, False)

In [36]:
cytof.X = sparse.csr_matrix(cytof.X)

In [37]:
facs.X = sparse.csr_matrix(facs.X)

In [38]:
scipy.sparse.issparse(rna.X), scipy.sparse.issparse(adt.X), scipy.sparse.issparse(cytof.X), scipy.sparse.issparse(facs.X)

(True, True, True, True)

## Cell type harmonization

In [39]:
def get_map_raw(l1, l2):
    dic = {}
    for label in l1:
        if label in l2:
            dic.update([(label, label)])
        else:
            dic.update([(label, None)])

    for label in l2:
        if label not in dic:
            dic.update([(label, None)])
    
    return dic

In [40]:
#cite_cells.Annotation_major_subset.unique(), cytof_cells_harmonised.harmonized_major_subset.unique()

In [41]:
#cite_cells.Annotation_cell_type.unique(), cytof_cells_harmonised.harmonized_cell_type.unique()

In [42]:
cell_type_harm_map = get_map_raw(cite_cells.Annotation_cell_type.unique(), cytof_cells_harmonised.harmonized_cell_type.unique())
major_subset_harm_map = get_map_raw(cite_cells.Annotation_major_subset.unique(), cytof_cells_harmonised.harmonized_major_subset.unique())

#### Note: 

From the maps one can see that labels between cytof and citeseq are already harmonized. The only thing to do at this point is to rename the cytof cells as the corresponding harmonized labels present in the cytof_cells_harmonized file.
The only thing to notice is that a subset of cell types identified with one assay are not identified with the other one and vice versa.

In [43]:
#used to map the name present in cytof_cells the same way as harmonized_cytof_cells (and consequently cite_Cells)

dic_major = {}
dic_type = {}
for i in range(len(cytof_cells_harmonised)):
    key = cytof_cells_harmonised.iloc[i]['major_cell_type']
    value_major = cytof_cells_harmonised.iloc[i]['harmonized_major_subset']
    value_type = cytof_cells_harmonised.iloc[i]['harmonized_cell_type']
    if value_major not in dic_major:
        dic_major.update([(key, value_major)])
    if value_type not in dic_type:
        dic_type.update([(key, value_type)])

In [44]:
cytof.obs['harmonized_major_subset'] = cytof.obs['major_cell_type'].map(dic_major)
cytof.obs['harmonized_cell_type'] = cytof.obs['major_cell_type'].map(dic_type)

In [45]:
cytof.obs.rename(columns={'harmonized_major_subset': 'Annotation_major_subset', 'harmonized_cell_type': 'Annotation_cell_type'}, inplace=True)
cytof.obs.columns

Index(['sample_id', 'condition', 'patient_id', 'batch', 'cellID',
       'COMBAT_ID_Time', 'CyTOF_priority', 'major_cell_type',
       'fine_cluster_id', 'Annotation_major_subset', 'Annotation_cell_type'],
      dtype='object')

In [46]:
rna.obs['Annotation_major_subset'].cat.categories

Index(['B', 'CD4', 'CD8', 'DC', 'DN', 'DP', 'GDT', 'HSC', 'MAIT', 'Mast', 'NK',
       'PB', 'PLT', 'RET', 'cMono', 'iNKT', 'nan', 'ncMono'],
      dtype='object')

In [47]:
cytof.obs['Annotation_major_subset'] = cytof.obs['Annotation_major_subset'].astype('category')
cytof.obs['Annotation_cell_type'] = cytof.obs['Annotation_cell_type'].astype('category')

In [48]:
cytof.obs['Annotation_major_subset'] = cytof.obs['Annotation_major_subset'].cat.rename_categories({'UNCLASSIFIED': 'nan'})
cytof.obs['Annotation_cell_type'] = cytof.obs['Annotation_cell_type'].cat.rename_categories({'UNCLASSIFIED': 'nan'})
#cytof.obs['Annotation_major_subset'].unique(), cytof.obs['Annotation_cell_type'].unique()

In [49]:
facs.obs['Annotation_major_subset'] = 'CD4' #to change to CD4
facs.obs['Annotation_cell_type'] = 'CD4'

## Assign domain to each adata

In [50]:
rna.obs['Domain'] = 'cite'
adt.obs['Domain'] = 'cite'
cytof.obs['Domain'] = 'cytof'
facs.obs['Domain'] = 'facs'
rna.obs['Domain_major'] = 'rna'
adt.obs['Domain_major'] = 'adt'
cytof.obs['Domain_major'] = 'cytof'
facs.obs['Domain_major'] = 'facs'

## Write preprocessed, harmonized files

In [52]:
rna.write("all_samples/rna-pp-harm-sub.h5ad", compression="gzip")
adt.write("all_samples/adt-pp-harm-sub.h5ad", compression="gzip")
cytof.write("all_samples/cytof-pp-harm-sub.h5ad", compression="gzip")
facs.write("all_samples/facs-pp-harm-sub.h5ad", compression="gzip")

In [55]:
!ls 

adt-pp-harm.h5ad		     facs-pp-harm.h5ad
adt-pp-harm-sub.h5ad		     facs-pp-harm-sub.h5ad
adt_query-pp-harm-sub.h5ad	     fulldata-preprocessing.ipynb
all_samples			     pp_data_show_umaps.ipynb
check harmonized labels.ipynb	     rna-pp-harm.h5ad
COMBAT-CITESeq-DATA.h5ad?download=1  rna-pp-harm-sub.h5ad
compute_umap_pp_data.ipynb	     rna_query-pp-harm-sub.h5ad
cytof-pp-harm.h5ad		     show_pp_data_distribution.ipynb
cytof-pp-harm-sub.h5ad		     umaps
data_pp_harm.ipynb


In [56]:
rna = ad.read("COMBAT-CITESeq-DATA.h5ad?download=1")
rna

AnnData object with n_obs × n_vars = 836148 × 20807
    obs: 'Annotation_cluster_id', 'Annotation_cluster_name', 'Annotation_minor_subset', 'Annotation_major_subset', 'Annotation_cell_type', 'GEX_region', 'QC_ngenes', 'QC_total_UMI', 'QC_pct_mitochondrial', 'QC_scrub_doublet_scores', 'TCR_chain_composition', 'TCR_clone_ID', 'TCR_clone_count', 'TCR_clone_proportion', 'TCR_contains_unproductive', 'TCR_doublet', 'TCR_chain_TRA', 'TCR_v_gene_TRA', 'TCR_d_gene_TRA', 'TCR_j_gene_TRA', 'TCR_c_gene_TRA', 'TCR_productive_TRA', 'TCR_cdr3_TRA', 'TCR_umis_TRA', 'TCR_chain_TRA2', 'TCR_v_gene_TRA2', 'TCR_d_gene_TRA2', 'TCR_j_gene_TRA2', 'TCR_c_gene_TRA2', 'TCR_productive_TRA2', 'TCR_cdr3_TRA2', 'TCR_umis_TRA2', 'TCR_chain_TRB', 'TCR_v_gene_TRB', 'TCR_d_gene_TRB', 'TCR_j_gene_TRB', 'TCR_c_gene_TRB', 'TCR_productive_TRB', 'TCR_chain_TRB2', 'TCR_v_gene_TRB2', 'TCR_d_gene_TRB2', 'TCR_j_gene_TRB2', 'TCR_c_gene_TRB2', 'TCR_productive_TRB2', 'TCR_cdr3_TRB2', 'TCR_umis_TRB2', 'BCR_umis_HC', 'BCR_contig_qc_H