In [None]:
import anndata as ad
import pandas as pd
import numpy as np
from numpy.random import default_rng
import scanpy as sc
import scipy
from scipy import sparse
import os
import matplotlib.pyplot as plt

In [None]:
data_path = '../../../../lustre/groups/ml01/datasets/projects/202201212_psngraph_fabiola.curion/data.dir/'
!ls $data_path

## Loading adatas

In [None]:
cite = ad.read_h5ad("../data_extra_modalities/COMBAT-CITESeq-DATA.h5ad")
cite

In [None]:
cite.var_names

In [None]:
sparse.issparse(cite.X)

In [None]:
rna = ad.read_h5ad(data_path+"combat_rna.h5ad")
adt = ad.read_h5ad(data_path+"combat_adt.h5ad")
cytof = ad.read_h5ad(data_path+"cytof_full.h5ad")
facs = ad.read_h5ad(data_path+"facs_full.h5ad")

In [None]:
rna.layers['raw']

## Fixing warnings

In [None]:
facs.obs_names = [str(i)+'_facs' for i in range(len(facs))]
adt.var.index = [index[3:] for index in adt.var.index]

In [None]:
rna.shape, adt.shape, cytof.shape, facs.shape

## Loading label harmonization files

In [None]:
cite_cells = pd.read_csv(data_path+'cite_cells.csv')
cytof_cells = pd.read_csv(data_path+'cytof_cells.csv')
cytof_cells_harmonised = pd.read_csv(data_path+'cytof_cells_harmonised.csv', sep=';')

## Removing original obsm

In [None]:
rna.obsm = None
adt.obsm = None
cytof.obsm = None
facs.obsm = None

## Check variable distributions

In [None]:
#plt.hist(rna.X.data[:100])

In [None]:
#plt.hist(rna.layers['raw'].data[:100])

In [None]:
#plt.hist(adt.X.data[:100])

In [None]:
#plt.hist(adt.layers['raw'].data[:100])

In [None]:
#plt.hist(cytof.X[:,9])

In [None]:
plt.hist(facs.X[:,1])

In [None]:
plt.hist(facs.layers['exprs'][:,1])

## Subsampling

In [None]:
"""#subset size is taken to equal the length of the smallest dataset: i.e. facs
rng = default_rng(seed=100)
rna_subset = rng.choice(len(rna), size=131920, replace=False) 
cytof_subset = rng.choice(len(cytof), size=131920, replace=False)
facs_subset = rng.choice(len(facs), size=131920, replace=False)"""

In [None]:
"""rna_query = rna[rna_query_subset,].copy()
adt_query = adt[rna_query_subset,].copy()
rna = rna[rna_subset,].copy()
adt = adt[rna_subset,].copy() # keep the same observations of the rna assay to provide paired cite seq information
cytof = cytof[cytof_subset,].copy()
facs = facs[facs_subset,].copy()"""

## RNA variable selection and pca

In [None]:
#sc.pp.highly_variable_genes(rna, layer = 'raw', n_top_genes=4000, flavor="seurat_v3")
#highly_variable_genes = rna.var['highly_variable']
#rna = rna[:,highly_variable_genes].copy()
#sc.tl.pca(rna, n_comps=50, svd_solver="auto")

## Use unnormalized expressions for cite and adt

In [None]:
rna.layers['original_X'] = rna.X.copy()
rna.X = rna.layers['raw'].copy()
adt.layers['original_X'] = adt.X.copy()
adt.X = adt.layers['raw'].copy()

## Check data matrix sparsification and convert to dense

In [None]:
scipy.sparse.issparse(rna.X), scipy.sparse.issparse(adt.X), scipy.sparse.issparse(cytof.X), scipy.sparse.issparse(facs.X)

In [None]:
rna.X = sparse.csr_matrix.todense(rna.X)

In [None]:
adt.X = sparse.csr_matrix.todense(adt.X)

In [None]:
scipy.sparse.issparse(rna.X), scipy.sparse.issparse(adt.X), scipy.sparse.issparse(cytof.X), scipy.sparse.issparse(facs.X)

## Cell type harmonization

In [None]:
def get_map_raw(l1, l2):
    dic = {}
    for label in l1:
        if label in l2:
            dic.update([(label, label)])
        else:
            dic.update([(label, None)])

    for label in l2:
        if label not in dic:
            dic.update([(label, None)])
    
    return dic

In [None]:
#cite_cells.Annotation_major_subset.unique(), cytof_cells_harmonised.harmonized_major_subset.unique()

In [None]:
#cite_cells.Annotation_cell_type.unique(), cytof_cells_harmonised.harmonized_cell_type.unique()

In [None]:
cell_type_harm_map = get_map_raw(cite_cells.Annotation_cell_type.unique(), cytof_cells_harmonised.harmonized_cell_type.unique())
major_subset_harm_map = get_map_raw(cite_cells.Annotation_major_subset.unique(), cytof_cells_harmonised.harmonized_major_subset.unique())

#### Note: 

From the maps one can see that labels between cytof and citeseq are already harmonized. The only thing to do at this point is to rename the cytof cells as the corresponding harmonized labels present in the cytof_cells_harmonized file.
The only thing to notice is that a subset of cell types identified with one assay are not identified with the other one and vice versa.

In [None]:
#used to map the name present in cytof_cells the same way as harmonized_cytof_cells (and consequently cite_Cells)

dic_major = {}
dic_type = {}
for i in range(len(cytof_cells_harmonised)):
    key = cytof_cells_harmonised.iloc[i]['major_cell_type']
    value_major = cytof_cells_harmonised.iloc[i]['harmonized_major_subset']
    value_type = cytof_cells_harmonised.iloc[i]['harmonized_cell_type']
    if value_major not in dic_major:
        dic_major.update([(key, value_major)])
    if value_type not in dic_type:
        dic_type.update([(key, value_type)])

In [None]:
cytof.obs['harmonized_major_subset'] = cytof.obs['major_cell_type'].map(dic_major)
cytof.obs['harmonized_cell_type'] = cytof.obs['major_cell_type'].map(dic_type)

In [None]:
cytof.obs.rename(columns={'harmonized_major_subset': 'Annotation_major_subset', 'harmonized_cell_type': 'Annotation_cell_type'}, inplace=True)
cytof.obs.columns

In [None]:
rna.obs['Annotation_major_subset'].cat.categories

In [None]:
cytof.obs['Annotation_major_subset'] = cytof.obs['Annotation_major_subset'].astype('category')
cytof.obs['Annotation_cell_type'] = cytof.obs['Annotation_cell_type'].astype('category')

In [None]:
cytof.obs['Annotation_major_subset'] = cytof.obs['Annotation_major_subset'].cat.rename_categories({'UNCLASSIFIED': 'nan'})
cytof.obs['Annotation_cell_type'] = cytof.obs['Annotation_cell_type'].cat.rename_categories({'UNCLASSIFIED': 'nan'})
#cytof.obs['Annotation_major_subset'].unique(), cytof.obs['Annotation_cell_type'].unique()

In [None]:
facs.obs['Annotation_major_subset'] = 'WBCs' #to change to CD4
facs.obs['Annotation_cell_type'] = 'WBCs'

## Assign domain to each adata

In [None]:
rna.obs['Domain'] = 'cite'
adt.obs['Domain'] = 'cite'
cytof.obs['Domain'] = 'cytof'
facs.obs['Domain'] = 'facs'
rna.obs['Domain_major'] = 'rna'
adt.obs['Domain_major'] = 'adt'
cytof.obs['Domain_major'] = 'cytof'
facs.obs['Domain_major'] = 'facs'

## Assign reference or query label to each adata

In [None]:
rna.obs['Framework'] = 'reference'
adt.obs['Framework'] = 'reference'
cytof.obs['Framework'] = 'reference'
facs.obs['Framework'] = 'reference'

## Write preprocessed, harmonized files

In [None]:
filepath = "../data_extra_modalities/"
os.makedirs(filepath, exist_ok=True)

In [None]:
rna.write(filepath + "rna_cite-pp-harm.h5ad", compression="gzip")
adt.write(filepath + "adt_cite-pp-harm.h5ad", compression="gzip")
cytof.write(filepath + "cytof-pp-harm.h5ad", compression="gzip")
#facs.write("facs-pp-harm-sub.h5ad", compression="gzip")