### Prepare the RNA adata for SCENIC+: 

- see here for example: https://scenicplus.readthedocs.io/en/latest/human_cerebellum_scRNA_pp.html

Since we are interested in the development+disease contrast, we will use the cell type, disease binary, and age status like this: Cardiomyocyte:Y:postnatal and store in 'updated_cell_type'

In [1]:
import scanpy as sc 
import pandas as pd
import numpy as np
from collections import Counter

In [2]:
adata = sc.read_h5ad("01_subsampled_RNA.h5ad")
adata.obs_names_make_unique()

In [3]:
adata.obs

Unnamed: 0,age,donor_id,sex,region,cell_type,disease,consistent_cell_type,study,technology,cell_or_nuclei,...,_scvi_labels,leiden_scVI,scvi_cell_type,redo_leiden_0.5,UMAP1,UMAP2,v2_scvi_cell_type,final_cell_type,disease_and_age_status,group
Fetal_LV_18wk_e1-run1n2:CCGCGTATTGGG,18.0,Penn:Penn_F1,female,LV,Lymphoid,ND,Lymphoid,Penn,Dropseq,Nuclei,...,0,18,Endothelial,8,-3.197946,6.218067,Endothelial,Endothelial,N:fetal,N:fetal_Endothelial
Fetal-Atria-18wk:TCCGTCCAACAC,18.0,Penn:Penn_F1,female,Atria,Lymphoid,ND,Lymphoid,Penn,Dropseq,Nuclei,...,0,18,Endothelial,2,-2.106596,6.544215,Endothelial,Endothelial,N:fetal,N:fetal_Endothelial
Fetal-1st-LV-0315-1-run1n2:CGGAGATAACTG,18.0,Penn:Penn_F1,female,LV,Lymphoid,ND,Lymphoid,Penn,Dropseq,Nuclei,...,0,18,Endothelial,3,-0.987720,6.037060,Endothelial,Endothelial,N:fetal,N:fetal_Endothelial
Fetal_LV_18wk_e1-run1n2:TTGGGCCGGAAA,18.0,Penn:Penn_F1,female,LV,Lymphoid,ND,Lymphoid,Penn,Dropseq,Nuclei,...,0,18,Endothelial,2,-2.163236,6.179525,Endothelial,Endothelial,N:fetal,N:fetal_Endothelial
ENCFF849ALE:CGTGTTACAGTACCGT,16.0,ENCODE v4 (Snyder):ENCFF849ALE,female,WH,Endothelial,ND,Endothelial,ENCODE v4 (Snyder),Multiome-v1,Nuclei,...,0,18,Endothelial,2,-1.239180,5.651179,Endothelial,Endothelial,N:fetal,N:fetal_Endothelial
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TWCM-11-74:TTCGGTCGTTCGAATC,65.0,Koenig 2022:TWCM-11-74,male,LV,Myeloid,ND,Myeloid,Koenig 2022,5prime-v1,Nuclei,...,0,14,Myeloid,1,1.887421,12.988435,Myeloid,Myeloid,N:postnatal,N:postnatal_Myeloid
P1516:TTGTGTTCAGTGACCC-1-10,66.0,Chaffin 2022:P1516,female,LV,Macrophage,ND,Myeloid,Chaffin 2022,3prime-v3,Nuclei,...,0,14,Myeloid,8,1.123286,15.893510,Myeloid,Myeloid,N:postnatal,N:postnatal_Myeloid
ENCSR762LML:GGTCCTGCATCAGCAC,54.0,ENCODE v4 (Snyder):ENCSR762LML,male,LV,Myeloid,ND,Myeloid,ENCODE v4 (Snyder),Multiome-v1,Nuclei,...,0,14,Myeloid,9,3.155972,10.162045,Myeloid,Myeloid,N:postnatal,N:postnatal_Myeloid
ND15755-LV-2nd-run1234:AACTCAGTGGTA,65.0,Penn:ND15755,male,LV,Cardiomyocyte,ND,Cardiomyocyte,Penn,Dropseq,Nuclei,...,0,14,Myeloid,9,3.288646,11.371865,Myeloid,Myeloid,N:postnatal,N:postnatal_Myeloid


In [4]:
Counter(adata.obs.age_status)

Counter({'postnatal': 9000, 'fetal': 4500})

### SCENICplus requires the raw data to be stored in adata.raw

In [5]:
adata.raw = adata

In [6]:
adata.obs['updated_cell_type'] = ( adata.obs['final_cell_type'].astype(str) + ":" + 
                                       adata.obs['disease_binary'].astype(str) + ":" +
                                      adata.obs['age_status'].astype(str)  )

In [7]:
adata.obs['age_disease_status'] = ( adata.obs['disease_binary'].astype(str) + ":" +
                                      adata.obs['age_status'].astype(str)  )

In [8]:
Counter(adata.obs.updated_cell_type)

Counter({'Endothelial:N:fetal': 750,
         'Endothelial:Y:postnatal': 750,
         'Endothelial:N:postnatal': 750,
         'Lymphoid:N:fetal': 750,
         'Lymphoid:Y:postnatal': 750,
         'Lymphoid:N:postnatal': 750,
         'Fibroblast:N:fetal': 750,
         'Fibroblast:Y:postnatal': 750,
         'Fibroblast:N:postnatal': 750,
         'Cardiomyocyte:N:fetal': 750,
         'Cardiomyocyte:Y:postnatal': 750,
         'Cardiomyocyte:N:postnatal': 750,
         'Pericyte:N:fetal': 750,
         'Pericyte:Y:postnatal': 750,
         'Pericyte:N:postnatal': 750,
         'Myeloid:N:fetal': 750,
         'Myeloid:Y:postnatal': 750,
         'Myeloid:N:postnatal': 750})

In [9]:
Counter(adata.obs.age_disease_status)

Counter({'N:fetal': 4500, 'Y:postnatal': 4500, 'N:postnatal': 4500})

In [10]:
%%time
adata.write("03_RNA_adata_for_scenicplus.h5ad")

CPU times: user 288 ms, sys: 522 ms, total: 811 ms
Wall time: 809 ms
