In [1]:
import anndata
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import scanpy as sc
import scvelo as scv
import pandas as pd
import pyreadr

# Mouse Embryo Dataset
Cao, J., Spielmann, M., Qiu, X. et al. The single-cell transcriptional landscape of mammalian organogenesis. Nature 566, 496–502 (2019). https://doi.org/10.1038/s41586-019-0969-x

In [4]:
#Load Gene and Cell IDs
genes_u = pd.read_csv("/scratch/blaauw_root/blaauw1/gyichen/gene_name_u.csv")
cells_u = pd.read_csv("/scratch/blaauw_root/blaauw1/gyichen/cell_name_u.csv")
genes_s = pd.read_csv("/scratch/blaauw_root/blaauw1/gyichen/gene_name_s.csv")
cells_s = pd.read_csv("/scratch/blaauw_root/blaauw1/gyichen/cell_name_s.csv")

In [6]:
genes_u = genes_u['genes'].to_numpy() 
cells_u = cells_u['cells'].to_numpy()
genes_s = genes_s['genes'].to_numpy() 
cells_s = cells_s['cells'].to_numpy()

In [7]:
print(np.all(genes_u==genes_s), np.all(cells_u==cells_s))

True True


In [8]:
# rows are genes
U_ix = pd.read_csv("/scratch/blaauw_root/blaauw1/gyichen/U_ix.csv")
S_ix = pd.read_csv("/scratch/blaauw_root/blaauw1/gyichen/S_ix.csv")
row_u = U_ix['row'].to_numpy()
val_u = U_ix['val'].to_numpy()
row_s = S_ix['row'].to_numpy()
val_s = S_ix['val'].to_numpy()

In [9]:
row_s.max()

24551

$U_j,S_j$ contains the cumulative number of nonzero elements until each column. 

In [10]:
# columns are cells
U_j = pd.read_csv("/scratch/blaauw_root/blaauw1/gyichen/U_j.csv")
S_j = pd.read_csv("/scratch/blaauw_root/blaauw1/gyichen/S_j.csv")
col_u_sum = U_j['col'].to_numpy()
col_u_sum = np.diff(col_u_sum)
col_s_sum = S_j['col'].to_numpy()
col_s_sum = np.diff(col_s_sum)

In [36]:
anno = pyreadr.read_r("/nfs/turbo/umms-welchjd/yichen/data/scRNA/MOCA_df_cell.rds")
anno = anno[None]
anno.columns = ['clusters','day']

In [37]:
print(np.all(np.array(anno.index)==cells_s))

True


In [15]:
N = len(cells_s)
G = len(genes_s)
print(f"Number of cells: {N}, Number of Genes: {G}")

Number of cells: 1393565, Number of Genes: 24552


In [16]:
col_u, col_s = np.zeros((len(row_u))), np.zeros((len(row_s)))
ptr = 0
for i in range(N):
    col_u[ptr:ptr+col_u_sum[i]] = i
    ptr += col_u_sum[i]

ptr = 0
for i in range(N):
    col_s[ptr:ptr+col_s_sum[i]] = i
    ptr += col_s_sum[i]

U = sp.sparse.csr_matrix((val_u, (col_u, row_u)), shape=(N,G))
S = sp.sparse.csr_matrix((val_s, (col_s, row_s)), shape=(N,G))

In [17]:
X = U+S

In [38]:
adata = anndata.AnnData(X=X,
                        obs=anno,
                        var=pd.DataFrame({},index=pd.Index(genes_s)),
                        layers={'unspliced':U, 'spliced':S})

In [39]:
adata.obs

Unnamed: 0_level_0,clusters,day
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1
sci3-me-001.ATTAGTCTGTGTATAATACG,E11.5:Endothelium,E11.5
sci3-me-001.GAGGAACTTAATACCATCC,E10.5:Early chondrocytes,E10.5
sci3-me-001.TGGTCGAATGATCGCTTCT,E10.5:Limb mesenchyme progenitors,E10.5
sci3-me-001.GTCGGAGTTTAGACTTCTT,E11.5:Limb mesenchyme progenitors,E11.5
sci3-me-001.CTTCATGCTTTACGATGAA,E11.5:Myocytes,E11.5
...,...,...
sci3-me-760.ATTACCATCTACTTCCGAAT,E13.5:Connective tissue progenitors,E13.5
sci3-me-760.TCAGGAGATCGTAATGCAG,E11.5:Spinal cord inhibitory neurons,E11.5
sci3-me-760.ATTCGCAATTGCCGCAACGA,E9.5:Di/telencephalon,E9.5
sci3-me-760.CTAGTACGTCGTAGTTACC,E10.5:Chondrocyte and osteoblast progenitors,E10.5


In [43]:
adata.write_h5ad("/nfs/turbo/umms-welchjd/yichen/data/scRNA/mouse_E9_13.h5ad")

... storing 'clusters' as categorical
... storing 'day' as categorical


## Add Major Cell Type Annotation

In [2]:
cell_anno = pd.read_csv("/scratch/blaauw_root/blaauw1/gyichen/cell_annotate.csv")

In [4]:
cell_anno.keys()

Index(['sample', 'all_exon_count', 'all_intron_count', 'all_read_count',
       'intergenic_rate', 'embryo_id', 'embryo_sex', 'nuclei_extraction_date',
       'development_stage', 'Total_mRNAs', 'num_genes_expressed',
       'Size_Factor', 'Main_Cluster', 'Main_cluster_tsne_1',
       'Main_cluster_tsne_2', 'Sub_cluster', 'Sub_cluster_tsne_1',
       'Sub_cluster_tsne_2', 'doublet_score', 'detected_doublet',
       'doublet_cluster', 'sub_cluster_id', 'Main_cell_type',
       'Main_trajectory', 'Main_trajectory_umap_1', 'Main_trajectory_umap_2',
       'Main_trajectory_umap_3', 'Main_trajectory_refined_by_cluster',
       'Main_trajectory_refined_umap_1', 'Main_trajectory_refined_umap_2',
       'Main_trajectory_refined_umap_3', 'Sub_trajectory_name',
       'Sub_trajectory_umap_1', 'Sub_trajectory_umap_2',
       'Sub_trajectory_louvain_component', 'Sub_trajectory_Pseudotime'],
      dtype='object')

In [17]:
x = cell_anno["Main_trajectory"].to_numpy()
isstr = np.array([isinstance(x[i], str) for i in range(len(x))])
x = x[isstr]
len(np.unique(x))

10

In [21]:
adata = anndata.read_h5ad("/nfs/turbo/umms-welchjd/yichen/data/scRNA/mouse_E9_13.h5ad")

In [29]:
cell_id_anno = cell_anno['sample'].to_numpy()
cell_id = adata.obs.index.to_numpy()

In [None]:
is_in_adata = np.array([x in cell_id for x in cell_id_anno])
cell_labels = cell_anno["Main_trajectory"][is_in_adata].to_numpy()