In [None]:
%config InlineBackend.figure_format='retina'

In [None]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc

from cytofuture_data.gene_name_mapping import GeneNameMapper

In [None]:
#%%time
#import anndata
#
#data_collection_name = 'Qiu_Organogenesis_MM_2022'
#data_path = os.path.join('/home/xingjie/Data/data2/cytofuture/datasets/scRNAseq/', data_collection_name)
#
#ages = ['3.5', '4.5', '5.25', '5.5', '6.25', '6.5', '6.75', '7.0', '7.25', '7.5', '7.75', '8.0',
#       '8.25', '8.5a', '8.5b', '9.5', '10.5', '11.5', '12.5', '13.5']
#
#adata_list = []
#
#for age in ages:
#    print(age)
#    adata_tmp = sc.read_h5ad(os.path.join(data_path, 'TOME', f'E{age}.h5ad'))
#    meta_df = pd.read_csv(os.path.join(data_path, 'TOME', f'metadata_E{age}.csv'), index_col=0)
#    adata_tmp.obs = meta_df
#    adata_tmp.var = adata_tmp.var.set_index('features')
#    
#    adata_list.append(adata_tmp)
#    
#adata = anndata.concat(adata_list)
#adata.obs = adata.obs.drop(columns='group')
#
#adata.write_h5ad(os.path.join(data_path, 'TOME', 'adata_all.h5ad'), compression='gzip')

In [None]:
# Load the gene name mapper
gene_name_mapper = GeneNameMapper(
    '../standard_genes/gene_names/human_genes.csv',
    '../standard_genes/gene_names/mouse_genes.csv',
    '../standard_genes/gene_names/orthologue_map_human2mouse_best.csv',
    '../standard_genes/gene_names/orthologue_map_mouse2human_best.csv'
)

In [None]:
# Load the data
data_collection_name = 'Qiu_Organogenesis_MM_2022'
data_path = os.path.join('/home/xingjie/Data/data2/cytofuture/datasets/scRNAseq/', data_collection_name)
dataset_name = 'all'
query_organism, query_var_id_type = 'mouse', 'id'

adata = sc.read_h5ad(os.path.join(data_path, 'TOME', 'adata_all.h5ad'))
adata

In [None]:
sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts'], jitter=0, multi_panel=True, log=True)

In [None]:
sc.pp.filter_cells(adata, min_genes=200)

In [None]:
list(adata.obs['cell_type'].cat.categories)

In [None]:
cell_ontology_map = {
 'Allantois' : 'allantois',
 'Amniochorionic mesoderm' : 'amniochorionic mesoderm',
 'Amniochorionic mesoderm A' : 'amniochorionic mesoderm A',
 'Amniochorionic mesoderm B' : 'amniochorionic mesoderm B',
 'Anterior floor plate' : 'anterior floor plate',
 'Anterior primitive streak' : 'anterior primitive streak',
 'Apical ectodermal ridge' : 'apical ectodermal ridge',
 'Blood progenitors' : 'blood progenitors',
 'Brain endothelium' : 'brain endothelium',
 'Branchial arch epithelium' : 'branchial arch epithelium',
 'Cardiomyocytes' : 'fetal cardiomyocyte',
 'Caudal lateral epiblast': 'caudal lateral epiblast',
 'Caudal neuroectoderm': 'caudal neuroectoderm',
 'Chondrocyte and osteoblast progenitors' : 'chondrocyte and osteoblast progenitors',
 'Connective tissue progenitors' : 'connective tissue progenitors',
 'Definitive endoderm' : 'definitive endoderm',
 'Definitive erythroid cells' : 'definitive erythroid cells',
 'Di/mesencephalon excitatory neurons' : 'di/mesencephalon excitatory neurons',
 'Di/mesencephalon inhibitory neurons' : 'di/mesencephalon inhibitory neurons',
 'Di/telencephalon' : 'di/telencephalon',
 'Early chondrocytes' : 'early chondrocytes',
 'Embryonic visceral endoderm' : 'embryonic visceral endoderm',
 'Endothelium' : 'endothelial cell',
 'Epiblast' : 'epiblast cell',
 'Epidermis' : 'epidermal cell',
 'Extraembryonic ectoderm' : 'extraembryonic ectoderm',
 'Extraembryonic mesoderm' : 'extraembryonic mesoderm',
 'Extraembryonic visceral endoderm' : 'extraembryonic visceral endoderm',
 'First heart field' : 'first heart field',
 'Forebrain/midbrain' : 'forebrain/midbrain',
 'Foregut epithelium' : 'foregut epithelium',
 'Fusing epithelium' : 'fusing epithelium',
 'Gut' : 'gut',
 'Gut and lung epithelium' : 'gut and lung epithelium',
 'Hematoendothelial progenitors' : 'hematoendothelial progenitors',
 'Hepatocytes' : 'hepatocyte',
 'Hindbrain' : 'hindbrain',
 'Hypoblast' : 'hypoblast',
 'Inhibitory interneurons' : 'inhibitory interneuron',
 'Inner cell mass' : 'inner cell mass cell',
 'Intermediate mesoderm' : 'intermediate mesodermal cell',
 'Intermediate progenitor cells' : 'intermediate progenitor cell',
 'Limb mesenchyme progenitors' : 'limb mesenchyme progenitors',
 'Liver endothelium' : 'liver endothelium',
 'Lung epithelium' : 'lung epithelium',
 'Megakaryocytes' : 'megakaryocyte',
 'Mesencephalon/MHB' : 'mesencephalon/mhb',
 'Mesenchymal stromal cells' : 'mesenchymal stromal cell',
 'Midgut/Hindgut epithelium' : 'midgut/hindgut epithelium',
 'Mixed mesoderm' : 'mixed mesoderm',
 'Motor neurons' : 'motor neuron',
 'Myocytes' : 'myocyte',
 'Nascent mesoderm' : 'nascent mesoderm',
 'Neural crest' : 'neural crest cell',
 'Neural crest (PNS glia)' : 'neural crest (pns glia)',
 'Neural crest (PNS neurons)' : 'neural crest (pns neurons)',
 'Neuromesodermal progenitors' : 'neuromesodermal progenitors',
 'Neuron progenitor cells' : 'neural progenitor cell',
 'Noradrenergic neurons' : 'noradrenergic neuron',
 'Notochord' : 'notochordal cell',
 'Olfactory epithelium' : 'olfactory epithelium',
 'Olfactory sensory neurons' : 'olfactory sensory neuron',
 'Osteoblast progenitors A' : 'osteoblast progenitors A',
 'Osteoblast progenitors B' : 'osteoblast progenitors B',
 'Otic epithelium' : 'otic epithelium',
 'Pancreatic epithelium' : 'pancreatic epithelium',
 'Paraxial mesoderm A' : 'paraxial mesoderm A',
 'Paraxial mesoderm B' : 'paraxial mesoderm B',
 'Paraxial mesoderm C' : 'paraxial mesoderm C',
 'Parietal endoderm' : 'parietal endoderm',
 'Placodal area' : 'placodal area',
 'Posterior floor plate' : 'posterior floor plate',
 'Pre-epidermal keratinocytes' : 'pre-epidermal keratinocyte',
 'Primitive erythroid cells' : 'primitive erythroid cells',
 'Primitive streak and adjacent ectoderm' : 'primitive streak and adjacent ectoderm',
 'Primordial germ cells' : 'primordial germ cell',
 'Renal epithelium' : 'renal epithelium',
 'Retinal neurons' : 'retinal neuron',
 'Retinal pigment cells' : 'retinal pigment cell',
 'Retinal primordium' : 'retinal primordium',
 'Roof plate' : 'roof plate',
 'Rostral neuroectoderm' : 'rostral neuroectoderm',
 'Second heart field' : 'second heart field',
 'Skeletal muscle progenitors' : 'skeletal muscle progenitors',
 'Somatic mesoderm' : 'somatic mesoderm',
 'Spinal cord' : 'spinal cord',
 'Spinal cord (dorsal)' : 'spinal cord (dorsal)',
 'Spinal cord (ventral)' : 'spinal cord (ventral)',
 'Spinal cord excitatory neurons' : 'spinal cord excitatory neuron',
 'Spinal cord inhibitory neurons' : 'spinal cord inhibitory neuron',
 'Splanchnic mesoderm' : 'splanchnic mesoderm',
 'Surface ectoderm' : 'surface ectoderm',
 'Visceral endoderm': 'visceral endoderm',
 'White blood cells' : 'white blood cell',
}

In [None]:
# Append the dataset name into cel IDs
adata.obs.index = [':'.join([data_collection_name, dataset_name, i]) for i in adata.obs.index]

# Copy the standard meta-data
adata_std = sc.AnnData(X=adata.X, obs=adata.obs[[]], var=adata.var[[]], dtype=np.float32)
adata_std.obs['dataset_id'] = ':'.join([data_collection_name, dataset_name]) 
adata_std.obs['assay'] = 'seq'
adata_std.obs['batch'] = [':'.join([data_collection_name, dataset_name, a]) 
                         for a in adata.obs['orig.ident']]
adata_std.obs['development_stage'] = adata.obs['day']
adata_std.obs['tissue'] = adata.obs['cell_type']
adata_std.obs['cell_type'] = adata.obs['cell_type'].map(cell_ontology_map)

# Map the genes to human gene IDs
adata_std.var['human_gene_id'] = gene_name_mapper.map_gene_names(list(adata_std.var.index), 
                                                             query_organism, 'human', query_var_id_type, 'id')

In [None]:
print(np.unique(adata_std.obs['cell_type']))
print(np.unique(adata_std.obs['development_stage']))
display(adata_std)
display(adata_std[:, adata_std.var['human_gene_id'] != 'na'])

In [None]:
%%time
adata_std.write_h5ad(os.path.join(data_path, f'standard_adata_{data_collection_name}_{dataset_name}.h5ad'),
                compression='gzip')