In [None]:
%config InlineBackend.figure_format='retina'

In [None]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc

from cytofuture_data.gene_name_mapping import GeneNameMapper

In [None]:
# Load the gene name mapper
gene_name_mapper = GeneNameMapper(
    '../standard_genes/gene_names/human_genes.csv',
    '../standard_genes/gene_names/mouse_genes.csv',
    '../standard_genes/gene_names/orthologue_map_human2mouse_best.csv',
    '../standard_genes/gene_names/orthologue_map_mouse2human_best.csv'
)

In [None]:
# Load the data
data_collection_name = 'Xu_HS_early_organogenesis_2023'
data_path = os.path.join('/home/xingjie/Data/data2/cytofuture/datasets/scRNAseq/', data_collection_name)
dataset_name = 'all'
query_organism, query_var_id_type = 'human', 'id'

adata = sc.read_h5ad(os.path.join(data_path, 'download', 'adata_all.h5ad'))

adata

In [None]:
adata.obs[adata.obs['developmental system'] == 'IM']

In [None]:
list(adata.obs['final_annotation'].cat.categories)

In [None]:
cell_ontology_map = {
    'IM' : 'intermediate mesodermal cell',
    'PGC' : 'primordial germ cell',
    'blood' : 'blood cell',
    'craniofacial' : 'craniofacial cell',
    'endoderm' : 'endodermal cell',
    'endothelium' : 'endothelial cell',
    'epidermis' : 'epidermal cell',
    'epithelium': 'epithelial cell',
    'fibroblast' : 'fibroblast',
    'head mesoderm' : 'head mesodermal cell',
    'limb' : 'limb cell',
    'miscellaneous' : 'miscellaneous',
    'neural progenitor' : 'neural progenitor cell',
    'neuron' : 'neuron',
    'schwann' : 'Schwann cell',
    'sensory neuron' : 'sensory neuron',
    'somatic LPM' : 'lateral mesodermal cell',
    'somite' : 'somite',
    'splanchnic LPM' : 'lateral mesodermal cell'
    }

In [None]:
# Append the dataset name into cel IDs
adata.obs.index = [':'.join([data_collection_name, dataset_name, i]) for i in adata.obs.index]

# Copy the standard meta-data
adata_std = sc.AnnData(X=adata.X, obs=adata.obs[[]], var=adata.var[[]])
adata_std.obs['dataset_id'] = ':'.join([data_collection_name, dataset_name]) 
adata_std.obs['assay'] = '10x'
adata_std.obs['batch'] = [':'.join([data_collection_name, dataset_name, d]) 
                         for d in adata.obs['sample']]
adata_std.obs['development_stage'] = adata.obs['stage']
adata_std.obs['tissue'] = adata.obs['dissection_part']
adata_std.obs['cell_type'] = adata.obs['developmental system'].map(cell_ontology_map)

# Map the genes to human gene IDs
adata_std.var['human_gene_id'] = gene_name_mapper.map_gene_names(list(adata_std.var.index), 
                                                             query_organism, 'human', query_var_id_type, 'id')

In [None]:
print(np.unique(adata_std.obs['cell_type']))
print(np.unique(adata_std.obs['development_stage']))
display(adata_std)
display(adata_std[:, adata_std.var['human_gene_id'] != 'na'])

In [None]:
%%time
adata_std.write_h5ad(os.path.join(data_path, f'standard_adata_{data_collection_name}_{dataset_name}.h5ad'),
                compression='gzip')