In [None]:
%config InlineBackend.figure_format='retina'

In [None]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc

from cytofuture_data.gene_name_mapping import GeneNameMapper

In [None]:
# Load the gene name mapper
gene_name_mapper = GeneNameMapper(
    '../standard_genes/gene_names/human_genes.csv',
    '../standard_genes/gene_names/mouse_genes.csv',
    '../standard_genes/gene_names/orthologue_map_human2mouse_best.csv',
    '../standard_genes/gene_names/orthologue_map_mouse2human_best.csv'
)

In [None]:
# Load the data
data_collection_name = 'LaManno_WBDev_MM_2021'
data_path = os.path.join('/home/xingjie/Data/data2/cytofuture/datasets/scRNAseq/', data_collection_name)
dataset_name = 'all'
query_organism, query_var_id_type = 'mouse', 'id'

adata = sc.read_h5ad(os.path.join(data_path, 'download', 'adata_wb_dev.h5ad'))
adata.var = adata.var.set_index('Accession')

adata

In [None]:
adata.obsm['X_umap'] = adata.obsm['TSNE']
sc.pl.umap(adata, color='Class', palette='tab20')

In [None]:
sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts'], jitter=0, multi_panel=True, log=True)

In [None]:
sc.pp.filter_cells(adata, min_genes=200)

In [None]:
adata = adata[~adata.obs['Class'].isin(['Bad cells', 'Undefined'])]
adata = adata[~adata.obs['Class'].isna()]

In [None]:
list(adata.obs['Class'].cat.categories)

In [None]:
cell_ontology_map = {
    'Blood' : 'blood cell',
    'Choroid plexus' : 'choroid plexus epithelial cell',
    'Ectoderm' : 'ectodermal cell',
    'Endoderm' : 'endodermal cell',
    'Ependymal' : 'ependymal cell',
    'Fibroblast' : 'fibroblast',
    'Gastrulation' : 'gastrula cell',
    'Glioblast' : 'glioblast',
    'Immune' : 'defensive cell',
    'Mesenchyme' : 'mesenchymal cell',
    'Mesoderm' : 'mesodermal cell',
    'Neural crest' : 'neural crest cell',
    'Neural tube' : 'neural stem cell',
    'Neuroblast' : 'neural progenitor cell',
    'Neuron' : 'neuron',
    'Olfactory ensheathing cell' : 'olfactory ensheathing cell',
    'Oligodendrocyte' : 'oligodendrocyte',
    'Pineal gland' : 'interstitial cell of pineal gland',
    'Radial glia' : 'radial glial cell',
    'Schwann cell' : 'Schwann cell',
    'Subcommissural organ' : 'hypendymal cell',
    'Vascular' : 'brain vascular cell',
}

In [None]:
# Append the dataset name into cel IDs
adata.obs.index = [':'.join([data_collection_name, dataset_name, i]) for i in adata.obs.index]

# Copy the standard meta-data
adata_std = sc.AnnData(X=adata.X, obs=adata.obs[[]], var=adata.var[[]])
adata_std.obs['dataset_id'] = ':'.join([data_collection_name, dataset_name]) 
adata_std.obs['assay'] = '10x'
adata_std.obs['batch'] = [':'.join([data_collection_name, dataset_name, a]) 
                         for a in adata.obs['Sample_Index']]
adata_std.obs['development_stage'] = adata.obs['Age']
adata_std.obs['tissue'] = adata.obs['Tissue']
adata_std.obs['cell_type'] = adata.obs['Class'].map(cell_ontology_map)

# Map the genes to human gene IDs
adata_std.var['human_gene_id'] = gene_name_mapper.map_gene_names(list(adata_std.var.index), 
                                                             query_organism, 'human', query_var_id_type, 'id')

In [None]:
print(np.unique(adata_std.obs['cell_type']))
print(np.unique(adata_std.obs['development_stage']))
display(adata_std)
display(adata_std[:, adata_std.var['human_gene_id'] != 'na'])

In [None]:
%%time
adata_std.write_h5ad(os.path.join(data_path, f'standard_adata_{data_collection_name}_{dataset_name}.h5ad'),
                compression='gzip')