In [None]:
%config InlineBackend.figure_format='retina'

In [None]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc

from cytofuture_data.gene_name_mapping import GeneNameMapper

In [None]:
# Load the gene name mapper
gene_name_mapper = GeneNameMapper(
    '../standard_genes/gene_names/human_genes.csv',
    '../standard_genes/gene_names/mouse_genes.csv',
    '../standard_genes/gene_names/orthologue_map_human2mouse_best.csv',
    '../standard_genes/gene_names/orthologue_map_mouse2human_best.csv'
)

In [None]:
# Load the data
data_collection_name = 'AllenBrain_WB_MM_2023'
data_path = os.path.join('/home/xingjie/Data/data2/cytofuture/datasets/scRNAseq/', data_collection_name)
dataset_name = 'all'
query_organism, query_var_id_type = 'mouse', 'name'

adata = sc.read_h5ad('/home/xingjie/Data/data2/whole_brain/scRNA_seq/Allen_full_set/adata_WB_seq_anno_20230131_downsample_500K.h5ad')

adata

In [None]:
sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts'], jitter=0, multi_panel=True, log=True)

In [None]:
sc.pp.filter_cells(adata, min_genes=200)

In [None]:
adata.obs['class_label'].cat.categories

In [None]:
cell_ontology_map = {
    'Astro-Epen' : 'astrocyte', 
    'CB GABA' : 'cerebellar inhibitory GABAergic interneuron', 
    'CB Glut' : 'cerebellum glutamatergic neuron', 
    'CGE GABA' : 'GABAergic neuron', 
    'CNU GABA' : 'GABAergic neuron',
    'CNU-HYa GABA' : 'GABAergic neuron', 
    'CNU-HYa Glut' : 'glutamatergic neuron', 
    'HY GABA' : 'GABAergic neuron', 
    'HY Glut' : 'glutamatergic neuron', 
    'HY Gnrh1 Glut' : 'glutamatergic neuron',
    'HY MM Glut' : 'glutamatergic neuron', 
    'IT-ET Glut' : 'glutamatergic neuron', 
    'Immune' : 'defensive cell', 
    'LSX GABA' : 'GABAergic neuron', 
    'MB Dopa' : 'dopaminergic neuron', 
    'MB GABA' : 'GABAergic neuron',
    'MB Glut' : 'glutamatergic neuron', 
    'MB-HB Sero' : 'serotonergic neuron', 
    'MGE GABA' : 'GABAergic neuron', 
    'MH-LH Glut' : 'glutamatergic neuron', 
    'MOB-CR Glut' : 'glutamatergic neuron',
    'MOB-DG-IMN' : 'neural progenitor cell', 
    'MY GABA' : 'GABAergic neuron', 
    'MY Glut' : 'glutamatergic neuron', 
    'NP-CT-L6b Glut' : 'glutamatergic neuron', 
    'OEG' : 'olfactory ensheathing cell', 
    'Oligo' : 'oligodendrocyte',
    'P GABA' : 'GABAergic neuron', 
    'P Glut' : 'glutamatergic neuron', 
    'Pineal Glut' : 'glutamatergic neuron', 
    'TH Glut' : 'glutamatergic neuron', 
    'Vascular' : 'brain vascular cell',
}

In [None]:
# Append the dataset name into cel IDs
adata.obs.index = [':'.join([data_collection_name, dataset_name, i]) for i in adata.obs.index]

# Copy the standard meta-data
adata_std = sc.AnnData(X=adata.X, obs=adata.obs[[]], var=adata.var[[]])
adata_std.obs['dataset_id'] = ':'.join([data_collection_name, dataset_name]) 
adata_std.obs['assay'] = adata.obs['method']
adata_std.obs['batch'] = [':'.join([data_collection_name, dataset_name, a]) 
                         for a in adata_std.obs['assay']]
adata_std.obs['development_stage'] = adata.obs['age'].astype(str)
adata_std.obs['tissue'] = adata.obs['roi']
adata_std.obs['cell_type'] = adata.obs['class_label'].map(cell_ontology_map)

# Map the genes to human gene IDs
adata_std.var['human_gene_id'] = gene_name_mapper.map_gene_names(list(adata_std.var.index), 
                                                             query_organism, 'human', query_var_id_type, 'id')

In [None]:
display(adata_std)
display(adata_std[:, adata_std.var['human_gene_id'] != 'na'])

In [None]:
%%time
adata_std.write_h5ad(os.path.join(data_path, f'standard_adata_{data_collection_name}_{dataset_name}.h5ad'),
                compression='gzip')