In [None]:
%config InlineBackend.figure_format='retina'

In [None]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import anndata
import scanpy as sc

from cytofuture_data.gene_name_mapping import GeneNameMapper

In [None]:
# Load the gene name mapper
gene_name_mapper = GeneNameMapper(
    '../standard_genes/gene_names/human_genes.csv',
    '../standard_genes/gene_names/mouse_genes.csv',
    '../standard_genes/gene_names/orthologue_map_human2mouse_best.csv',
    '../standard_genes/gene_names/orthologue_map_mouse2human_best.csv'
)

In [None]:
data_path = '/home/xingjie/Data/data2/cytofuture/datasets/scRNAseq/Tyser_Embryo_HS_2021/download'

exp_df = pd.read_csv(os.path.join(data_path, 'expression_values.csv'), index_col=0)
anno_df = pd.read_csv(os.path.join(data_path, 'annot_umap.csv'), index_col=0)
anno_df = anno_df.loc[exp_df.index]

X = np.exp(exp_df.values) - 1
adata = anndata.AnnData(X=X, obs=anno_df, var=exp_df.T[[]])
adata.obsm['X_umap'] = adata.obs[['X0', 'X1']].values
adata

In [None]:
# Load the data
data_collection_name = 'Tyser_Embryo_HS_2021'
data_path = os.path.join('/home/xingjie/Data/data2/cytofuture/datasets/scRNAseq/', data_collection_name)
dataset_name = 'all'
query_organism, query_var_id_type = 'human', 'name'

adata

In [None]:
sc.pl.umap(adata, color='cluster_id')

In [None]:
# Append the dataset name into cel IDs
adata.obs.index = [':'.join([data_collection_name, dataset_name, i]) for i in adata.obs.index]

# Copy the standard meta-data
adata_std = sc.AnnData(X=adata.X, obs=adata.obs[[]], var=adata.var[[]])
adata_std.obs['dataset_id'] = ':'.join([data_collection_name, dataset_name]) 
adata_std.obs['assay'] = 'smart-seq2'
adata_std.obs['batch'] = ':'.join([data_collection_name, dataset_name, '0'])

adata_std.obs['development_stage'] = 'embryo'
adata_std.obs['tissue'] = 'embryo'
adata_std.obs['cell_type'] = adata.obs['cluster_id']

# Map the genes to human gene IDs
adata_std.var['human_gene_id'] = gene_name_mapper.map_gene_names(list(adata_std.var.index), 
                                         query_organism, 'human', query_var_id_type, 'id')

In [None]:
print(np.unique(adata_std.obs['cell_type']))
print(np.unique(adata_std.obs['development_stage']))
display(adata_std)
display(adata_std[:, adata_std.var['human_gene_id'] != 'na'])

In [None]:
%%time
adata_std.write_h5ad(os.path.join(data_path, f'standard_adata_{data_collection_name}_{dataset_name}.h5ad'),
                compression='gzip')