In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
from collections import defaultdict
from scipy.io import mmread
from scipy.sparse import csr_matrix

import gc

from matplotlib import rcParams

In [138]:
path_data='/net/bmc-lab6/data/lab/kellis/users/khrovati/data/'
path_mm=path_data+'datasets/mouse_brain_devel_genus/'
path_hs=path_data+'datasets/human_brain_organoid_DS/'
path_genes=path_data+'gene_info/'
path_save=path_data+'cross_species_prediction/brain_mmEmbryo_hsOrganoid/'

In [3]:
# Orthologues
orthology_info=pd.read_table(path_genes+'orthologues_ORGmus_musculus_ORG2homo_sapiens_V109.tsv'
                         ).rename(
    {'Gene name':'gs_mm','Human gene name':'gs_hs',
     'Gene stable ID':'eid_mm','Human gene stable ID':'eid_hs'},axis=1)

In [4]:
# One to one orthologues - dont have same mm/hs gene in the table 2x
# USe here human gs as dont have EIDs
oto_orthologues=orthology_info[~orthology_info.duplicated('eid_mm',keep=False).values & 
               ~orthology_info.duplicated('gs_hs',keep=False).values]

In [5]:
oto_orthologues

Unnamed: 0,eid_mm,gs_mm,eid_hs,gs_hs
0,ENSMUSG00000064341,mt-Nd1,ENSG00000198888,MT-ND1
1,ENSMUSG00000064345,mt-Nd2,ENSG00000198763,MT-ND2
2,ENSMUSG00000064351,mt-Co1,ENSG00000198804,MT-CO1
3,ENSMUSG00000064354,mt-Co2,ENSG00000198712,MT-CO2
4,ENSMUSG00000064356,mt-Atp8,ENSG00000228253,MT-ATP8
...,...,...,...,...
25713,ENSMUSG00000027133,Nop10,ENSG00000182117,NOP10
25714,ENSMUSG00000027596,a,ENSG00000101440,ASIP
25716,ENSMUSG00000027454,Gins1,ENSG00000101003,GINS1
25717,ENSMUSG00000068115,Ninl,ENSG00000101004,NINL


In [53]:
oto_orthologues['gs_mm'].nunique()==oto_orthologues.shape[0]

False

In [56]:
oto_orthologues['gs_mm'].value_counts().head(3)

Gm26457    2
mt-Nd1     1
Bpgm       1
Name: gs_mm, dtype: int64

In [54]:
oto_orthologues['gs_hs'].nunique()==oto_orthologues.shape[0]

True

C: Mouse symbols are not unique but for now it may be resolved after HVG selection.

## Mouse

In [120]:
adata=sc.read(path_mm+'filtered_celltype_subset_labelled.h5ad')

In [121]:
adata.shape

(23376, 20478)

In [122]:
# normalise
adata.layers['counts']=adata.X.copy()
sc.pp.normalize_total(adata,target_sum =1e4)
sc.pp.log1p(adata)

In [123]:
# Subset to orthologues
oto=set(oto_orthologues['gs_mm'])
adata=adata[:,[v for v in adata.var_names if v in oto]]
adata.shape

(23376, 14168)

In [124]:
n_hvg=4000
sc.pp.highly_variable_genes(
     adata=adata, n_top_genes=n_hvg, flavor='cell_ranger', subset=True,batch_key='ID')

  self.data[key] = value


In [125]:
# Parse obs
del adata.uns
del adata.obsm
del adata.var
obs=adata.obs
del adata.obs
adata.obs['cell_type']=obs['celltype']
adata.obs['condition']=obs['Genotype_Treatment']
adata.obs['sample']=obs['ID']
adata.obs['system']=0

In [126]:
adata

AnnData object with n_obs × n_vars = 23376 × 4000
    obs: 'cell_type', 'condition', 'sample', 'system'
    layers: 'counts'

In [127]:
adata_mm=adata

## Human

In [128]:
adata=sc.read(path_hs+'all_combined_cycling.h5ad')

In [129]:
adata.shape

(49320, 15706)

In [130]:
# normalise
adata.layers['counts']=adata.X.copy()
sc.pp.normalize_total(adata,target_sum =1e4)
sc.pp.log1p(adata)

In [131]:
# Subset to orthologues and change to mouse symbols
oto=set(oto_orthologues['gs_hs'])
adata=adata[:,[v for v in adata.var_names if v in oto]]
oto_orthologues.index=oto_orthologues.gs_hs
adata.var_names=oto_orthologues.loc[adata.var_names,'gs_mm']
adata.shape

(49320, 12341)

In [132]:
n_hvg=4000
sc.pp.highly_variable_genes(
     adata=adata, n_top_genes=n_hvg, flavor='cell_ranger', subset=True,batch_key='Sample')

In [133]:
# Parse obs
del adata.uns
del adata.obsm
del adata.var
obs=adata.obs
del adata.obs
adata.obs['cell_type']=obs['LineComp']
adata.obs['cell_type_fine']=obs['FullLineage']
adata.obs['condition']=obs.apply(lambda x: str(x['Genotype'])+'_'+str(x['Time']),axis=1)
adata.obs['sample']=obs['Sample']
adata.obs['system']=1

In [134]:
adata

AnnData object with n_obs × n_vars = 49320 × 4000
    obs: 'cell_type', 'cell_type_fine', 'condition', 'sample', 'system'
    layers: 'counts'

In [135]:
adata_hs=adata

## Combine adatas

In [136]:
genes=list(set(adata_mm.var_names)&set(adata_hs.var_names))
adata=sc.concat([adata_mm[:,genes],adata_hs[:,genes]],join='outer')

In [137]:
adata

AnnData object with n_obs × n_vars = 72696 × 1347
    obs: 'cell_type', 'condition', 'sample', 'system', 'cell_type_fine'
    layers: 'counts'

In [139]:
adata.write(path_save+'combined_orthologuesHVG.h5ad')