In [None]:
import h5py
from scipy.sparse import csr_matrix
from scipy.io import mmwrite
import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc
from tqdm.auto import tqdm

indir = '/projects_ng/SC_DATA/LinnarsonBraun/'
outdir = '/scratch/xoel/'

In [2]:
data_url = 'https://storage.googleapis.com/linnarsson-lab-human/HumanFetalBrainPool.h5'
data_fname = f"{indir}HumanFetalBrainPool.h5"

!mkdir -p $(dirname {data_fname})
!wget -nc -O {data_fname} {data_url}

File ‘/projects_ng/SC_DATA/LinnarsonBraun/HumanFetalBrainPool.h5’ already there; not retrieving.


In [None]:
data_url = 'https://storage.googleapis.com/linnarsson-lab-human/human_dev.h5ad'
data_fname = f"{outdir}human_dev.h5ad"


!mkdir -p $(dirname {data_fname})
!wget -nc -O {data_fname} {data_url}

File ‘/scratch/xoel/human_dev.h5ad’ already there; not retrieving.


# .h5 load

In [4]:
filename =  f"{indir}HumanFetalBrainPool.h5"
f = h5py.File(filename, "r")


In [None]:
# atts: DataFrame storing HDF5 dataset keys and their dimensions
atts = pd.DataFrame([{k: f['shoji'][k].shape for k in f['shoji'].keys()}]).T.reset_index().rename({'index': 'key', 0: 'dims'}, axis=1)
atts = atts.sort_values('dims')

In [None]:
dim_names = {
    '51': 'annotation',
    '617': 'clusters',
    '59480': 'genes',
    '1665937': 'ncells'
}
a = atts['dims'].astype(str)
for s,r in dim_names.items():
    a = a.str.replace(s,r)
atts['dim_names'] = a
# att_groups: Dictionary grouping HDF5 keys by dimension names (e.g., 'genes', 'ncells')
att_groups = {k: v.values for k, v in atts.groupby('dim_names')['key']}


In [None]:
# factors and embeddings:
# dfs: Dictionary to hold extracted metadata DataFrames (e.g., CellInfo, GeneInfo)
dfs = dict()

# CellInfo
CellInfo = {k: f['shoji'][k][()] for k in att_groups['(ncells,)']}
dfs.update(dict(CellInfo = pd.DataFrame(CellInfo).apply(lambda x: x if not x.dtype == 'O' else x.str.decode("utf-8") , 0)))

# GeneInfo
GeneInfo = {k: f['shoji'][k][()] for k in att_groups['(genes,)']}
dfs.update(dict(GeneInfo = pd.DataFrame(GeneInfo).apply(lambda x: x if not x.dtype == 'O' else x.str.decode("utf-8") , 0)))

# Expression
Expression = csr_matrix(f['shoji'][att_groups['(ncells, genes)'][0]][()])

In [8]:
f.close()

# Disease expression

In [None]:
disease_info_path = './DiseaseInfo.csv'
disease_gene_path = '/users/genomics/xoel/codebases/gene_disease_asociation/parsed_lists_with_nicola.union.csv'

# dis_info: DataFrame containing metadata for each disease
dis_info = pd.read_csv(disease_info_path, index_col = 0).iloc[:,3:]
# diseases: DataFrame mapping genes (index) to diseases (columns, 1=associated)
diseases = pd.read_csv(disease_gene_path, index_col=0)
diseases['All'] = (diseases.sum(1)>0).astype(int)
# disease_genes: Dictionary mapping each disease name to a list of its associated genes
disease_genes = {d: diseases.index[diseases[d].astype(bool)].tolist() for d in diseases.columns}

In [None]:
selected_regions = ['Forebrain', 'Telencephalon']
selected_subregions = ['Forebrain', 'Cortex', 'Striatum', 'Telencephalon']

# rg_mask: Boolean mask for selecting radial glia cells based on region, subregion, and cell class
rg_mask = (dfs['CellInfo']['Region'].isin(selected_regions) & dfs['CellInfo']['Subregion'].isin(selected_subregions) & dfs['CellInfo']['CellClass'].isin(['Radial glia']))


adataRG = ad.AnnData(X=Expression[rg_mask,:],
                     obs = dfs['CellInfo'].loc[rg_mask,:], 
                     var=dfs['GeneInfo'].set_index('Gene'))
adataRG.var_names_make_unique(join='--')
adataRG

  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 193379 × 59480
    obs: 'SampleID', 'UnsplicedFraction', 'Age', 'TotalUMIs', 'TopLevelCluster', 'Tissue', 'Subregion', 'Subdivision', 'CellClass', 'CellCycleFraction', 'CellID', 'Sex', 'Chemistry', 'PrevClusters', 'Region', 'Clusters', 'Donor', 'NGenes', 'DoubletFlag', 'MitoFraction', 'DoubletScore', 'DropletClass', 'ValidCells'
    var: 'SelectedFeatures', 'Start', 'StdevExpression', 'GeneTotalUMIs', 'Accession', 'Chromosome', 'GeneNonzeros', 'End', 'ValidGenes'

In [None]:
# texp: Total expression count per gene across selected cells
texp = adataRG.X.sum(axis = 0)
# pctexp: Percentage of selected cells expressing each gene
pctexp = (adataRG.X > 0).mean(axis = 0)
# avgexp: Average expression per gene across selected cells
avgexp = adataRG.X.mean(axis = 0)

In [None]:
# expdata: DataFrame combining total, percentage, and average expression statistics per gene
expdata = pd.concat([
    pd.DataFrame(texp, columns=adataRG.var_names, index=['Total expression']),
    pd.DataFrame(pctexp, columns=adataRG.var_names, index=['Percentage expression']),
    pd.DataFrame(avgexp, columns=adataRG.var_names, index=['Average expression'])
], axis=0).T


In [35]:
dis_info

Unnamed: 0,Disease,DiseaseLabel,DiseaseGroup,Dataset
1,All,All,All,in vitro\n(Micali et al)
2,Microcephaly,MIC,Cortical Malformations,in vitro\n(Micali et al)
3,Lissencephaly,LIS,Cortical Malformations,in vitro\n(Micali et al)
4,Cobblestone,COB,Cortical Malformations,in vitro\n(Micali et al)
5,Heterotopia,HET,Cortical Malformations,in vitro\n(Micali et al)
6,Polymicrogyria,POLY,Cortical Malformations,in vitro\n(Micali et al)
7,Hydrocephaly,HYD,Cortical Malformations,in vitro\n(Micali et al)
8,RareMCD,MCD,Cortical Malformations,in vitro\n(Micali et al)
9,FCDandmTOR,FCD & mTOR,Cortical Malformations,in vitro\n(Micali et al)
10,DevDyslexia,DevDys,Cortical Malformations,in vitro\n(Micali et al)


In [None]:
## Using at least 1 count

dis_exp = {}
for disease in diseases.columns:
    
    
    dis_genes = diseases.index[diseases.loc[:, disease].astype(bool)]
    dis_genes_clean = dis_genes[dis_genes.isin(adataRG.var_names)]
    n_exp = (expdata.loc[dis_genes_clean,'Total expression'] > 0).sum()
    
    
    dis_exp[disease] = {
        'n.exp': n_exp,
        'n.present': len(dis_genes_clean),
        'n.NOTpresent': len(dis_genes) - len(dis_genes_clean),
        'Disease': disease
    }

# dis_exp: Dictionary storing expression summary for each disease (using >=1 count threshold)
dis_exp = pd.DataFrame(dis_exp).T
dis_exp = pd.merge(dis_exp, dis_info, on = 'Disease')

output_csv_path_1count = './disease_expression.get1count.Linnarsson.RadialGlia.FT.FCST.csv'
dis_exp.to_csv(output_csv_path_1count)

In [None]:
## Using at least 5 count

dis_exp = {}
for disease in diseases.columns:
    
    
    dis_genes = diseases.index[diseases.loc[:, disease].astype(bool)]
    dis_genes_clean = dis_genes[dis_genes.isin(adataRG.var_names)]
    n_exp = (expdata.loc[dis_genes_clean,'Total expression'] >= 5).sum()
    
    dis_exp[disease] = {
        'n.exp': n_exp,
        'n.present': len(dis_genes_clean),
        'n.NOTpresent': len(dis_genes) - len(dis_genes_clean),
        'Disease': disease
    }

# dis_exp: Dictionary storing expression summary for each disease (using >=5 count threshold)
dis_exp = pd.DataFrame(dis_exp).T
dis_exp = pd.merge(dis_exp, dis_info, on = 'Disease')

output_csv_path_5count = './disease_expression.get5count.Linnarsson.RadialGlia.FT.FCST.csv'
dis_exp.to_csv(output_csv_path_5count)

In [None]:
## Using at least 5 percent of cells

dis_exp = {}
for disease in diseases.columns:
    
    
    dis_genes = diseases.index[diseases.loc[:, disease].astype(bool)]
    dis_genes_clean = dis_genes[dis_genes.isin(adataRG.var_names)]
    n_exp = (expdata.loc[dis_genes_clean,'Percentage expression'] >= .05).sum()
    
    
    dis_exp[disease] = {
        'n.exp': n_exp,
        'n.present': len(dis_genes_clean),
        'n.NOTpresent': len(dis_genes) - len(dis_genes_clean),
        'Disease': disease
    }

# dis_exp: Dictionary storing expression summary for each disease (using >=5% cell threshold)
dis_exp = pd.DataFrame(dis_exp).T
dis_exp = pd.merge(dis_exp, dis_info, on = 'Disease')

output_csv_path_5pct = './disease_expression.get5pct.Linnarsson.RadialGlia.FT.FCST.csv'
dis_exp.to_csv(output_csv_path_5pct)