In [1]:
import anndata as ad
from pathlib import Path
from scipy.sparse import csr_array
import numpy as np
from tqdm.auto import tqdm

In [2]:
import gc

In [3]:
obs_map = {
    "Age": "age",
    "CellCycle": "cell_cycle",
    "CellID": "cell_id",
    "Chemistry": "chemistry",
    "Clusters": "clusters",
    "Donor": "donor",
    "NGenes": "n_genes",
    "ROIGroupCoarse": "roi_group_coarse",
    "ROIGroupFine": "roi_group_fine",
    "Roi": "roi",
    "SampleID": "sample_id",
    "Sex": "sex",
    "Subclusters": "subclusters",
    "Tissue": "tissue",
    "TotalUMI": "total_umi",
}

var_map = {
    "Accession": "accession",
    "Gene": "gene",
    "Valid": "valid",
}

In [4]:
adatas_paths = list(
    Path("/project/genomics/ayshan/ldsc_analysis/data_2/Single_cell_Siletti_Duncan/h5ad_chunks_new").glob("*.h5ad")
)


def read_h5ad(apath):
    adata = ad.read_h5ad(apath)
    adata.X = csr_array(adata.X.astype(np.uint16, copy=False))
    adata.obs = adata.obs.rename(columns=obs_map)
    adata.var = adata.var.rename(columns=var_map)
    adata.obs["chunk"] = apath.stem
    adata.obs = adata.obs.set_index("cell_id")
    return adata


adata = read_h5ad(adatas_paths[0])
adata.write_h5ad("/lustre/groups/shared/scgenetics/single_cell_siletti_duncan_single_chunk.h5ad")
adata

AnnData object with n_obs × n_vars = 50000 × 59480
    obs: 'age', 'cell_cycle', 'chemistry', 'clusters', 'donor', 'n_genes', 'roi_group_coarse', 'roi_group_fine', 'roi', 'sample_id', 'sex', 'subclusters', 'tissue', 'total_umi', 'chunk'
    var: 'accession', 'gene', 'valid'

In [None]:
for apath in tqdm(adatas_paths[1:]):
    try:
        adata2 = read_h5ad(apath)
        adata = ad.concat([adata, adata2], join="outer", merge="same")
        gc.collect()
    except Exception as e:
        print(f"Error merging {apath}")
        continue
adata

  0%|          | 0/67 [00:00<?, ?it/s]

Error merging /project/genomics/ayshan/ldsc_analysis/data_2/Single_cell_Siletti_Duncan/h5ad_chunks_new/adata_chunk_3250000_3300000.h5ad
Error merging /project/genomics/ayshan/ldsc_analysis/data_2/Single_cell_Siletti_Duncan/h5ad_chunks_new/adata_chunk_1050000_1100000.h5ad
Error merging /project/genomics/ayshan/ldsc_analysis/data_2/Single_cell_Siletti_Duncan/h5ad_chunks_new/adata_chunk_2150000_2200000.h5ad


In [None]:
adata

AnnData object with n_obs × n_vars = 1150000 × 59480
    obs: 'age', 'cell_cycle', 'chemistry', 'clusters', 'donor', 'n_genes', 'roi_group_coarse', 'roi_group_fine', 'roi', 'sample_id', 'sex', 'subclusters', 'tissue', 'total_umi', 'chunk'
    var: 'accession', 'gene', 'valid'

In [None]:
adata.write_h5ad("/lustre/groups/shared/scgenetics/single_cell_siletti_duncan_merged.h5ad")

KeyboardInterrupt: 