In [1]:
print('hi')

hi


In [2]:
import os
import scanpy as sc
import pandas as pd



In [3]:
input_dir = "/projects/bioinformatics/DB/scRNAseq_parkinson"
# the full path to the .h5ad file
h5ad_path = os.path.join(input_dir, "dataset.h5ad")

In [4]:
# Get the size in bytes
size_bytes = os.path.getsize(h5ad_path)

# Convert to a more readable unit (e.g., MB or GB)

size_gb = size_bytes / (1024 * 1024 * 1024)

print(f"Size in gigabytes: {size_gb:.2f} GB")

Size in gigabytes: 28.45 GB


In [6]:
import zarr

### Size issue 
That file size of 28.45 GB for a single-cell RNA-seq .h5ad dataset is very large and will likely pose significant computational challenges for a standard analysis,
especially when using a Transformer-based model like GenFormer.

Memory (RAM): A 28 GB .h5ad file, which typically contains a sparse matrix of UMI counts, will require a substantial amount of RAM when loaded and processed. When the data is converted to a dense matrix or intermediate processing steps are performed (like calculating nearest neighbors, running PCA, etc.), the memory footprint can easily exceed 100-200 GB. If you don't have access to a high-memory computing environment (e.g., a powerful server or cloud instance), the analysis will crash.

Processing Time: Even with sufficient RAM, a dataset of this size (likely involving millions of cells) will result in long processing times for any single-cell workflow, particularly for computationally intensive steps like training a large neural network model such as GenFormer.

In [None]:
import os
import anndata
import shutil

# --- Configuration ---
input_dir = "/projects/bioinformatics/DB/scRNAseq_parkinson"
h5ad_path = os.path.join(input_dir, "dataset.h5ad")
zarr_path = os.path.join(input_dir, "dataset.zarr") # The new Zarr directory

print(f"File to process: {h5ad_path}")

try:
    # 1. Load the AnnData object in 'backed' mode
    # This reads metadata but keeps the 28 GB expression matrix on disk.
    adata_h5ad = anndata.read_h5ad(h5ad_path, backed='r')
    print("Successfully loaded AnnData in memory-safe (backed) mode.")
    print(f"Data dimensions (Cells, Genes): {adata_h5ad.shape}")

    # 2. Cleanup: Remove existing Zarr directory to ensure a clean start
    if os.path.exists(zarr_path):
        print(f"Removing existing Zarr directory at {zarr_path}")
        shutil.rmtree(zarr_path)

    # 3. Write to Zarr
    # The 'chunks=True' argument tells anndata to chunk the data efficiently.
    # Note: If you get an 'overwrite' error, remove 'overwrite=True' (older anndata version).
    print(f"\nStarting conversion to Zarr at: {zarr_path}...")
    try:
        adata_h5ad.write_zarr(zarr_path, chunks=True, overwrite=True)
    except TypeError:
        # Fallback for older anndata versions without the 'overwrite' parameter
        adata_h5ad.write_zarr(zarr_path, chunks=True)

    print("Conversion complete! The scalable Zarr format has been created.")

except Exception as e:
    print(f"An error occurred during loading or conversion: {e}")

File to process: /projects/bioinformatics/DB/scRNAseq_parkinson/dataset.h5ad
Successfully loaded AnnData in memory-safe (backed) mode.
Data dimensions (Cells, Genes): (2096155, 17267)

Starting conversion to Zarr at: /projects/bioinformatics/DB/scRNAseq_parkinson/dataset.zarr...
