In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from scaleflow.data._datamanager_new import DataManager
from scaleflow.data._anndata_location import AnnDataLocation
from scaleflow.data._data import GroupedDistribution
from pathlib import Path
import anndata as ad
import h5py
from anndata.experimental import read_lazy

In [3]:
DATA_PATH = Path("/lustre/groups/ml01/projects/big_perturbation/datasets/nadig_jurkat.h5ad")
OUTPUT_PATH = Path("/lustre/groups/ml01/workspace/100mil/nadig_jurkat.zarr")

In [4]:
with h5py.File(DATA_PATH, "r") as f:
    adata = ad.AnnData(
        obs=ad.io.read_elem(f["obs"]),
        obsm=read_lazy(f["obsm"]),
        uns=ad.io.read_elem(f["uns"]),
    )

# preparing data to be compatible with DataManager
dum = adata.uns['gene_embeddings']['AAAS']
missing_genes = set(adata.obs['pert_target'].unique()) - set(adata.uns['gene_embeddings'].keys())
for gene in missing_genes:
    adata.uns['gene_embeddings'][gene] = dum
adata.obs['control'] = (adata.obs['nperts'] == 0)


In [5]:
adl = AnnDataLocation()
dm = DataManager(
    dist_flag_key="control",
    src_dist_keys=["cell_line"],
    tgt_dist_keys=["pert_target"],
    rep_keys={
        'cell_line': 'cell_line_embeddings',
        'pert_target': 'gene_embeddings',
    },
    data_location=adl.obsm['X_pca'][:,:50],
)
gd = dm.prepare_data(
    adata=adata,
)

In [6]:
chunk_size = 131072
shard_size = chunk_size * 8

In [7]:
gd.write_zarr(
    path=OUTPUT_PATH,
    chunk_size=chunk_size,
    shard_size=shard_size,
    max_workers=14,
)

Writing /data/src_data: 100%|██████████| 1/1 [00:00<00:00, 12.83it/s]
Writing /data/tgt_data: 100%|██████████| 2394/2394 [00:06<00:00, 387.59it/s]
Writing /data/conditions: 100%|██████████| 2393/2393 [00:00<00:00, 47371.90it/s]


In [8]:
gd = GroupedDistribution.read_zarr(OUTPUT_PATH)

In [11]:
gd.data.conditions[0]

{'cell_line': array([ 1.66789598e+00, -6.56532124e-01, -1.00463417e+00, -1.19014874e+00,
        -8.53267474e-01,  3.32409789e-01, -2.97095274e-02, -4.95185602e-01,
        -3.54348622e-01,  9.78880380e-01,  4.14514446e-01,  7.03965328e-01,
        -4.75822789e-01, -4.75968444e-01, -1.93356999e+00,  6.52297708e-01,
        -8.92273607e-01,  1.95919096e+00,  6.31954716e-01,  3.35955476e-03,
         9.87745313e-01, -7.73558064e-02,  2.56089784e-01,  3.04381787e+00,
         2.91866682e+00, -2.53171483e+00, -2.33287315e+00, -8.80268042e-01,
        -9.74720532e-01,  1.70958069e+00,  1.54763252e+00,  1.19906683e+00,
         4.17385465e-01, -1.88251178e+00, -5.61469324e-01, -1.56020347e+00,
         1.05911435e+00, -2.37674126e+00, -1.28641138e+00, -5.67946763e-01,
         2.68989785e-01, -1.68478377e+00,  5.78521147e-01,  9.07055194e-01,
        -5.67860358e-01, -1.86321523e-01, -3.52974710e-01,  1.16166815e+00,
        -1.20251333e+00, -3.27250885e-01,  9.68835748e-01,  3.70503538e-01,