In [1]:
x=1

In [None]:
# env muon
import scanpy as sc
import pandas as pd
import muon
import numpy as np
import anndata as ad

In [3]:
def make_nested_train_test_splits(
    adata,
    train_fracs=(0.75, 0.5, 0.25),
    seed=0
):
    """
    Create nested train/test splits in AnnData objects.

    Returns
    -------
    dict
        Keys are train fractions, values are AnnData objects
        with obs['split'] = 'train' or 'test'
    """
    rng = np.random.default_rng(seed)

    n_cells = adata.n_obs
    all_idx = np.arange(n_cells)

    # global random ordering â†’ ensures nesting
    permuted_idx = rng.permutation(all_idx)

    out = {}

    for frac in sorted(train_fracs, reverse=True):
        n_train = int(np.floor(frac * n_cells))
        train_idx = permuted_idx[:n_train]

        split = np.full(n_cells, "test", dtype=object)
        split[train_idx] = "train"

        adata_split = adata.copy()
        adata_split.obs["split"] = split

        out[frac] = adata_split

    return out


## cite

In [4]:
adata = sc.read('/lustre/groups/ml01/projects/2022_multigrate_anastasia.litinetskaya/snakemake/trimodal/data/raw/GSE194122_openproblems_neurips2021_cite_BMMC_processed.h5ad')

adata_gene = adata[:, adata.var['feature_types'] == 'GEX'].copy()
adata_protein = adata[:, adata.var['feature_types'] == 'ADT'].copy()
del adata

  utils.warn_names_duplicates("var")


In [5]:
adata_gene.X = adata_gene.layers['counts'].copy()
sc.pp.normalize_total(adata_gene, target_sum=1e4)
sc.pp.log1p(adata_gene)

hvf_mod1 = pd.read_csv('/lustre/groups/ml01/code/anastasia.litinetskaya/multigrate_reproducibility/imputation/pipeline/data/input/cite/hvf_mod1.txt', header=None)
hvf_mod1 = hvf_mod1[0].astype(str).tolist()

adata_gene = adata_gene[:, hvf_mod1].copy()

In [6]:
adata_protein.X = adata_protein.layers['counts'].copy()
muon.prot.pp.clr(adata_protein)

  warn("adata.X is sparse but not in CSC format. Converting to CSC.")


In [7]:
splits = make_nested_train_test_splits(adata_gene, seed=0)

adata_gene_75 = splits[0.75]
adata_gene_50 = splits[0.5]
adata_gene_25 = splits[0.25]

In [8]:
adata_gene_50.obs['split'].value_counts()

split
test     45131
train    45130
Name: count, dtype: int64

In [9]:
adata_gene_25.obs['split'].value_counts()

split
test     67696
train    22565
Name: count, dtype: int64

In [10]:
adata_gene_75.obs['split'].value_counts()

split
train    67695
test     22566
Name: count, dtype: int64

In [11]:
np.sum(adata_protein.obs_names != adata_gene_25.obs_names)

np.int64(0)

In [12]:
np.sum(adata_protein.obs_names != adata_gene_50.obs_names)

np.int64(0)

In [13]:
np.sum(adata_protein.obs_names != adata_gene_75.obs_names)

np.int64(0)

In [14]:
adata_protein_25 = adata_protein.copy()
adata_protein_25.obs['split'] = adata_gene_25.obs['split'].copy()

In [15]:
adata_protein_50 = adata_protein.copy()
adata_protein_50.obs['split'] = adata_gene_50.obs['split'].copy()

In [16]:
adata_protein_75 = adata_protein.copy()
adata_protein_75.obs['split'] = adata_gene_75.obs['split'].copy()

In [17]:
adata_gene_25.write('/lustre/groups/ml01/code/anastasia.litinetskaya/multigrate_reproducibility/imputation/pipeline/data/input/cite/adata_gene_25.h5ad')
adata_gene_50.write('/lustre/groups/ml01/code/anastasia.litinetskaya/multigrate_reproducibility/imputation/pipeline/data/input/cite/adata_gene_50.h5ad')
adata_gene_75.write('/lustre/groups/ml01/code/anastasia.litinetskaya/multigrate_reproducibility/imputation/pipeline/data/input/cite/adata_gene_75.h5ad')
adata_protein_25.write('/lustre/groups/ml01/code/anastasia.litinetskaya/multigrate_reproducibility/imputation/pipeline/data/input/cite/adata_protein_25.h5ad')
adata_protein_50.write('/lustre/groups/ml01/code/anastasia.litinetskaya/multigrate_reproducibility/imputation/pipeline/data/input/cite/adata_protein_50.h5ad')
adata_protein_75.write('/lustre/groups/ml01/code/anastasia.litinetskaya/multigrate_reproducibility/imputation/pipeline/data/input/cite/adata_protein_75.h5ad')

## multiome

In [4]:
adata = sc.read('/lustre/groups/ml01/projects/2022_multigrate_anastasia.litinetskaya/snakemake/trimodal/data/raw/GSE194122_openproblems_neurips2021_multiome_BMMC_processed.h5ad')

adata_gene = adata[:, adata.var['feature_types'] == 'GEX'].copy()
adata_atac = adata[:, adata.var['feature_types'] == 'ATAC'].copy()
del adata

In [5]:
adata_gene.X = adata_gene.layers['counts'].copy()
sc.pp.normalize_total(adata_gene, target_sum=1e4)
sc.pp.log1p(adata_gene)

hvf_mod1 = pd.read_csv('/lustre/groups/ml01/code/anastasia.litinetskaya/multigrate_reproducibility/imputation/pipeline/data/input/multiome/hvf_mod1.txt', header=None)
hvf_mod1 = hvf_mod1[0].astype(str).tolist()

adata_gene = adata_gene[:, hvf_mod1].copy()

In [6]:
adata_atac.X = adata_atac.layers['counts'].copy()
sc.pp.normalize_total(adata_atac, target_sum=1e4)
sc.pp.log1p(adata_atac)

hvf_mod2 = pd.read_csv('/lustre/groups/ml01/code/anastasia.litinetskaya/multigrate_reproducibility/imputation/pipeline/data/input/multiome/hvf_mod2.txt', header=None)
hvf_mod2 = hvf_mod2[0].astype(str).tolist()

adata_atac = adata_atac[:, hvf_mod2].copy()

In [7]:
splits = make_nested_train_test_splits(adata_gene, seed=0)

adata_gene_75 = splits[0.75]
adata_gene_50 = splits[0.5]
adata_gene_25 = splits[0.25]

In [8]:
adata_atac_25 = adata_atac.copy()
adata_atac_25.obs['split'] = adata_gene_25.obs['split'].copy()

adata_atac_50 = adata_atac.copy()
adata_atac_50.obs['split'] = adata_gene_50.obs['split'].copy()

adata_atac_75 = adata_atac.copy()
adata_atac_75.obs['split'] = adata_gene_75.obs['split'].copy()

In [9]:
adata_gene_75.write_h5ad('/lustre/groups/ml01/code/anastasia.litinetskaya/multigrate_reproducibility/imputation/pipeline/data/input/multiome/adata_gene_75.h5ad')
adata_gene_50.write_h5ad('/lustre/groups/ml01/code/anastasia.litinetskaya/multigrate_reproducibility/imputation/pipeline/data/input/multiome/adata_gene_50.h5ad')
adata_gene_25.write_h5ad('/lustre/groups/ml01/code/anastasia.litinetskaya/multigrate_reproducibility/imputation/pipeline/data/input/multiome/adata_gene_25.h5ad')
adata_atac_25.write_h5ad('/lustre/groups/ml01/code/anastasia.litinetskaya/multigrate_reproducibility/imputation/pipeline/data/input/multiome/adata_atac_25.h5ad')
adata_atac_50.write_h5ad('/lustre/groups/ml01/code/anastasia.litinetskaya/multigrate_reproducibility/imputation/pipeline/data/input/multiome/adata_atac_50.h5ad')
adata_atac_75.write_h5ad('/lustre/groups/ml01/code/anastasia.litinetskaya/multigrate_reproducibility/imputation/pipeline/data/input/multiome/adata_atac_75.h5ad')

In [None]:
props = ['25', '50', '75']
base_path = '/lustre/groups/ml01/code/anastasia.litinetskaya/multigrate_reproducibility/imputation/pipeline/data/input/multiome/adata_gene_{}.h5ad'
for path in [base_path.format(prop) for prop in props]:
    adata = sc.read_h5ad(path)
    print(path)
    print(adata.obs['split'].value_counts())