# Preliminaries

## Import packages

In [1]:
# import standard packages
from pathlib import Path
from scipy.io import mmwrite
import numpy as np
import pandas as pd 

import sys

# import single-cell packages
import scanpy as sc
import scvelo as scv

# set verbosity levels
sc.settings.verbosity = 2
scv.settings.verbosity = 3 

## Set up paths

In [3]:
sys.path.insert(0, "../../..")  # this depends on the notebook depth and must be adapted per notebook

from paths import DATA_DIR

## Load the data

In [9]:
# it has the same #cells as adata_preprocessed.h5ad - 104678
adata = sc.read(DATA_DIR / "morris_data" / "adata.h5ad")
adata

AnnData object with n_obs × n_vars = 104679 × 22630
    obs: 'batch'
    layers: 'spliced', 'unspliced'

## Write annotations and the sparse raw data for FateID

In [10]:
mmwrite(DATA_DIR / "morris_data" / "raw.mtx", adata.X.T)  # write genes x cells
pd.DataFrame(adata.obs_names).to_csv(DATA_DIR / "morris_data" / "annotations" / "obs_names.csv")
pd.DataFrame(adata.var_names).to_csv(DATA_DIR / "morris_data" / "annotations" / "genes.csv")

## Create random dataset subsets

### Set seed and the number of splits

Subset the dataset to 10k, 20k, ... 100k cells and for each subset, create 10 different splits.

In [11]:
np.random.seed(42)
n_splits = 10

### Write the splits

In [12]:
for size in [i * 10_000 for i in range(1, 11)]:
    pd.DataFrame([np.random.choice(np.arange(adata.n_obs), size=size, replace=False)
                  for _ in range(n_splits)]).T.to_csv(DATA_DIR / "morris_data" / "splits" / f'size_{size}.csv')