# Preliminaries

## Import packages

In [8]:
# import standard packages
from pathlib import Path
import numpy as np
import pandas as pd 

import os
import sys

# import single-cell packages
import scanpy as sc
import scvelo as scv

# set verbosity levels
sc.settings.verbosity = 2
scv.settings.verbosity = 3 

## Print package versions for reproducibility

In [9]:
scv.logging.print_versions()

scvelo==0.2.2  scanpy==1.6.0  anndata==0.7.4  loompy==3.0.6  numpy==1.19.2  scipy==1.5.2  matplotlib==3.3.2  sklearn==0.23.2  pandas==1.1.3  
 Your version: 		 0.2.2 
Latest version: 	 modeling


## Set up paths

In [10]:
sys.path.insert(0, "../../..")  # this depends on the notebook depth and must be adapted per notebook

from paths import DATA_DIR

### Define path for the Morris dataset

This dataset comes from [Biddy, B.A., Kong, W., Kamimoto, K. et al. Single-cell mapping of lineage and identity in direct reprogramming. Nature 564, 219–224 (2018)](https://doi.org/10.1038/s41586-018-0744-4).

In [11]:
root = DATA_DIR / "20200507_morris_celltagging_10x/rev7/velocyto"

## Load the data

In [4]:
adatas = []
for dirname in os.listdir(root):
    if dirname.startswith("hf"):
        fname = [f for f in os.listdir(root / dirname) if f.endswith(".loom")][0]
        adatas.append(scv.read_loom(root / dirname / fname))
        adatas[-1].var_names_make_unique()  # for merging
        adatas[-1].obs_names_make_unique()

Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Vari

## Concatenate the objects

The dataset is saved in multiple parts si we have to merge the objects.

In [5]:
adata = adatas[0].concatenate(adatas[1:])

adata.var_names_make_unique()
adata.obs_names_make_unique()

adata

## Write the data

In [None]:
sc.write(DATA_DIR / "morris_data" / "adata.h5ad", adata)

# Create random dataset subsets

## Set seed and the number of splits

Subset the dataset to 10k, 20k, ... 100k cells and for each subset, create 10 different splits.

In [None]:
np.random.seed(42)
n_splits = 10

## Write the splits

In [None]:
for size in [i * 10_000 for i in range(1, 11)]:
    pd.DataFrame([np.random.choice(np.arange(adata.n_obs), size=size, replace=False)
                  for _ in range(n_splits)]).T.to_csv(DATA_DIR / "morris_data" / f'size_{size}.csv')