# Basic preprocessing and analysis of the human hematopoiesis dataset

Notebook for preprocessing human hematopoiesis dataset

## Library imports

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import mplscience

import anndata as ad
import scanpy as sc
import scvelo as scv
from velovi import preprocess_data

from rgv_tools import DATA_DIR, FIG_DIR
from rgv_tools.preprocessing import get_prior_grn

## General settings

In [None]:
sc.settings.verbosity = 2
scv.settings.verbosity = 3

In [None]:
plt.rcParams["svg.fonttype"] = "none"

## Constants

In [None]:
DATASET = "hematopoiesis"

In [None]:
SAVE_DATA = True
if SAVE_DATA:
    (DATA_DIR / DATASET / "processed").mkdir(parents=True, exist_ok=True)

In [None]:
SAVE_FIGURES = False
if SAVE_FIGURES:
    (FIG_DIR / DATASET).mkdir(parents=True, exist_ok=True)

FIGURE_FORMAT = "svg"

## Data loading

In [None]:
adata = ad.io.read_h5ad(DATA_DIR / DATASET / "raw" / "hsc_dynamo_adata.h5ad")
adata

In [None]:
tfs = pd.read_csv(DATA_DIR / DATASET / "raw" / "allTFs_hg38.csv", header=None)
gt_net = pd.read_csv(DATA_DIR / DATASET / "raw" / "skeleton.csv", index_col=0)

## Visualization

## Preprocessing

In [None]:
scv.pp.filter_and_normalize(adata, min_shared_counts=10, log=False, n_top_genes=2000)

In [None]:
sc.pp.neighbors(adata, n_neighbors=50)

In [None]:
scv.pp.moments(adata, n_pcs=None, n_neighbors=None)
adata

In [None]:
with mplscience.style_context():
    fig, ax = plt.subplots(figsize=(6, 4))
    sc.pl.scatter(adata, basis="draw_graph_fa", color="cell_type", frameon=False, ax=ax)

    if SAVE_FIGURES:
        fig.savefig(
            FIG_DIR / DATASET / f"intro_figure.{FIGURE_FORMAT}",
            format=FIGURE_FORMAT,
            transparent=True,
            bbox_inches="tight",
        )

In [None]:
scv.tl.velocity(adata)

In [None]:
if SAVE_DATA:
    adata.write_h5ad(DATA_DIR / DATASET / "processed" / "adata_preprocessed_full.h5ad")

del adata.uns["velocity_params"]
del adata.layers["velocity"]
del adata.layers["variance_velocity"]
adata.var.drop(columns=["velocity_gamma", "velocity_qreg_ratio", "velocity_r2", "velocity_genes"], inplace=True)

## RegVelo preprocessing

In [None]:
adata = get_prior_grn(adata, gt_net)
adata

In [None]:
velocity_genes = preprocess_data(adata.copy()).var_names.tolist()

In [None]:
tf_grn = adata.var_names[adata.uns["skeleton"].T.sum(0) != 0].tolist()
tf = list(set(tfs.iloc[:, 0].tolist()).intersection(tf_grn))
adata.var["tf"] = adata.var_names.isin(tfs)

Select genes that are either part of the transcription factor (TF) list or `velocity_genes`

In [None]:
var_mask = np.union1d(adata.var_names[adata.var["tf"]], velocity_genes)
adata = adata[:, var_mask].copy()

In [None]:
adata = preprocess_data(adata, filter_on_r2=False)

In [None]:
mask = adata.var_names.isin(adata.uns["regulators"])

# Filter the skeleton matrix `W` based on the selected indices
skeleton = adata.uns["skeleton"][np.ix_(mask, mask)]

# Update the filtered values in `uns`
adata.uns.update({"skeleton": skeleton, "regulators": adata.var_names.tolist(), "targets": adata.var_names.tolist()})

## Save dataset

In [None]:
if SAVE_DATA:
    adata.write_h5ad(DATA_DIR / DATASET / "processed" / "adata_preprocessed.h5ad")