# Calculate velocity and latent time using TFvelo

## Library imports

In [1]:
import os
import sys

import TFvelo as TFv
from paths import DATA_DIR, FIG_DIR

import numpy as np

Global seed set to 0


In [None]:
import scipy

import anndata as ad
import scanpy as sc

# attach your local TFvelo repo directory
sys.path.append("/home/itg/z.xue/VeloBenchmark/TFvelo")

sys.path.append("../..")

## General settings

In [2]:
np.set_printoptions(suppress=True)
SAVE_FIGURES = True
if SAVE_FIGURES:
    os.makedirs(FIG_DIR / "simulation", exist_ok=True)

SAVE_DATASETS = True
if SAVE_DATASETS:
    os.makedirs(DATA_DIR / "simulation", exist_ok=True)

In [3]:
input_path = DATA_DIR
output_path = DATA_DIR / "simulation"
input_files = os.listdir(input_path)

## Function definitions

In [4]:
def run_TFvelo(input_path, output_path, input_file):
    """TODO."""
    adata = ad.read(os.path.join(input_path, input_file))
    print("Start processing " + os.path.join(input_path, input_file))
    adata.layers["spliced"] = adata.layers["counts_spliced"].copy()
    adata.layers["unspliced"] = adata.layers["counts_unspliced"].copy()

    if "spliced" in adata.layers:
        adata.layers["total"] = adata.layers["spliced"] + adata.layers["unspliced"]
    elif "new" in adata.layers:
        adata.layers["total"] = np.array(adata.layers["total"].todense())
    else:
        adata.layers["total"] = adata.X
    adata.layers["count"] = adata.X.copy()
    adata.layers["total_raw"] = adata.layers["total"].copy()
    n_cells, n_genes = adata.X.shape
    sc.pp.filter_genes(adata, min_cells=int(n_cells / 50))
    sc.pp.filter_cells(adata, min_genes=int(n_genes / 50))
    TFv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=2000, log=True)  # include the following steps
    adata.X = adata.layers["total"].copy()

    gene_names = []
    for tmp in adata.var_names:
        gene_names.append(tmp.upper())
    adata.var_names = gene_names
    adata.var_names_make_unique()
    adata.obs_names_make_unique()

    TFv.pp.moments(adata, n_pcs=30, n_neighbors=30)
    adata.X = adata.X.A
    n_gene = adata.shape[1]
    adata.varm["TFs"] = np.full([n_gene, n_gene], "blank")
    adata.varm["TFs"] = adata.varm["TFs"].astype("U10")

    adata.varm["TFs_id"] = np.full([n_gene, n_gene], -1)
    adata.varm["TFs_times"] = np.full([n_gene, n_gene], 0)
    adata.varm["TFs_correlation"] = np.full([n_gene, n_gene], 0.0)
    adata.varm["knockTF_Log2FC"] = np.full([n_gene, n_gene], 0.0)
    adata.var["n_TFs"] = np.zeros(n_gene, dtype=int)

    gene_names = adata.var_names.tolist()  # all genes as targets
    all_TFs = list(adata.var_names[adata.var["is_tf"]])  # select TFs

    for TF_name in all_TFs:
        TF_idx = gene_names.index(TF_name)
        TF_expression = np.ravel(adata[:, TF_name].X)

        for target in gene_names:
            target_idx = gene_names.index(target)
            if target == TF_name:
                continue

            if TF_name in adata.varm["TFs"][target_idx]:
                ii = list(adata.varm["TFs"][target_idx]).index(TF_name)
                adata.varm["TFs_times"][target_idx, ii] += 1
                continue
            target_expression = np.ravel(adata[:, target].X)
            flag = (TF_expression > 0) & (target_expression > 0)  # consider all possible regulation
            if flag.sum() < 2:
                correlation = 0
            else:
                correlation, _ = scipy.stats.spearmanr(target_expression[flag], TF_expression[flag])

            tmp_n_TF = adata.var["n_TFs"][target_idx]
            adata.varm["TFs"][target_idx][tmp_n_TF] = TF_name
            adata.varm["TFs_id"][target_idx][tmp_n_TF] = TF_idx
            adata.varm["TFs_times"][target_idx, tmp_n_TF] = 1
            adata.varm["TFs_correlation"][target_idx, tmp_n_TF] = correlation
            adata.var["n_TFs"][target_idx] += 1
    TFv.tl.recover_dynamics(
        adata,
        n_jobs=64,
        max_iter=20,
        var_names="all",
        WX_method="lsq_linear",
        WX_thres=20,
        n_top_genes=adata.shape[1],
        fit_scaling=True,
        use_raw=0,
        init_weight_method="ones",
        n_time_points=1000,
    )
    n_cells = adata.shape[0]
    expanded_scaling_y = np.expand_dims(np.array(adata.var["fit_scaling_y"]), 0).repeat(n_cells, axis=0)
    adata.layers["velocity"] = adata.layers["velo_hat"] / expanded_scaling_y
    return adata

## Data loading and processing of one instance

In [5]:
adata = run_TFvelo(input_path, output_path, input_files[0])

Start processing /home/itg/z.xue/VeloBenchmark/50_time_simulations/dataset_sim41.h5ad
Normalized count data: X, spliced, unspliced, total.
Skip filtering by dispersion since number of variables are less than `n_top_genes`.
Logarithmized X.
computing neighbors
    finished (0:00:12) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:00) --> added 
    'M_total', moments of total abundances (adata.layers)
recovering dynamics (using 48/48 cores)


  0%|          | 0/90 [00:00<?, ?gene/s]

Processing 0/2 A2_TF1
Processing 0/2 A1_TF1
Processing 0/2 A4_TF2
Processing 0/2 BURN1_TF1
Processing 0/2 A3_TF1
Processing 0/2 BURN2_TF1
Processing 0/2 B3_TF2
Processing 0/2 BURN4_TF1
Processing 0/2 A3_TF3
Processing 0/2 B5_TF1
Processing 0/2 B1_TF2
Processing 0/2 BURN3_TF1
Processing 0/2 B2_TF2
Processing 0/2 B4_TF2
Processing 0/2 B7_TF1
Processing 0/2 B6_TF1
Processing 0/2 B8_TF1
Processing 0/2 A5_TF2
Processing 0/2 HK1
Processing 0/2 TARGET3
Processing 0/2 TARGET9
Processing 0/1 HK8
Processing 0/2 HK3
Processing 0/1 HK5
Processing 0/1 HK6
Processing 1/2 TARGET8
Processing 0/2 TARGET1
Processing 0/1 HK10
Processing 0/1 HK9
Processing 0/1 HK7
0/2 B6_TF1 FINISHED with n_TFs: 69
Processing 1/2 B6_TF2
0/2 B8_TF1 FINISHED with n_TFs: 69
Processing 1/2 B8_TF2
0/2 A3_TF1 FINISHED with n_TFs: 69
Processing 1/2 A3_TF2
0/2 B5_TF1 FINISHED with n_TFs: 69
Processing 1/2 B5_TF2
0/2 BURN2_TF1 FINISHED with n_TFs: 69
Processing 1/2 BURN2_TF2
0/2 A2_TF1 FINISHED with n_TFs: 69
Processing 1/2 A2_TF2

In [6]:
# fit_t and velocity are the computed results
adata

AnnData object with n_obs × n_vars = 300 × 90
    obs: 'step_ix', 'simulation_i', 'sim_time', 'n_genes', 'initial_size_spliced', 'initial_size_unspliced', 'initial_size_total', 'initial_size', 'n_counts'
    var: 'module_id', 'basal', 'burn', 'independence', 'color', 'is_tf', 'is_hk', 'transcription_rate', 'splicing_rate', 'translation_rate', 'mrna_halflife', 'protein_halflife', 'mrna_decay_rate', 'protein_decay_rate', 'max_premrna', 'max_mrna', 'max_protein', 'mol_premrna', 'mol_mrna', 'mol_protein', 'n_cells', 'n_TFs', 'fit_alpha', 'fit_beta', 'fit_omega', 'fit_theta', 'fit_gamma', 'fit_delta', 'fit_likelihood', 'fit_varx', 'fit_scaling_y'
    uns: 'network', 'regulators', 'regulatory_network', 'regulatory_network_regulators', 'regulatory_network_targets', 'skeleton', 'targets', 'traj_dimred_segments', 'traj_milestone_network', 'traj_progressions', 'pca', 'neighbors', 'recover_dynamics'
    obsm: 'dimred', 'regulatory_network_sc', 'X_pca'
    varm: 'PCs', 'TFs', 'TFs_id', 'TFs_times'

In [7]:
# save the results
if SAVE_DATASETS:
    adata.write_h5ad(DATA_DIR / "simulation" / "c2f_output.h5ad")