Unfortunately the number of raw output files is very large. To make the figures reproducible we created summary statistics using this script. The summaries are available in `data_summary/`.

In [5]:
import numpy as np
import warnings
import json
import os
import pandas as pd

from sklearn.metrics import adjusted_rand_score
from src_python.cell_tree import CellTree
from src_python.utils import load_config_and_set_random_seed, path_len_dist

In [6]:
path = r"../data/simulated_data"
path_consensus = r"../data/results"

n_cells = [50]
n_mut = [500]
n_tests = 100
clones = ["", 5, 10, 20]

n_round = 1 #

config = load_config_and_set_random_seed()

use_summary_statistics = False #True # Uses the summary files saved in the data/ directory. If raw output files were generated, set to False
genotype_differences = {"SCITE-RNA": {}}
path_len_differences = {"SCITE-RNA": {}}

genotype_differences["SCITE-RNA-clustered"] = {}
path_len_differences["SCITE-RNA-clustered"] = {}
adjusted_rand_index= {"SCITE-RNA-clustered": {}}

path_len_differences["SCITE-RNA-consensus"] = {}

genotype_differences["SClineager"] = {}
path_len_differences["SClineager"] = {}
adjusted_rand_index["SClineager"] = {}

genotype_differences["DENDRO"] = {}
path_len_differences["DENDRO"] = {}
adjusted_rand_index["DENDRO"] = {}

In [7]:
for n_c, n_m in zip(n_cells, n_mut):

    path_len_differences["SCITE-RNA"][f"{n_c}_{n_m}"] = {}
    genotype_differences["SCITE-RNA"][f"{n_c}_{n_m}"] = {}

    for clone in clones:
        vaf_differences = []
        path_len_distances = []

        for t in range(n_tests):
            base_path = os.path.join(path, f"{n_c}c{n_m}m{clone}")
            genotype_pred_path = os.path.join(base_path, "sciterna", "sciterna_genotype", f"sciterna_genotype_{n_round}r{t}.txt")
            genotype_path = os.path.join(base_path, "genotype", f"genotype_{t}.txt")
            genotype_pred = np.loadtxt(genotype_pred_path, dtype=str)
            genotype_gt = np.loadtxt(genotype_path, dtype=str)

            true_parent_vec = np.loadtxt(os.path.join(base_path, "parent_vec", f"parent_vec_{t}.txt"), dtype=int)
            sciterna_parent_vec = np.loadtxt(os.path.join(base_path, "sciterna", "sciterna_parent_vec", f"sciterna_parent_vec_{n_round}r{t}.txt"), dtype=int)

            mapping_dict = {'A': 1.0, 'H': 0.5, 'R': 0}
            vectorized_map = np.vectorize(lambda x: float(mapping_dict[x]))
            genotype_predicted = vectorized_map(genotype_pred)
            genotype_gt = vectorized_map(genotype_gt)
            unique_genotypes = np.unique(genotype_gt, axis=0)

            difference = np.mean(np.abs(genotype_predicted - genotype_gt))
            vaf_differences.append(difference)

            ct_gt = CellTree(n_c)
            ct_sciterna = CellTree(n_c, flipped_mutation_direction=True)

            ct_gt.use_parent_vec(true_parent_vec)
            ct_sciterna.use_parent_vec(sciterna_parent_vec)

            path_len_distances.append(path_len_dist(ct_gt, ct_sciterna))

        path_len_differences["SCITE-RNA"][f"{n_c}_{n_m}"][clone] = path_len_distances
        genotype_differences["SCITE-RNA"][f"{n_c}_{n_m}"][clone] = vaf_differences
        print(np.mean(path_len_distances), " mean path length distance")
        print(np.mean(vaf_differences), " mean abs difference of vafs predicted")

2.4021714285714286  mean path length distance
0.030678799999999996  mean abs difference of vafs predicted
4.887208163265306  mean path length distance
0.034108999999999993  mean abs difference of vafs predicted
3.6361714285714286  mean path length distance
0.0327506  mean abs difference of vafs predicted
2.4828979591836733  mean path length distance
0.03191580000000001  mean abs difference of vafs predicted


In [8]:
for n_c, n_m in zip(n_cells, n_mut):
    path_len_differences["SCITE-RNA-consensus"][f"{n_c}_{n_m}"] = {}

    for clone in clones:
        path_len_distances = []

        for t in range(n_tests):
            base_path = os.path.join(path, f"{n_c}c{n_m}m{clone}")
            base_path_consensus = os.path.join(path_consensus, f"{n_c}c{n_m}m{clone}")
            true_parent_vec = np.loadtxt(os.path.join(base_path, "parent_vec", f"parent_vec_{t}.txt"), dtype=int)
            sciterna_parent_vec = np.loadtxt(os.path.join(base_path_consensus,  "sciterna_consensus_parent_vec", f"sciterna_parent_vec_{n_round}r{t}.txt"), dtype=int)

            ct_gt = CellTree(n_c)
            ct_sciterna = CellTree(n_c, flipped_mutation_direction=True)

            ct_gt.use_parent_vec(true_parent_vec)
            ct_sciterna.use_parent_vec(sciterna_parent_vec)

            path_len_distances.append(path_len_dist(ct_gt, ct_sciterna))

        path_len_differences["SCITE-RNA-consensus"][f"{n_c}_{n_m}"][clone] = path_len_distances
        print(np.mean(path_len_distances), " mean path length distance")

2.2206367346938776  mean path length distance
4.076481632653061  mean path length distance
3.384653061224489  mean path length distance
2.4992897959183673  mean path length distance


In [9]:
for n_c, n_m in zip(n_cells, n_mut):
    genotype_differences["SCITE-RNA-clustered"][f"{n_c}_{n_m}"] = {}
    path_len_differences["SCITE-RNA-clustered"][f"{n_c}_{n_m}"] = {}
    adjusted_rand_index["SCITE-RNA-clustered"][f"{n_c}_{n_m}"] = {}
    for clone in clones:
        vaf_differences = []
        path_len_distances = []
        adjusted_rand_scores = []
        for t in range(n_tests):
            base_path = os.path.join(path, f"{n_c}c{n_m}m{clone}")

            genotype_gt = np.loadtxt(os.path.join(base_path, "genotype", f"genotype_{t}.txt"), dtype=str).T

            if genotype_gt.ndim == 1:
                genotype_gt = genotype_gt[:, np.newaxis]

            _, clones_gt = np.unique(genotype_gt, axis=1, return_inverse=True)

            clones_pred_path = os.path.join(base_path, "sciterna", "sciterna_clones", f"sciterna_clones_{t}.txt")
            clones_pred = np.loadtxt(clones_pred_path, dtype=float)
            if len(clones_pred) != n_c:
                print("Cells were filtered out, skipping this test")
                continue
            adjusted_rand_scores.append(adjusted_rand_score(clones_gt, clones_pred))

            ref_path = os.path.join(base_path, "ref", f"ref_{t}.txt")
            alt_path = os.path.join(base_path, "alt", f"alt_{t}.txt")
            true_parent_vec = np.loadtxt(os.path.join(base_path, "parent_vec", f"parent_vec_{t}.txt"), dtype=int)
            sciterna_parent_vec = np.loadtxt(os.path.join(base_path, "sciterna", "sciterna_parent_vec_clustering", f"sciterna_parent_vec_clustering_{t}.txt"), dtype=int)

            alt = np.loadtxt(alt_path)
            ref = np.loadtxt(ref_path)
            alt_t = alt.T
            ref_t = ref.T

            with np.errstate(invalid='ignore'):
                vaf_observed = alt_t/(alt_t + ref_t)

            mapping_dict = {'A': 1.0, 'H': 0.5, 'R': 0}
            vectorized_map = np.vectorize(lambda x: float(mapping_dict[x]))
            genotype_gt = vectorized_map(genotype_gt)

            unique_classes = np.unique(clones_pred)

            # For each unique class, replace column values with the mean of the columns of that class
            for cls in unique_classes:
                class_indices = np.where(clones_pred == cls)[0]
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", category=RuntimeWarning)
                    mean_values = np.nanmean(vaf_observed[:, class_indices], axis=1)

                # in case the mean is nan replace it with the mean genotype over all cells
                row_nanmean = np.nanmean(vaf_observed, axis=1)
                if np.isnan(row_nanmean).any():
                    raise ValueError("Error: The array contains NaN values.")
                mean_values = np.where(np.isnan(mean_values), row_nanmean, mean_values)
                if np.isnan(mean_values).any():
                    raise ValueError("Error: The array contains NaN values.")

                vaf_observed[:, class_indices] = np.tile(mean_values[:, np.newaxis], len(class_indices))


            genotype_predicted = np.round(vaf_observed * 2) / 2
            if np.isnan(genotype_predicted).any():
                print("Error: The array contains NaN values.")

            vaf_difference = np.mean(np.abs(genotype_predicted - genotype_gt))
            vaf_differences.append(vaf_difference)

            ct_gt = CellTree(n_c)
            ct_sciterna = CellTree(n_c)

            ct_gt.use_parent_vec(true_parent_vec)
            ct_sciterna.use_parent_vec(sciterna_parent_vec)

            path_len_distances.append(path_len_dist(ct_gt, ct_sciterna))

        genotype_differences["SCITE-RNA-clustered"][f"{n_c}_{n_m}"][clone] = vaf_differences
        path_len_differences["SCITE-RNA-clustered"][f"{n_c}_{n_m}"][clone] = path_len_distances
        adjusted_rand_index["SCITE-RNA-clustered"][f"{n_c}_{n_m}"][clone] = adjusted_rand_scores
        print(np.mean(vaf_differences), " mean abs difference of vafs predicted")
        print(np.mean(path_len_distances), " mean path length distance")
        print(np.mean(adjusted_rand_scores), " mean adjusted rand index")

0.0675508  mean abs difference of vafs predicted
3.5893877551020403  mean path length distance
1.0  mean adjusted rand index
0.0501416  mean abs difference of vafs predicted
4.209836734693877  mean path length distance
0.4864087198943913  mean adjusted rand index
0.0794224  mean abs difference of vafs predicted
3.9233632653061226  mean path length distance
0.448898661930177  mean adjusted rand index
0.09255079999999999  mean abs difference of vafs predicted
3.5360244897959183  mean path length distance
0.4214693242006576  mean adjusted rand index


In [10]:
for n_c, n_m in zip(n_cells, n_mut):
    genotype_differences["SClineager"][f"{n_c}_{n_m}"] = {}
    path_len_differences["SClineager"][f"{n_c}_{n_m}"] = {}
    adjusted_rand_index["SClineager"][f"{n_c}_{n_m}"] = {}
    for clone in clones:
        vaf_differences = []
        path_len_distances = []
        adjusted_rand_scores = []
        for t in range(n_tests):
            base_path = os.path.join(path, f"{n_c}c{n_m}m{clone}")

            genotype_gt = np.loadtxt(os.path.join(base_path, "genotype", f"genotype_{t}.txt"), dtype=str).T
            vaf_predicted = np.loadtxt(os.path.join(base_path, "sclineager", "sclineager_vaf", f"sclineager_vaf_{t}.txt"), dtype=float).T

            if genotype_gt.ndim == 1:
                genotype_gt = genotype_gt[:, np.newaxis]

            _, clones_gt = np.unique(genotype_gt, axis=1, return_inverse=True)

            clones_pred_path = os.path.join(base_path, "sclineager", "sclineager_clones", f"sclineager_clones_{t}.txt")
            clones_pred = np.loadtxt(clones_pred_path, dtype=float)

            adjusted_rand_scores.append(adjusted_rand_score(clones_gt, clones_pred))

            true_parent_vec = np.loadtxt(os.path.join(base_path, "parent_vec", f"parent_vec_{t}.txt"), dtype=int)
            sclineager_parent_vec = np.loadtxt(os.path.join(base_path, "sclineager", "sclineager_parent_vec", f"sclineager_parent_vec_{t}.txt"), dtype=int)

            mapping_dict = {'A': 1.0, 'H': 0.5, 'R': 0}
            vectorized_map = np.vectorize(lambda x: float(mapping_dict[x]))
            genotype_gt = vectorized_map(genotype_gt)

            genotype_predicted = np.round(vaf_predicted * 2) / 2
            if genotype_predicted.shape != genotype_gt.shape:
                print("Cells or mutations were filtered out")
                continue
            vaf_difference = np.mean(np.abs(genotype_predicted - genotype_gt))
            vaf_differences.append(vaf_difference)

            ct_gt = CellTree(n_c)
            ct_sclineager = CellTree(n_c)

            ct_gt.use_parent_vec(true_parent_vec)
            ct_sclineager.use_parent_vec(sclineager_parent_vec)

            path_len_distances.append(path_len_dist(ct_gt, ct_sclineager))


        genotype_differences["SClineager"][f"{n_c}_{n_m}"][clone] = vaf_differences
        path_len_differences["SClineager"][f"{n_c}_{n_m}"][clone] = path_len_distances
        adjusted_rand_index["SClineager"][f"{n_c}_{n_m}"][clone] = adjusted_rand_scores
        print(np.mean(vaf_differences), " mean abs difference of vafs predicted")
        print(np.mean(path_len_distances), " mean path length distance")
        print(np.mean(adjusted_rand_scores), " mean adjusted rand index")

Cells or mutations were filtered out
Cells or mutations were filtered out
0.06797775510204082  mean abs difference of vafs predicted
4.651453561016242  mean path length distance
1.0  mean adjusted rand index
0.08357539999999998  mean abs difference of vafs predicted
4.515102040816327  mean path length distance
0.32879471421472906  mean adjusted rand index
Cells or mutations were filtered out
Cells or mutations were filtered out
Cells or mutations were filtered out
0.08701958762886597  mean abs difference of vafs predicted
4.575821586366505  mean path length distance
0.16775850098537823  mean adjusted rand index
Cells or mutations were filtered out
Cells or mutations were filtered out
0.08818346938775509  mean abs difference of vafs predicted
4.443631820074968  mean path length distance
0.08625234856376364  mean adjusted rand index


In [11]:
for n_c, n_m in zip(n_cells, n_mut):
    genotype_differences["DENDRO"][f"{n_c}_{n_m}"] = {}
    path_len_differences["DENDRO"][f"{n_c}_{n_m}"] = {}
    adjusted_rand_index["DENDRO"][f"{n_c}_{n_m}"] = {}
    for clone in clones:
        vaf_differences = []
        path_len_distances = []
        adjusted_rand_scores = []
        for t in range(n_tests):
            base_path = os.path.join(path, f"{n_c}c{n_m}m{clone}")

            genotype_gt = np.loadtxt(os.path.join(base_path, "genotype", f"genotype_{t}.txt"), dtype=str).T

            if genotype_gt.ndim == 1:
                genotype_gt = genotype_gt[:, np.newaxis]

            _, clones_gt = np.unique(genotype_gt, axis=1, return_inverse=True)

            clones_pred_path = os.path.join(base_path, "dendro", "dendro_clones", f"dendro_clones_{t}.txt")
            clones_pred = np.loadtxt(clones_pred_path, dtype=float)
            if len(clones_pred) != n_c:
                print("Cells were filtered out, skipping this test")
                continue
            adjusted_rand_scores.append(adjusted_rand_score(clones_gt, clones_pred))

            ref_path = os.path.join(base_path, "ref", f"ref_{t}.txt")
            alt_path = os.path.join(base_path, "alt", f"alt_{t}.txt")
            true_parent_vec = np.loadtxt(os.path.join(base_path, "parent_vec", f"parent_vec_{t}.txt"), dtype=int)

            sciterna_parent_vec = np.loadtxt(os.path.join(base_path, "dendro", "dendro_parent_vec", f"dendro_parent_vec_{t}.txt"), dtype=int)

            alt = np.loadtxt(alt_path)
            ref = np.loadtxt(ref_path)
            alt_t = alt.T
            ref_t = ref.T

            with np.errstate(invalid='ignore'):
                vaf_observed = alt_t/(alt_t + ref_t)

            mapping_dict = {'A': 1.0, 'H': 0.5, 'R': 0}
            vectorized_map = np.vectorize(lambda x: float(mapping_dict[x]))
            genotype_gt = vectorized_map(genotype_gt)

            unique_classes = np.unique(clones_pred)

            # For each unique class, replace column values with the mean of the columns of that class
            for cls in unique_classes:
                class_indices = np.where(clones_pred == cls)[0]
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", category=RuntimeWarning)
                    mean_values = np.nanmean(vaf_observed[:, class_indices], axis=1)

                # in case the mean is nan replace it with the mean genotype over all cells
                row_nanmean = np.nanmean(vaf_observed, axis=1)
                if np.isnan(row_nanmean).any():
                    raise ValueError("Error: The array contains NaN values.")
                mean_values = np.where(np.isnan(mean_values), row_nanmean, mean_values)
                if np.isnan(mean_values).any():
                    raise ValueError("Error: The array contains NaN values.")

                vaf_observed[:, class_indices] = np.tile(mean_values[:, np.newaxis], len(class_indices))


            genotype_predicted = np.round(vaf_observed * 2) / 2
            if np.isnan(genotype_predicted).any():
                print("Error: The array contains NaN values.")

            vaf_difference = np.mean(np.abs(genotype_predicted - genotype_gt))
            vaf_differences.append(vaf_difference)

            ct_gt = CellTree(n_c)
            ct_sciterna = CellTree(n_c)

            ct_gt.use_parent_vec(true_parent_vec)
            ct_sciterna.use_parent_vec(sciterna_parent_vec)

            path_len_distances.append(path_len_dist(ct_gt, ct_sciterna))


        genotype_differences["DENDRO"][f"{n_c}_{n_m}"][clone] = vaf_differences
        path_len_differences["DENDRO"][f"{n_c}_{n_m}"][clone] = path_len_distances
        adjusted_rand_index["DENDRO"][f"{n_c}_{n_m}"][clone] = adjusted_rand_scores
        print(np.mean(vaf_differences), " mean abs difference of vafs predicted")
        print(np.mean(path_len_distances), " mean path length distance")
        print(np.mean(adjusted_rand_scores), " mean adjusted rand index")

0.0675508  mean abs difference of vafs predicted
4.732391836734695  mean path length distance
1.0  mean adjusted rand index
0.0751738  mean abs difference of vafs predicted
4.659591836734694  mean path length distance
0.23567379134374583  mean adjusted rand index
0.10883880000000003  mean abs difference of vafs predicted
4.6629632653061215  mean path length distance
0.09482540200326817  mean adjusted rand index
0.11349439999999998  mean abs difference of vafs predicted
4.608440816326531  mean path length distance
0.05038498739179565  mean adjusted rand index


In [19]:
n_cells = [50, 50, 100, 100, 100, 200, 200]
n_mut = [50, 100, 50, 100, 200, 100, 200]
path = "../data/simulated_data/"
clone = ""

models = ["SClineager", "SCITE-RNA", "DENDRO"]
colors = ["lightgreen", "lightblue", "orange"]

# Store runtimes per model per condition
runtime_data = {model: [] for model in models}
conditions = []

# Collect runtime data
for n_c, n_m in zip(n_cells, n_mut):
    condition_label = f"{n_c}c {n_m}m"
    conditions.append(condition_label)

    base_path = os.path.join(path, f"{n_c}c{n_m}m{clone}")
    try:
        scite_rna = np.loadtxt(os.path.join(base_path, "sciterna", "sciterna_runtimes.txt")).tolist()
        sclineager = np.loadtxt(os.path.join(base_path, "sclineager", "sclineager_runtimes.txt")).tolist()
        dendro = np.loadtxt(os.path.join(base_path, "dendro", "dendro_runtimes.txt")).tolist()
    except Exception as e:
        print(f"Error loading data for {condition_label}: {e}")
        continue

    runtime_data["SCITE-RNA"].append(scite_rna)
    runtime_data["SClineager"].append(sclineager)
    runtime_data["DENDRO"].append(dendro)

In [21]:
os.makedirs("../data_summary/simulated_data", exist_ok=True)
with open("../data_summary/simulated_data/genotype_differences.json", "w") as f:
    json.dump(genotype_differences, f, indent=4)
with open("../data_summary/simulated_data/path_len_differences.json", "w") as f:
    json.dump(path_len_differences, f, indent=4)
with open("../data_summary/simulated_data/adjusted_rand_index.json", "w") as f:
    json.dump(adjusted_rand_index, f, indent=4)
with open("../data_summary/simulated_data/runtimes.json", "w") as f:
    json.dump(runtime_data, f, indent=4)

In [None]:
# Summary statistics for multiple myeloma
n_bootstrap = 1000
n_rounds = 2

study_nums = ["mm16", "mm34"]
global_parameters = {}
mean_individual_overdispersions_h = {}
mean_individual_dropouts = {}

for study_num in study_nums:
    path_bootstrap = rf"../data/results/{study_num}/sciterna_bootstrap"
    global_parameters[study_num] = []
    for i in range(n_bootstrap):
        for j in range(n_rounds-1, n_rounds):
            global_param = np.loadtxt(os.path.join(path_bootstrap, "sciterna_global_parameters", f"sciterna_global_parameters_{j}r{i}.txt")).tolist()
            global_parameters[study_num].append(global_param)
    global_parameters[study_num] = global_parameters[study_num] # Dropout, Overdispersion homozygous, Error Rate, Overdispersion heterozygous

    selected = np.loadtxt(os.path.join(path_bootstrap, "selected.txt"), delimiter=',', dtype=int)
    ref = pd.read_csv(os.path.join(f"../data/input_data/{study_num}", "ref.csv"))

    all_individual_overdispersions_h = np.full((n_bootstrap, ref.shape[1]), np.nan)
    all_individual_dropouts = np.full((n_bootstrap, ref.shape[1]), np.nan)

    for i in range(n_bootstrap):
        for j in range(n_rounds-1, n_rounds):
            selected_mutations = np.loadtxt(os.path.join(path_bootstrap, "sciterna_selected_loci", f"sciterna_selected_loci_{j}r{i}.txt"), dtype=int)
            individual_dropouts = np.loadtxt(os.path.join(path_bootstrap, "sciterna_individual_dropout_probs", f"sciterna_individual_dropout_probs_{j}r{i}.txt"))
            individual_overdispersions_h = np.loadtxt(os.path.join(path_bootstrap, "sciterna_individual_overdispersions_h", f"sciterna_individual_overdispersions_h_{j}r{i}.txt"))

            unique_mutations = np.unique(selected_mutations)
            for mut in unique_mutations:
                indices = np.where(selected_mutations == mut)[0]
                mean_dropout = np.mean(individual_dropouts[indices])
                mean_overdispersion = np.mean(individual_overdispersions_h[indices])

                all_individual_dropouts[i, mut] = mean_dropout
                all_individual_overdispersions_h[i, mut] = mean_overdispersion

    global_parameters_sample = np.array(global_parameters[study_num])

    sufficient_data_columns_od = ~np.any(
        all_individual_overdispersions_h == global_parameters_sample[:, 3][:, np.newaxis], axis=0
    )

    sufficient_data_columns_dropout = ~np.any(
        all_individual_dropouts == global_parameters_sample[:, 0][:, np.newaxis], axis=0
    )
    mean_individual_overdispersions_h[study_num] = np.nanmean(all_individual_overdispersions_h[:, sufficient_data_columns_od], axis=0).tolist()
    mean_individual_dropouts[study_num] = np.nanmean(all_individual_dropouts[:, sufficient_data_columns_dropout], axis=0).tolist()

In [40]:
os.makedirs("../data_summary/real_data", exist_ok=True)
with open("../data_summary/real_data/mean_individual_overdispersions_h.json", "w") as f:
    json.dump(mean_individual_overdispersions_h, f, indent=4)
with open("../data_summary/real_data/mean_individual_dropouts.json", "w") as f:
    json.dump(mean_individual_dropouts , f, indent=4)
with open("../data_summary/real_data/global_parameters.json", "w") as f:
    json.dump(global_parameters, f, indent=4)