Fig. 5: Memory performance comparison (1 core)
----

In this notebook, we produce Suppl. Tab. 3 displaying the single-core memory performance differences
between CellRank and Palantir on 100k cells.

# Preliminaries

## Dependencies

1. Please consult the [analysis_files/README.md](analysis_files/README.md) on how to run the memory performance benchmarks.

## Import packages

In [2]:
# import standard packages
from pathlib import Path
import pickle
import os
import sys

import numpy as np
import pandas as pd

## Set up paths

In [3]:
sys.path.insert(0, "../..")  # this depends on the notebook depth and must be adapted per notebook

from paths import DATA_DIR, FIG_DIR

## Set global parameters

In [4]:
root =  DATA_DIR / 'benchmarking' / 'memory_analysis_1_core'
palantir_path = root / "palantir"
cellrank_path = root / "gpcca"

## Load the data

In [3]:
res = {'CellRank (lin. probs.)': [], 'CellRank (macrostates)': [], 'Palantir': []}

for fname in os.listdir(palantir_path):
    with open(palantir_path / fname, 'rb') as fin:
        data = pickle.load(fin)
    res['Palantir'].append(max(data) / 1024)
    
for fname in os.listdir(cellrank_path):
    if not fname.endswith(".pickle"):
        continue
    with open(cellrank_path / fname, 'rb') as fin:
        data = pickle.load(fin)
    print(fname)
    # add macrostates and kernel memory together
    res['CellRank (macrostates)'].append((max(data['macro_mem']) + max(data['kernel_mem'])) / 1024)
    res['CellRank (lin. probs.)'].append(max(data['ap_mem']) / 1024)

100000_8.pickle
100000_0.pickle
100000_9.pickle
100000_6.pickle
100000_2.pickle
100000_3.pickle
100000_4.pickle
100000_5.pickle
100000_7.pickle
100000_1.pickle


### Clean the index

In [5]:
df = pd.DataFrame(res)
df.index = np.arange(1, 11)
df.index.name = 'subset'
df.round(2)

Unnamed: 0_level_0,CellRank (lin. probs.),CellRank (macrostates),Palantir
subset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,14.16,22.52,89.37
2,14.19,22.5,78.21
3,14.17,22.49,88.45
4,14.16,22.51,80.48
5,14.3,22.65,79.8
6,14.27,22.66,79.34
7,14.27,22.62,80.37
8,14.25,22.56,80.55
9,14.16,22.54,88.33
10,14.24,22.55,80.13


# Generate the table

## Calculate mean and standard deviation across the splits

In [7]:
tall_df = df.melt(value_vars=df.columns, var_name='algorithm', value_name='memory')

mean = tall_df.groupby('algorithm').mean().T
mean.index.name = 'size'
mean.columns = [f"{c} mean" for c in mean.columns]

std = tall_df.groupby('algorithm').std().T
std.index.name = 'size'
std.columns = [f"{c} std" for c in std.columns]

stats = pd.concat([mean, std], axis=1)
stats.index = [100_000]
stats.index.name = '#cells (thousands)'
stats = stats.round(2)
stats

Unnamed: 0_level_0,CellRank (lin. probs.) mean,CellRank (macrostates) mean,Palantir mean,CellRank (lin. probs.) std,CellRank (macrostates) std,Palantir std
#cells (thousands),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100000,14.22,22.56,82.5,0.05,0.06,4.35


## Reorder the dataframe

In [8]:
order = ['CellRank (macrostates)', 'CellRank (lin. probs.)', 'Palantir']

In [9]:
stats = stats[[f"{c} {s}" for c in order for s in ('mean', 'std')]]
stats

Unnamed: 0_level_0,CellRank (macrostates) mean,CellRank (macrostates) std,CellRank (lin. probs.) mean,CellRank (lin. probs.) std,Palantir mean,Palantir std
#cells (thousands),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100000,22.56,0.06,14.22,0.05,82.5,4.35


## Save the results

In [10]:
stats.to_csv(DATA_DIR / "benchmarking_results" / "suppl_tab_memory_benchmark_1_core" / "statistics.csv")