### List datasets that will be used for training with scBasset

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import scbasset

2024-06-14 09:59:05.194112: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-14 09:59:05.244288: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-14 09:59:05.246314: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-06-14 09:59:05.246324: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudar

In [3]:
cd ~/workspace/theislab/mubind-pipeline/notebooks/pipeline

/home/ilibarra/workspace/theislab/mubind-pipeline/notebooks/pipeline


In [4]:
# read scbasset
import tensorflow as tf
import scanpy as sc
from scbasset.utils import *
from scbasset.basenji_utils import *
from scbasset.utils import *
import pandas as pd
import glob
import os
import anndata
import pickle
import datetime



### Load scBasset models and score with them

In [6]:
overwrite = False


In [7]:
# rootdir = '/mnt/f/workspace/theislab/mubind/data/*/*/scbasset_output/obs*/*/*/*'
rootdir = '/home/ilibarra/workspace/theislab/mubind/data/*/*/scbasset_output/obs*/*/*/*'

df = []

for p in glob.glob(rootdir):

    # print(p)
    res = []
    if not os.path.exists(p):
        continue
    if not 'best_model.h5' in p:
        continue

    print(p)
    # continue
    # assert False
    runtime_path = p.replace('best_model.h5', 'running_time.pkl')
    time_diff = pickle.load(open(runtime_path, 'rb')) if os.path.exists(runtime_path) else None

    # print(time_diff, type(time_diff))
    if not isinstance(time_diff, datetime.timedelta):
        # print('time log not ok....skip')
        continue

    # continue

    scores_path = p.replace('.h5', '_scores.csv')
    print(os.path.exists(scores_path), scores_path)
    if os.path.exists(scores_path) and not overwrite:
        # print('here...')
        df2 = pd.read_csv(scores_path, index_col=0)
        df2['n_obs'] = scores_path.split('/')[-4].replace('obs', '').split('_')[0]
        # print(time_diff)
        df2['time'] = time_diff.total_seconds()
        df.append(df2)
        continue

    os.makedirs("results", exist_ok=True)

    n_obs = p.split('/')[-4].split('_')[0] # .replace('obs', '')
    dataset_name = p.split('/')[-7]
    feature_selection_rule = p.split('/')[-6]
    loss_key = p.split('/')[-3]

    print(n_obs, dataset_name, feature_selection_rule, loss_key)
    
    fold_key = p.split('/')[-2]
    ad_path = os.path.join(os.path.dirname(p),
                           '../../../../scbasset_input/%s/%s/ad.h5ad' % (n_obs, fold_key))
    seqs_path = os.path.join(os.path.dirname(p),
                                '../../../../scbasset_input/%s/%s/all_seqs.h5' % (n_obs, fold_key))
    train_seqs = os.path.join(os.path.dirname(p),
                                '../../../../scbasset_input/%s/%s/train_seqs.h5' % (n_obs, fold_key))
    val_seqs = os.path.join(os.path.dirname(p),
                                '../../../../scbasset_input/%s/%s/val_seqs.h5' % (n_obs, fold_key))
    test_seqs = os.path.join(os.path.dirname(p),
                                '../../../../scbasset_input/%s/%s/test_seqs.h5' % (n_obs, fold_key))
    splits_path = os.path.join(os.path.dirname(p),
                                '../../../../scbasset_input/%s/%s/splits.h5' % (n_obs, fold_key))
    print(os.path.exists(ad_path), ad_path)

    # read h5ad file
    ad = anndata.read_h5ad(ad_path)

    # load model
    model = make_model(32, ad.shape[0], show_summary=False)
    model.load_weights(p)

    # since imputation generates a dense cell by peak matrix as output.
    # it won't scale to very large dataset due to memory issue.
    # predict scores
    print('get sequences scores')
    all_seqs = seqs_path # from the preprocess step
    # make tesorflow dataset to feed into model
    m = ad.X.tocoo().transpose().tocsr()
    n_cells = ad.shape[0]
    all_ds = tf.data.Dataset.from_generator(
        generator(all_seqs, m), 
        output_signature=(
                tf.TensorSpec(shape=(1344,4), dtype=tf.int8),
                tf.TensorSpec(shape=(n_cells), dtype=tf.int8),
        )
    ).batch(128).prefetch(tf.data.AUTOTUNE)
    
    # Y_impute = imputation_Y_normalize(all_ds, model)
    Y_pred = model.predict(all_ds)
    Y_pred = Y_pred.T

    Y_true = ad.X.A

    # convert to csr matrix
    import h5py
    with h5py.File(splits_path, 'r') as hf:
        train_ids = hf['train_ids'][:]
        val_ids = hf['val_ids'][:]
        test_ids = hf['test_ids'][:]

    import matplotlib.pyplot as plt
    from sklearn.metrics import r2_score, roc_auc_score
    from sklearn.metrics import average_precision_score
    # plt.scatter(Y_true.A.flatten(), Y_pred.flatten())
    from sklearn.preprocessing import LabelBinarizer
    import sklearn


    for next_ids, label in zip([train_ids, val_ids, test_ids], ['train', 'val', 'test']):

        # roc auc
        Y_true_roc = np.where(Y_true[:,next_ids] > 0, 1, Y_true[:,next_ids])
        
        roc_auc, pr_auc, pr_auc_multi, roc_auc_multi = np.nan, np.nan, np.nan, np.nan
        # assert False
        try:
            roc_auc = roc_auc_score(Y_true_roc.flatten(), Y_pred[:,next_ids].flatten())
            pr_auc = average_precision_score(Y_true_roc.flatten(), Y_pred[:,next_ids].flatten())

            y_flatten = Y_true_roc.flatten()
            label_binarizer = LabelBinarizer().fit(y_flatten)
            y_onehot_test = label_binarizer.transform(y_flatten)
            pred_onehot = label_binarizer.transform(Y_pred[:,next_ids].flatten().round())
            # y_onehot_test.shape  # (n_samples, n_classes)
            
            pr_auc_multi= sklearn.metrics.average_precision_score(y_onehot_test, pred_onehot)
            roc_auc_multi= roc_auc_score(y_onehot_test, pred_onehot)

        except:
            print('problem while calculating PR/AUC (maybe full zero mats)')

        res.append([dataset_name, feature_selection_rule, loss_key, label, 'roc_auc', roc_auc, p, fold_key])        
        res.append([dataset_name, feature_selection_rule, loss_key, label, 'roc_auc_multi', roc_auc_multi, p, fold_key])        
        res.append([dataset_name, feature_selection_rule, loss_key, label, 'pr_auc', pr_auc, p, fold_key])        
        res.append([dataset_name, feature_selection_rule, loss_key, label, 'pr_auc_multi', pr_auc_multi, p, fold_key])        

        # assert False

        r2 = r2_score(Y_true[:,next_ids].flatten(), Y_pred[:,next_ids].flatten())
        res.append([dataset_name, feature_selection_rule, loss_key, label, 'r2', r2, p, fold_key])        
        
    
    res = pd.DataFrame(res, columns=['dataset', 'feat_selection', 'loss_key', 'group', 'metric', 'value', 'model_path', 'fold_key'])
    res.to_csv(scores_path)

    df.append(res)

/home/ilibarra/workspace/theislab/mubind/data/pbmc/random/scbasset_output/obs100_e10/poisson/fold_0/best_model.h5
True /home/ilibarra/workspace/theislab/mubind/data/pbmc/random/scbasset_output/obs100_e10/poisson/fold_0/best_model_scores.csv
/home/ilibarra/workspace/theislab/mubind/data/pbmc/random/scbasset_output/obs100_e10/poisson/fold_1/best_model.h5
True /home/ilibarra/workspace/theislab/mubind/data/pbmc/random/scbasset_output/obs100_e10/poisson/fold_1/best_model_scores.csv
/home/ilibarra/workspace/theislab/mubind/data/pbmc/random/scbasset_output/obs100_e10/poisson/fold_2/best_model.h5
True /home/ilibarra/workspace/theislab/mubind/data/pbmc/random/scbasset_output/obs100_e10/poisson/fold_2/best_model_scores.csv
/home/ilibarra/workspace/theislab/mubind/data/pbmc/random/scbasset_output/obs100_e10/poisson/fold_4/best_model.h5
True /home/ilibarra/workspace/theislab/mubind/data/pbmc/random/scbasset_output/obs100_e10/poisson/fold_4/best_model_scores.csv
/home/ilibarra/workspace/theislab/mu

2024-06-14 09:59:16.390979: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-06-14 09:59:16.391118: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-06-14 09:59:16.391144: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2024-06-14 09:59:16.391161: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2024-06-14 09:59:16.391177: W tensorflow/c

get sequences scores
/home/ilibarra/workspace/theislab/mubind/data/pbmc/episcanpy/scbasset_output/obs2000_e10/poisson/fold_3/best_model.h5
True /home/ilibarra/workspace/theislab/mubind/data/pbmc/episcanpy/scbasset_output/obs2000_e10/poisson/fold_3/best_model_scores.csv
/home/ilibarra/workspace/theislab/mubind/data/pbmc/episcanpy/scbasset_output/obs2000_e10/bce/fold_0/best_model.h5
True /home/ilibarra/workspace/theislab/mubind/data/pbmc/episcanpy/scbasset_output/obs2000_e10/bce/fold_0/best_model_scores.csv
/home/ilibarra/workspace/theislab/mubind/data/pbmc/episcanpy/scbasset_output/obs2000_e10/bce/fold_1/best_model.h5
True /home/ilibarra/workspace/theislab/mubind/data/pbmc/episcanpy/scbasset_output/obs2000_e10/bce/fold_1/best_model_scores.csv
/home/ilibarra/workspace/theislab/mubind/data/pbmc/episcanpy/scbasset_output/obs2000_e10/bce/fold_2/best_model.h5
True /home/ilibarra/workspace/theislab/mubind/data/pbmc/episcanpy/scbasset_output/obs2000_e10/bce/fold_2/best_model_scores.csv
/home/i

In [11]:
res = pd.concat(df)
res['model'] = 'scBasset'
print('')




In [12]:
res['n_epochs'] = res['model_path'].str.split('/').str[-4].str.split('_').str[1].str[1:].astype(int)
res

Unnamed: 0,dataset,feat_selection,loss_key,group,metric,value,model_path,fold_key,n_obs,time,model,n_epochs
0,pbmc,random,poisson,train,roc_auc,0.636187,/home/ilibarra/workspace/theislab/mubind/data/...,fold_0,100,30.751385,scBasset,10
1,pbmc,random,poisson,train,roc_auc_multi,0.525161,/home/ilibarra/workspace/theislab/mubind/data/...,fold_0,100,30.751385,scBasset,10
2,pbmc,random,poisson,train,pr_auc,0.107659,/home/ilibarra/workspace/theislab/mubind/data/...,fold_0,100,30.751385,scBasset,10
3,pbmc,random,poisson,train,pr_auc_multi,0.077117,/home/ilibarra/workspace/theislab/mubind/data/...,fold_0,100,30.751385,scBasset,10
4,pbmc,random,poisson,train,r2,-0.121580,/home/ilibarra/workspace/theislab/mubind/data/...,fold_0,100,30.751385,scBasset,10
...,...,...,...,...,...,...,...,...,...,...,...,...
10,noack_2022,episcanpy,bce,test,roc_auc,0.661295,/home/ilibarra/workspace/theislab/mubind/data/...,fold_3,2000,532.165629,scBasset,10
11,noack_2022,episcanpy,bce,test,roc_auc_multi,0.500000,/home/ilibarra/workspace/theislab/mubind/data/...,fold_3,2000,532.165629,scBasset,10
12,noack_2022,episcanpy,bce,test,pr_auc,0.014053,/home/ilibarra/workspace/theislab/mubind/data/...,fold_3,2000,532.165629,scBasset,10
13,noack_2022,episcanpy,bce,test,pr_auc_multi,0.002838,/home/ilibarra/workspace/theislab/mubind/data/...,fold_3,2000,532.165629,scBasset,10


In [None]:
res.to_csv('results_scbasset.csv')

In [None]:
res['dataset'].value_counts()

pancreatic_endocrinogenesis    1500
pbmc                           1440
noack_2022                      750
Name: dataset, dtype: int64