### List datasets that will be used for training with scBasset

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import scbasset

2024-06-03 12:50:21.603971: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-03 12:50:21.824487: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-03 12:50:21.830174: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-06-03 12:50:21.830195: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore 

In [3]:
cd ~/workspace/theislab/mubind-pipeline/notebooks/pipeline

/mnt/c/Users/IgnacioIbarra/Dropbox/workspace/theislab/mubind-pipeline/notebooks/pipeline


In [4]:
# read scbasset
import tensorflow as tf
import scanpy as sc
from scbasset.utils import *
from scbasset.basenji_utils import *
from scbasset.utils import *
import pandas as pd
import glob
import os
import anndata
import pickle
import datetime



### Load scBasset models and score with them

In [5]:
rootdir = '/mnt/f/workspace/theislab/mubind/data/*/*/scbasset_output/obs*/*/*'

df = []

overwrite = False
for p in glob.glob(rootdir):

    # print(p)
    res = []
    if not os.path.exists(p):
        continue
    if not 'best_model.h5' in p:
        continue


    runtime_path = p.replace('best_model.h5', 'running_time.pkl')
    time_diff = pickle.load(open(runtime_path, 'rb')) if os.path.exists(runtime_path) else None

    # print(time_diff, type(time_diff))
    if not isinstance(time_diff, datetime.timedelta):
        # print('time log not ok....skip')
        continue

    # continue

    scores_path = p.replace('.h5', '_scores.csv')
    print(os.path.exists(scores_path), scores_path)
    if os.path.exists(scores_path) and not overwrite:
        # print('here...')
        df2 = pd.read_csv(scores_path, index_col=0)
        df2['n_obs'] = scores_path.split('/')[-3].replace('obs', '').split('_')[0]
        # print(time_diff)
        df2['time'] = time_diff.total_seconds()
        df.append(df2)
        continue
    

    os.makedirs("results", exist_ok=True)

    n_obs = p.split('/')[-3].split('_')[0]
    dataset_name = p.split('/')[-6]
    feature_selection_rule = p.split('/')[-5]
    loss_key = p.split('/')[-2]
    
    ad_path = os.path.join(os.path.dirname(p),
                            '../../../scbasset_input/%s/ad.h5ad' % n_obs)
    seqs_path = os.path.join(os.path.dirname(p),
                                '../../../scbasset_input/%s/all_seqs.h5' % n_obs)
    train_seqs = os.path.join(os.path.dirname(p),
                                '../../../scbasset_input/%s/train_seqs.h5' % n_obs)
    val_seqs = os.path.join(os.path.dirname(p),
                                '../../../scbasset_input/%s/val_seqs.h5' % n_obs)
    test_seqs = os.path.join(os.path.dirname(p),
                                '../../../scbasset_input/%s/test_seqs.h5' % n_obs)
    splits_path = os.path.join(os.path.dirname(p),
                                '../../../scbasset_input/%s/splits.h5' % n_obs)
    print(os.path.exists(ad_path), ad_path)

    # read h5ad file
    ad = anndata.read_h5ad(ad_path)

    # load model
    model = make_model(32, ad.shape[0], show_summary=False)
    model.load_weights(p)

    # since imputation generates a dense cell by peak matrix as output.
    # it won't scale to very large dataset due to memory issue.
    # predict scores
    print('get sequences scores')
    all_seqs = seqs_path # from the preprocess step
    # make tesorflow dataset to feed into model
    m = ad.X.tocoo().transpose().tocsr()
    n_cells = ad.shape[0]
    all_ds = tf.data.Dataset.from_generator(
        generator(all_seqs, m), 
        output_signature=(
                tf.TensorSpec(shape=(1344,4), dtype=tf.int8),
                tf.TensorSpec(shape=(n_cells), dtype=tf.int8),
        )
    ).batch(128).prefetch(tf.data.AUTOTUNE)
    
    # Y_impute = imputation_Y_normalize(all_ds, model)
    Y_pred = model.predict(all_ds)
    Y_pred = Y_pred.T

    Y_true = ad.X.A

    # convert to csr matrix
    import h5py
    with h5py.File(splits_path, 'r') as hf:
        train_ids = hf['train_ids'][:]
        val_ids = hf['val_ids'][:]
        test_ids = hf['test_ids'][:]

    import matplotlib.pyplot as plt
    from sklearn.metrics import r2_score, roc_auc_score
    from sklearn.metrics import average_precision_score
    # plt.scatter(Y_true.A.flatten(), Y_pred.flatten())


    for next_ids, label in zip([train_ids, val_ids, test_ids], ['train', 'val', 'test']):

        # roc auc

        Y_true_roc = np.where(Y_true[:,next_ids] > 0, 1, Y_true[:,next_ids])
        
        roc_auc, pr_auc = np.nan, np.nan
        try:
            roc_auc = roc_auc_score(Y_true_roc.flatten(), Y_pred[:,next_ids].flatten())
            pr_auc = average_precision_score(Y_true_roc.flatten(), Y_pred[:,next_ids].flatten())
        except:
            print('problem while calculating PR/AUC (maybe full zero mats)')

        res.append([dataset_name, feature_selection_rule, loss_key, label, 'roc_auc', roc_auc, p])
        res.append([dataset_name, feature_selection_rule, loss_key, label, 'pr_auc', pr_auc, p])
        
        r2 = r2_score(Y_true[:,next_ids].flatten(), Y_pred[:,next_ids].flatten())
        res.append([dataset_name, feature_selection_rule, loss_key, label, 'r2', r2, p])        
        
    
    res = pd.DataFrame(res, columns=['dataset', 'feat_selection', 'loss_key', 'group', 'metric', 'value', 'model_path'])
    res.to_csv(scores_path)

    df.append(res)

True /mnt/f/workspace/theislab/mubind/data/noack_2022/random/scbasset_output/obs500_e10/poisson/best_model_scores.csv
True /mnt/f/workspace/theislab/mubind/data/noack_2022/random/scbasset_output/obs500_e10/bce/best_model_scores.csv
True /mnt/f/workspace/theislab/mubind/data/noack_2022/random/scbasset_output/obs2000_e10/poisson/best_model_scores.csv
True /mnt/f/workspace/theislab/mubind/data/noack_2022/random/scbasset_output/obs2000_e10/bce/best_model_scores.csv
True /mnt/f/workspace/theislab/mubind/data/noack_2022/random/scbasset_output/obs5000_e10/poisson/best_model_scores.csv
True /mnt/f/workspace/theislab/mubind/data/noack_2022/random/scbasset_output/obs5000_e10/bce/best_model_scores.csv
True /mnt/f/workspace/theislab/mubind/data/noack_2022/random/scbasset_output/obs10000_e10/poisson/best_model_scores.csv
True /mnt/f/workspace/theislab/mubind/data/noack_2022/random/scbasset_output/obs1000_e1/poisson/best_model_scores.csv
True /mnt/f/workspace/theislab/mubind/data/noack_2022/random/s

2024-06-03 12:51:02.180884: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-06-03 12:51:02.181154: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-06-03 12:51:02.181298: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2024-06-03 12:51:02.181373: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2024-06-03 12:51:02.181446: W tensorf

get sequences scores
False /mnt/f/workspace/theislab/mubind/data/pbmc/random/scbasset_output/obs100_e1/bce/best_model_scores.csv
True /mnt/f/workspace/theislab/mubind/data/pbmc/random/scbasset_output/obs100_e1/bce/../../../scbasset_input/obs100/ad.h5ad
get sequences scores
False /mnt/f/workspace/theislab/mubind/data/pbmc/random/scbasset_output/obs100_e10/poisson/best_model_scores.csv
True /mnt/f/workspace/theislab/mubind/data/pbmc/random/scbasset_output/obs100_e10/poisson/../../../scbasset_input/obs100/ad.h5ad
get sequences scores
False /mnt/f/workspace/theislab/mubind/data/pbmc/random/scbasset_output/obs100_e10/bce/best_model_scores.csv
True /mnt/f/workspace/theislab/mubind/data/pbmc/random/scbasset_output/obs100_e10/bce/../../../scbasset_input/obs100/ad.h5ad
get sequences scores
False /mnt/f/workspace/theislab/mubind/data/pbmc/random/scbasset_output/obs100_e20/poisson/best_model_scores.csv
True /mnt/f/workspace/theislab/mubind/data/pbmc/random/scbasset_output/obs100_e20/poisson/../..

In [6]:
res = pd.concat(df)
res['model'] = 'scBasset'
print('')




In [7]:
res['n_epochs'] = res['model_path'].str.split('/').str[-3].str.split('_').str[1].str[1:].astype(int)

In [8]:
res.to_csv('results_scbasset.csv')

In [9]:
res['dataset'].value_counts()

pancreatic_endocrinogenesis    675
pbmc                           468
noack_2022                     315
Name: dataset, dtype: int64

In [10]:
res

Unnamed: 0,dataset,feat_selection,loss_key,group,metric,value,model_path,n_obs,time,model,n_epochs
0,noack_2022,random,poisson,train,roc_auc,,/mnt/f/workspace/theislab/mubind/data/noack_20...,500,187.031928,scBasset,10
1,noack_2022,random,poisson,train,pr_auc,,/mnt/f/workspace/theislab/mubind/data/noack_20...,500,187.031928,scBasset,10
2,noack_2022,random,poisson,train,r2,0.075192,/mnt/f/workspace/theislab/mubind/data/noack_20...,500,187.031928,scBasset,10
3,noack_2022,random,poisson,val,roc_auc,0.698906,/mnt/f/workspace/theislab/mubind/data/noack_20...,500,187.031928,scBasset,10
4,noack_2022,random,poisson,val,pr_auc,0.194494,/mnt/f/workspace/theislab/mubind/data/noack_20...,500,187.031928,scBasset,10
...,...,...,...,...,...,...,...,...,...,...,...
4,pbmc,episcanpy,bce,val,pr_auc,0.568809,/mnt/f/workspace/theislab/mubind/data/pbmc/epi...,,,scBasset,20
5,pbmc,episcanpy,bce,val,r2,-0.027791,/mnt/f/workspace/theislab/mubind/data/pbmc/epi...,,,scBasset,20
6,pbmc,episcanpy,bce,test,roc_auc,0.603354,/mnt/f/workspace/theislab/mubind/data/pbmc/epi...,,,scBasset,20
7,pbmc,episcanpy,bce,test,pr_auc,0.588223,/mnt/f/workspace/theislab/mubind/data/pbmc/epi...,,,scBasset,20
