In [31]:
import numpy as np
import scanpy as sc
import pandas as pd
import anndata as ad
import scib_metrics
from scib_metrics.benchmark import Benchmarker
import faiss
from scipy import sparse

In [32]:
def define_path(model_type, integration_type):
    adata_path = "../results/scglue/point_nine_corr/" + model_type + '/' + integration_type + '/' 
    return adata_path

In [33]:
def load_adata(adata_path):
    print("loading trimodal adata..\n\n")
    combined = ad.read_h5ad(adata_path+"combined.h5ad")
    #print(combined)
    return combined

In [34]:
def remove_obs_names_duplicates(combined):
    # Check for duplicate observation names
    print("Number of duplicate observation names:", sum(combined.obs.index.duplicated()))

    # Rename the observations with a unique suffix
    unique_obs_names = combined.obs.index
    unique_obs_names_suffix = [f"{name}_{i}" for i, name in enumerate(unique_obs_names)]
    combined.obs.index = unique_obs_names_suffix

    # Check that there are no more duplicate observation names
    print("Number of duplicate observation names:", sum(combined.obs.index.duplicated()))

In [35]:
def compute_neighbours(combined):
    print("computing neighbours on scglue latent space..\n\n")
    sc.pp.neighbors(combined, 15,  use_rep='X_glue')

In [36]:
def define_variables(combined):
    print("defining variables..\n\n")
    X = combined.obsm['X_glue']
    labels = combined.obs['Annotation_major_subset'].cat.codes.to_numpy()
    batches = combined.obs['Domain'].cat.codes.to_numpy()
    #del combined.obsp['distances']
    #del combined.obsp['connectivities']
    distances_nn = combined.obsp['distances']
    connectivities = combined.obsp['connectivities']
    
    return X, labels, batches, distances_nn, connectivities

In [37]:
def run_metrics(X, labels, batches, distances_nn, connectivities):
    print("running metrics..\n\n")
    batch_effect_metrics = {}
    #removal of batch effects
    print("running pcr..\n\n")
    pcr = scib_metrics.utils.principal_component_regression(X=X, covariate=batches)
    #print("running graph connectivity..\n\n")
    #graph_connectivity = scib_metrics.graph_connectivity(X=distances_nn, labels=labels)
    print("running ilisi knn..\n\n")
    ilisi_knn = scib_metrics.ilisi_knn(X=distances_nn, batches=batches) #potentially crashes
    #print("running silhouette batch..\n\n")
    #silhouette_batch = scib_metrics.silhouette_batch(X=X, labels=labels, batch=batches) #potentially crashes
    print("running kbet..\n\n")
    kbet = scib_metrics.kbet(X=distances_nn, batches=batches)
    
    #conservation of variance from cell identity labels (label conservation metrics)
    label_conserv_metrics = {}
    print("running silhouette label.. \n\n")
    silhouette_label = scib_metrics.silhouette_label(X=X, labels=labels)
    print("running nmi, ari with leiden..\n\n")
    nmi_ari_leiden = scib_metrics.nmi_ari_cluster_labels_leiden(X=connectivities, labels=labels)
    print("running clisi knn..\n\n")
    clisi_knn = scib_metrics.clisi_knn(X=distances_nn, labels=labels)
    
    print("generating batch effect metrics dictionary.. \n\n")
    batch_effect_metrics.update([('pcr', pcr), ('graph_connectivity', graph_connectivity),
                                 ('ilisi_knn', ilisi_knn), #('silhouette_batch', silhouette_batch), not working on scglue
                                 ('kbet', kbet)])
    print("generating label conservation metrics dictionary.. \n\n")
    label_conserv_metrics.update([('nmi_ari_leiden', nmi_ari_leiden), #('silhouette_label', silhouette_label), 
                                 ('clisi_knn', clisi_knn)])
    
    print("batch effect metrics for current integration coefficient:")
    print(batch_effect_metrics)
    print("label conservation metrics for current integration coefficient:")
    print(label_conserv_metrics)
    
    return batch_effect_metrics, label_conserv_metrics

In [38]:
def save_metrics(adata_path, batch_effect_metrics, label_conserv_metrics):
    with open(adata_path+"integration_metrics_Domain_AnnMajorSubset.txt", "w") as f:
        f.write("batch_effect_metrics:\n\n")
        for key, value in batch_effect_metrics.items():
            f.write(f"{key}: {value}\n")
        print("\n")
        f.write("label conservation metrics:\n\n")
        for key, value in label_conserv_metrics.items():
            f.write(f"{key}: {value}\n")

In [39]:
def main(model_types = ['unpaired', 'paired'],
         integration_types = ['trimodal', 'full', 'cite']):
    
    print("analysis starting..\n\n")
    for model_type in model_types: #run both paired and unpaired
        for integration_type in integration_types: #run full, cite-only, and trimodal models      
            print(f"Computing metrics for model type:'{model_type}' and integration type '{integration_type}'\n\n")
            adata_path = define_path(model_type, integration_type)
            combined = load_adata(adata_path)
            remove_obs_names_duplicates(combined)
            X, labels, batches, distances_nn, connectivities = define_variables(combined)
            compute_neighbours(combined)
            batch_effect_metrics, label_conserv_metrics = run_metrics(X, labels, batches, distances_nn, connectivities)
            save_metrics(adata_path, batch_effect_metrics, label_conserv_metrics)
    
    print("analysis finished")

In [40]:
main()

analysis starting..


Computing metrics for model type:'unpaired' and integration type 'trimodal'


loading trimodal adata..




  utils.warn_names_duplicates("obs")


Number of duplicate observation names: 131920
Number of duplicate observation names: 0
defining variables..


computing neighbours on scglue latent space..


running metrics..


running pcr..


running ilisi knn..




ValueError: Each cell must have the same number of neighbors.