In [1]:
import numpy as np
import pandas as pd
import hnswlib
from scipy.sparse import csr_matrix
import igraph as ig
import leidenalg
import time
import os 
import sys
import louvain

  import louvain


In [52]:
def calculate_metrics(ref_label, query_labels, experiment_name='clusterEval'):
    """calculate Stability and Purity for sets of custering label

    :param ref_label: A set of reference labels, for which metrics can be calculate. Can be either vector like or pandas DataFrame, where column one is sample labels, and column2 is cluster labels.
    :type ref_label: List/vector OR pandas DataFrame
    :param query_labels: 2 or more sets of labels to compare ref_label againt. Either List of lists, or list of DataFrames(MUST MATCH ref_label FORMAT)
    :type query_labels: List of Lists or List of Pandas DataFrames
    :param experiment_name: name to assign current experiment
    :type experiment_name: str
    """ 
    if type(ref_label) == type(pd.DataFrame()):
        if ref_label.shape[1] >2:
            print('DF longer than 2: extra columns will be ignored')
    
        ref_label=ref_label.iloc[:,:2]
        ref_label.columns = ['Barcode', 'labels']
        query_labels = [df.rename(columns = {df.columns[0]:'Barcode', df.columns[1]:'labels' }) for df in query_labels]
        
        label_converted = ref_label['labels'].dtype != 'int'
        if ref_label['Barcode'].dtype != 'int':
            id_conv_df = pd.DataFrame().assign(Barcode=ref_label['Barcode'], 
            new_bc=list(range(ref_label.shape[0])))
            ref_label['Barcode'] = id_conv_df['new_bc']
            query_labels = [df.merge(id_conv_df, how='inner').drop(
                columns=['Barcode']).rename(columns = {'new_bc': 'Barcode'})[['Barcode', 'labels']] 
                for df in query_labels
                ]
        if label_converted:
            converter_df = pd.DataFrame({'labels': ref_label['labels'].unique()})
            converter_df['new_lab'] = list(range(converter_df.shape[0]))
            ref_label = ref_label.merge(converter_df, how='inner', on='labels').drop(
                columns=['labels']).rename(columns = {'new_lab': 'labels'})[['Barcode', 'labels']].to_dict('list')
            
            ## query labels are independent from ref labels, so just convert to numeric
            query_labels = [df.assign(labels = pd.factorize(df['labels'])[0] ).to_dict('list')
                for df in query_labels]
        else:
            ref_label = ref_label.to_dict('list')
            query_labels = [df.to_dict('list')
                            for df in query_labels]
#------------------------------------------------------------------------------------------------------------------------------
        return ref_label,  query_labels
#------------------------------------------------------------------------------------------------------------------------------
    else:
        ## list mode 
        is_int = np.issubdtype(int, np.integer)
        sample_ids = list(range(len(ref_label)))
        if is_int:
            ref_label = {'Barcode': sample_ids, 'labels': ref_label}
            query_labels = [{'Barcode': sample_ids, 'labels': ql}  for ql in query_labels ]
        else:
            converter_tab = pd.DataFrame().assign(label =ref_label, cluster_id = pd.factorize(ref_label)[0])
            ref_label = {'Barcode': sample_ids, 'labels': converter_tab['cluster_id'].to_list()} 
            query_labels = [{'Barcode': sample_ids,
                             'labels': pd.factorize(ql)[0].to_list()} for ql in query_labels]
#------------------------------------------------------------------------------------------------------------------------------        
        return ref_labels,  query_labels
        

In [58]:
all_int_ref_labels_df = pd.DataFrame({'x':[1,2,3,4,5,6],  'y':[0,0,0, 1, 1, 2] })
all_int_query_labels_df = [pd.DataFrame({'x':[1,2,3,4,5,6],  'y':[0,0,0, 1, 1, 6] }) for _ in range(3)]
all_string_ref_labels_df = pd.DataFrame({'x':['1','2','3','4','5','6'],  'y':['0','0','0', '1', 'C', 'D'] })
all_string_query_labels_df = [pd.DataFrame({'x':['2','4','6'],  'y':['0','1', 'B'] }) for _ in range(3)]
calculate_metrics(all_int_ref_labels_df, all_int_query_labels_df)
calculate_metrics(all_string_ref_labels_df, all_string_query_labels_df)

({'Barcode': [0, 1, 2, 3, 4, 5], 'labels': [0, 0, 0, 1, 2, 3]},
 [{'Barcode': [1, 3, 5], 'labels': [0, 1, 2]},
  {'Barcode': [1, 3, 5], 'labels': [0, 1, 2]},
  {'Barcode': [1, 3, 5], 'labels': [0, 1, 2]}])