In [15]:
import helper_fns
import pandas as pd
import numpy as np
import scanpy as sc
from sklearn.preprocessing import LabelEncoder
import joblib
import ray

In [2]:
dataset_celltypist = sc.read("/Volumes/SSD/global.h5ad")
list_celltypes = dataset_celltypist.obs['Manually_curated_celltype'].unique().tolist()
list_celltypes = list(filter(lambda x: x not in ['Mast cells', 'pDC','Progenitor', 'Erythroid', 'Megakaryocytes'], list_celltypes))
list_filtered_celltypes = list(filter(lambda x: 'doublets' not in x, list_celltypes)) 
dataset_celltypist = dataset_celltypist[dataset_celltypist.obs['Manually_curated_celltype'].isin(list_filtered_celltypes)]

# dataset_celltypist = sc.read("./pre_processed_datasets/celltypist_pca.h5ad")
# dataset_popv = sc.read("./pre_processed_datasets/popv_immune_pca.h5ad")
# list_celltypes = dataset_celltypist.obs['Manually_curated_celltype'].unique().tolist()

encoder_celltype = LabelEncoder()
encoder_celltype.fit(dataset_celltypist.obs['Manually_curated_celltype'])

list_ct = dataset_celltypist.obs['Manually_curated_celltype'].unique().tolist()
list_num_ct = encoder_celltype.transform(list_ct)
list_inner_nodes = ['Cross-tissue Immune Cell Atlas', 'B cell', 'Germinal center B cell', 'Myeloid', 'Dendritic cell',
                    'Macrophages', 'Monocytes', 'T & Innate lymphoid cells', 'CD4', 'T Naive', 'CD8', 
                    'Tissue-resident memory T (Trm) cells', 'NK']
all_nodes = list_ct + list_inner_nodes

encoder_celltype_inner = LabelEncoder()
encoder_celltype_inner.fit(list_inner_nodes)

# encoder_celltype_popv = LabelEncoder()
# encoder_celltype_popv.fit(dataset_popv.obs['cell_type'])

graph_celltypist = helper_fns.build_hierarchical_tree_celltypist(all_nodes=all_nodes, list_ct=list_ct, list_inner_nodes=list_inner_nodes, encoder_celltype=encoder_celltype, encoder_celltype_inner=encoder_celltype_inner)
distance_matrix_celltypist = helper_fns.get_dist_df(list_num_ct=list_num_ct, g=graph_celltypist)
celltypes = []
for i in range(len(list_ct)):
    celltype = helper_fns.inverse_transform(distance_matrix_celltypist.index[i], list_ct=list_ct, encoder_celltype=encoder_celltype, encoder_celltype_inner=encoder_celltype_inner)
    celltypes.append(celltype)
distance_matrix_celltypist.index = celltypes
distance_matrix_celltypist.columns = celltypes

In [21]:
distance_matrix_celltypist.loc['Trm_Th1/Th17', 'Trm_Th1/Th17']

0

In [25]:
for i in pr.unique().tolist():
    if i not in distance_matrix_celltypist.index.tolist():
        print(i)

In [17]:
# load results_dict from results_0908
results_dict = joblib.load('./results_celltypist_cv/results_celltypist_dict.pickle')

In [18]:
celltypes_celltypist = dataset_celltypist.obs['Manually_curated_celltype'].values

all_models = ['KNN', 'Logistic Regression', 'Net', 'Proto_Net', 'Proto_Net+pl', 'Proto_Net+disto', 'Proto_Net+disto_pl']

# load cv
cv = joblib.load('results_celltypist_cv/cv.pkl')

all_keys = results_dict.keys()
keys_fold_0 = [key for key in all_keys if 'fold_0' in key]
keys_fold_1 = [key for key in all_keys if 'fold_1' in key]
keys_fold_2 = [key for key in all_keys if 'fold_2' in key]
keys_fold_3 = [key for key in all_keys if 'fold_3' in key]
keys_fold_4 = [key for key in all_keys if 'fold_4' in key]

true_labels_fold_0 = celltypes_celltypist[cv[0]['test']]
true_labels_fold_1 = celltypes_celltypist[cv[1]['test']]
true_labels_fold_2 = celltypes_celltypist[cv[2]['test']]
true_labels_fold_3 = celltypes_celltypist[cv[3]['test']]
true_labels_fold_4 = celltypes_celltypist[cv[4]['test']]

In [None]:
@ray.remote
def get_distance_matrix_celltypist(model_name, true_labels, keys_fold, fold_str, fold_num):
    distance_matrix_celltypist = pd.DataFrame(columns=['model', 'fold', 'distance'])
    for key in keys_fold_0:
        pred_labels = results_dict[key]
        for i in range(len(pred_labels)):
            dist = distance_matrix_celltypist.loc[true_labels[i], pred_labels[i]]
            df_dist = pd.concat([df_dist, pd.DataFrame([[keys_fold[0].split(fold_str)[0], fold_num , dist]], columns=['model', 'fold', 'distance'])], ignore_index=True)

    return distance_matrix_celltypist

In [26]:
keys_fold_0[0].split('_fold_0')[0]

'proto_disto_pl'

In [29]:
for key in keys_fold_0:
    print()

proto_disto_pl
proto_disto
proto_pl
proto
Net
logistic
knn


In [None]:
df_dist = pd.DataFrame(columns=['model', 'fold', 'distance'])

print('fold 0')
for key in keys_fold_0:
    pred_labels = results_dict[key]
    for i in range(len(pred_labels)):
        dist = distance_matrix_celltypist.loc[true_labels_fold_0[i], pred_labels[i]]
        df_dist = pd.concat([df_dist, pd.DataFrame([[keys_fold_0[0].split('_fold_0')[0], 'fold_0' , dist]], columns=['model', 'fold', 'distance'])], ignore_index=True)

print('fold 1')
for key in keys_fold_1:
    pred_labels = results_dict[key]
    for i in range(len(pred_labels)):
        dist = distance_matrix_celltypist.loc[true_labels_fold_1[i], pred_labels[i]]
        df_dist = pd.concat([df_dist, pd.DataFrame([[keys_fold_1[0].split('_fold_1')[0], 'fold_1' , dist]], columns=['model', 'fold', 'distance'])], ignore_index=True)

print('fold 2')
for key in keys_fold_2:
    pred_labels = results_dict[key]
    for i in range(len(pred_labels)):
        dist = distance_matrix_celltypist.loc[true_labels_fold_2[i], pred_labels[i]]
        df_dist = pd.concat([df_dist, pd.DataFrame([[keys_fold_2[0].split('_fold_2')[0], 'fold_2' , dist]], columns=['model', 'fold', 'distance'])], ignore_index=True)

print('fold 3')
for key in keys_fold_3:
    pred_labels = results_dict[key]
    for i in range(len(pred_labels)):
        dist = distance_matrix_celltypist.loc[true_labels_fold_3[i], pred_labels[i]]
        df_dist = pd.concat([df_dist, pd.DataFrame([[keys_fold_3[0].split('_fold_3')[0], 'fold_3' , dist]], columns=['model', 'fold', 'distance'])], ignore_index=True)

print('fold 4')
for key in keys_fold_4:
    pred_labels = results_dict[key]
    for i in range(len(pred_labels)):
        dist = distance_matrix_celltypist.loc[true_labels_fold_4[i], pred_labels[i]]
        df_dist = pd.concat([df_dist, pd.DataFrame([[keys_fold_4[0].split('_fold_4')[0], 'fold_4' , dist]], columns=['model', 'fold', 'distance'])], ignore_index=True)

In [5]:
dataset_popv = sc.read("/Volumes/SSD/popv_immune.h5ad")
dataset_popv = dataset_popv[dataset_popv.obs['cell_type'] != 'double-positive, alpha-beta thymocyte']
list_celltypes = dataset_popv.obs['cell_type'].unique().tolist()

encoder_celltype = LabelEncoder()
encoder_celltype.fit(dataset_popv.obs['cell_type'])

list_ct = dataset_popv.obs['cell_type'].unique().tolist()
list_num_ct = encoder_celltype.transform(list_ct)
list_inner_nodes = ['popv_immune', 'myeloid leukocyte', 'mature B cell', 'NK', 'CD4', 'CD8']
all_nodes = list_ct + list_inner_nodes

encoder_celltype_inner = LabelEncoder()
encoder_celltype_inner.fit(list_inner_nodes)

# encoder_celltype_popv = LabelEncoder()
# encoder_celltype_popv.fit(dataset_popv.obs['cell_type'])

graph_popv = helper_fns.build_hierarchical_tree_popv_immune(all_nodes=all_nodes, list_ct=list_ct, list_inner_nodes=list_inner_nodes, encoder_celltype=encoder_celltype, encoder_celltype_inner=encoder_celltype_inner)