# Use Case: HLCA Datasets

In [1]:
import sys
sys.path.append("../..")

In [None]:
import os
import time
import scanpy as sc
import idtrack
import pickle

In [None]:
%load_ext autoreload
%autoreload 2

Initialize the graph and pathfinder

In [4]:
local_dir = "/lustre/groups/ml01/workspace/kemal.inecik/idtrack_temp"
idt = idtrack.API(local_repository=local_dir)
idt.configure_logger()
idt.initialize_graph(organism_name='homo_sapiens', ensembl_release=107, return_test=True)

2023-01-29 23:26:20 INFO:graph_maker: The graph is being read.


In [5]:
idt.calculate_graph_caches()

2023-01-29 23:27:23 INFO:the_graph: Cached properties being calculated: combined_edges
2023-01-29 23:28:07 INFO:the_graph: Cached properties being calculated: combined_edges_assembly_specific_genes
2023-01-29 23:28:11 INFO:the_graph: Cached properties being calculated: combined_edges_genes
2023-01-29 23:28:24 INFO:the_graph: Cached properties being calculated: lower_chars_graph
2023-01-29 23:28:25 INFO:the_graph: Cached properties being calculated: get_active_ranges_of_id
2023-01-29 23:28:48 INFO:the_graph: Cached properties being calculated: available_external_databases
2023-01-29 23:28:50 INFO:the_graph: Cached properties being calculated: external_database_connection_form
2023-01-29 23:29:38 INFO:the_graph: Cached properties being calculated: available_genome_assemblies
2023-01-29 23:29:39 INFO:the_graph: Cached properties being calculated: available_external_databases_assembly
2023-01-29 23:29:41 INFO:the_graph: Cached properties being calculated: node_trios


The datasets of HLCA

In [6]:
base_path = "/lustre/groups/ml01/workspace/hlca_lisa.sikkema_malte.luecken/HLCA_reproducibility/data"
dset0_dir = os.path.join(base_path, "HLCA_extended/extension_datasets/ready/full")
dset1_dir = os.path.join(base_path, "HLCA_extended/extension_datasets/raw")

adata_dict = {
    "Kaminski_2020": [f"{dset0_dir}/adams.h5ad"],
    "Meyer_2021": [f"{dset0_dir}/meyer_2021.h5ad"],
    "MeyerNikolic_unpubl": [f"{dset0_dir}/meyer_nikolic_unpubl.h5ad"],
    "Barbry_unpubl": [f"{dset0_dir}/barbry.h5ad"],
    "Regev_2021": [
        f"{dset0_dir}/delorey_cryo.h5ad", f"{dset0_dir}/delorey_fresh.h5ad",
        f"{dset0_dir}/delorey_nuclei.h5ad"
    ],
    "Thienpont_2018": [f"{dset1_dir}/Lambrechts/lambrechts.h5ad"],
    "Budinger_2020": [f"{dset0_dir}/bharat.h5ad"],
    "Banovich_Kropski_2020": [f"{dset0_dir}/haberman.h5ad"],
    "Sheppard_2020": [f"{dset0_dir}/tsukui.h5ad"],
    "Wunderink_2021": [
        f"{dset0_dir}/grant_cryo.h5ad", f"{dset0_dir}/grant_fresh.h5ad"
    ],
    "Lambrechts_2021": [
        f"{dset0_dir}/wouters.h5ad" #, f"{dset0_dir}/wouters_labs.h5ad"
    ],
    "Zhang_2021": [f"{dset1_dir}/Liao/covid_for_publish.h5ad"],
    "Duong_lungMAP_unpubl": [f"{dset0_dir}/duong.h5ad"],
    "Janssen_2020": [f"{dset0_dir}/mould.h5ad"],
    "Sun_2020": [
        f"{dset0_dir}/wang_sub_batch1.h5ad", f"{dset0_dir}/wang_sub_batch2.h5ad",
        f"{dset0_dir}/wang_sub_batch3.h5ad", f"{dset0_dir}/wang_sub_batch4.h5ad"],
    "Gomperts_2021": [
        f"{dset0_dir}/carraro_ucla.h5ad", f"{dset0_dir}/carraro_cff.h5ad",
        f"{dset0_dir}/carraro_csmc.h5ad"],
    "Eils_2020": [f"{dset0_dir}/lukassen.h5ad"],
    "Schiller_2020": [f"{dset0_dir}/mayr.h5ad"],
    "Misharin_Budinger_2018": [f"{dset0_dir}/reyfman_disease.h5ad"],
    "Shalek_2018": [f"{dset0_dir}/ordovasmontanes.h5ad"],
    "Schiller_2021": [f"{dset0_dir}/schiller_discovair.h5ad"],
    "Peer_Massague_2020": [f"{dset0_dir}/laughney.h5ad"],
    "Lafyatis_2019": [f"{dset0_dir}/valenzi.h5ad"],
    "Tata_unpubl": [f"{dset0_dir}/tata_unpubl.h5ad"],
    "Xu_2020": [f"{dset0_dir}/guo.h5ad"],
    "Sims_2019": [f"{dset0_dir}/szabo.h5ad"],
    "Schultze_unpubl": [f"{dset0_dir}/schultze_unpubl.h5ad"]
}

Run the ID conversion with HGNC Symbol (a.k.a gene name)

In [7]:
result = dict()
final_database="HGNC Symbol"

for dataset_name in adata_dict:
    
    adata = sc.read(adata_dict[dataset_name][0])
    gene_list = list(adata.var.index)
    
    matching = idt.convert_identifier_multiple(gene_list, final_database=final_database, pbar_prefix=dataset_name)
    binned_conversions = idt.classify_multiple_conversion(matching)
    
    idt.print_binned_conversion(binned_conversions)
    print(f"Source release: {idt.infer_identifier_source(gene_list)}")
    
    result[dataset_name] = binned_conversions

Kaminski_2020: 100%|██████████| 45947/45947 [05:05<00:00, 150.37it/s, ID:ENSG00000280139]


changed_only_1_to_n: 59
changed_only_1_to_1: 4002
alternative_target_1_to_1: 12331
alternative_target_1_to_n: 13
matching_1_to_0: 98
matching_1_to_1: 33446
matching_1_to_n: 59
input_identifiers: 45947
Source release: (38, 94)


Meyer_2021: 100%|███████████████████| 20922/20922 [04:38<00:00, 75.08it/s, ID:AC240274.1]


changed_only_1_to_n: 3
changed_only_1_to_1: 997
alternative_target_1_to_1: 368
alternative_target_1_to_n: 2
matching_1_to_0: 17
matching_1_to_1: 20532
matching_1_to_n: 3
input_identifiers: 20922
Source release: (38, 84)


MeyerNikolic_unpubl: 100%|███████| 33582/33582 [04:52<00:00, 114.64it/s, ID:hsa-mir-1253]


changed_only_1_to_n: 15
changed_only_1_to_1: 2711
alternative_target_1_to_1: 8342
alternative_target_1_to_n: 10
matching_1_to_0: 106
matching_1_to_1: 25109
matching_1_to_n: 15
input_identifiers: 33582
Source release: (38, 93)


Barbry_unpubl: 100%|██████████████████████| 16859/16859 [04:10<00:00, 67.29it/s, ID:WHRN]


changed_only_1_to_n: 1
changed_only_1_to_1: 972
alternative_target_1_to_1: 1679
alternative_target_1_to_n: 4
matching_1_to_0: 20
matching_1_to_1: 15155
matching_1_to_n: 1
input_identifiers: 16859
Source release: (38, 98)


Regev_2021: 100%|████████████████████████| 30983/30983 [05:14<00:00, 98.64it/s, ID:ZZEF1]


changed_only_1_to_n: 8
changed_only_1_to_1: 2520
alternative_target_1_to_1: 7387
alternative_target_1_to_n: 9
matching_1_to_0: 113
matching_1_to_1: 23466
matching_1_to_n: 8
input_identifiers: 30983
Source release: (38, 93)


Thienpont_2018: 100%|███████████████| 27958/27958 [04:44<00:00, 98.29it/s, ID:AC240274.1]


changed_only_1_to_n: 8
changed_only_1_to_1: 3598
alternative_target_1_to_1: 5342
alternative_target_1_to_n: 43
matching_1_to_0: 171
matching_1_to_1: 22394
matching_1_to_n: 8
input_identifiers: 27958
Source release: (38, 84)


Budinger_2020: 100%|█████████████████████| 26316/26316 [04:39<00:00, 94.27it/s, ID:ZZEF1]


changed_only_1_to_n: 6
changed_only_1_to_1: 2122
alternative_target_1_to_1: 4807
alternative_target_1_to_n: 6
matching_1_to_0: 56
matching_1_to_1: 21441
matching_1_to_n: 6
input_identifiers: 26316
Source release: (38, 93)


Banovich_Kropski_2020: 100%|██████████| 33694/33694 [04:51<00:00, 115.44it/s, ID:FAM231B]


changed_only_1_to_n: 15
changed_only_1_to_1: 4470
alternative_target_1_to_1: 8131
alternative_target_1_to_n: 57
matching_1_to_0: 262
matching_1_to_1: 25229
matching_1_to_n: 15
input_identifiers: 33694
Source release: (38, 84)


Sheppard_2020: 100%|████████████████| 27147/27147 [04:42<00:00, 96.21it/s, ID:AC240274.1]


changed_only_1_to_n: 6
changed_only_1_to_1: 3464
alternative_target_1_to_1: 5020
alternative_target_1_to_n: 40
matching_1_to_0: 166
matching_1_to_1: 21915
matching_1_to_n: 6
input_identifiers: 27147
Source release: (38, 84)


Wunderink_2021: 100%|████████████████████| 21819/21819 [05:00<00:00, 72.65it/s, ID:ZZEF1]


changed_only_1_to_n: 2
changed_only_1_to_1: 1712
alternative_target_1_to_1: 2907
alternative_target_1_to_n: 1
matching_1_to_0: 53
matching_1_to_1: 18856
matching_1_to_n: 2
input_identifiers: 21819
Source release: (38, 93)


Lambrechts_2021: 100%|████████████████| 33538/33538 [04:50<00:00, 115.40it/s, ID:FAM231C]


changed_only_1_to_n: 15
changed_only_1_to_1: 2693
alternative_target_1_to_1: 8342
alternative_target_1_to_n: 10
matching_1_to_0: 80
matching_1_to_1: 25091
matching_1_to_n: 15
input_identifiers: 33538
Source release: (38, 93)


Zhang_2021: 100%|█████████████████████████| 18474/18474 [04:31<00:00, 68.06it/s, ID:DSG1]


changed_only_1_to_n: 1
changed_only_1_to_1: 806
alternative_target_1_to_1: 99
alternative_target_1_to_n: 0
matching_1_to_0: 10
matching_1_to_1: 18364
matching_1_to_n: 1
input_identifiers: 18474
Source release: (38, 87)


Duong_lungMAP_unpubl: 100%|██████████████| 27678/27678 [04:37<00:00, 99.63it/s, ID:ZZEF1]


changed_only_1_to_n: 6
changed_only_1_to_1: 2293
alternative_target_1_to_1: 5994
alternative_target_1_to_n: 10
matching_1_to_0: 51
matching_1_to_1: 21617
matching_1_to_n: 6
input_identifiers: 27678
Source release: (38, 93)


Janssen_2020: 100%|███████████████████| 33538/33538 [04:51<00:00, 115.16it/s, ID:FAM231C]


changed_only_1_to_n: 15
changed_only_1_to_1: 2693
alternative_target_1_to_1: 8342
alternative_target_1_to_n: 10
matching_1_to_0: 80
matching_1_to_1: 25091
matching_1_to_n: 15
input_identifiers: 33538
Source release: (38, 93)


Sun_2020: 100%|█████████████████████| 26578/26578 [05:10<00:00, 85.62it/s, ID:AP006748.1]


changed_only_1_to_n: 6
changed_only_1_to_1: 2192
alternative_target_1_to_1: 5417
alternative_target_1_to_n: 8
matching_1_to_0: 50
matching_1_to_1: 21097
matching_1_to_n: 6
input_identifiers: 26578
Source release: (38, 93)


Gomperts_2021: 100%|███████████████████| 31229/31229 [04:57<00:00, 104.83it/s, ID:WFDC11]


changed_only_1_to_n: 14
changed_only_1_to_1: 4744
alternative_target_1_to_1: 5910
alternative_target_1_to_n: 28
matching_1_to_0: 348
matching_1_to_1: 24929
matching_1_to_n: 14
input_identifiers: 31229
Source release: (38, 89)


Eils_2020: 100%|███████████████████| 32738/32738 [04:55<00:00, 110.71it/s, ID:AC002321.1]


changed_only_1_to_n: 17
changed_only_1_to_1: 5448
alternative_target_1_to_1: 7519
alternative_target_1_to_n: 44
matching_1_to_0: 612
matching_1_to_1: 24546
matching_1_to_n: 17
input_identifiers: 32738
Source release: (37, 85)


Schiller_2020: 100%|█████████████| 32104/32104 [04:49<00:00, 110.80it/s, ID:RP11-205K6.1]


changed_only_1_to_n: 8
changed_only_1_to_1: 4469
alternative_target_1_to_1: 6431
alternative_target_1_to_n: 26
matching_1_to_0: 533
matching_1_to_1: 25106
matching_1_to_n: 8
input_identifiers: 32104
Source release: (38, 89)


Misharin_Budinger_2018: 100%|███████| 27181/27181 [04:45<00:00, 95.28it/s, ID:AC240274.1]


changed_only_1_to_n: 7
changed_only_1_to_1: 3464
alternative_target_1_to_1: 5009
alternative_target_1_to_n: 43
matching_1_to_0: 177
matching_1_to_1: 21945
matching_1_to_n: 7
input_identifiers: 27181
Source release: (38, 84)


Shalek_2018: 100%|██████████████████████| 25328/25328 [04:40<00:00, 90.33it/s, ID:TTTY10]


changed_only_1_to_n: 7
changed_only_1_to_1: 3647
alternative_target_1_to_1: 3642
alternative_target_1_to_n: 33
matching_1_to_0: 492
matching_1_to_1: 21154
matching_1_to_n: 7
input_identifiers: 25328
Source release: (37, 85)


Schiller_2021: 100%|████████████████| 17533/17533 [04:59<00:00, 58.55it/s, ID:AC240274.1]


changed_only_1_to_n: 0
changed_only_1_to_1: 497
alternative_target_1_to_1: 193
alternative_target_1_to_n: 0
matching_1_to_0: 10
matching_1_to_1: 17330
matching_1_to_n: 0
input_identifiers: 17533
Source release: (38, 99)


Peer_Massague_2020: 100%|█████████████████| 19222/19222 [04:31<00:00, 70.76it/s, ID:ZZZ3]


changed_only_1_to_n: 4
changed_only_1_to_1: 1545
alternative_target_1_to_1: 1247
alternative_target_1_to_n: 11
matching_1_to_0: 63
matching_1_to_1: 17897
matching_1_to_n: 4
input_identifiers: 19222
Source release: (38, 86)


Lafyatis_2019: 100%|████████████████| 22164/22164 [04:42<00:00, 78.39it/s, ID:AC213203.1]


changed_only_1_to_n: 6
changed_only_1_to_1: 1032
alternative_target_1_to_1: 442
alternative_target_1_to_n: 2
matching_1_to_0: 21
matching_1_to_1: 21693
matching_1_to_n: 6
input_identifiers: 22164
Source release: (38, 84)


Tata_unpubl: 100%|█████████████████| 31915/31915 [04:45<00:00, 111.79it/s, ID:AC213203.1]


changed_only_1_to_n: 11
changed_only_1_to_1: 1954
alternative_target_1_to_1: 7632
alternative_target_1_to_n: 6
matching_1_to_0: 36
matching_1_to_1: 24230
matching_1_to_n: 11
input_identifiers: 31915
Source release: (38, 93)


Xu_2020: 100%|█████████████████████| 32738/32738 [04:59<00:00, 109.45it/s, ID:AC002321.1]


changed_only_1_to_n: 17
changed_only_1_to_1: 5448
alternative_target_1_to_1: 7519
alternative_target_1_to_n: 44
matching_1_to_0: 612
matching_1_to_1: 24546
matching_1_to_n: 17
input_identifiers: 32738
Source release: (37, 85)


Sims_2019: 100%|████████████████| 60725/60725 [08:50<00:00, 114.41it/s, ID:CTD-2062F14.3]


changed_only_1_to_n: 985
changed_only_1_to_1: 8062
alternative_target_1_to_1: 16431
alternative_target_1_to_n: 265
matching_1_to_0: 1461
matching_1_to_1: 41583
matching_1_to_n: 985
input_identifiers: 60725




Source release: (38, 83)


Schultze_unpubl: 100%|██████████████| 24532/24532 [04:33<00:00, 89.73it/s, ID:AP001269.4]


changed_only_1_to_n: 5
changed_only_1_to_1: 2070
alternative_target_1_to_1: 3820
alternative_target_1_to_n: 6
matching_1_to_0: 46
matching_1_to_1: 20655
matching_1_to_n: 5
input_identifiers: 24532
Source release: (38, 91)


In [8]:
time_suffix = time.strftime("%Y%m%d-%H%M%S", time.gmtime())
file_path = os.path.join(local_dir, f"results_for_hlca_datasets_{final_database}_{time_suffix}.pk")
with open(file_path, 'wb') as handle:
    pickle.dump(result, handle)
    print(f"Saved: {file_path}")

Saved: /lustre/groups/ml01/workspace/kemal.inecik/idtrack_temp/results_for_hlca_datasets_HGNC Symbol_20230130-004722.pk


Run the ID conversion with Ensembl gene ID

In [9]:
result = dict()
final_database="ensembl_gene"

for dataset_name in adata_dict:
    
    adata = sc.read(adata_dict[dataset_name][0])
    gene_list = list(adata.var.index)
    
    matching = idt.convert_identifier_multiple(gene_list, final_database=final_database, pbar_prefix=dataset_name)
    binned_conversions = idt.classify_multiple_conversion(matching)
    
    idt.print_binned_conversion(binned_conversions)
    print(f"Source release: {idt.infer_identifier_source(gene_list)}")
    
    result[dataset_name] = binned_conversions

Kaminski_2020: 100%|██████████| 45947/45947 [03:58<00:00, 193.04it/s, ID:ENSG00000280139]


changed_only_1_to_n: 496
changed_only_1_to_1: 45353
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 98
matching_1_to_1: 45353
matching_1_to_n: 496
input_identifiers: 45947
Source release: (38, 94)


Meyer_2021: 100%|███████████████████| 20922/20922 [03:42<00:00, 94.23it/s, ID:AC240274.1]


changed_only_1_to_n: 213
changed_only_1_to_1: 20692
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 17
matching_1_to_1: 20692
matching_1_to_n: 213
input_identifiers: 20922
Source release: (38, 84)


MeyerNikolic_unpubl: 100%|███████| 33582/33582 [03:20<00:00, 167.62it/s, ID:hsa-mir-1253]


changed_only_1_to_n: 290
changed_only_1_to_1: 33186
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 106
matching_1_to_1: 33186
matching_1_to_n: 290
input_identifiers: 33582
Source release: (38, 93)


Barbry_unpubl: 100%|██████████████████████| 16859/16859 [03:21<00:00, 83.67it/s, ID:WHRN]


changed_only_1_to_n: 101
changed_only_1_to_1: 16738
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 20
matching_1_to_1: 16738
matching_1_to_n: 101
input_identifiers: 16859
Source release: (38, 98)


Regev_2021: 100%|███████████████████████| 30983/30983 [03:48<00:00, 135.73it/s, ID:ZZEF1]


changed_only_1_to_n: 222
changed_only_1_to_1: 30648
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 113
matching_1_to_1: 30648
matching_1_to_n: 222
input_identifiers: 30983
Source release: (38, 93)


Thienpont_2018: 100%|██████████████| 27958/27958 [03:45<00:00, 123.97it/s, ID:AC240274.1]


changed_only_1_to_n: 246
changed_only_1_to_1: 27541
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 171
matching_1_to_1: 27541
matching_1_to_n: 246
input_identifiers: 27958
Source release: (38, 84)


Budinger_2020: 100%|████████████████████| 26316/26316 [03:12<00:00, 136.63it/s, ID:ZZEF1]


changed_only_1_to_n: 194
changed_only_1_to_1: 26066
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 56
matching_1_to_1: 26066
matching_1_to_n: 194
input_identifiers: 26316
Source release: (38, 93)


Banovich_Kropski_2020: 100%|██████████| 33694/33694 [03:51<00:00, 145.69it/s, ID:FAM231B]


changed_only_1_to_n: 336
changed_only_1_to_1: 33096
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 262
matching_1_to_1: 33096
matching_1_to_n: 336
input_identifiers: 33694
Source release: (38, 84)


Sheppard_2020: 100%|███████████████| 27147/27147 [03:43<00:00, 121.66it/s, ID:AC240274.1]


changed_only_1_to_n: 239
changed_only_1_to_1: 26742
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 166
matching_1_to_1: 26742
matching_1_to_n: 239
input_identifiers: 27147
Source release: (38, 84)


Wunderink_2021: 100%|███████████████████| 21819/21819 [03:35<00:00, 101.13it/s, ID:ZZEF1]


changed_only_1_to_n: 169
changed_only_1_to_1: 21597
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 53
matching_1_to_1: 21597
matching_1_to_n: 169
input_identifiers: 21819
Source release: (38, 93)


Lambrechts_2021: 100%|████████████████| 33538/33538 [03:53<00:00, 143.91it/s, ID:FAM231C]


changed_only_1_to_n: 290
changed_only_1_to_1: 33168
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 80
matching_1_to_1: 33168
matching_1_to_n: 290
input_identifiers: 33538
Source release: (38, 93)


Zhang_2021: 100%|█████████████████████████| 18474/18474 [03:05<00:00, 99.36it/s, ID:DSG1]


changed_only_1_to_n: 159
changed_only_1_to_1: 18305
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 10
matching_1_to_1: 18305
matching_1_to_n: 159
input_identifiers: 18474
Source release: (38, 87)


Duong_lungMAP_unpubl: 100%|█████████████| 27678/27678 [03:40<00:00, 125.62it/s, ID:ZZEF1]


changed_only_1_to_n: 180
changed_only_1_to_1: 27447
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 51
matching_1_to_1: 27447
matching_1_to_n: 180
input_identifiers: 27678
Source release: (38, 93)


Janssen_2020: 100%|███████████████████| 33538/33538 [03:52<00:00, 143.96it/s, ID:FAM231C]


changed_only_1_to_n: 290
changed_only_1_to_1: 33168
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 80
matching_1_to_1: 33168
matching_1_to_n: 290
input_identifiers: 33538
Source release: (38, 93)


Sun_2020: 100%|████████████████████| 26578/26578 [03:42<00:00, 119.21it/s, ID:AP006748.1]


changed_only_1_to_n: 161
changed_only_1_to_1: 26367
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 50
matching_1_to_1: 26367
matching_1_to_n: 161
input_identifiers: 26578
Source release: (38, 93)


Gomperts_2021: 100%|███████████████████| 31229/31229 [03:23<00:00, 153.75it/s, ID:WFDC11]


changed_only_1_to_n: 274
changed_only_1_to_1: 30607
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 348
matching_1_to_1: 30607
matching_1_to_n: 274
input_identifiers: 31229
Source release: (38, 89)


Eils_2020: 100%|███████████████████| 32738/32738 [03:59<00:00, 136.92it/s, ID:AC002321.1]


changed_only_1_to_n: 284
changed_only_1_to_1: 31842
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 612
matching_1_to_1: 31842
matching_1_to_n: 284
input_identifiers: 32738
Source release: (37, 85)


Schiller_2020: 100%|█████████████| 32104/32104 [03:48<00:00, 140.54it/s, ID:RP11-205K6.1]


changed_only_1_to_n: 263
changed_only_1_to_1: 31308
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 533
matching_1_to_1: 31308
matching_1_to_n: 263
input_identifiers: 32104
Source release: (38, 89)


Misharin_Budinger_2018: 100%|██████| 27181/27181 [03:44<00:00, 120.93it/s, ID:AC240274.1]


changed_only_1_to_n: 251
changed_only_1_to_1: 26753
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 177
matching_1_to_1: 26753
matching_1_to_n: 251
input_identifiers: 27181
Source release: (38, 84)


Shalek_2018: 100%|█████████████████████| 25328/25328 [03:10<00:00, 133.17it/s, ID:TTTY10]


changed_only_1_to_n: 198
changed_only_1_to_1: 24638
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 492
matching_1_to_1: 24638
matching_1_to_n: 198
input_identifiers: 25328
Source release: (37, 85)


Schiller_2021: 100%|████████████████| 17533/17533 [03:34<00:00, 81.76it/s, ID:AC240274.1]


changed_only_1_to_n: 139
changed_only_1_to_1: 17384
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 10
matching_1_to_1: 17384
matching_1_to_n: 139
input_identifiers: 17533
Source release: (38, 99)


Peer_Massague_2020: 100%|█████████████████| 19222/19222 [03:40<00:00, 87.07it/s, ID:ZZZ3]


changed_only_1_to_n: 155
changed_only_1_to_1: 19004
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 63
matching_1_to_1: 19004
matching_1_to_n: 155
input_identifiers: 19222
Source release: (38, 86)


Lafyatis_2019: 100%|███████████████| 22164/22164 [03:13<00:00, 114.67it/s, ID:AC213203.1]


changed_only_1_to_n: 258
changed_only_1_to_1: 21885
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 21
matching_1_to_1: 21885
matching_1_to_n: 258
input_identifiers: 22164
Source release: (38, 84)


Tata_unpubl: 100%|█████████████████| 31915/31915 [03:47<00:00, 140.47it/s, ID:AC213203.1]


changed_only_1_to_n: 280
changed_only_1_to_1: 31599
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 36
matching_1_to_1: 31599
matching_1_to_n: 280
input_identifiers: 31915
Source release: (38, 93)


Xu_2020: 100%|█████████████████████| 32738/32738 [03:58<00:00, 137.10it/s, ID:AC002321.1]


changed_only_1_to_n: 284
changed_only_1_to_1: 31842
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 612
matching_1_to_1: 31842
matching_1_to_n: 284
input_identifiers: 32738
Source release: (37, 85)


Sims_2019: 100%|████████████████| 60725/60725 [06:26<00:00, 157.24it/s, ID:CTD-2062F14.3]


changed_only_1_to_n: 1830
changed_only_1_to_1: 57434
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 1461
matching_1_to_1: 57434
matching_1_to_n: 1830
input_identifiers: 60725




Source release: (38, 83)


Schultze_unpubl: 100%|█████████████| 24532/24532 [03:36<00:00, 113.41it/s, ID:AP001269.4]


changed_only_1_to_n: 196
changed_only_1_to_1: 24290
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 46
matching_1_to_1: 24290
matching_1_to_n: 196
input_identifiers: 24532
Source release: (38, 91)


In [10]:
time_suffix = time.strftime("%Y%m%d-%H%M%S", time.gmtime())
file_path = os.path.join(local_dir, f"results_for_hlca_datasets_{final_database}_{time_suffix}.pk")
with open(file_path, 'wb') as handle:
    pickle.dump(result, handle)
    print(f"Saved: {file_path}")

Saved: /lustre/groups/ml01/workspace/kemal.inecik/idtrack_temp/results_for_hlca_datasets_ensembl_gene_20230130-023149.pk


Print the processor

In [11]:
import subprocess
print((subprocess.check_output("lscpu", shell=True).strip()).decode())

Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                112
On-line CPU(s) list:   0-111
Thread(s) per core:    2
Core(s) per socket:    28
Socket(s):             2
NUMA node(s):          2
Vendor ID:             GenuineIntel
CPU family:            6
Model:                 85
Model name:            Intel(R) Xeon(R) Platinum 8280L CPU @ 2.70GHz
Stepping:              7
CPU MHz:               3634.881
CPU max MHz:           4000.0000
CPU min MHz:           1000.0000
BogoMIPS:              5400.00
Virtualization:        VT-x
L1d cache:             32K
L1i cache:             32K
L2 cache:              1024K
L3 cache:              39424K
NUMA node0 CPU(s):     0-27,56-83
NUMA node1 CPU(s):     28-55,84-111
Flags:                 fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_go

Plot/Table

In [1]:
import sys
sys.path.append("../..")

In [2]:
import os
import time
import pickle

In [3]:
local_dir = "/lustre/groups/ml01/workspace/kemal.inecik/idtrack_temp"

In [4]:
!ls -lh /lustre/groups/ml01/workspace/kemal.inecik/idtrack_temp

total 15G
-rw-rwxr--+ 1 kemal.inecik OG-ICB-User 622M Oct 17 17:18 graph_homo_sapiens_ens107_min79_narrow.pickle
-rw-rwxr--+ 1 kemal.inecik OG-ICB-User 7.2G Oct 17 15:01 homo_sapiens_assembly-37.h5
-rw-rwxr--+ 1 kemal.inecik OG-ICB-User 7.0G Oct 31 09:44 homo_sapiens_assembly-38.h5
-rw-rwxr--+ 1 kemal.inecik OG-ICB-User  45K Aug 12 15:57 homo_sapiens_externals_modified.yml
-rw-rwxr--+ 1 kemal.inecik OG-ICB-User 2.2M Nov 10 10:48 results_for_hlca_core_only_ensembl_gene_20221110-094821.pk
-rw-rwxr--+ 1 kemal.inecik OG-ICB-User 2.4M Nov 10 10:43 results_for_hlca_core_only_HGNC Symbol_20221110-094335.pk
-rw-rwxr--+ 1 kemal.inecik OG-ICB-User  57M Nov  1 20:13 results_for_hlca_datasets_ensembl_gene_20221101-191345.pk
-rw-r--r--. 1 kemal.inecik OG-ICB-User  57M Jan 30 03:31 results_for_hlca_datasets_ensembl_gene_20230130-023149.pk
-rw-rwxr--+ 1 kemal.inecik OG-ICB-User  55M Nov  1 18:50 results_for_hlca_datasets_HGNC Symbol_20221101-175029.pk
-rw-r--r--. 1 kemal.inecik OG-ICB-User  55M Jan 3

In [5]:
with open(local_dir + "/results_for_hlca_datasets_ensembl_gene_20230130-023149.pk", 'rb') as pickle_file_1:
    res_ens = pickle.load(pickle_file_1)
with open(local_dir + "/results_for_hlca_datasets_HGNC Symbol_20230130-004722.pk", 'rb') as pickle_file_2:
    res_hgn = pickle.load(pickle_file_2)

In [6]:
res_ens.keys()

dict_keys(['Kaminski_2020', 'Meyer_2021', 'MeyerNikolic_unpubl', 'Barbry_unpubl', 'Regev_2021', 'Thienpont_2018', 'Budinger_2020', 'Banovich_Kropski_2020', 'Sheppard_2020', 'Wunderink_2021', 'Lambrechts_2021', 'Zhang_2021', 'Duong_lungMAP_unpubl', 'Janssen_2020', 'Sun_2020', 'Gomperts_2021', 'Eils_2020', 'Schiller_2020', 'Misharin_Budinger_2018', 'Shalek_2018', 'Schiller_2021', 'Peer_Massague_2020', 'Lafyatis_2019', 'Tata_unpubl', 'Xu_2020', 'Sims_2019', 'Schultze_unpubl'])

In [11]:
res_ens['MeyerNikolic_unpubl'].keys()

dict_keys(['changed_only_1_to_n', 'changed_only_1_to_1', 'alternative_target_1_to_1', 'alternative_target_1_to_n', 'matching_1_to_0', 'matching_1_to_1', 'matching_1_to_n', 'input_identifiers'])

In [17]:
r_ens = dict()
for i in res_ens:
    r_ens[i] = dict()
    for j in res_ens[i]:
        r_ens[i][j] = len(res_ens[i][j])
        
r_hgn = dict()
for i in res_hgn:
    r_hgn[i] = dict()
    for j in res_hgn[i]:
        r_hgn[i][j] = len(res_hgn[i][j])

In [14]:
import pandas as pd

In [153]:
column_order = ["input_identifiers", 
 "matching_1_to_1", "changed_only_1_to_1", "alternative_target_1_to_1", 
 "matching_1_to_n", "changed_only_1_to_n", "alternative_target_1_to_n", 
 "matching_1_to_0"]
raname_columns = {"input_identifiers": "", 
 "matching_1_to_1": "TDM", "changed_only_1_to_1": "CI", "alternative_target_1_to_1": "ATM",  # TDM: target database matching, ATM: alternative target matching
 "matching_1_to_n": "TDM", "changed_only_1_to_n": "CI", "alternative_target_1_to_n": "ATM", 
 "matching_1_to_0": ""}
higher_columns=["Input IDs", 
 "One-to-One", "One-to-One", "One-to-One", 
 "One-to-Many", "One-to-Many", "One-to-Many", 
 "One-to-None"]

In [154]:
df1=pd.DataFrame.from_dict(r_hgn).T[column_order].rename(columns=raname_columns)
df1.columns=[[""] + ["Target HGNC Names"] * 6 + [""], higher_columns, df1.columns]
df2=pd.DataFrame.from_dict(r_ens).T[column_order].rename(columns=raname_columns)
df2.columns=[[""] + ["Target Ensembl"] * 6 + [""], higher_columns, df2.columns]

In [155]:
df = pd.concat([df1, df2[["Target Ensembl"]]], axis=1)

In [173]:
df = df[[
(                 '',            'Input IDs', ''),
('Target HGNC Names',  'One-to-One',       'TDM'),
#('Target HGNC Names',  'One-to-One',        'CI'),
('Target HGNC Names',  'One-to-One',       'ATM'),
('Target HGNC Names', 'One-to-Many',       'TDM'),
#('Target HGNC Names', 'One-to-Many',        'CI'),
('Target HGNC Names', 'One-to-Many',       'ATM'),
(   'Target Ensembl',  'One-to-One',       'TDM'),
#(   'Target Ensembl',  'One-to-One',        'CI'),
#(   'Target Ensembl',  'One-to-One',       'ATM'),
(   'Target Ensembl', 'One-to-Many',       'TDM'),
#(   'Target Ensembl', 'One-to-Many',        'CI'),
#(   'Target Ensembl', 'One-to-Many',       'ATM'),
(                 '', 'One-to-None',          '')
]]
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Target HGNC Names,Target HGNC Names,Target HGNC Names,Target HGNC Names,Target Ensembl,Target Ensembl,Unnamed: 8_level_0
Unnamed: 0_level_1,Input IDs,One-to-One,One-to-One,One-to-Many,One-to-Many,One-to-One,One-to-Many,One-to-None
Unnamed: 0_level_2,Unnamed: 1_level_2,TDM,ATM,TDM,ATM,TDM,TDM,Unnamed: 8_level_2
Kaminski_2020,45947,33446,12331,59,13,45353,496,98
Meyer_2021,20922,20532,368,3,2,20692,213,17
MeyerNikolic_unpubl,33582,25109,8342,15,10,33186,290,106
Barbry_unpubl,16859,15155,1679,1,4,16738,101,20
Regev_2021,30983,23466,7387,8,9,30648,222,113
Thienpont_2018,27958,22394,5342,8,43,27541,246,171
Budinger_2020,26316,21441,4807,6,6,26066,194,56
Banovich_Kropski_2020,33694,25229,8131,15,57,33096,336,262
Sheppard_2020,27147,21915,5020,6,40,26742,239,166
Wunderink_2021,21819,18856,2907,2,1,21597,169,53


In [176]:
res_hgn.keys()

dict_keys(['Kaminski_2020', 'Meyer_2021', 'MeyerNikolic_unpubl', 'Barbry_unpubl', 'Regev_2021', 'Thienpont_2018', 'Budinger_2020', 'Banovich_Kropski_2020', 'Sheppard_2020', 'Wunderink_2021', 'Lambrechts_2021', 'Zhang_2021', 'Duong_lungMAP_unpubl', 'Janssen_2020', 'Sun_2020', 'Gomperts_2021', 'Eils_2020', 'Schiller_2020', 'Misharin_Budinger_2018', 'Shalek_2018', 'Schiller_2021', 'Peer_Massague_2020', 'Lafyatis_2019', 'Tata_unpubl', 'Xu_2020', 'Sims_2019', 'Schultze_unpubl'])

In [180]:
res_hgn["Kaminski_2020"].keys()

dict_keys(['changed_only_1_to_n', 'changed_only_1_to_1', 'alternative_target_1_to_1', 'alternative_target_1_to_n', 'matching_1_to_0', 'matching_1_to_1', 'matching_1_to_n', 'input_identifiers'])

In [234]:
res_hgn["Kaminski_2020"]['changed_only_1_to_n'][0].keys()

dict_keys(['target_id', 'last_node', 'final_database', 'graph_id', 'query_id', 'no_corresponding', 'no_conversion', 'no_target'])

In [235]:
query_ids = dict()
for i in res_hgn:
    query_ids[i] = list()
    for j in res_hgn[i]["input_identifiers"]:
        query_ids[i].append(j["query_id"])
        
hgnc_ids = dict()
for i in res_hgn:
    hgnc_ids[i] = list()
    for j in res_hgn[i]["matching_1_to_1"]:
        hgnc_ids[i].append(j["target_id"][0])
    for j in res_hgn[i]["alternative_target_1_to_1"]:
        hgnc_ids[i].append(j["target_id"][0])
        
ens_ids = dict()
for i in res_ens:
    ens_ids[i] = list()
    for j in res_ens[i]["matching_1_to_1"]:
        ens_ids[i].append(j["target_id"][0])

In [236]:
for i in query_ids:
    print(len(query_ids[i]), len(hgnc_ids[i]), len(ens_ids[i]))

45947 45777 45353
20922 20900 20692
33582 33451 33186
16859 16834 16738
30983 30853 30648
27958 27736 27541
26316 26248 26066
33694 33360 33096
27147 26935 26742
21819 21763 21597
33538 33433 33168
18474 18463 18305
27678 27611 27447
33538 33433 33168
26578 26514 26367
31229 30839 30607
32738 32065 31842
32104 31537 31308
27181 26954 26753
25328 24796 24638
17533 17523 17384
19222 19144 19004
22164 22135 21885
31915 31862 31599
32738 32065 31842
60725 58014 57434
24532 24475 24290


In [237]:
a1=[i for k, v in query_ids.items() for i in v]
a2=[i for k, v in hgnc_ids.items() for i in v]
a3=[i for k, v in ens_ids.items() for i in v]
len(a1), len(a2), len(a3)

(782442, 774720, 768700)

In [238]:
b1,b2,b3=set(a1),set(a2),set(a3)
len(b1),len(b2),len(b3)

(94538, 56960, 56536)

In [272]:
[(i, len(b3-set(v)), len(set(v)), len(b3)) for i,v in ens_ids.items()]

[('Kaminski_2020', 11756, 44780, 56536),
 ('Meyer_2021', 36051, 20485, 56536),
 ('MeyerNikolic_unpubl', 24123, 32413, 56536),
 ('Barbry_unpubl', 39911, 16625, 56536),
 ('Regev_2021', 26455, 30081, 56536),
 ('Thienpont_2018', 29425, 27111, 56536),
 ('Budinger_2020', 30876, 25660, 56536),
 ('Banovich_Kropski_2020', 24140, 32396, 56536),
 ('Sheppard_2020', 30223, 26313, 56536),
 ('Wunderink_2021', 35219, 21317, 56536),
 ('Lambrechts_2021', 24123, 32413, 56536),
 ('Zhang_2021', 38356, 18180, 56536),
 ('Duong_lungMAP_unpubl', 29509, 27027, 56536),
 ('Janssen_2020', 24123, 32413, 56536),
 ('Sun_2020', 30537, 25999, 56536),
 ('Gomperts_2021', 30995, 25541, 56536),
 ('Eils_2020', 25661, 30875, 56536),
 ('Schiller_2020', 28743, 27793, 56536),
 ('Misharin_Budinger_2018', 30217, 26319, 56536),
 ('Shalek_2018', 32318, 24218, 56536),
 ('Schiller_2021', 39277, 17259, 56536),
 ('Peer_Massague_2020', 37709, 18827, 56536),
 ('Lafyatis_2019', 34924, 21612, 56536),
 ('Tata_unpubl', 25403, 31133, 56536),


In [270]:
[(i, len(b2-set(v)), len(set(v)), len(b2)) for i,v in hgnc_ids.items()]

[('Kaminski_2020', 11793, 45167, 56960),
 ('Meyer_2021', 36154, 20806, 56960),
 ('MeyerNikolic_unpubl', 24284, 32676, 56960),
 ('Barbry_unpubl', 40223, 16737, 56960),
 ('Regev_2021', 26709, 30251, 56960),
 ('Thienpont_2018', 29588, 27372, 56960),
 ('Budinger_2020', 31120, 25840, 56960),
 ('Banovich_Kropski_2020', 24202, 32758, 56960),
 ('Sheppard_2020', 30402, 26558, 56960),
 ('Wunderink_2021', 35477, 21483, 56960),
 ('Lambrechts_2021', 24284, 32676, 56960),
 ('Zhang_2021', 38556, 18404, 56960),
 ('Duong_lungMAP_unpubl', 29794, 27166, 56960),
 ('Janssen_2020', 24284, 32676, 56960),
 ('Sun_2020', 30828, 26132, 56960),
 ('Gomperts_2021', 31114, 25846, 56960),
 ('Eils_2020', 25741, 31219, 56960),
 ('Schiller_2020', 28974, 27986, 56960),
 ('Misharin_Budinger_2018', 30371, 26589, 56960),
 ('Shalek_2018', 32544, 24416, 56960),
 ('Schiller_2021', 39511, 17449, 56960),
 ('Peer_Massague_2020', 37940, 19020, 56960),
 ('Lafyatis_2019', 34941, 22019, 56960),
 ('Tata_unpubl', 25483, 31477, 56960),


In [271]:
[(i, len(b1-set(v)), len(set(v)), len(b1)) for i,v in query_ids.items()]

[('Kaminski_2020', 48591, 45947, 94538),
 ('Meyer_2021', 73616, 20922, 94538),
 ('MeyerNikolic_unpubl', 60956, 33582, 94538),
 ('Barbry_unpubl', 77679, 16859, 94538),
 ('Regev_2021', 63555, 30983, 94538),
 ('Thienpont_2018', 66580, 27958, 94538),
 ('Budinger_2020', 68222, 26316, 94538),
 ('Banovich_Kropski_2020', 60844, 33694, 94538),
 ('Sheppard_2020', 67391, 27147, 94538),
 ('Wunderink_2021', 72719, 21819, 94538),
 ('Lambrechts_2021', 61000, 33538, 94538),
 ('Zhang_2021', 76064, 18474, 94538),
 ('Duong_lungMAP_unpubl', 66860, 27678, 94538),
 ('Janssen_2020', 61000, 33538, 94538),
 ('Sun_2020', 67960, 26578, 94538),
 ('Gomperts_2021', 63309, 31229, 94538),
 ('Eils_2020', 61800, 32738, 94538),
 ('Schiller_2020', 62434, 32104, 94538),
 ('Misharin_Budinger_2018', 67357, 27181, 94538),
 ('Shalek_2018', 69210, 25328, 94538),
 ('Schiller_2021', 77005, 17533, 94538),
 ('Peer_Massague_2020', 75316, 19222, 94538),
 ('Lafyatis_2019', 72374, 22164, 94538),
 ('Tata_unpubl', 62623, 31915, 94538),
