# Use Case: HLCA Datasets

In [1]:
import sys
sys.path.append("/home/icb/kemal.inecik/work/codes/idtrack")

In [2]:
import os
import time
import scanpy as sc
import idtrack
import pickle

In [3]:
%load_ext autoreload
%autoreload 2

Initialize the graph and pathfinder

In [4]:
local_dir = "/lustre/groups/ml01/workspace/kemal.inecik/idtrack_temp"
idt = idtrack.API(local_repository=local_dir)
idt.configure_logger()

In [5]:
idt.initialize_graph(organism_name='homo_sapiens', ensembl_release=110, return_test=True)

2023-12-19 23:59:48 INFO:graph_maker: The graph is being read.


In [6]:
idt.calculate_graph_caches()

2023-12-20 00:00:39 INFO:the_graph: Cached properties being calculated: combined_edges
2023-12-20 00:01:15 INFO:the_graph: Cached properties being calculated: combined_edges_assembly_specific_genes
2023-12-20 00:01:19 INFO:the_graph: Cached properties being calculated: combined_edges_genes
2023-12-20 00:01:52 INFO:the_graph: Cached properties being calculated: lower_chars_graph
2023-12-20 00:01:53 INFO:the_graph: Cached properties being calculated: get_active_ranges_of_id
2023-12-20 00:02:16 INFO:the_graph: Cached properties being calculated: available_external_databases
2023-12-20 00:02:18 INFO:the_graph: Cached properties being calculated: external_database_connection_form
2023-12-20 00:03:09 INFO:the_graph: Cached properties being calculated: available_genome_assemblies
2023-12-20 00:03:10 INFO:the_graph: Cached properties being calculated: available_external_databases_assembly
2023-12-20 00:03:11 INFO:the_graph: Cached properties being calculated: node_trios


The datasets of HLCA

In [7]:
base_path = "/lustre/groups/ml01/workspace/hlca_lisa.sikkema_malte.luecken/HLCA_reproducibility/data"
dset0_dir = os.path.join(base_path, "HLCA_extended/extension_datasets/ready/full")
dset1_dir = os.path.join(base_path, "HLCA_extended/extension_datasets/raw")

adata_dict = {
    "Kaminski_2020": [f"{dset0_dir}/adams.h5ad"],
    "Meyer_2021": [f"{dset0_dir}/meyer_2021.h5ad"],
    "MeyerNikolic_unpubl": [f"{dset0_dir}/meyer_nikolic_unpubl.h5ad"],
    "Barbry_unpubl": [f"{dset0_dir}/barbry.h5ad"],
    "Regev_2021": [
        f"{dset0_dir}/delorey_cryo.h5ad", f"{dset0_dir}/delorey_fresh.h5ad",
        f"{dset0_dir}/delorey_nuclei.h5ad"
    ],
    "Thienpont_2018": [f"{dset1_dir}/Lambrechts/lambrechts.h5ad"],
    "Budinger_2020": [f"{dset0_dir}/bharat.h5ad"],
    "Banovich_Kropski_2020": [f"{dset0_dir}/haberman.h5ad"],
    "Sheppard_2020": [f"{dset0_dir}/tsukui.h5ad"],
    "Wunderink_2021": [
        f"{dset0_dir}/grant_cryo.h5ad", f"{dset0_dir}/grant_fresh.h5ad"
    ],
    "Lambrechts_2021": [
        f"{dset0_dir}/wouters.h5ad" #, f"{dset0_dir}/wouters_labs.h5ad"
    ],
    "Zhang_2021": [f"{dset1_dir}/Liao/covid_for_publish.h5ad"],
    "Duong_lungMAP_unpubl": [f"{dset0_dir}/duong.h5ad"],
    "Janssen_2020": [f"{dset0_dir}/mould.h5ad"],
    "Sun_2020": [
        f"{dset0_dir}/wang_sub_batch1.h5ad", f"{dset0_dir}/wang_sub_batch2.h5ad",
        f"{dset0_dir}/wang_sub_batch3.h5ad", f"{dset0_dir}/wang_sub_batch4.h5ad"],
    "Gomperts_2021": [
        f"{dset0_dir}/carraro_ucla.h5ad", f"{dset0_dir}/carraro_cff.h5ad",
        f"{dset0_dir}/carraro_csmc.h5ad"],
    "Eils_2020": [f"{dset0_dir}/lukassen.h5ad"],
    "Schiller_2020": [f"{dset0_dir}/mayr.h5ad"],
    "Misharin_Budinger_2018": [f"{dset0_dir}/reyfman_disease.h5ad"],
    "Shalek_2018": [f"{dset0_dir}/ordovasmontanes.h5ad"],
    "Schiller_2021": [f"{dset0_dir}/schiller_discovair.h5ad"],
    "Peer_Massague_2020": [f"{dset0_dir}/laughney.h5ad"],
    "Lafyatis_2019": [f"{dset0_dir}/valenzi.h5ad"],
    "Tata_unpubl": [f"{dset0_dir}/tata_unpubl.h5ad"],
    "Xu_2020": [f"{dset0_dir}/guo.h5ad"],
    "Sims_2019": [f"{dset0_dir}/szabo.h5ad"],
    "Schultze_unpubl": [f"{dset0_dir}/schultze_unpubl.h5ad"]
}

Run the ID conversion with HGNC Symbol (a.k.a gene name)

In [10]:
result = dict()
final_database="HGNC Symbol"

for dataset_name in adata_dict:
    
    adata = sc.read(adata_dict[dataset_name][0])
    gene_list = list(adata.var.index)
    
    matching = idt.convert_identifier_multiple(gene_list, to_release=107, final_database=final_database, pbar_prefix=dataset_name)
    binned_conversions = idt.classify_multiple_conversion(matching)
    
    idt.print_binned_conversion(binned_conversions)
    print(f"Source release: {idt.infer_identifier_source(gene_list)}")
    
    result[dataset_name] = binned_conversions

Kaminski_2020: 100%|█████████| 45947/45947 [45:09<00:00, 16.96it/s, ID:ENSG00000280139]


changed_only_1_to_n: 55
changed_only_1_to_1: 4607
alternative_target_1_to_1: 11734
alternative_target_1_to_n: 65
matching_1_to_0: 97
matching_1_to_1: 33996
matching_1_to_n: 55
input_identifiers: 45947
Source release: (38, 94)


Meyer_2021:  31%|██████▍              | 6395/20922 [01:19<03:00, 80.66it/s, ID:FAM153B]


KeyboardInterrupt: 

Run the ID conversion with Ensembl gene ID

In [None]:
result = dict()
final_database="ensembl_gene"

for dataset_name in adata_dict:
    
    adata = sc.read(adata_dict[dataset_name][0])
    gene_list = list(adata.var.index)
    
    matching = idt.convert_identifier_multiple(gene_list, final_database=final_database, pbar_prefix=dataset_name)
    binned_conversions = idt.classify_multiple_conversion(matching)
    
    idt.print_binned_conversion(binned_conversions)
    print(f"Source release: {idt.infer_identifier_source(gene_list)}")
    
    result[dataset_name] = binned_conversions

Print the processor

In [None]:
import subprocess
print((subprocess.check_output("lscpu", shell=True).strip()).decode())

Plot/Table

In [None]:
import sys
sys.path.append("../..")

In [None]:
import os
import time
import pickle

In [None]:
local_dir = "/lustre/groups/ml01/workspace/kemal.inecik/idtrack_temp"

In [None]:
!ls -lh /lustre/groups/ml01/workspace/kemal.inecik/idtrack_temp

In [None]:
with open(local_dir + "/results_for_hlca_datasets_ensembl_gene_20230130-023149.pk", 'rb') as pickle_file_1:
    res_ens = pickle.load(pickle_file_1)
with open(local_dir + "/results_for_hlca_datasets_HGNC Symbol_20230130-004722.pk", 'rb') as pickle_file_2:
    res_hgn = pickle.load(pickle_file_2)

In [None]:
res_ens.keys()

In [None]:
res_ens['MeyerNikolic_unpubl'].keys()

In [None]:
r_ens = dict()
for i in res_ens:
    r_ens[i] = dict()
    for j in res_ens[i]:
        r_ens[i][j] = len(res_ens[i][j])
        
r_hgn = dict()
for i in res_hgn:
    r_hgn[i] = dict()
    for j in res_hgn[i]:
        r_hgn[i][j] = len(res_hgn[i][j])

In [None]:
import pandas as pd

In [None]:
column_order = ["input_identifiers", 
 "matching_1_to_1", "changed_only_1_to_1", "alternative_target_1_to_1", 
 "matching_1_to_n", "changed_only_1_to_n", "alternative_target_1_to_n", 
 "matching_1_to_0"]
raname_columns = {"input_identifiers": "", 
 "matching_1_to_1": "TDM", "changed_only_1_to_1": "CI", "alternative_target_1_to_1": "ATM",  # TDM: target database matching, ATM: alternative target matching
 "matching_1_to_n": "TDM", "changed_only_1_to_n": "CI", "alternative_target_1_to_n": "ATM", 
 "matching_1_to_0": ""}
higher_columns=["Input IDs", 
 "One-to-One", "One-to-One", "One-to-One", 
 "One-to-Many", "One-to-Many", "One-to-Many", 
 "One-to-None"]

In [None]:
df1=pd.DataFrame.from_dict(r_hgn).T[column_order].rename(columns=raname_columns)
df1.columns=[[""] + ["Target HGNC Names"] * 6 + [""], higher_columns, df1.columns]
df2=pd.DataFrame.from_dict(r_ens).T[column_order].rename(columns=raname_columns)
df2.columns=[[""] + ["Target Ensembl"] * 6 + [""], higher_columns, df2.columns]

In [None]:
df = pd.concat([df1, df2[["Target Ensembl"]]], axis=1)

In [None]:
df = df[[
(                 '',            'Input IDs', ''),
('Target HGNC Names',  'One-to-One',       'TDM'),
#('Target HGNC Names',  'One-to-One',        'CI'),
('Target HGNC Names',  'One-to-One',       'ATM'),
('Target HGNC Names', 'One-to-Many',       'TDM'),
#('Target HGNC Names', 'One-to-Many',        'CI'),
('Target HGNC Names', 'One-to-Many',       'ATM'),
(   'Target Ensembl',  'One-to-One',       'TDM'),
#(   'Target Ensembl',  'One-to-One',        'CI'),
#(   'Target Ensembl',  'One-to-One',       'ATM'),
(   'Target Ensembl', 'One-to-Many',       'TDM'),
#(   'Target Ensembl', 'One-to-Many',        'CI'),
#(   'Target Ensembl', 'One-to-Many',       'ATM'),
(                 '', 'One-to-None',          '')
]]
df

In [None]:
res_hgn.keys()

In [None]:
res_hgn["Kaminski_2020"].keys()

In [None]:
res_hgn["Kaminski_2020"]['changed_only_1_to_n'][0].keys()

In [None]:
query_ids = dict()
for i in res_hgn:
    query_ids[i] = list()
    for j in res_hgn[i]["input_identifiers"]:
        query_ids[i].append(j["query_id"])
        
hgnc_ids = dict()
for i in res_hgn:
    hgnc_ids[i] = list()
    for j in res_hgn[i]["matching_1_to_1"]:
        hgnc_ids[i].append(j["target_id"][0])
    for j in res_hgn[i]["alternative_target_1_to_1"]:
        hgnc_ids[i].append(j["target_id"][0])
        
ens_ids = dict()
for i in res_ens:
    ens_ids[i] = list()
    for j in res_ens[i]["matching_1_to_1"]:
        ens_ids[i].append(j["target_id"][0])

In [None]:
for i in query_ids:
    print(len(query_ids[i]), len(hgnc_ids[i]), len(ens_ids[i]))

In [None]:
a1=[i for k, v in query_ids.items() for i in v]
a2=[i for k, v in hgnc_ids.items() for i in v]
a3=[i for k, v in ens_ids.items() for i in v]
len(a1), len(a2), len(a3)

In [None]:
b1,b2,b3=set(a1),set(a2),set(a3)
len(b1),len(b2),len(b3)

In [None]:
[(i, len(b3-set(v)), len(set(v)), len(b3)) for i,v in ens_ids.items()]

In [None]:
[(i, len(b2-set(v)), len(set(v)), len(b2)) for i,v in hgnc_ids.items()]

In [None]:
[(i, len(b1-set(v)), len(set(v)), len(b1)) for i,v in query_ids.items()]