# results_preliminary_for_hlca

In [1]:
import os
import scanpy as sc
import pickle
import sys
from collections import Counter
sys.path.append("../..")

In [2]:
from idtrack import *
from idtrack._track_tests import *

In [3]:
logger_config()
local_dir = "/lustre/groups/ml01/workspace/kemal.inecik/idtrack_temp"  # or any other local directory
dm = DatabaseManager("homo_sapiens", 107, "gene", local_dir, 79) 

In [4]:
tt: TrackTests = TrackTests(dm)

2022-08-16 23:41:21 INFO:graph: The graph is being read.


In [5]:
base_path = "/lustre/groups/ml01/workspace/hlca_lisa.sikkema_malte.luecken/HLCA_reproducibility/data"
dset0_dir = os.path.join(base_path, "HLCA_extended/extension_datasets/ready/full")
dset1_dir = os.path.join(base_path, "HLCA_extended/extension_datasets/raw")

adata_dict = {
    "Kaminski_2020": [f"{dset0_dir}/adams.h5ad"],
    "Meyer_2021": [f"{dset0_dir}/meyer_2021.h5ad"],
    "MeyerNikolic_unpubl": [f"{dset0_dir}/meyer_nikolic_unpubl.h5ad"],
    "Barbry_unpubl": [f"{dset0_dir}/barbry.h5ad"],
    "Regev_2021": [
        f"{dset0_dir}/delorey_cryo.h5ad", f"{dset0_dir}/delorey_fresh.h5ad",
        f"{dset0_dir}/delorey_nuclei.h5ad"
    ],
    "Thienpont_2018": [f"{dset1_dir}/Lambrechts/lambrechts.h5ad"],
    "Budinger_2020": [f"{dset0_dir}/bharat.h5ad"],
    "Banovich_Kropski_2020": [f"{dset0_dir}/haberman.h5ad"],
    "Sheppard_2020": [f"{dset0_dir}/tsukui.h5ad"],
    "Wunderink_2021": [
        f"{dset0_dir}/grant_cryo.h5ad", f"{dset0_dir}/grant_fresh.h5ad"
    ],
    "Lambrechts_2021": [
        f"{dset0_dir}/wouters.h5ad" #, f"{dset0_dir}/wouters_labs.h5ad"
    ],
    "Zhang_2021": [f"{dset1_dir}/Liao/covid_for_publish.h5ad"],
    "Duong_lungMAP_unpubl": [f"{dset0_dir}/duong.h5ad"],
    "Janssen_2020": [f"{dset0_dir}/mould.h5ad"],
    "Sun_2020": [
        f"{dset0_dir}/wang_sub_batch1.h5ad", f"{dset0_dir}/wang_sub_batch2.h5ad",
        f"{dset0_dir}/wang_sub_batch3.h5ad", f"{dset0_dir}/wang_sub_batch4.h5ad"],
    "Gomperts_2021": [
        f"{dset0_dir}/carraro_ucla.h5ad", f"{dset0_dir}/carraro_cff.h5ad",
        f"{dset0_dir}/carraro_csmc.h5ad"],
    "Eils_2020": [f"{dset0_dir}/lukassen.h5ad"],
    "Schiller_2020": [f"{dset0_dir}/mayr.h5ad"],
    "Misharin_Budinger_2018": [f"{dset0_dir}/reyfman_disease.h5ad"],
    "Shalek_2018": [f"{dset0_dir}/ordovasmontanes.h5ad"],
    "Schiller_2021": [f"{dset0_dir}/schiller_discovair.h5ad"],
    "Peer_Massague_2020": [f"{dset0_dir}/laughney.h5ad"],
    "Lafyatis_2019": [f"{dset0_dir}/valenzi.h5ad"],
    "Tata_unpubl": [f"{dset0_dir}/tata_unpubl.h5ad"],
    "Xu_2020": [f"{dset0_dir}/guo.h5ad"],
    "Sims_2019": [f"{dset0_dir}/szabo.h5ad"],
    "Schultze_unpubl": [f"{dset0_dir}/schultze_unpubl.h5ad"]
}

##### Conversion and save

In [6]:
results = dict()

for ddadaata in adata_dict:
    
    adata = sc.read(adata_dict[ddadaata][0])
    gene_list = list(adata.var.index)
    
    entered_gene_list = list()
    return_list = list()
    
    print(ddadaata)
    for ind, gl in enumerate(gene_list):
        if ind % 100 == 0 or ind > len(gene_list)-5:
            progress_bar(ind, len(gene_list)-1)
        
        new_gl, is_converted = tt.unfound_node_solutions(gl)
        if new_gl is None:
            new_gl, is_converted = tt.unfound_node_solutions(f"{DB.synonym_id_nodes_prefix}{gl}")
            is_converted = True if new_gl is not None else False

        if new_gl is None:
            return_list.append([])
        elif new_gl and is_converted:
            conv = tt.convert(new_gl, None, None, 'ensembl_gene', prioritize_to_one_filter=True, return_path=False)
            return_list.append(list(conv) if conv is not None else [])
            entered_gene_list.append(new_gl)
        elif new_gl:
            conv = tt.convert(gl, None, None, 'ensembl_gene', prioritize_to_one_filter=True, return_path=False)
            return_list.append(list(conv) if conv is not None else [])
            entered_gene_list.append(gl)
        else:
            raise ValueError
    print(tt.identify_source(entered_gene_list)[0])
    print(list(Counter([len(i) for i in return_list]).most_common())[:10])
    assert len(gene_list) == len(return_list)
    results[ddadaata] = {i: j for i, j in zip(gene_list, return_list)}
    print()

file_path = os.path.join(local_dir, "results_preliminary_for_hlca.pk")
with open(file_path, 'wb') as handle:
    pickle.dump(result, handle)
    print(f"Saved: {file_path}")

Kaminski_2020
Progress: |████████████████████| 100.0% 
(('HGNC Symbol', 94), 30503)
[(1, 45536), (0, 411)]

Meyer_2021
Progress: |████████████████████| 100.0% 
(('HGNC Symbol', 93), 20407)
[(1, 20898), (0, 24)]

MeyerNikolic_unpubl
Progress: |████████████████████| 100.0% 
(('HGNC Symbol', 93), 23406)
[(1, 33404), (0, 174), (2, 4)]

Barbry_unpubl
Progress: |████████████████████| 100.0% 
(('HGNC Symbol', 98), 14650)
[(1, 16829), (0, 29), (2, 1)]

Regev_2021
Progress: |████████████████████| 100.0% 
(('HGNC Symbol', 93), 21864)
[(1, 30809), (0, 173), (2, 1)]

Thienpont_2018
Progress: |████████████████████| 100.0% 
(('Vega gene', 88), 26131)
[(1, 27630), (0, 327), (2, 1)]

Budinger_2020
Progress: |████████████████████| 100.0% 
(('HGNC Symbol', 93), 20189)
[(1, 26219), (0, 96), (2, 1)]

Banovich_Kropski_2020
Progress: |████████████████████| 100.0% 
(('Vega gene', 84), 31238)
[(1, 33191), (0, 497), (2, 4), (3, 2)]

Sheppard_2020
Progress: |████████████████████| 100.0% 
(('Vega gene', 88), 254

NameError: name 'result' is not defined

In [7]:
file_path = os.path.join(local_dir, "results_preliminary_for_hlca.pk")
with open(file_path, 'wb') as handle:
    pickle.dump(results, handle)
    print(f"Saved: {file_path}")

Saved: /lustre/groups/ml01/workspace/kemal.inecik/idtrack_temp/results_preliminary_for_hlca.pk


##### how many target hit

In [12]:
all_targets = {j for database in results for i in results[database] for j in results[database][i]}
len(all_targets)

57606

In [19]:
for database in results:
    all_targets = Counter(j for i in results[database] for j in results[database][i])
    n_to_1_events = [j for i in results[database] for j in results[database][i] if all_targets[j] != 1]
    one_to_events = [i for i in results[database] if len(results[database][i]) > 1]
    one_to_0_events = [i for i in results[database] if len(results[database][i]) < 1]
    print(f"{database}\nSource_Count:{len(results[database])}\nTarget_Count:{len(all_targets)}")
    print(f"n-to-1_events:{len(n_to_1_events)}\n1-to-n_events:{len(one_to_events)}\n1-to-0_events:{len(one_to_0_events)}")
    print()

Kaminski_2020
Source_Count:45947
Target_Count:45378
n-to-1_events:305
1-to-n_events:0
1-to-0_events:411

Meyer_2021
Source_Count:20922
Target_Count:20801
n-to-1_events:189
1-to-n_events:0
1-to-0_events:24

MeyerNikolic_unpubl
Source_Count:33582
Target_Count:32815
n-to-1_events:1145
1-to-n_events:4
1-to-0_events:174

Barbry_unpubl
Source_Count:16859
Target_Count:16768
n-to-1_events:126
1-to-n_events:1
1-to-0_events:29

Regev_2021
Source_Count:30983
Target_Count:30388
n-to-1_events:826
1-to-n_events:1
1-to-0_events:173

Thienpont_2018
Source_Count:27958
Target_Count:27428
n-to-1_events:393
1-to-n_events:1
1-to-0_events:327

Budinger_2020
Source_Count:26316
Target_Count:25943
n-to-1_events:545
1-to-n_events:1
1-to-0_events:96

Banovich_Kropski_2020
Source_Count:33694
Target_Count:32824
n-to-1_events:731
1-to-n_events:6
1-to-0_events:497

Sheppard_2020
Source_Count:27147
Target_Count:26617
n-to-1_events:409
1-to-n_events:1
1-to-0_events:320

Wunderink_2021
Source_Count:21819
Target_Count:2

##### HGNC vs Ensembl gene as targets

In [None]:
ddadaata = "Kaminski_2020"
adata = sc.read(adata_dict[ddadaata][0])

gene_list = list(adata.var.index)
entered_gene_list = list()
return_list = list()
print(ddadaata, len(gene_list))

for ind, gl in enumerate(gene_list):
    
    if ind % 100 == 0 or ind > len(gene_list)-5:
        progress_bar(ind, len(gene_list)-1)
    
    new_gl, is_converted = tt.unfound_node_solutions(gl)
    if new_gl is None:
        new_gl, is_converted = tt.unfound_node_solutions(f"{DB.synonym_id_nodes_prefix}{gl}")
        is_converted = True if new_gl is not None else False

    if new_gl is None:
        return_list.append([])
    elif new_gl and is_converted:
        conv = tt.convert(new_gl, None, None, 'ensembl_gene', prioritize_to_one_filter=True, return_path=False)
        return_list.append(list(conv) if conv is not None else [])
        entered_gene_list.append(new_gl)
    elif new_gl:
        conv = tt.convert(gl, None, None, 'ensembl_gene', prioritize_to_one_filter=True, return_path=False)
        return_list.append(list(conv) if conv is not None else [])
        entered_gene_list.append(gl)
    else:
        raise ValueError
print(tt.identify_source(entered_gene_list)[0])
print(list(Counter([len(i) for i in return_list]).most_common())[:10])
print()

gene_list = list(adata.var.index)
entered_gene_list = list()
return_list = list()
print(ddadaata, len(gene_list))

for ind, gl in enumerate(gene_list):
    if ind % 100 == 0 or ind > len(gene_list)-5:
        progress_bar(ind, len(gene_list)-1)

    new_gl, is_converted = tt.unfound_node_solutions(gl)
    if new_gl is None:
        new_gl, is_converted = tt.unfound_node_solutions(f"{DB.synonym_id_nodes_prefix}{gl}")
        is_converted = True if new_gl is not None else False

    if new_gl is None:
        return_list.append([])
    elif new_gl and is_converted:
        conv = tt.convert(new_gl, None, None, 'HGNC Symbol', prioritize_to_one_filter=True, return_path=False)
        return_list.append(list(conv) if conv is not None else [])
        entered_gene_list.append(new_gl)
    elif new_gl:
        conv = tt.convert(gl, None, None, 'HGNC Symbol', prioritize_to_one_filter=True, return_path=False)
        return_list.append(list(conv) if conv is not None else [])
        entered_gene_list.append(gl)
    else:
        raise ValueError
print(tt.identify_source(entered_gene_list)[0])
print(list(Counter([len(i) for i in return_list]).most_common())[:10])
print()

In [None]:
[gene_list[ind] for ind, i in enumerate(return_list) if len(i) == 0][:10]