# Initialization and Tests

In [1]:
1

1

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys

sys.path.append("/home/icb/kemal.inecik/work/codes/idtrack")

import copy
import os
import pickle
import time

import idtrack
from idtrack import DB

In [4]:
local_dir = "/lustre/groups/ml01/workspace/kemal.inecik/idtrack_temp/"
test_results_dir = os.path.join(local_dir, "tests")

idt = idtrack.API(local_repository=local_dir)
idt.configure_logger()
idt.get_ensembl_organism("homo sapiens")

2025-05-31 17:19:20 INFO:verify_organism: Ensembl Rest API query to get the organism names and associated releases.


('homo_sapiens', 114)

In [5]:
!ls -l /lustre/groups/ml01/workspace/kemal.inecik/idtrack_temp/

total 42110132
-rw-r--r--. 1 kemal.inecik OG-ICB-User   852126351 May 27 20:30 graph_homo_sapiens_min79_max114_narrow.pickle
-rw-r--r--. 1 kemal.inecik OG-ICB-User 21376563160 May 27 20:28 homo_sapiens_assembly-37.h5
-rw-r--r--. 1 kemal.inecik OG-ICB-User 20891831455 May 27 20:28 homo_sapiens_assembly-38.h5
-rw-r--r--. 1 kemal.inecik OG-ICB-User       65514 May 27 17:59 homo_sapiens_externals_modified.yml
-rw-r--r--. 1 kemal.inecik OG-ICB-User       55077 May 27 17:26 homo_sapiens_externals_template.yml


In [6]:
idt.initialize_graph(organism_name="homo_sapiens", last_ensembl_release=114, return_test=True)

2025-05-31 17:19:23 INFO:graph_maker: The graph is being read: /lustre/groups/ml01/workspace/kemal.inecik/idtrack_temp/graph_homo_sapiens_min79_max114_narrow.pickle


In [7]:
idt.calculate_graph_caches()

2025-05-31 17:20:12 INFO:the_graph: Cached properties being calculated: combined_edges
2025-05-31 17:20:54 INFO:the_graph: Cached properties being calculated: combined_edges_assembly_specific_genes
2025-05-31 17:20:57 INFO:the_graph: Cached properties being calculated: combined_edges_genes
2025-05-31 17:21:34 INFO:the_graph: Cached properties being calculated: lower_chars_graph
2025-05-31 17:21:35 INFO:the_graph: Cached properties being calculated: get_active_ranges_of_id
2025-05-31 17:21:56 INFO:the_graph: Cached properties being calculated: available_external_databases
2025-05-31 17:21:57 INFO:the_graph: Cached properties being calculated: available_genome_assemblies
2025-05-31 17:21:58 INFO:the_graph: Cached properties being calculated: available_external_databases_assembly
2025-05-31 17:21:59 INFO:the_graph: Cached properties being calculated: node_trios
2025-05-31 17:22:43 INFO:the_graph: Cached properties being calculated: hyperconnective_nodes
2025-05-31 17:22:47 INFO:the_graph:

### Structural Tests

In [7]:
idt.track.is_id_functions_consistent_ensembl()

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 36/36 [01:17<00:00,  2.14s/it, Item:114]


True

In [8]:
idt.track.is_id_functions_consistent_ensembl_2()

100%|█████████████████████████████████████████████████████████████████████████████| 165093/165093 [00:00<00:00, 176122.22it/s, Item:LRG_99.1]


True

In [9]:
idt.track.is_range_functions_robust()

100%|██████████████████████████████████████████████████████████████████████| 100842/100842 [00:00<00:00, 239189.69it/s, Item:ENSG00000169933]


True

In [10]:
idt.track.is_base_is_range_correct()

100%|███████████████████████████████████████████████████████████████████████| 100842/100842 [00:03<00:00, 29035.43it/s, Item:ENSG00000169933]


True

In [11]:
idt.track.is_combined_edges_dicts_overlapping_and_complete()

True

In [12]:
idt.track.is_edge_with_same_nts_only_at_backbone_nodes()

True

In [14]:
idt.track.is_node_consistency_robust()

100%|██████████████████████████████████████████████████████████████████████████████████████████| 2507048/2507048 [00:08<00:00, 309828.55it/s]


True

In [20]:
idt.track.is_id_functions_consistent_external()

Assembly 37: 100%|█████████████████████████████████████████████████████████████████████████████████| 36/36 [28:41<00:00, 47.81s/it, Item:114]
Assembly 38: 100%|█████████████████████████████████████████████████████████████████████████████████| 36/36 [34:12<00:00, 57.00s/it, Item:114]


True

### Conversion Tests

#### External conversion

In [None]:
results = []
for _ in range(12):
    response, (_, _, _, res) = idt.track.is_final_external_conversion_robust(
        from_fraction=0.25, prioritize_to_one_filter=False
    )
    print(response)
    results.append(res)

os.makedirs(test_results_dir, exist_ok=True)
output_path = os.path.join(test_results_dir, "travel_v0_conversion_results.pkl")
with open(output_path, "wb") as f:
    pickle.dump(results, f)
print(f"Results saved to {output_path}")

**Note:** the role of `prioritize_to_one_filter` for final external conversion

Nevertheless, we recommend using the filter to minimize 1-to-n matching.

To show more in-depth what is happening: 

In [88]:
release = 81
database = "Vega gene"

dm_original = idt.track.db_manager
dm = dm_original.change_release(release).change_assembly(DB.main_assembly)
df_a = dm.get_db("external_relevant")
df = df_a[df_a["name_db"] == database].copy()

base_dict: dict[str, set] = dict()
for _, item in df.iterrows():
    if item["graph_id"] not in base_dict:
        base_dict[item["graph_id"]] = set()
    base_dict[item["graph_id"]].add(item["id_db"])

In [89]:
response, (issues_t1, issues_t2, issues_t3, res) = idt.track.is_final_external_conversion_robust(
    from_fraction=0.1,
    database=database,
    ens_rel=release,
    prioritize_to_one_filter=False,
)

2025-05-31 09:50:34 INFO:track_tests: Assembly: 38, Database: Vega gene, Release: 81
Mapping: 100%|█████████████████████████████████████████████████████| 6657/6657 [00:31<00:00, 208.49it/s, Item:LRG_784.1]


In [90]:
response, len(res["conversion"]), len(issues_t1), len(issues_t2), len(issues_t3), len(
    df[df["graph_id"].isin(issues_t3)]
) == 0

(True, 5522, 0, 0, 1135, True)

In [91]:
print(
    idt.track.format_history_travel_testing_report(
        res, include_header=True, printable=True, line_separation_at_end=False
    )
)

╔═ History-Travel-Testing Report ═╗
Source  : assembly_38_ensembl_gene (Assembly 38, Release 81)
Target  : Vega gene (Release 81)
External: True   1→1-pref.: False
Sample  : 0.1 of source IDs
Failure / Anomaly Counts:
  - Voyage failed (graceful)       : 1,135
  - Voyage failed (unknown)        : 0
  - Query not in graph             : 0
  - Lost item                      : 0
  - Lost item, but ID exists       : 0
  - Found IDs not accurate         : 0
Mapping Statistics:
  - One→one IDs                    : 0
  - One→many IDs                   : 5,522
  - One→many (single conv.)        : 5,522
  - Successfully converted IDs     : 5,522
  - Clash one→one                  : 0
  - Clash many→many                : 74
  - Clash mixed                    : 0
Total runtime: 31.94 s


In [92]:
response, (issues_t1, issues_t2, issues_t3, res) = idt.track.is_final_external_conversion_robust(
    from_fraction=0.1,
    database=database,
    ens_rel=release,
    prioritize_to_one_filter=True,
)

2025-05-31 09:51:15 INFO:track_tests: Assembly: 38, Database: Vega gene, Release: 81
Mapping: 100%|██████████████████████████████████████████████████████| 6657/6657 [00:32<00:00, 203.92it/s, Item:LRG_92.1]


In [93]:
response, len(res["conversion"]), len(issues_t1), len(issues_t2), len(issues_t3), len(
    df[df["graph_id"].isin(issues_t3)]
) == 0

(False, 5498, 510, 0, 1159, True)

In [94]:
print(
    idt.track.format_history_travel_testing_report(
        res, include_header=True, printable=True, line_separation_at_end=False
    )
)

╔═ History-Travel-Testing Report ═╗
Source  : assembly_38_ensembl_gene (Assembly 38, Release 81)
Target  : Vega gene (Release 81)
External: True   1→1-pref.: True
Sample  : 0.1 of source IDs
Failure / Anomaly Counts:
  - Voyage failed (graceful)       : 1,159
  - Voyage failed (unknown)        : 0
  - Query not in graph             : 0
  - Lost item                      : 0
  - Lost item, but ID exists       : 0
  - Found IDs not accurate         : 0
Mapping Statistics:
  - One→one IDs                    : 510
  - One→many IDs                   : 4,988
  - One→many (single conv.)        : 4,988
  - Successfully converted IDs     : 5,498
  - Clash one→one                  : 0
  - Clash many→many                : 66
  - Clash mixed                    : 0
Total runtime: 32.65 s


In [95]:
issues_t1[0]

{'database': 'Vega gene',
 'asym': 38,
 'ens_rel': 81,
 'id': 'ENSG00000117242.7',
 'converted': ['OTTHUMG00000002840'],
 'base_expectation': {'OTTHUMG00000002840', 'RP11-401M16.7'}}

In [96]:
id_from = "ENSG00000008197.4"

In [97]:
df[df["graph_id"] == id_from]

Unnamed: 0,release,graph_id,id_db,name_db,ensembl_identity,xref_identity
355949,81,ENSG00000008197.4,OTTHUMG00000014832,Vega gene,,
355950,81,ENSG00000008197.4,RP3-336H9.2,Vega gene,,


In [98]:
idt.track.convert(
    id_from,
    to_release=release,
    from_release=release,
    final_database=database,
    return_path=True,
    prioritize_to_one_filter=True,
)

{'ENSG00000008197.4': {'from_id': 'ENSG00000008197.4',
  'assembly_jump': 0,
  'external_jump': 0,
  'external_step': 0,
  'initial_conversion_conf': 1,
  'edge_scores_reduced': nan,
  'ensembl_step': 1,
  'final_assembly_priority': ([1, 2], 2),
  'the_path': ((None, 'ENSG00000008197.4', None),),
  'final_conversion': {'final_conversion_confidence': 0,
   'final_database': 'Vega gene',
   'final_elements': {'OTTHUMG00000014832': {'final_assembly_priority_count': 2,
     'final_assembly_min_priority': 1,
     'additional_assembly_jump': 0,
     'the_path': (('ENSG00000008197.4', 'OTTHUMG00000014832', 0, 81),),
     'filter_scores': {'initial_filter': [0, 1, 0, 0, 1, -2, -2],
      'same_as_input_filter': False,
      'node_importance_filter': None}}}}}}

In [99]:
idt.track.convert(
    id_from,
    to_release=release,
    from_release=release,
    final_database=database,
    return_path=True,
    prioritize_to_one_filter=False,
)

{'ENSG00000008197.4': {'from_id': 'ENSG00000008197.4',
  'assembly_jump': 0,
  'external_jump': 0,
  'external_step': 0,
  'initial_conversion_conf': 1,
  'edge_scores_reduced': nan,
  'ensembl_step': 1,
  'final_assembly_priority': ([1, 2], 2),
  'the_path': ((None, 'ENSG00000008197.4', None),),
  'final_conversion': {'final_conversion_confidence': 0,
   'final_database': 'Vega gene',
   'final_elements': {'OTTHUMG00000014832': {'final_assembly_priority_count': 2,
     'final_assembly_min_priority': 1,
     'additional_assembly_jump': 0,
     'the_path': (('ENSG00000008197.4', 'OTTHUMG00000014832', 0, 81),)},
    'RP3-336H9.2': {'final_assembly_priority_count': 1,
     'final_assembly_min_priority': 1,
     'additional_assembly_jump': 0,
     'the_path': (('ENSG00000008197.4', 'RP3-336H9.2', 0, 81),)}}}}}

#### History travel only on Ensembl backbone

In [None]:
history_results = []
for i in range(12):
    r = idt.track.history_travel_testing_random(
        from_fraction=0.1,
        include_external_destination=False,
        include_external_source=False,
        include_ensembl_destination=False,
        include_ensembl_source=False,
        return_result=True,
    )
    history_results.append(r)

os.makedirs(test_results_dir, exist_ok=True)
output_path = os.path.join(test_results_dir, "travel_v1_conversion_results.pkl")
with open(output_path, "wb") as f:
    pickle.dump(results, f)
print(f"Results saved to {output_path}")

#### History travel from anywhere to Ensembl backbone

In [None]:
history_results = []
for i in range(12):
    r = idt.track.history_travel_testing_random(
        from_fraction=0.1,
        include_external_destination=False,
        include_external_source=True,
        include_ensembl_destination=False,
        include_ensembl_source=True,
        return_result=True,
    )
    history_results.append(r)

os.makedirs(test_results_dir, exist_ok=True)
output_path = os.path.join(test_results_dir, "travel_v2_conversion_results.pkl")
with open(output_path, "wb") as f:
    pickle.dump(results, f)
print(f"Results saved to {output_path}")

#### History travel from anywhere to anywhere

In [None]:
history_results = []
for i in range(12):
    r = idt.track.history_travel_testing_random(
        from_fraction=0.1,
        include_external_destination=True,
        include_external_source=True,
        include_ensembl_destination=True,
        include_ensembl_source=True,
        return_result=True,
    )
    history_results.append(r)

os.makedirs(test_results_dir, exist_ok=True)
output_path = os.path.join(test_results_dir, "travel_v3_conversion_results.pkl")
with open(output_path, "wb") as f:
    pickle.dump(results, f)
print(f"Results saved to {output_path}")