In [1]:
import pykeen.datasets
import pykeen
import os
import subprocess
import sys
import pickle
import shutil
import pandas as pd
import polars as pl
import numpy as np

# Train a Probabalistic Case-based Reasoning model on BioKG

Context:
Rivas-Barrigan implemented a pared down flavor of BioKG and OpenBioLink containing only 3 node types. This notebook seeks to create case-based reasoning models on the full BioKG and OpenBiolink knowledge graphs for drug repurposing. We don't run probCBR on the modified dataset because it wouldn't make any sense - the modified dataset only has a specific schema ( Drug - interacts with - Gene - associated with - Disease)

Actions:
* modify probCBR repository to take a specific edge type to process, this flag is under `--filter_relations`
* create probCBR similarity subgraph
* generate entity similarity, vocabulary, subgraphs, priors, and precision file
* run probCBR
* 

Findings:
* prediction performance seems to be much more poor than to MIND. Still better performing than CBR.
    * MRR:        0.02075
    * Hits_at_1:  0.00464
    * Hits_at_3:  0.01289
    * Hits_at_10: 0.02075


Change to appropriate directory

In [2]:
os.getcwd()

'/home/rogertu/projects/semmed'

In [3]:
os.chdir("../Consilience-Drug-Repurposing")

In [4]:
sys.path.append("./path-based/probCBR/prob_cbr")

# Preprocess probCBR subgraph

## Create Required vocabulary files

In [5]:
subprocess.run(
    [
        "python",
        "path-based/probCBR/prob_cbr/data/preprocessing.py",
        "--dataset_name",
        "biokg",
        "--k_adj",
        "10",
        "--max_path_len",
        "3",
        "--linkage",
        "0",
        "--prevent_loops",
        "--add_inv_edges",
        "--test",
        "--create_vocab",
    ]
)

[2024-12-03 22:14:17 	 COMMAND: path-based/probCBR/prob_cbr/data/preprocessing.py --dataset_name biokg --k_adj 10 --max_path_len 3 --linkage 0 --prevent_loops --add_inv_edges --test --create_vocab]
[2024-12-03 22:14:17 	 Output directory: ./outputs/biokg]
[2024-12-03 22:14:17 	 Loading train map]
2067997it [00:04, 441519.88it/s]
[2024-12-03 22:14:21 	 Loading dev map]
206800it [00:00, 847778.88it/s]
[2024-12-03 22:14:22 	 Loading test map]
206800it [00:00, 748708.09it/s]
2067997it [00:02, 872780.88it/s]
[2024-12-03 22:14:25 	 Saving vocabs...]
[2024-12-03 22:14:25 	 Writing ./data/biokg/entity_vocab.json]
[2024-12-03 22:14:25 	 Writing ./data/biokg/relation_vocab.json]
[2024-12-03 22:14:25 	 Writing ./data/biokg/eval_vocab.json]


CompletedProcess(args=['python', 'path-based/probCBR/prob_cbr/data/preprocessing.py', '--dataset_name', 'biokg', '--k_adj', '10', '--max_path_len', '3', '--linkage', '0', '--prevent_loops', '--add_inv_edges', '--test', '--create_vocab'], returncode=0)

## Calculate entity similarities

In [6]:
subprocess.run(
    [
        "python",
        "path-based/probCBR/prob_cbr/data/preprocessing.py",
        "--dataset_name",
        "biokg",
        "--k_adj",
        "10",
        "--max_path_len",
        "3",
        "--linkage",
        "0",
        "--prevent_loops",
        "--add_inv_edges",
        "--test",
        "--calculate_ent_similarity",
    ]
)

[2024-12-03 22:14:35 	 COMMAND: path-based/probCBR/prob_cbr/data/preprocessing.py --dataset_name biokg --k_adj 10 --max_path_len 3 --linkage 0 --prevent_loops --add_inv_edges --test --calculate_ent_similarity]
[2024-12-03 22:14:35 	 Output directory: ./outputs/biokg]
[2024-12-03 22:14:35 	 Loading train map]
2067997it [00:04, 458733.93it/s]
[2024-12-03 22:14:40 	 Loading dev map]
206800it [00:00, 730940.97it/s]
[2024-12-03 22:14:40 	 Loading test map]
206800it [00:00, 879729.06it/s]
[2024-12-03 22:14:40 	 Loading vocabs...]
2067997it [00:02, 935980.71it/s]
  adj_mat = adj_mat / l2norm.reshape(l2norm.shape[0], 1)
[2024-12-03 22:14:43 	 Building sparse adjacency matrices]
[2024-12-03 22:14:47 	 Calculating entity similarity matrix...]
[2024-12-03 22:14:47 	 Using device: cuda]
[2024-12-03 22:14:47 	 st: 0, en: 128, query_ind.shape[0]: 56744]
[2024-12-03 22:14:48 	 st: 128, en: 256, query_ind.shape[0]: 56744]
[2024-12-03 22:14:48 	 st: 256, en: 384, query_ind.shape[0]: 56744]
[2024-12-03 

cuda


CompletedProcess(args=['python', 'path-based/probCBR/prob_cbr/data/preprocessing.py', '--dataset_name', 'biokg', '--k_adj', '10', '--max_path_len', '3', '--linkage', '0', '--prevent_loops', '--add_inv_edges', '--test', '--calculate_ent_similarity'], returncode=0)

## Calculate subgraph

 Creating them in parallel in multiple tmux sessions, replace X with 0-9. Here are the commands below.

```bash
python path-based/probCBR/prob_cbr/data/preprocessing.py \
--dataset_name biokg \
--k_adj 10 \
--max_path_len 3 \
--linkage 0 \
--prevent_loops \
--add_inv_edges \
--test \
--get_paths_parallel \
--total_jobs 10 \
--current_job [X]
```

## Combine subgraphs

In [7]:
subprocess.run(
    [
        "python",
        "path-based/probCBR/prob_cbr/data/preprocessing.py",
        "--dataset_name",
        "biokg",
        "--k_adj",
        "10",
        "--max_path_len",
        "3",
        "--linkage",
        "0",
        "--prevent_loops",
        "--add_inv_edges",
        "--test",
        "--combine_paths",
    ]
)

[2024-12-03 22:45:10 	 COMMAND: path-based/probCBR/prob_cbr/data/preprocessing.py --dataset_name biokg --k_adj 10 --max_path_len 3 --linkage 0 --prevent_loops --add_inv_edges --test --combine_paths]
[2024-12-03 22:45:10 	 Output directory: ./outputs/biokg]
[2024-12-03 22:45:10 	 Loading train map]
2067997it [00:04, 433789.57it/s]
[2024-12-03 22:45:15 	 Loading dev map]
206800it [00:00, 700436.52it/s]
[2024-12-03 22:45:15 	 Loading test map]
206800it [00:00, 760042.08it/s]
[2024-12-03 22:45:16 	 Loading vocabs...]
2067997it [00:02, 897406.23it/s] 
  adj_mat = adj_mat / l2norm.reshape(l2norm.shape[0], 1)
[2024-12-03 22:45:18 	 Building sparse adjacency matrices]
[2024-12-03 22:45:22 	 Loading cluster assignments of entities from ./data/outputs/biokg/linkage=0.0/cluster_assignments.pkl]
[2024-12-03 22:45:22 	 Loading paths generated in parallel.]
100%|██████████| 10/10 [01:50<00:00, 11.02s/it]
[2024-12-03 22:47:12 	 Combining paths parallel paths.]


CompletedProcess(args=['python', 'path-based/probCBR/prob_cbr/data/preprocessing.py', '--dataset_name', 'biokg', '--k_adj', '10', '--max_path_len', '3', '--linkage', '0', '--prevent_loops', '--add_inv_edges', '--test', '--combine_paths'], returncode=0)

Create prior map

In [None]:
subprocess.run(
    [
        "python",
        "path-based/probCBR/prob_cbr/data/preprocessing.py",
        "--dataset_name",
        "biokg",
        "--k_adj",
        "10",
        "--max_path_len",
        "3",
        "--linkage",
        "0",
        "--prevent_loops",
        "--add_inv_edges",
        "--test",
        "--calculate_prior_map_parallel",
        "--total_jobs",
        "10",
        "--current_job",
        "0",
    ]
)

Create precision map in parallel (like the subgraph and prior maps)
* There's a bug in the code where if you combine the prior map before generating the precision map maually, a key error occurs
* Don't run this in parallel and just run it by itself

In [None]:
subprocess.run(
    [
        "python",
        "path-based/probCBR/prob_cbr/data/preprocessing.py",
        "--dataset_name",
        "biokg",
        "--k_adj",
        "10",
        "--max_path_len",
        "3",
        "--linkage",
        "0",
        "--prevent_loops",
        "--add_inv_edges",
        "--test",
        "--calculate_precision_map_parallel",
        "--total_jobs",
        "10",
        "--current_job",
        "0",
    ]
)

Now combine the prior map

In [50]:
subprocess.run(
    [
        "python",
        "path-based/probCBR/prob_cbr/data/preprocessing.py",
        "--dataset_name",
        "biokg",
        "--k_adj",
        "10",
        "--max_path_len",
        "3",
        "--linkage",
        "0",
        "--prevent_loops",
        "--add_inv_edges",
        "--test",
        "--combine_prior_map",
    ]
)

[2024-12-04 14:17:07 	 COMMAND: path-based/probCBR/prob_cbr/data/preprocessing.py --dataset_name biokg --k_adj 10 --max_path_len 3 --linkage 0 --prevent_loops --add_inv_edges --test --combine_prior_map]
[2024-12-04 14:17:07 	 Output directory: ./outputs/biokg]
[2024-12-04 14:17:07 	 Loading train map]
2067997it [00:04, 455260.80it/s]
[2024-12-04 14:17:12 	 Loading dev map]
206800it [00:00, 816178.31it/s]
[2024-12-04 14:17:12 	 Loading test map]
206800it [00:00, 689187.20it/s]
[2024-12-04 14:17:12 	 Loading vocabs...]
2067997it [00:02, 948728.28it/s] 
  adj_mat = adj_mat / l2norm.reshape(l2norm.shape[0], 1)
[2024-12-04 14:17:15 	 Building sparse adjacency matrices]
[2024-12-04 14:17:19 	 Loading cluster assignments of entities from ./data/outputs/biokg/linkage=0.0/cluster_assignments.pkl]
[2024-12-04 14:17:19 	 Combining prior maps located in ./data/outputs/biokg/linkage=0.0/path_len_3/prior_maps]
[2024-12-04 14:17:19 	 Reading 9_path_prior_map.pkl]
[2024-12-04 14:17:19 	 Reading 8_path

CompletedProcess(args=['python', 'path-based/probCBR/prob_cbr/data/preprocessing.py', '--dataset_name', 'biokg', '--k_adj', '10', '--max_path_len', '3', '--linkage', '0', '--prevent_loops', '--add_inv_edges', '--test', '--combine_prior_map'], returncode=0)

Combine precision map 

In [51]:
subprocess.run(
    [
        "python",
        "path-based/probCBR/prob_cbr/data/preprocessing.py",
        "--dataset_name",
        "biokg",
        "--k_adj",
        "10",
        "--max_path_len",
        "3",
        "--linkage",
        "0",
        "--prevent_loops",
        "--add_inv_edges",
        "--test",
        "--combine_precision_map",
    ]
)

[2024-12-04 14:17:57 	 COMMAND: path-based/probCBR/prob_cbr/data/preprocessing.py --dataset_name biokg --k_adj 10 --max_path_len 3 --linkage 0 --prevent_loops --add_inv_edges --test --combine_precision_map]
[2024-12-04 14:17:57 	 Output directory: ./outputs/biokg]
[2024-12-04 14:17:57 	 Loading train map]
2067997it [00:04, 462067.27it/s]
[2024-12-04 14:18:02 	 Loading dev map]
206800it [00:00, 740228.57it/s]
[2024-12-04 14:18:02 	 Loading test map]
206800it [00:00, 731674.70it/s]
[2024-12-04 14:18:02 	 Loading vocabs...]
2067997it [00:02, 869553.74it/s]
  adj_mat = adj_mat / l2norm.reshape(l2norm.shape[0], 1)
[2024-12-04 14:18:05 	 Building sparse adjacency matrices]
[2024-12-04 14:18:09 	 Loading cluster assignments of entities from ./data/outputs/biokg/linkage=0.0/cluster_assignments.pkl]
[2024-12-04 14:18:09 	 Combining precision map located in ./data/outputs/biokg/linkage=0.0/path_len_3/precision_maps]
[2024-12-04 14:18:09 	 Reading filename 0_precision_map.pkl]
[2024-12-04 14:18:0

CompletedProcess(args=['python', 'path-based/probCBR/prob_cbr/data/preprocessing.py', '--dataset_name', 'biokg', '--k_adj', '10', '--max_path_len', '3', '--linkage', '0', '--prevent_loops', '--add_inv_edges', '--test', '--combine_precision_map'], returncode=0)

# Run pCBR model

In [52]:
subprocess.run(
    [
        "python",
        "path-based/probCBR/prob_cbr/pr_cbr_mod.py",
        "--dataset_name",
        "biokg",
        "--subgraph_file_name",
        "paths_1000_pathLen_3_noLoops_invEdges_combined.pkl",
        "--k_adj",
        "10",
        "--max_path_len",
        "3",
        "--linkage",
        "0",
        "--add_inv_edges",
        "--test",
        "--filter_relations",
        "['DRUG_DISEASE_ASSOCIATION']",
    ]
)

[2024-12-04 14:18:32] 	 Running ProbCBR for biokg
[2024-12-04 14:18:32] 	 dataset_name: biokg
[2024-12-04 14:18:32] 	 data_dir: ./
[2024-12-04 14:18:32] 	 expt_dir: ./
[2024-12-04 14:18:32] 	 subgraph_file_name: paths_1000_pathLen_3_noLoops_invEdges_combined.pkl
[2024-12-04 14:18:32] 	 test: True
[2024-12-04 14:18:32] 	 add_inv_edges: True
[2024-12-04 14:18:32] 	 test_file_name: test.txt
[2024-12-04 14:18:32] 	 filter_relations: ['DRUG_DISEASE_ASSOCIATION']
[2024-12-04 14:18:32] 	 dump_paths: 1
[2024-12-04 14:18:32] 	 ans_num: 100
[2024-12-04 14:18:32] 	 per_relation_config_file: None
[2024-12-04 14:18:32] 	 output_per_relation_scores: 1
[2024-12-04 14:18:32] 	 print_paths: 1
[2024-12-04 14:18:32] 	 linkage: 0.0
[2024-12-04 14:18:32] 	 k_adj: 10
[2024-12-04 14:18:32] 	 cheat_neighbors: False
[2024-12-04 14:18:32] 	 max_num_programs: 1000
[2024-12-04 14:18:32] 	 max_path_len: 3
[2024-12-04 14:18:32] 	 max_branch: 100
[2024-12-04 14:18:32] 	 aggr_type1: none
[2024-12-04 14:18:32] 	 aggr_

CompletedProcess(args=['python', 'path-based/probCBR/prob_cbr/pr_cbr_mod.py', '--dataset_name', 'biokg', '--subgraph_file_name', 'paths_1000_pathLen_3_noLoops_invEdges_combined.pkl', '--k_adj', '10', '--max_path_len', '3', '--linkage', '0', '--add_inv_edges', '--test', '--filter_relations', "['DRUG_DISEASE_ASSOCIATION']"], returncode=0)

# Debugging

## Inspect the programs

In [30]:
all_programs = [
    ["DDI", "inv_DDI", "DRUG_DISEASE_ASSOCIATION"],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_PROTEIN_DISEASE_ASSOCIATION",
        "PROTEIN_DISEASE_ASSOCIATION",
    ],
    ["DDI", "inv_DDI", "DRUG_DISEASE_ASSOCIATION"],
    ["DDI", "DRUG_DISEASE_ASSOCIATION"],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_PROTEIN_DISEASE_ASSOCIATION",
        "PROTEIN_DISEASE_ASSOCIATION",
    ],
    ["DDI", "DRUG_DISEASE_ASSOCIATION"],
    ["DDI", "DRUG_DISEASE_ASSOCIATION"],
    ["DDI", "DRUG_DISEASE_ASSOCIATION"],
    ["DDI", "DRUG_DISEASE_ASSOCIATION"],
    ["inv_DDI", "DRUG_DISEASE_ASSOCIATION"],
    ["DDI", "DRUG_DISEASE_ASSOCIATION"],
    ["DDI", "inv_DDI", "DRUG_DISEASE_ASSOCIATION"],
    ["DDI", "inv_DDI", "DRUG_DISEASE_ASSOCIATION"],
    ["DDI", "inv_DDI", "DRUG_DISEASE_ASSOCIATION"],
    ["DDI", "DDI", "DRUG_DISEASE_ASSOCIATION"],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_DRUG_DISEASE_ASSOCIATION",
        "DRUG_DISEASE_ASSOCIATION",
    ],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_PROTEIN_DISEASE_ASSOCIATION",
        "PROTEIN_DISEASE_ASSOCIATION",
    ],
    ["inv_DDI", "DRUG_DISEASE_ASSOCIATION"],
    ["inv_DDI", "DDI", "DRUG_DISEASE_ASSOCIATION"],
    ["DDI", "inv_DDI", "DRUG_DISEASE_ASSOCIATION"],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_PROTEIN_DISEASE_ASSOCIATION",
        "PROTEIN_DISEASE_ASSOCIATION",
    ],
    ["DRUG_TARGET", "PROTEIN_DISEASE_ASSOCIATION"],
    ["DDI", "DDI", "DRUG_DISEASE_ASSOCIATION"],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_PROTEIN_DISEASE_ASSOCIATION",
        "PROTEIN_DISEASE_ASSOCIATION",
    ],
    ["DDI", "DRUG_DISEASE_ASSOCIATION"],
    ["DDI", "DRUG_DISEASE_ASSOCIATION"],
    ["DDI", "DRUG_DISEASE_ASSOCIATION"],
    ["DDI", "DRUG_DISEASE_ASSOCIATION"],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_DRUG_DISEASE_ASSOCIATION",
        "DRUG_DISEASE_ASSOCIATION",
    ],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_DRUG_DISEASE_ASSOCIATION",
        "DRUG_DISEASE_ASSOCIATION",
    ],
    ["DRUG_TARGET", "PROTEIN_DISEASE_ASSOCIATION"],
    ["DDI", "inv_DDI", "DRUG_DISEASE_ASSOCIATION"],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_PROTEIN_DISEASE_ASSOCIATION",
        "PROTEIN_DISEASE_ASSOCIATION",
    ],
    ["DDI", "DRUG_DISEASE_ASSOCIATION"],
    ["DPI", "PROTEIN_DISEASE_ASSOCIATION"],
    ["DRUG_TARGET", "PROTEIN_DISEASE_ASSOCIATION"],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_DRUG_DISEASE_ASSOCIATION",
        "DRUG_DISEASE_ASSOCIATION",
    ],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_DRUG_DISEASE_ASSOCIATION",
        "DRUG_DISEASE_ASSOCIATION",
    ],
    ["inv_DDI", "DDI", "DRUG_DISEASE_ASSOCIATION"],
    ["DDI", "DRUG_DISEASE_ASSOCIATION"],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_PROTEIN_DISEASE_ASSOCIATION",
        "PROTEIN_DISEASE_ASSOCIATION",
    ],
    ["DDI", "inv_DDI", "DRUG_DISEASE_ASSOCIATION"],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_PROTEIN_DISEASE_ASSOCIATION",
        "PROTEIN_DISEASE_ASSOCIATION",
    ],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_PROTEIN_DISEASE_ASSOCIATION",
        "PROTEIN_DISEASE_ASSOCIATION",
    ],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_PROTEIN_DISEASE_ASSOCIATION",
        "PROTEIN_DISEASE_ASSOCIATION",
    ],
    ["DRUG_TARGET", "PROTEIN_DISEASE_ASSOCIATION"],
    ["DRUG_TARGET", "PROTEIN_DISEASE_ASSOCIATION"],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_DRUG_DISEASE_ASSOCIATION",
        "DRUG_DISEASE_ASSOCIATION",
    ],
    ["DPI", "PROTEIN_DISEASE_ASSOCIATION"],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_PROTEIN_DISEASE_ASSOCIATION",
        "PROTEIN_DISEASE_ASSOCIATION",
    ],
    ["DDI", "DRUG_DISEASE_ASSOCIATION"],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_DRUG_DISEASE_ASSOCIATION",
        "DRUG_DISEASE_ASSOCIATION",
    ],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_PROTEIN_DISEASE_ASSOCIATION",
        "PROTEIN_DISEASE_ASSOCIATION",
    ],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_PROTEIN_DISEASE_ASSOCIATION",
        "PROTEIN_DISEASE_ASSOCIATION",
    ],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_PROTEIN_DISEASE_ASSOCIATION",
        "PROTEIN_DISEASE_ASSOCIATION",
    ],
    ["DDI", "inv_DDI", "DRUG_DISEASE_ASSOCIATION"],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_PROTEIN_DISEASE_ASSOCIATION",
        "PROTEIN_DISEASE_ASSOCIATION",
    ],
    ["DDI", "DRUG_DISEASE_ASSOCIATION"],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_PROTEIN_DISEASE_ASSOCIATION",
        "PROTEIN_DISEASE_ASSOCIATION",
    ],
    ["DPI", "PROTEIN_DISEASE_ASSOCIATION"],
    ["DPI", "PROTEIN_DISEASE_ASSOCIATION"],
    ["DRUG_TARGET", "PROTEIN_DISEASE_ASSOCIATION"],
    [
        "DRUG_PATHWAY_ASSOCIATION",
        "inv_DRUG_PATHWAY_ASSOCIATION",
        "DRUG_DISEASE_ASSOCIATION",
    ],
    ["DDI", "inv_DDI", "DRUG_DISEASE_ASSOCIATION"],
    ["DDI", "inv_DDI", "DRUG_DISEASE_ASSOCIATION"],
    ["DDI", "DRUG_DISEASE_ASSOCIATION"],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_PROTEIN_DISEASE_ASSOCIATION",
        "PROTEIN_DISEASE_ASSOCIATION",
    ],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_DRUG_DISEASE_ASSOCIATION",
        "DRUG_DISEASE_ASSOCIATION",
    ],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_DRUG_DISEASE_ASSOCIATION",
        "DRUG_DISEASE_ASSOCIATION",
    ],
    ["DDI", "DDI", "DRUG_DISEASE_ASSOCIATION"],
    ["DDI", "inv_DDI", "DRUG_DISEASE_ASSOCIATION"],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_PROTEIN_DISEASE_ASSOCIATION",
        "PROTEIN_DISEASE_ASSOCIATION",
    ],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_PROTEIN_DISEASE_ASSOCIATION",
        "PROTEIN_DISEASE_ASSOCIATION",
    ],
    ["inv_DDI", "DRUG_DISEASE_ASSOCIATION"],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_DRUG_DISEASE_ASSOCIATION",
        "DRUG_DISEASE_ASSOCIATION",
    ],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_PROTEIN_DISEASE_ASSOCIATION",
        "PROTEIN_DISEASE_ASSOCIATION",
    ],
    ["DDI", "inv_DDI", "DRUG_DISEASE_ASSOCIATION"],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_PROTEIN_DISEASE_ASSOCIATION",
        "PROTEIN_DISEASE_ASSOCIATION",
    ],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_PROTEIN_DISEASE_ASSOCIATION",
        "PROTEIN_DISEASE_ASSOCIATION",
    ],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_PROTEIN_DISEASE_ASSOCIATION",
        "PROTEIN_DISEASE_ASSOCIATION",
    ],
    ["DDI", "DDI", "DRUG_DISEASE_ASSOCIATION"],
    ["DPI", "inv_DPI", "DRUG_DISEASE_ASSOCIATION"],
    ["DDI", "DRUG_DISEASE_ASSOCIATION"],
    ["DDI", "DRUG_DISEASE_ASSOCIATION"],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_PROTEIN_DISEASE_ASSOCIATION",
        "PROTEIN_DISEASE_ASSOCIATION",
    ],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_DRUG_DISEASE_ASSOCIATION",
        "DRUG_DISEASE_ASSOCIATION",
    ],
    ["DDI", "DDI", "DRUG_DISEASE_ASSOCIATION"],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_DRUG_DISEASE_ASSOCIATION",
        "DRUG_DISEASE_ASSOCIATION",
    ],
    ["DDI", "inv_DDI", "DRUG_DISEASE_ASSOCIATION"],
    ["DDI", "DRUG_DISEASE_ASSOCIATION"],
    ["inv_DDI", "DRUG_DISEASE_ASSOCIATION"],
    ["DDI", "DRUG_TARGET", "PROTEIN_DISEASE_ASSOCIATION"],
    ["DDI", "DRUG_DISEASE_ASSOCIATION"],
    ["DDI", "DRUG_DISEASE_ASSOCIATION"],
    ["DDI", "inv_DDI", "DRUG_DISEASE_ASSOCIATION"],
    [
        "DRUG_DISEASE_ASSOCIATION",
        "inv_PROTEIN_DISEASE_ASSOCIATION",
        "PROTEIN_DISEASE_ASSOCIATION",
    ],
]

In [31]:
all_programs[0:5]

[['DDI', 'inv_DDI', 'DRUG_DISEASE_ASSOCIATION'],
 ['DRUG_DISEASE_ASSOCIATION',
  'inv_PROTEIN_DISEASE_ASSOCIATION',
  'PROTEIN_DISEASE_ASSOCIATION'],
 ['DDI', 'inv_DDI', 'DRUG_DISEASE_ASSOCIATION'],
 ['DDI', 'DRUG_DISEASE_ASSOCIATION'],
 ['DRUG_DISEASE_ASSOCIATION',
  'inv_PROTEIN_DISEASE_ASSOCIATION',
  'PROTEIN_DISEASE_ASSOCIATION']]

In [32]:
unique_programs = set()
for p in all_programs:
    unique_programs.add(tuple(p))

In [33]:
unique_programs

{('DDI', 'DDI', 'DRUG_DISEASE_ASSOCIATION'),
 ('DDI', 'DRUG_DISEASE_ASSOCIATION'),
 ('DDI', 'DRUG_TARGET', 'PROTEIN_DISEASE_ASSOCIATION'),
 ('DDI', 'inv_DDI', 'DRUG_DISEASE_ASSOCIATION'),
 ('DPI', 'PROTEIN_DISEASE_ASSOCIATION'),
 ('DPI', 'inv_DPI', 'DRUG_DISEASE_ASSOCIATION'),
 ('DRUG_DISEASE_ASSOCIATION',
  'inv_DRUG_DISEASE_ASSOCIATION',
  'DRUG_DISEASE_ASSOCIATION'),
 ('DRUG_DISEASE_ASSOCIATION',
  'inv_PROTEIN_DISEASE_ASSOCIATION',
  'PROTEIN_DISEASE_ASSOCIATION'),
 ('DRUG_PATHWAY_ASSOCIATION',
  'inv_DRUG_PATHWAY_ASSOCIATION',
  'DRUG_DISEASE_ASSOCIATION'),
 ('DRUG_TARGET', 'PROTEIN_DISEASE_ASSOCIATION'),
 ('inv_DDI', 'DDI', 'DRUG_DISEASE_ASSOCIATION'),
 ('inv_DDI', 'DRUG_DISEASE_ASSOCIATION')}

## check priors and precisions

In [36]:
x = 0

In [37]:
if x is None:
    print("x is none")
else:
    print("x is not none")

x is not none


In [43]:
biokg_prior_map[0.0]["DRUG_DISEASE_ASSOCIATION"][
    ("DDI", "DDI", "DRUG_DISEASE_ASSOCIATION")
]

0.016640529709008537

In [46]:
biokg_precision_map[0.0]["DRUG_DISEASE_ASSOCIATION"]

KeyError: 'DRUG_DISEASE_ASSOCIATION'

## Compare Prior Maps

In [12]:
with open(
    "./data/outputs/biokg/linkage=0.0/path_len_3/prior_maps/path_prior_map.pkl", "rb"
) as f:
    biokg_prior_map = pickle.load(f)

In [13]:
with open(
    "./data/outputs/MIND/linkage=0.0/path_len_3/prior_maps/path_prior_map.pkl",
    "rb",
) as f:
    mind_prior_map = pickle.load(f)

In [14]:
biokg_prior_map

{0.0: {'COMPLEX_IN_PATHWAY': {('inv_MEMBER_OF_COMPLEX',
    'MEMBER_OF_COMPLEX',
    'COMPLEX_IN_PATHWAY'): 0.1930018810867443,
   ('inv_MEMBER_OF_COMPLEX',
    'PROTEIN_PATHWAY_ASSOCIATION'): 0.5401655673817971,
   ('COMPLEX_TOP_LEVEL_PATHWAY',
    'inv_COMPLEX_TOP_LEVEL_PATHWAY',
    'COMPLEX_IN_PATHWAY'): 0.07119556158773245,
   ('inv_MEMBER_OF_COMPLEX',
    'inv_PPI',
    'PROTEIN_PATHWAY_ASSOCIATION'): 0.020152233096014795,
   ('inv_MEMBER_OF_COMPLEX',
    'PPI',
    'PROTEIN_PATHWAY_ASSOCIATION'): 0.02306117103602638,
   ('COMPLEX_IN_PATHWAY',
    'inv_COMPLEX_IN_PATHWAY',
    'COMPLEX_IN_PATHWAY'): 0.11012691382717814,
   ('COMPLEX_IN_PATHWAY',
    'inv_PROTEIN_PATHWAY_ASSOCIATION',
    'PROTEIN_PATHWAY_ASSOCIATION'): 0.042050622663523586,
   ('COMPLEX_TOP_LEVEL_PATHWAY',
    'inv_PROTEIN_PATHWAY_ASSOCIATION',
    'PROTEIN_PATHWAY_ASSOCIATION'): 0.00024604932098324484},
  'COMPLEX_TOP_LEVEL_PATHWAY': {('COMPLEX_IN_PATHWAY',
    'inv_COMPLEX_IN_PATHWAY',
    'COMPLEX_TOP_LEVEL_PA

In [15]:
mind_prior_map

{0.0: {'negatively_regulates_GnrBP': {('regulates_GrBP',): 0.816309599546072,
   ('part_of_GpoBP',
    'inv_part_of_GpoBP',
    'negatively_regulates_GnrBP'): 0.019025430285940267,
   ('part_of_GpoBP',
    'inv_part_of_GpoBP',
    'regulates_GrBP'): 0.02543888306195086,
   ('part_of_GpoBP',
    'associated_with_BPawD',
    'inv_associated_with_BPawD'): 0.0014314207603294416,
   ('regulates_GrBP',
    'associated_with_BPawD',
    'inv_associated_with_BPawD'): 0.00030089925892811087,
   ('regulates_Gr>G', 'negatively_regulates_GnrBP'): 8.597121683660311e-05,
   ('positively_regulates_GprBP',): 0.02652641895493389,
   ('negatively_regulates_GnrBP',
    'associated_with_BPawD',
    'inv_associated_with_BPawD'): 0.0001547481903058856,
   ('regulates_GrBP',
    'inv_negatively_regulates_GnrBP',
    'negatively_regulates_GnrBP'): 0.0010875358929830293,
   ('regulates_Gr>G', 'regulates_GrBP'): 0.0003266906239790918,
   ('regulates_GrBP',
    'inv_regulates_GrBP',
    'positively_regulates_GprB

## compare precision map

In [16]:
with open(
    "./data/outputs/biokg/linkage=0.0/path_len_3/precision_maps/precision_map.pkl",
    "rb",
) as f:
    biokg_precision_map = pickle.load(f)

In [17]:
with open(
    "./data/outputs/MIND/linkage=0.0/path_len_3/precision_maps/precision_map.pkl",
    "rb",
) as f:
    mind_precision_map = pickle.load(f)

In [18]:
biokg_precision_map

{0.0: {'MEMBER_OF_COMPLEX': {('MEMBER_OF_COMPLEX',
    'inv_MEMBER_OF_COMPLEX',
    'MEMBER_OF_COMPLEX'): 0.052432308398347865,
   ('PROTEIN_PATHWAY_ASSOCIATION',
    'inv_COMPLEX_IN_PATHWAY'): 0.10312426591496358,
   ('PROTEIN_PATHWAY_ASSOCIATION',
    'inv_PROTEIN_PATHWAY_ASSOCIATION',
    'MEMBER_OF_COMPLEX'): 0.009733025837470195,
   ('MEMBER_OF_COMPLEX',
    'COMPLEX_IN_PATHWAY',
    'inv_COMPLEX_IN_PATHWAY'): 0.12386104783599089,
   ('PPI', 'MEMBER_OF_COMPLEX'): 0.10942760942760943,
   ('PPI',
    'PROTEIN_PATHWAY_ASSOCIATION',
    'inv_COMPLEX_IN_PATHWAY'): 0.02152414194299011,
   ('PPI', 'PPI', 'MEMBER_OF_COMPLEX'): 0.026905829596412557,
   ('PPI', 'inv_PPI', 'MEMBER_OF_COMPLEX'): 0.010700389105058366,
   ('MEMBER_OF_COMPLEX',
    'COMPLEX_TOP_LEVEL_PATHWAY',
    'inv_COMPLEX_TOP_LEVEL_PATHWAY'): 0.007915735716565592},
  'PROTEIN_PATHWAY_ASSOCIATION': {('MEMBER_OF_COMPLEX',
    'inv_MEMBER_OF_COMPLEX',
    'PROTEIN_PATHWAY_ASSOCIATION'): 0.057500969117457036,
   ('MEMBER_OF_COM

In [19]:
mind_precision_map

{0.0: {'activates_CaG': {('affects_CafG',): 0.5255603080415752,
   ('affects_CafG', 'inv_activates_CaG', 'affects_CafG'): 0.006391298659802386,
   ('inhibits_CinG',): 0.1607242224160126,
   ('treats_CtD', 'inv_treats_CtD', 'inhibits_CinG'): 0.0029830986523531654,
   ('associated_with_CawPW',
    'inv_associated_with_CawPW',
    'affects_CafG'): 0.0031680567153462495,
   ('activates_CaG',
    'inv_activates_CaG',
    'affects_CafG'): 0.007050201281419131,
   ('activates_CaG',
    'inv_inhibits_CinG',
    'inhibits_CinG'): 0.009811607598996304,
   ('inhibits_CinG',
    'inv_inhibits_CinG',
    'affects_CafG'): 0.019760759266950572,
   ('treats_CtD', 'inv_treats_CtD', 'activates_CaG'): 0.0017700403363254177,
   ('inv_regulates_GrC',): 0.13389765294343978,
   ('inv_in_reaction_with_GrxC',): 0.03952756584335532,
   ('affects_CafG',
    'inv_activates_CaG',
    'activates_CaG'): 0.007643612293463259,
   ('activates_CaG', 'part_of_GpoPW', 'inv_part_of_GpoPW'): 0.0314813881817879,
   ('inhibit

## compare subgraphs

In [21]:
with open(
    "data/subgraphs/biokg/paths_1000_pathLen_3_noLoops_invEdges_combined.pkl", "rb"
) as f:
    biokg_subgraph = pickle.load(f)

In [26]:
with open(
    "./data/subgraphs/MIND/paths_1000_pathLen_3_noLoops_invEdges_combined.pkl", "rb"
) as f:
    mind_subgraph = pickle.load(f)

In [22]:
len(biokg_subgraph)

68586

In [24]:
list(biokg_subgraph.keys())[0:5]

['Q6PC78', 'R-HSA-936444', 'Q80VK6', 'Q7TP47', 'R-HSA-71493']

In [25]:
biokg_subgraph["Q6PC78"]

{(('PROTEIN_PATHWAY_ASSOCIATION', 'R-DRE-382556'),
  ('inv_PROTEIN_PATHWAY_ASSOCIATION', 'A3QK16'),
  ('PROTEIN_DISEASE_ASSOCIATION', 'C567628')),
 (('PROTEIN_PATHWAY_ASSOCIATION', 'R-DRE-382556'),
  ('inv_PROTEIN_PATHWAY_ASSOCIATION', 'A3QK16'),
  ('PROTEIN_DISEASE_ASSOCIATION', 'D008607')),
 (('PROTEIN_PATHWAY_ASSOCIATION', 'R-DRE-382556'),
  ('inv_PROTEIN_PATHWAY_ASSOCIATION', 'F1QGH9'),
  ('PROTEIN_PATHWAY_ASSOCIATION', 'R-DRE-1169091')),
 (('PROTEIN_PATHWAY_ASSOCIATION', 'R-DRE-382556'),
  ('inv_PROTEIN_PATHWAY_ASSOCIATION', 'F1QGH9'),
  ('PROTEIN_PATHWAY_ASSOCIATION', 'R-DRE-1236978')),
 (('PROTEIN_PATHWAY_ASSOCIATION', 'R-DRE-382556'),
  ('inv_PROTEIN_PATHWAY_ASSOCIATION', 'F1QGH9'),
  ('PROTEIN_PATHWAY_ASSOCIATION', 'R-DRE-174084')),
 (('PROTEIN_PATHWAY_ASSOCIATION', 'R-DRE-382556'),
  ('inv_PROTEIN_PATHWAY_ASSOCIATION', 'F1QGH9'),
  ('PROTEIN_PATHWAY_ASSOCIATION', 'R-DRE-174154')),
 (('PROTEIN_PATHWAY_ASSOCIATION', 'R-DRE-382556'),
  ('inv_PROTEIN_PATHWAY_ASSOCIATION', 'F1QGH9

In [27]:
len(mind_subgraph)

162302

In [28]:
list(mind_subgraph.keys())[0:5]

['WD:Q59981933',
 'WD:Q59956477',
 'NCBIGene:5967459',
 'NCBITaxon:1481730',
 'REACT:R-CEL-9010869']

In [29]:
mind_subgraph["WD:Q59981933"]

{(('in_taxon_GitT', 'NCBITaxon:5825'),
  ('inv_in_taxon_AitT', 'GO:0005739'),
  ('associated_with_AawD', 'DOID:0060333')),
 (('in_taxon_GitT', 'NCBITaxon:5825'),
  ('inv_in_taxon_AitT', 'GO:0005826'),
  ('in_taxon_AitT', 'NCBITaxon:458650')),
 (('in_taxon_GitT', 'NCBITaxon:5825'),
  ('inv_in_taxon_AitT', 'GO:0020011'),
  ('inv_part_of_GpoA', 'WD:Q59917185')),
 (('in_taxon_GitT', 'NCBITaxon:5825'),
  ('inv_in_taxon_AitT', 'GO:0020026'),
  ('in_taxon_AitT', 'NCBITaxon:310761')),
 (('in_taxon_GitT', 'NCBITaxon:5825'),
  ('inv_in_taxon_AitT', 'GO:0022627'),
  ('in_taxon_AitT', 'NCBITaxon:1187918')),
 (('in_taxon_GitT', 'NCBITaxon:5825'),
  ('inv_in_taxon_BPitT', 'GO:0007053'),
  ('in_taxon_BPitT', 'NCBITaxon:197552')),
 (('in_taxon_GitT', 'NCBITaxon:5825'),
  ('inv_in_taxon_BPitT', 'GO:0031370'),
  ('in_taxon_BPitT', 'NCBITaxon:224454')),
 (('in_taxon_GitT', 'NCBITaxon:5825'),
  ('inv_in_taxon_BPitT', 'GO:0032187'),
  ('in_taxon_BPitT', 'NCBITaxon:1187939')),
 (('in_taxon_GitT', 'NCBITaxon

In [81]:
with open(
    "data/subgraphs/biokg/paths_1000_pathLen_3_0_noLoops_invEdges.pkl", "rb"
) as f:
    biokg_subgraph_0 = pickle.load(f)

In [None]:
biokg_subgraph_0

x