In [1]:
import pykeen.datasets
import pykeen
import os
import subprocess
import sys
import shutil
import pandas as pd
import polars as pl
import numpy as np

# Train a Case-based Reasoning and model on BioKG

Context:
Rivas-Barrigan implemented a pared down flavor of BioKG and OpenBioLink containing only 3 node types. This notebook seeks to create case-based reasoning models on the full BioKG and OpenBiolink knowledge graphs for drug repurposing. We don't run CBR on the modified dataset because it wouldn't make any sense - the modified dataset only has a specific schema ( Drug - interacts with - Gene - associated with - Disease)

Actions:
* extract and export biokg dataset and graph
* copy to Consilience-based-Drug-Repurposing folder
* modify CBR repository to take a specific edge type to process, this flag is under `--filter_relations`
* create CBR similarity subgraph
* run Case-Based Reasoning

Findings:
* edge types between BioKG and the modified BioKG are not exactly the same. Need to match the results once the predictions are done to shoe-horn the modified KG predictions into CBR
* prediction performance seems to be similar to MIND
    * MRR:        0.05226
    * Hits_at_1:  0.02098
    * Hits_at_3:  0.04735
    * Hits_at_10: 0.10414

## Get BioKG Graph

In [2]:
biokg = pykeen.datasets.BioKG()

In [3]:
train = pl.DataFrame(
    biokg.training.triples,
)
test = pl.DataFrame(biokg.testing.triples)
valid = pl.DataFrame(biokg.validation.triples)

graph = pl.concat([train, test, valid]).unique(["column_0", "column_1", "column_2"])

Reconstructing all label-based triples. This is expensive and rarely needed.
Reconstructing all label-based triples. This is expensive and rarely needed.
Reconstructing all label-based triples. This is expensive and rarely needed.


In [4]:
graph.head()

column_0,column_1,column_2
str,str,str
"""A0A0G2K344""","""PROTEIN_PATHWA…","""R-RNO-912526"""
"""A0JNB0""","""PROTEIN_PATHWA…","""R-BTA-389513"""
"""A0MGZ7""","""PROTEIN_PATHWA…","""R-DRE-2022928"""
"""A1A4J1""","""PROTEIN_PATHWA…","""R-BTA-6798695"""
"""A1XQU1""","""PROTEIN_PATHWA…","""R-SSC-1234176"""


In [None]:
f"Graph size: {graph.shape[0]:,}"

'2,067,997'

In [9]:
graph.write_csv(
    "/home/rogertu/.data/pykeen/datasets/biokg/graph.txt",
    separator="\t",
    has_header=False,
)
train.write_csv(
    "/home/rogertu/.data/pykeen/datasets/biokg/train.txt",
    separator="\t",
    has_header=False,
)
test.write_csv(
    "/home/rogertu/.data/pykeen/datasets/biokg/test.txt",
    separator="\t",
    has_header=False,
)
valid.write_csv(
    "/home/rogertu/.data/pykeen/datasets/biokg/valid.txt",
    separator="\t",
    has_header=False,
)

  graph.write_csv(
  train.write_csv(
  test.write_csv(
  valid.write_csv(


## Train CBR subgraph

In [10]:
os.getcwd()

'/home/rogertu/projects/semmed'

In [11]:
os.chdir("../Consilience-Drug-Repurposing")

In [12]:
sys.path.append("./path-based/CBR/code")

In [None]:
# make sure to move the graph.txt file into the consilience-drug-repurposing directory

In [None]:
subprocess.run(
    [
        "python",
        "path-based/CBR/code/data/get_paths.py",
        "--dataset_name",
        "biokg",
        "--data_dir",
        "./",
        "--num_paths_to_collect",
        "1000",
        "--ignore_sequential_inverse",
        "1",
    ]
)

[2024-11-29 12:20:35 	 {
    "data_dir": "./",
    "dataset_name": "biokg",
    "ignore_sequential_inverse": true,
    "num_paths_to_collect": 1000,
    "use_wandb": 0
}]
2067997it [00:03, 660987.27it/s]
 26%|██▋       | 27709/105524 [16:07<40:41, 31.88it/s]  

## Run CBR

In [70]:
subprocess.run(
    [
        "python",
        "path-based/CBR/code/cbr_mod.py",
        "--dataset_name",
        "biokg",
        "--data_dir",
        "./",
        "--max_num_programs",
        "25",
        "--k_adj",
        "10",
        "--filter_relations",
        "['DRUG_DISEASE_ASSOCIATION']",
        "--output_dir",
        "./data/outputs",
        "--max_answers",
        "100",
        "--output_predictions",
        "--output_per_relation_scores",
        "--test",
    ]
)

[2024-12-02 13:52:45 	 COMMAND: path-based/CBR/code/cbr_mod.py --dataset_name biokg --data_dir ./ --max_num_programs 25 --k_adj 10 --filter_relations ['DRUG_DISEASE_ASSOCIATION'] --output_dir ./data/outputs --max_answers 100 --output_predictions --output_per_relation_scores --test]
[2024-12-02 13:52:45 	 Loading subgraph around entities:]
2067997it [00:01, 1045481.01it/s]
[2024-12-02 13:52:56 	 Loading train map]
2067997it [00:02, 710004.66it/s]
1654397it [00:02, 700018.99it/s]
[2024-12-02 13:53:02 	 Loading dev map]
206800it [00:00, 718221.18it/s]
[2024-12-02 13:53:02 	 Loading test map]
206800it [00:00, 749814.85it/s]
[2024-12-02 13:53:02 	 Filtering relations: ['DRUG_DISEASE_ASSOCIATION']]
2067997it [00:02, 779589.10it/s]
  adj_mat = adj_mat / l2norm.reshape(l2norm.shape[0], 1)
[2024-12-02 13:53:08 	 Using device: cpu]
[2024-12-02 13:53:08 	 {
    "cuda": false,
    "data_dir": "./",
    "dataset_name": "biokg",
    "dev_file": "./data/biokg/dev.txt",
    "dev_file_name": "dev.txt",

CompletedProcess(args=['python', 'path-based/CBR/code/cbr_mod.py', '--dataset_name', 'biokg', '--data_dir', './', '--max_num_programs', '25', '--k_adj', '10', '--filter_relations', "['DRUG_DISEASE_ASSOCIATION']", '--output_dir', './data/outputs', '--max_answers', '100', '--output_predictions', '--output_per_relation_scores', '--test'], returncode=0)

In [59]:
pl.read_csv("/home/rogertu/.data/pykeen/datasets/biokg/test.txt", separator="\t",has_header=False).unique('column_2')['column_2'].to_list()

['DRUG_DISEASE_ASSOCIATION',
 'COMPLEX_IN_PATHWAY',
 'DISEASE_PATHWAY_ASSOCIATION',
 'PROTEIN_PATHWAY_ASSOCIATION',
 'DRUG_TARGET',
 'DDI',
 'DRUG_CARRIER',
 'PPI',
 'RELATED_GENETIC_DISORDER',
 'DRUG_TRANSPORTER',
 'PROTEIN_DISEASE_ASSOCIATION',
 'DISEASE_GENETIC_DISORDER',
 'DPI',
 'MEMBER_OF_COMPLEX',
 'DRUG_PATHWAY_ASSOCIATION',
 'COMPLEX_TOP_LEVEL_PATHWAY',
 'DRUG_ENZYME']

In [63]:
pl.read_csv("/home/rogertu/.data/pykeen/datasets/biokg/test.txt", separator="\t",has_header=False)

column_1,column_2,column_3
str,str,str
"""DB04855""","""DDI""","""DB12248"""
"""DB00767""","""DDI""","""DB00934"""
"""DB01097""","""DDI""","""DB11529"""
"""Q9BY11""","""MEMBER_OF_COMP…","""R-HSA-8871150"""
"""Q9CPX8""","""PROTEIN_PATHWA…","""mmu04932"""
…,…,…
"""O00139""","""PPI""","""P25791"""
"""P61812""","""MEMBER_OF_COMP…","""R-HSA-2467308"""
"""DB00849""","""DDI""","""DB11273"""
"""DB00300""","""DDI""","""DB00722"""
