In [1]:
import pykeen.datasets
import pykeen
import os
import subprocess
import sys
import shutil
import pandas as pd
import polars as pl
import numpy as np

# Train a Case-based Reasoning and Probabilistic Case-based Reasoning model on OpenBiolink

Context:
Rivas-Barrigan implemented a pared down flavor of BioKG and OpenBioLink containing only 3 node types. This notebook seeks to create case-based reasoning models on the full BioKG and OpenBiolink knowledge graphs for drug repurposing. We don't run probCBR on the modified dataset because it wouldn't make any sense - the modified dataset only has a specific schema ( Drug - interacts with - Gene - associated with - Disease)

Actions:
* 

Findings:


## Get OpenBiolink Graph

In [2]:
obl = pykeen.datasets.OpenBioLink()

In [3]:
train = pl.DataFrame(
    obl.training.triples,
)
test = pl.DataFrame(obl.testing.triples)
valid = pl.DataFrame(obl.validation.triples)

graph = pl.concat([train, test, valid]).unique(["column_0", "column_1", "column_2"])

You're trying to map triples with 2052 entities and 0 relations that are not in the training set. These triples will be excluded from the mapping.
In total 2047 from 183011 triples were filtered out
Reconstructing all label-based triples. This is expensive and rarely needed.
Reconstructing all label-based triples. This is expensive and rarely needed.
You're trying to map triples with 2099 entities and 0 relations that are not in the training set. These triples will be excluded from the mapping.
In total 2093 from 188394 triples were filtered out
Reconstructing all label-based triples. This is expensive and rarely needed.


In [4]:
graph.head()

column_0,column_1,column_2
str,str,str
"""CL:0000011""","""IS_A""","""CL:0000333"""
"""CL:0000037""","""IS_A""","""CL:0008001"""
"""CL:0000151""","""IS_A""","""CL:0000003"""
"""CL:0000359""","""PART_OF""","""UBERON:0002049…"
"""CL:0000438""","""IS_A""","""CL:0000163"""


In [5]:
f"Graph size: {graph.shape[0]:,}"

'Graph size: 4,559,267'

In [None]:
# write to dataset storage
graph.write_csv(
    "/home/rogertu/.data/pykeen/datasets/openbiolink/graph.txt",
    separator="\t",
    include_header=False,
)
train.write_csv(
    "/home/rogertu/.data/pykeen/datasets/openbiolink/train.txt",
    separator="\t",
    include_header=False,
)
test.write_csv(
    "/home/rogertu/.data/pykeen/datasets/openbiolink/test.txt",
    separator="\t",
    include_header=False,
)
valid.write_csv(
    "/home/rogertu/.data/pykeen/datasets/openbiolink/valid.txt",
    separator="\t",
    include_header=False,
)

  graph.write_csv(
  train.write_csv(
  test.write_csv(
  valid.write_csv(


In [8]:
os.mkdir("/home/rogertu/projects/Consilience-Drug-Repurposing/openbiolink")

In [None]:
# write to path-baeed file location
# write to dataset storage
graph.write_csv(
    "/home/rogertu/projects/Consilience-Drug-Repurposing/data/openbiolink/graph.txt",
    separator="\t",
    include_header=False,
)
train.write_csv(
    "/home/rogertu/projects/Consilience-Drug-Repurposing/data/openbiolink/train.txt",
    separator="\t",
    include_header=False,
)
test.write_csv(
    "/home/rogertu/projects/Consilience-Drug-Repurposing/data/openbiolink/test.txt",
    separator="\t",
    include_header=False,
)
valid.write_csv(
    "/home/rogertu/projects/Consilience-Drug-Repurposing/data/openbiolink/dev.txt",
    separator="\t",
    include_header=False,
)

## Train CBR subgraph

In [2]:
os.getcwd()

'/home/rogertu/projects'

In [4]:
os.chdir("./Consilience-Drug-Repurposing")
sys.path.append("./path-based/CBR/code")

In [None]:
subprocess.run(
    [
        "python",
        "path-based/CBR/code/data/get_paths.py",
        "--dataset_name",
        "openbiolink",
        "--data_dir",
        "./",
        "--num_paths_to_collect",
        "1000",
        "--ignore_sequential_inverse",
        "1",
    ]
)

## Inspect what relations are 'treats' to filter the test/valid results

In [6]:
pl.read_csv(
    "/home/rogertu/.data/pykeen/datasets/openbiolink/test.txt",
    separator="\t",
    has_header=False,
).unique("column_2")["column_2"].to_list()

['DRUG_REACTION_GENE',
 'DRUG_BINDINH_GENE',
 'GENE_DIS',
 'DIS_PHENOTYPE',
 'PART_OF',
 'DRUG_BINDING_GENE',
 'DRUG_CATALYSIS_GENE',
 'GENE_EXPRESSION_GENE',
 'GENE_PTMOD_GENE',
 'DRUG_PHENOTYPE',
 'GENE_ACTIVATION_GENE',
 'GENE_PATHWAY',
 'GENE_CATALYSIS_GENE',
 'DRUG_BINDACT_GENE',
 'DRUG_INHIBITION_GENE',
 'GENE_GENE',
 'GENE_UNDEREXPRESSED_ANATOMY',
 'GENE_REACTION_GENE',
 'GENE_PHENOTYPE',
 'GENE_DRUG',
 'GENE_BINDING_GENE',
 'GENE_OVEREXPRESSED_ANATOMY',
 'IS_A',
 'GENE_INHIBITION_GENE',
 'DIS_DRUG',
 'GENE_EXPRESSED_ANATOMY',
 'GENE_GO',
 'DRUG_ACTIVATION_GENE']

In [7]:
pl.read_csv(
    "/home/rogertu/.data/pykeen/datasets/openbiolink/test.txt",
    separator="\t",
    has_header=False,
)

column_1,column_2,column_3
str,str,str
"""CL:0000005""","""IS_A""","""CL:0000057"""
"""CL:0000022""","""IS_A""","""CL:0000021"""
"""CL:0000038""","""IS_A""","""CL:0000839"""
"""CL:0000047""","""PART_OF""","""UBERON:0001017…"
"""CL:0000060""","""IS_A""","""CL:0000055"""
…,…,…
"""UBERON:6007231…","""IS_A""","""UBERON:6005168…"
"""UBERON:6007233…","""IS_A""","""UBERON:0002536…"
"""UBERON:6007288…","""IS_A""","""UBERON:6007284…"
"""UBERON:6040005…","""IS_A""","""UBERON:0000480…"


## Run CBR

In [8]:
subprocess.run(
    [
        "python",
        "path-based/CBR/code/cbr_mod.py",
        "--dataset_name",
        "openbiolink",
        "--data_dir",
        "./",
        "--max_num_programs",
        "25",
        "--k_adj",
        "10",
        "--filter_relations",
        "['DIS_DRUG']",
        "--output_dir",
        "./data/outputs",
        "--max_answers",
        "100",
        "--output_predictions",
        "--output_per_relation_scores",
        "--test",
    ]
)

[2024-12-05 07:44:32 	 COMMAND: path-based/CBR/code/cbr_mod.py --dataset_name openbiolink --data_dir ./ --max_num_programs 25 --k_adj 10 --filter_relations ['DIS_DRUG'] --output_dir ./data/outputs --max_answers 100 --output_predictions --output_per_relation_scores --test]
[2024-12-05 07:44:32 	 Loading subgraph around entities:]
4559267it [00:02, 1657464.49it/s]
[2024-12-05 07:45:23 	 Loading train map]
4559267it [00:03, 1263103.21it/s]
4192002it [00:02, 1652976.18it/s]
[2024-12-05 07:45:30 	 Loading dev map]
186301it [00:00, 1270673.21it/s]
[2024-12-05 07:45:30 	 Loading test map]
180964it [00:00, 1266486.51it/s]
[2024-12-05 07:45:30 	 Filtering relations: ['DIS_DRUG']]
4559267it [00:02, 1708971.65it/s]
  adj_mat = adj_mat / l2norm.reshape(l2norm.shape[0], 1)
[2024-12-05 07:45:34 	 Using device: cpu]
[2024-12-05 07:45:34 	 {
    "cuda": false,
    "data_dir": "./",
    "dataset_name": "openbiolink",
    "dev_file": "./data/openbiolink/dev.txt",
    "dev_file_name": "dev.txt",
    "fil

CompletedProcess(args=['python', 'path-based/CBR/code/cbr_mod.py', '--dataset_name', 'openbiolink', '--data_dir', './', '--max_num_programs', '25', '--k_adj', '10', '--filter_relations', "['DIS_DRUG']", '--output_dir', './data/outputs', '--max_answers', '100', '--output_predictions', '--output_per_relation_scores', '--test'], returncode=0)