# Generate CBR predictions on MIND

In [1]:
import subprocess
import wandb
import os
import sys
import shutil
import polars as pl
import pandas as pd

os.chdir("./Consilience-Drug-Repurposing")
sys.path.append("./path-based/CBR/code/")

## Build a graph from the train, test, valid files

In [None]:
graph_dir = os.path.join("./data/MIND", "graph.txt")

if not os.path.exists(graph_dir):
    print("graph.txt does not exist. Creating it now.")
    pl.concat(
        [
            pl.read_csv("./data/MIND/train.txt", separator="\t", has_header=False),
            pl.read_csv("./data/MIND/valid.txt", separator="\t", has_header=False),
            pl.read_csv("./data/MIND/test.txt", separator="\t", has_header=False),
        ]
    ).write_csv("./data/MIND/graph.txt", separator="\t", include_header=False)
else:
    print("graph.txt exists. Skipping creation.")

### Make a copy of valid.txt

In [18]:
# make dev.txt a copy of valid.txt
dev_dir = os.path.join("./data/MIND", "dev.txt")
if not os.path.exists(dev_dir):
    print("dev.txt does not exist. Creating it now.")
    shutil.copy("./data/MIND/valid.txt", dev_dir)

else:
    print("dev.txt exists. Skipping creation.")

dev.txt does not exist. Creating it now.


## Generate subgraph file
* Essentially same as running in the commandline:
```bash
python path-based/CBR/code/data/get_paths.py --dataset_name MIND --data_dir ./ --num_paths_to_collect 1000 --ignore_sequential_inverse 1
```

In [None]:
subprocess.run(
    [
        "python",
        "path-based/CBR/code/data/get_paths.py",
        "--dataset_name",
        "MIND",
        "--data_dir",
        "./",
        "--num_paths_to_collect",
        "1000",
        "--ignore_sequential_inverse",
        "1",
    ]
)

## Run CBR
* per-relation scores outputted to: ./data/outputs/MIND_CBR_per_relation_scores.json
* predictions outputted to: ./data/outputs/MIND_CBR_query_data.json

In [32]:
subprocess.run(
    [
        "python",
        "path-based/CBR/code/cbr_mod.py",
        "--dataset_name",
        "MIND",
        "--data_dir",
        "./",
        "--max_num_programs",
        "25",
        "--k_adj",
        "10",
        "--test_file_name",
        "test.txt",
        "--output_dir",
        "./data/outputs",
        "--max_answers",
        "100",
        "--output_predictions",
        "--output_per_relation_scores",
        "--test",
    ]
)

[2024-08-26 15:26:13 	 COMMAND: path-based/CBR/code/cbr_mod.py --dataset_name MIND --data_dir ./ --max_num_programs 25 --k_adj 10 --test_file_name test.txt --output_dir ./data/outputs --max_answers 100 --output_predictions --output_per_relation_scores --test]
[2024-08-26 15:26:13 	 Loading subgraph around entities:]
9652116it [00:04, 2046856.60it/s]
[2024-08-26 15:27:35 	 Loading train map]
9652116it [00:23, 404858.33it/s] 
9651042it [00:15, 627828.74it/s] 
[2024-08-26 15:28:14 	 Loading dev map]
537it [00:00, 1233483.71it/s]
[2024-08-26 15:28:14 	 Loading test map]
537it [00:00, 1274669.64it/s]
9652116it [00:05, 1899637.71it/s]
  adj_mat = adj_mat / l2norm.reshape(l2norm.shape[0], 1)
[2024-08-26 15:28:22 	 Using device:]
[2024-08-26 15:28:22 	 {
    "data_dir": "./",
    "dataset_name": "MIND",
    "dev_file": "./data/MIND/dev.txt",
    "k_adj": 10,
    "max_answers": 100,
    "max_num_programs": 25,
    "output_dir": "./data/outputs",
    "output_per_relation_scores": true,
    "outp

cuda


9651042it [00:17, 555085.09it/s] 
537it [00:00, 1222105.94it/s]
537it [00:00, 1284849.54it/s]
[2024-08-26 15:28:44 	 Loaded...]
100%|██████████| 387/387 [20:11<00:00,  3.13s/it]  
[2024-08-26 15:48:55 	 Writing per-relation scores to ./data/outputs/MIND_CBR_per_relation_scores.json]
[2024-08-26 15:48:55 	 Out of 537 queries, atleast one program was returned for 529 queries]
[2024-08-26 15:48:55 	 Avg number of programs 8.14]
[2024-08-26 15:48:55 	 Avg number of answers after executing the programs: 7491.204134366925]
[2024-08-26 15:48:55 	 Accuracy (Loose): 0.8640595903165735]
[2024-08-26 15:48:55 	 Hits@1 0.00558659217877095]
[2024-08-26 15:48:55 	 Hits@3 0.05772811918063315]
[2024-08-26 15:48:55 	 Hits@5 0.0856610800744879]
[2024-08-26 15:48:55 	 Hits@10 0.12104283054003724]
[2024-08-26 15:48:55 	 MRR 0.048132341014984946]
[2024-08-26 15:48:55 	 Avg number of nn, that do not have the query relation: 0.0]
[2024-08-26 15:48:55 	 Avg num of returned nearest neighbors: 10.0000]
[2024-08-

CompletedProcess(args=['python', 'path-based/CBR/code/cbr_mod.py', '--dataset_name', 'MIND', '--data_dir', './', '--max_num_programs', '25', '--k_adj', '10', '--test_file_name', 'test.txt', '--output_dir', './data/outputs', '--max_answers', '100', '--output_predictions', '--output_per_relation_scores', '--test'], returncode=0)