In [1]:
# to run jupyter from a conda env install
# 'conda install nb_conda_kernels'
# open notebbok and ge to Kernel/Change Kernel and select env

In [21]:
from os import path
from c_clause import QAHandler, Loader, RulesHandler, PredictionHandler
import numpy as np
from clause import Options, Learner
from clause.util.utils import get_base_dir

### Preparation

In [2]:
# ensure that the data is downloaded and the hetionet folder is put into
# PyClause/data/

train = f"{get_base_dir()}/data/hetionet/train.txt"
filter_set = f"{get_base_dir()}/data/hetionet/valid.txt"
test_set = f"{get_base_dir()}/data/hetionet/valid.txt"
opts = Options()
exp_dir = f"{get_base_dir()}/local/paper-demonstration"

In [3]:
# The orginal data comes with entity/relation identifier like so
# (Gene::9282,  GiG, Gene::9441)
# PyClause allows to substitute these with a more readable format
# after the data and rules are loaded
# we do this here and it is used later

entity_names_f = f"{get_base_dir()}/data/hetionet/entity_strings.txt"

entity_names = {}
with open(entity_names_f, 'r') as file:
    for line in file:
        key, value = line.strip().split('\t')
        entity_names[key] = value

relation_names_f = f"{get_base_dir()}/data/hetionet/relation_strings.txt"
relation_names = {}
with open(relation_names_f, 'r') as file:
    for line in file:
        key, value = line.strip().split('\t')
        relation_names[key] = value

### Learn rules for all relations and separately for the target relation

In [4]:
rules_target_rel = path.join(exp_dir, "anyburl-rules-target.txt")
rules_all = path.join(exp_dir, "anyburl-rules-all.txt")

In [5]:
# you don't need to run this again
# rulesets are provided

if False:
    # learn rules for all relations
    opts.set("learner.mode", "anyburl")
    opts.set("learner.anyburl.time", 1000)
    learner = Learner(options=opts.get("learner"))
    learner.learn_rules(path_data=train, path_output=rules_all)

    # learn rules for target relation
    opts.set("learner.mode", "anyburl")
    opts.set("learner.anyburl.time", 1000)
    opts.set("learner.anyburl.raw.SINGLE_RELATIONS", "CtD")
    learner = Learner(options=opts.get("learner"))


## Analyze the general KG (rulset for all relations)

In [7]:
# load data (construct a loader with default options)
# loads more than 1mio triples
loader_1 = Loader(options={})
loader_1.load_data(train)

Loading triples...
Loaded triples.


In [8]:
# load the rulset that learned rules for all relations
# load only rules with constants
opts.set("loader.load_u_d_rules", False)
opts.set("loader.load_u_c_rules", True)
opts.set("loader.c_min_support", 20)
opts.set("loader.load_u_xxd_rules", False)
opts.set("loader.load_u_xxc_rules", False)
opts.set("loader.load_zero_rules", False)
opts.set("loader.load_b_rules", False)

loader_1.set_options(options=opts.get("loader"))
# load rules with original relation string representation
loader_1.load_rules(rules=rules_all)
# substitute relations/entities in loader index with more readable strings
loader_1.replace_rel_strings(relation_names)
loader_1.replace_ent_strings(entity_names)


Setting option load_zero_rules to: False
Setting option z_weight to: 0.1
Setting option z_num_unseen to: 5
Setting option z_min_support to: -1
Setting option z_min_preds to: -1
Setting option z_min_conf to: 0.0001
Setting option load_u_c_rules to: True
Setting option c_num_unseen to: 5
Setting option c_min_support to: 20
Setting option c_min_preds to: -1
Setting option c_min_conf to: 0.0001
Setting option c_max_length to: -1
Setting option load_b_rules to: False
Setting option b_max_branching_factor to: -1
Setting option b_num_unseen to: 5
Setting option b_min_support to: -1
Setting option b_min_preds to: -1
Setting option b_min_conf to: 0.0001
Setting option b_max_length to: -1
Setting option load_u_d_rules to: False
Setting option d_weight to: 0.01
Setting option d_max_branching_factor to: -1
Setting option d_num_unseen to: 5
Setting option d_min_support to: -1
Setting option d_min_preds to: -1
Setting option d_min_conf to: 0.0001
Setting option d_max_length to: -1
Setting option loa

In [9]:
## display the strongest rules with constants
rules = loader_1.get_rules()
rules = [line.split("\t") for line in rules]
rules.sort(key=lambda in_list: in_list[2], reverse=True)
num_rules  = 10000
for i in range(num_rules):
    if "Side" in rules[i][3]:
        print(rules[i][0], rules[i][1],rules[i][2], rules[i][3])

48 48 1.000000 Compound-causes-Side-Effect(Potassium Iodide,Y) <= Compound-causes-Side-Effect(Potassium Canrenoate,Y)
222 222 1.000000 Compound-causes-Side-Effect(Levobupivacaine,Y) <= Compound-causes-Side-Effect(Bupivacaine,Y)
208 208 1.000000 Compound-causes-Side-Effect(Atropine,Y) <= Compound-causes-Side-Effect(Hyoscyamine,Y)
108 108 1.000000 Compound-causes-Side-Effect(Fondaparinux sodium,Y) <= Compound-causes-Side-Effect(Sodium lauryl sulfate,Y)
204 204 1.000000 Gene-participates-Biological_Process(X,nucleobase-containing small molecule metabolic process) <= Gene-participates-Biological_Process(X,nucleoSide_triphosphate metabolic process)
39 39 1.000000 Gene-participates-Biological_Process(X,purine ribonucleoSide_triphosphate metabolic process) <= Gene-participates-Biological_Process(X,ATP generation from ADP)
38 38 1.000000 Gene-participates-Biological_Process(X,purine nucleoSide_monophosphate metabolic process) <= Gene-participates-Biological_Process(X,glycolytic process)
30 30 

In [10]:
# now display the strongest cyclical dependencies
# we have to load the data again because we performed the entity/relation_strings substitutions
# for more redable outputs but the rules appear in raw form
opts.set("loader.load_u_d_rules", False)
opts.set("loader.load_u_c_rules", False)
opts.set("loader.load_u_xxd_rules", False)
opts.set("loader.load_u_xxc_rules", False)
opts.set("loader.load_zero_rules", False)
opts.set("loader.load_b_rules", True)

loader_1 = Loader(opts.get("loader"))
loader_1.load_data(train)
loader_1.load_rules(rules=rules_all)
loader_1.replace_rel_strings(relation_names)
loader_1.replace_ent_strings(entity_names)

rules = loader_1.get_rules()
rules = [line.split("\t") for line in rules]
rules.sort(key=lambda in_list: in_list[2], reverse=True)
num_rules  = 500
for i in range(num_rules):
    print(rules[i][0], rules[i][1],rules[i][2], rules[i][3])


Setting option load_zero_rules to: False
Setting option z_weight to: 0.1
Setting option z_num_unseen to: 5
Setting option z_min_support to: -1
Setting option z_min_preds to: -1
Setting option z_min_conf to: 0.0001
Setting option load_u_c_rules to: False
Setting option c_num_unseen to: 5
Setting option c_min_support to: 20
Setting option c_min_preds to: -1
Setting option c_min_conf to: 0.0001
Setting option c_max_length to: -1
Setting option load_b_rules to: True
Setting option b_max_branching_factor to: -1
Setting option b_num_unseen to: 5
Setting option b_min_support to: -1
Setting option b_min_preds to: -1
Setting option b_min_conf to: 0.0001
Setting option b_max_length to: -1
Setting option load_u_d_rules to: False
Setting option d_weight to: 0.01
Setting option d_max_branching_factor to: -1
Setting option d_num_unseen to: 5
Setting option d_min_support to: -1
Setting option d_min_preds to: -1
Setting option d_min_conf to: 0.0001
Setting option d_max_length to: -1
Setting option loa

In [11]:
# define some custom rules and check their support in the data
# here: search for symmetric relationships
rh = RulesHandler(options=opts.get("rules_handler"))
symmetric_rules = [
    "Compound-resembles-Compound(X,Y) <= Compound-resembles-Compound(Y,X)",
    "Disease-resembles-Disease(X,Y) <= Disease-resembles-Disease(Y,X)"
]
rh.calculate_predictions(rules=symmetric_rules, loader=loader_1)
# no symmetries found
print(rh.get_statistics())


Setting option collect_predictions to: True
Setting option collect_statistics to: True
Setting option num_threads to: -1
Starting materialization of 2 rules.
[[6486, 0], [543, 0]]


## Example: drug repurposing problem

In [12]:
# use a new loader and a new Options instance
opts = Options()

opts.set("loader.load_u_c_rules", True)
opts.set("loader.c_min_support", 5)

loader_2 = Loader(options=opts.get("loader"))
loader_2.load_data(data=train)
loader_2.load_rules(rules=rules_target_rel)
loader_2.replace_ent_strings(entity_names)
loader_2.replace_rel_strings(relation_names)

Setting option load_zero_rules to: True
Setting option z_weight to: 0.1
Setting option z_num_unseen to: 5
Setting option z_min_support to: -1
Setting option z_min_preds to: -1
Setting option z_min_conf to: 0.0001
Setting option load_u_c_rules to: True
Setting option c_num_unseen to: 5
Setting option c_min_support to: 5
Setting option c_min_preds to: -1
Setting option c_min_conf to: 0.0001
Setting option c_max_length to: -1
Setting option load_b_rules to: True
Setting option b_max_branching_factor to: -1
Setting option b_num_unseen to: 5
Setting option b_min_support to: -1
Setting option b_min_preds to: -1
Setting option b_min_conf to: 0.0001
Setting option b_max_length to: -1
Setting option load_u_d_rules to: True
Setting option d_weight to: 0.01
Setting option d_max_branching_factor to: -1
Setting option d_num_unseen to: 5
Setting option d_min_support to: -1
Setting option d_min_preds to: -1
Setting option d_min_conf to: 0.0001
Setting option d_max_length to: -1
Setting option load_u_

In [19]:
# store the rules in a dict
rules_t = [rule_line.split("\t") for rule_line in loader_2.get_rules()]
rules_dict = {l[3]:l[0:3] for l in rules_t}

opts.set("qa_handler.collect_rules", True)
opts.set("qa_handler.aggregation_function", "noisyor")
opts.set("qa_handler.num_top_rules", 5)
# only output candidates that do not form a true answer in
# the loaded data
opts.set("qa_handler.filter_w_data", True)
qa = QAHandler(options=opts.get("qa_handler"))

queries = [
    ("Isoetarine", "Compound-treats-Disease"),
]

qa.calculate_answers(queries=queries, loader=loader_2, direction="tail")
answers = qa.get_answers(as_string=True)
rules = qa.get_rules(as_string=True)
print("\n-- Candidates and rules ---\n")
for i in range(10):
    print(f"Query: {queries[0]}")
    print(f"Answer {i+1}: {answers[0][i]}")
    print("Rules:")
    for rule in rules[0][i]:
        print(rule)
    print("...")



Setting option collect_rules to: True

-- Candidates and rules ---

Query: ('Isoetarine', 'Compound-treats-Disease')
Answer 1: ('asthma', 0.44917384883476097)
Rules:
Compound-treats-Disease(X,asthma) <= Compound-binds-Gene(X,ADRB2)
Compound-treats-Disease(X,Y) <= Compound-resembles-Compound(X,A), Compound-resembles-Compound(B,A), Compound-treats-Disease(B,Y)
Compound-treats-Disease(X,Y) <= Compound-resembles-Compound(A,X), Compound-treats-Disease(A,Y)
Compound-treats-Disease(X,asthma) <= Compound-binds-Gene(X,ADRB1)
Compound-treats-Disease(X,Y) <= Compound-resembles-Compound(A,X), Compound-resembles-Compound(A,B), Compound-treats-Disease(B,Y)
...
Query: ('Isoetarine', 'Compound-treats-Disease')
Answer 2: ('hypertension', 0.4018071031819255)
Rules:
Compound-treats-Disease(X,hypertension) <= Compound-binds-Gene(X,ADRB2)
Compound-treats-Disease(X,hypertension) <= Compound-binds-Gene(X,ADRB1)
Compound-treats-Disease(X,Y) <= Compound-resembles-Compound(A,X), Compound-resembles-Compound(A,B)

In [20]:
# check stats of first rule
rules_dict['Compound-treats-Disease(X,asthma) <= Compound-binds-Gene(X,ADRB2)']

['52', '8', '0.153846']

In [34]:
# analyze the first candidate more closely
opts.set("prediction_handler.collect_explanations", True)
opts.set("prediction_handler.aggregation_function", "noisyor")
opts.set("prediction_handler.num_top_rules", 5)

ph = PredictionHandler(opts.get("prediction_handler"))
triples = [
    ("Isoetarine", "Compound-treats-Disease", "asthma"),
]
ph.calculate_scores(triples=triples, loader=loader_2)
# outputs same score as above fir the first candidate
ph.get_scores(as_string=False)
targets, rules, groundings = ph.get_explanations(as_string=True)

print("\n --- Rule and grounding example --- \n")
# 3rd rule of first target triple (there is only 1)
print(rules[0][2])
# body groundings of 3rd rule of first target triple
# each grounding is a list of triples
for gr in groundings[0][2]:
    print("Next grounding:")
    for triple in gr:
        print(triple)

Setting option aggregation_function to: noisyor

 --- Rule and grounding example --- 

Compound-treats-Disease(X,Y) <= Compound-resembles-Compound(A,X), Compound-treats-Disease(A,Y)
Next grounding:
['Salbutamol', 'Compound-resembles-Compound', 'Isoetarine']
['Salbutamol', 'Compound-treats-Disease', 'asthma']
Next grounding:
['Isoprenaline', 'Compound-resembles-Compound', 'Isoetarine']
['Isoprenaline', 'Compound-treats-Disease', 'asthma']
Next grounding:
['Orciprenaline', 'Compound-resembles-Compound', 'Isoetarine']
['Orciprenaline', 'Compound-treats-Disease', 'asthma']
Setting option collect_explanations to: True
Setting option num_top_rules to: 5
Setting option num_threads to: -1


### Constraint checking

In [41]:
from c_clause import PredictionHandler
opts.set("prediction_handler.num_top_rules", 100)
opts.set("prediction_handler.collect_explanations", True)
opts.set("prediction_handler.aggregation_function", "noisyor")
p = PredictionHandler(opts.get("prediction_handler"))

triples =[
    ("Ibuprofen", "Compound-treats-Disease", "malaria"),
    ("Verapamil", "Compound-treats-Disease", "migraine"),
    ("Enalapril", "Compound-treats-Disease", "hypertension"),
]

rules = ["Compound-treats-Disease(X,Y) <= Compound-treats-Disease(X,A), Compound-treats-Disease(B,A), Compound-treats-Disease(B,Y)"]
stats = [(1, 1)]
print("")
loader_2.set_options(options={"b_num_unseen": "0"})
loader_2.load_rules(rules=rules, stats=stats)

p.calculate_scores(triples=triples, loader=loader_2)
scores = p.get_scores(as_string=True)
print(scores)
filtered = []
for results in scores:
    if float(results[3])>0:
        filtered.append((results[0], results[1], results[2]))


# in short
[triple for triple in scores if float(triple[3])>0.0]



[['Ibuprofen', 'Compound-treats-Disease', 'malaria', '0.000000'], ['Verapamil', 'Compound-treats-Disease', 'migraine', '1.000000'], ['Enalapril', 'Compound-treats-Disease', 'hypertension', '1.000000']]
Setting option aggregation_function to: noisyor
Setting option collect_explanations to: True
Setting option num_top_rules to: 100
Setting option num_threads to: -1
Setting option b_num_unseen to: 0
Loaded 1 rules.


[['Verapamil', 'Compound-treats-Disease', 'migraine', '1.000000'],
 ['Enalapril', 'Compound-treats-Disease', 'hypertension', '1.000000']]