# Implementing EDAM inferences

## Working on a toy example

In [1]:
!pip install rdflib

from rdflib import Graph



In [2]:
my_edam_light = """
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix edam: <http://edamontology.org/> .

edam:C rdf:type owl:Class .
edam:C_prime rdf:type owl:Class .
edam:Topic_1 rdf:type owl:Class .
edam:Operation_1 rdf:type owl:Class .
edam:Data_1 rdf:type owl:Class .

edam:C rdfs:subClassOf edam:C_prime .

## C_prime --> implies --> Topic 1 .
edam:C_prime  rdfs:subClassOf [
        rdf:type owl:Restriction ;
        owl:onProperty edam:has_topic ;
        owl:someValuesFrom edam:Topic_1 ;
    ] .
### --> ASSERTION 1: C should have topic Topic1 (by transitivity of subClassOf)

## Topic1 --> implies --> Operation 1
edam:Topic_1 rdfs:subClassOf [
        rdf:type owl:Restriction ;
        owl:onProperty edam:has_operation ;
        owl:someValuesFrom edam:Operation_1 ;
    ] .
### --> ASSERTION 2: C should have operation Operation_1

## Operation 1 --> implies --> Data 1 as OUTPUT
edam:Operation_1 rdfs:subClassOf [
        rdf:type owl:Restriction ;
        owl:onProperty edam:has_output ;
        owl:someValuesFrom edam:Data_1 ;
    ] .
### --> ASSERTION 3: C should have as output Data_1


edam:Topic_1 rdfs:subClassOf edam:Topic_2 .
edam:Topic_2 rdfs:subClassOf [
        rdf:type owl:Restriction ;
        owl:onProperty edam:has_operation ;
        owl:someValuesFrom edam:Operation_2 ;
    ] .
### --> ASSERTION 4: C should have operation Operation_2

## Operation 2 --> implies --> Data 2 as INPUT
edam:Operation_2 rdfs:subClassOf [
        rdf:type owl:Restriction ;
        owl:onProperty edam:has_input ;
        owl:someValuesFrom edam:Data_2 ;
    ] .
### --> ASSERTION 5: C should have as input Data_2

"""

In [3]:
kg = Graph()
kg.parse(data=my_edam_light, format="turtle")
print(len(kg))
print(kg.serialize(format="turtle"))

27
@prefix edam: <http://edamontology.org/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

edam:C a owl:Class ;
    rdfs:subClassOf edam:C_prime .

edam:C_prime a owl:Class ;
    rdfs:subClassOf [ a owl:Restriction ;
            owl:onProperty edam:has_topic ;
            owl:someValuesFrom edam:Topic_1 ] .

edam:Data_1 a owl:Class .

edam:Operation_1 a owl:Class ;
    rdfs:subClassOf [ a owl:Restriction ;
            owl:onProperty edam:has_output ;
            owl:someValuesFrom edam:Data_1 ] .

edam:Operation_2 rdfs:subClassOf [ a owl:Restriction ;
            owl:onProperty edam:has_input ;
            owl:someValuesFrom edam:Data_2 ] .

edam:Topic_1 a owl:Class ;
    rdfs:subClassOf [ a owl:Restriction ;
            owl:onProperty edam:has_operation ;
            owl:someValuesFrom edam:Operation_1 ],
        edam:Topic_2 .

edam:Topic_2 rdfs:subClassOf [ a owl:Restriction ;
            owl:onProperty edam:has_operation ;

Sous la forme de règles :
```
RULE1: C hasTopic T AND T hasOperation O --> C hasOperation O
RULE2: C hasOperation O AND O hasInput D --> C hasInput D
RULE3: C hasOperation O AND O hasOutput D --> C hasOutput D
RULE4: C hasInput D AND D hasTopic T --> C hasTopic T
RULE5: C hasOutput D AND D hasTopic T --> C hasTopic T
RULE6: C hasInput D AND D hasOperation O --> C hasOperation O
RULE7: C hasOutput D AND D hasOperation O --> C hasOperation O
```


In [42]:
import time

rule_subClassOf = """
PREFIX edam: <http://edamontology.org/>
CONSTRUCT {
  ?concept edam:has_topic ?topic ;
           edam:has_operation ?operation ;
           edam:has_input ?in_data ;
           edam:has_output ?out_data .

} WHERE {
    ?concept rdfs:subClassOf* ?super_concept .
    OPTIONAL { ?super_concept edam:has_topic ?topic } .
    OPTIONAL { ?super_concept edam:has_operation ?operation } .
    OPTIONAL { ?super_concept edam:has_input ?in_data } .
    OPTIONAL { ?super_concept edam:has_output ?out_data } .
}
"""

rule_0 = """
PREFIX edam: <http://edamontology.org/>
CONSTRUCT {
  ?concept ?rel ?class .
  ?class rdf:type owl:Class .
} WHERE {
    ?concept rdfs:subClassOf [
        rdf:type owl:Restriction ;
        owl:onProperty ?rel ;
        owl:someValuesFrom ?class ;
    ] .
}
"""

rule_1 = """
PREFIX edam: <http://edamontology.org/>
CONSTRUCT {
  ?concept edam:has_operation ?operation .
} WHERE {
  ?concept rdf:type owl:Class .
  ?concept edam:has_topic ?topic .
  ?topic edam:has_operation ?operation .
}
"""

rule_1_bis = """
PREFIX edam: <http://edamontology.org/>
CONSTRUCT {
  ?concept edam:has_topic ?topic .
} WHERE {
  ?concept rdf:type owl:Class .
  ?concept edam:has_operation ?operation .
  ?operation edam:has_topic ?topic .
}
"""

rule_2 = """
PREFIX edam: <http://edamontology.org/>
CONSTRUCT {
  ?concept edam:has_input ?data .
} WHERE {
  ?concept rdf:type owl:Class .
  ?concept edam:has_operation ?op .
  ?op edam:has_input ?data .
}
"""

rule_3 = """
PREFIX edam: <http://edamontology.org/>
CONSTRUCT {
  ?concept edam:has_output ?data .
} WHERE {
  ?concept rdf:type owl:Class .
  ?concept edam:has_operation ?op .
  ?op edam:has_output ?data .
}
"""

rule_4 = """
PREFIX edam: <http://edamontology.org/>
CONSTRUCT {
  ?concept edam:has_topic ?topic .
} WHERE {
  ?concept rdf:type owl:Class .
  ?concept edam:has_input ?data .
  ?data edam:has_topic ?topic .
}
"""

rule_5 = """
PREFIX edam: <http://edamontology.org/>
CONSTRUCT {
  ?concept edam:has_topic ?topic .
} WHERE {
  ?concept rdf:type owl:Class .
  ?concept edam:has_output ?data .
  ?data edam:has_topic ?topic .
}
"""

rule_6 = """
PREFIX edam: <http://edamontology.org/>
CONSTRUCT {
  ?concept edam:has_operation ?operation .
} WHERE {
  ?concept rdf:type owl:Class .
  ?concept edam:has_input ?data .
  ?data edam:has_operation ?operation .
}
"""

rule_7 = """
PREFIX edam: <http://edamontology.org/>
CONSTRUCT {
  ?concept edam:has_operation ?operation .
} WHERE {
  ?concept rdf:type owl:Class .
  ?concept edam:has_output ?data .
  ?data edam:has_operation ?operation .
}
"""

rules = [rule_0, rule_1, rule_1_bis, rule_2, rule_3, rule_4, rule_5, rule_6, rule_7]


def apply_all_rules(kg):
    """
    Apply the construct rules defined in the rules parameter, as well as the
    transitivity of subClassOf
    """

    inferred_kg = Graph()
    inferred_kg += kg

    for i, rule in enumerate(rules):
        res_kg = inferred_kg.query(rule).graph
        print(
            f"{len(inferred_kg + res_kg) - len(inferred_kg)} triples generated from rule {i}"
        )
        inferred_kg = inferred_kg + res_kg

    res_kg = inferred_kg.query(rule_subClassOf).graph
    print(
        f"{len(inferred_kg + res_kg) - len(inferred_kg)} triples generated from rule subClassOf"
    )
    inferred_kg = inferred_kg + res_kg

    return inferred_kg


def transitive_closure(kg):
    """
    Apply the collection of inference rules iteratively until no new triples are generated.
    Apply rules for 10 iteration max
    Return the resulting knowledge graph.
    """
    card_new_triples = 1
    i = 1
    while card_new_triples > 0:
        timer = time.time()

        kg_with_inferrences = apply_all_rules(kg)

        new_triples = kg_with_inferrences - kg
        card_new_triples = len(new_triples)
        print(f"Inferred {card_new_triples} new triples")

        kg += kg_with_inferrences
        duration = time.time() - timer
        print(f"Iteration {i} Duration : {round(duration, 2)}s")
        i += 1
        if i > 10:
            break
    return kg

In [43]:
kg_2 = transitive_closure(kg)
print(len(kg_2))
print(kg_2.serialize(format="turtle"))

0 triples generated from rule 0
0 triples generated from rule 1
0 triples generated from rule 2
0 triples generated from rule 3
0 triples generated from rule 4
0 triples generated from rule 5
0 triples generated from rule 6
0 triples generated from rule 7
0 triples generated from rule 8
0 triples generated from rule subClassOf
Inferred 0 new triples
Iteration 1 Duration : 0.05s
46
@prefix edam: <http://edamontology.org/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

edam:C a owl:Class ;
    edam:has_input edam:Data_2 ;
    edam:has_operation edam:Operation_1,
        edam:Operation_2 ;
    edam:has_output edam:Data_1 ;
    edam:has_topic edam:Topic_1 ;
    rdfs:subClassOf edam:C_prime .

edam:C_prime a owl:Class ;
    edam:has_input edam:Data_2 ;
    edam:has_operation edam:Operation_1,
        edam:Operation_2 ;
    edam:has_output edam:Data_1 ;
    edam:has_topic edam:Topic_1 ;
    rdfs:subClassOf [ a owl:Restriction ;
    

## Applying the same rule set to the full bio.tools and EDAM knowledge graph

In [44]:
full_kg = Graph()
full_kg.parse("https://edamontology.org/EDAM.owl", format="xml")
print(f"Successfully loaded EDAM: {len(full_kg)} triples")

Successfully loaded EDAM: 38792 triples


In [45]:
full_kg.parse(
    "https://github.com/research-software-ecosystem/content/raw/refs/heads/master/datasets/bioschemas-dump.ttl",
    format="turtle",
)
print(f"Successfully loaded bio.tools: {len(full_kg)} triples")

Successfully loaded bio.tools: 672116 triples


In [46]:
full_inferred_kg = transitive_closure(full_kg)
print(len(full_inferred_kg))

604 triples generated from rule 0
0 triples generated from rule 1
0 triples generated from rule 2
0 triples generated from rule 3
0 triples generated from rule 4
25 triples generated from rule 5
17 triples generated from rule 6
0 triples generated from rule 7
0 triples generated from rule 8
1549 triples generated from rule subClassOf
Inferred 2195 new triples
Iteration 1 Duration : 140.52s
0 triples generated from rule 0
0 triples generated from rule 1
0 triples generated from rule 2
0 triples generated from rule 3
0 triples generated from rule 4
9 triples generated from rule 5
12 triples generated from rule 6
0 triples generated from rule 7
0 triples generated from rule 8
0 triples generated from rule subClassOf
Inferred 21 new triples
Iteration 2 Duration : 148.26s
0 triples generated from rule 0
0 triples generated from rule 1
0 triples generated from rule 2
0 triples generated from rule 3
0 triples generated from rule 4
0 triples generated from rule 5
0 triples generated from rule 

In [47]:
full_inferred_kg.serialize(destination="out_inf.ttl", format="turtle")


<Graph identifier=N8b99de6896714ad98a9b4c851a927238 (<class 'rdflib.graph.Graph'>)>

In [48]:
full_inferred_kg = Graph()
full_inferred_kg.parse("out_inf.ttl", format="turtle")
print(f"Successfully loaded inferred KG: {len(full_inferred_kg)} triples")


Successfully loaded inferred KG: 674332 triples


In [63]:
desc_q = """
DESCRIBE <https://bio.tools/multiqc> <https://bio.tools/qiime2> <https://bio.tools/frogs> 
"""
res = full_inferred_kg.query(desc_q).graph
print(res.serialize(format="turtle"))


@prefix biotools: <https://bio.tools/ontology/> .
@prefix bsc: <http://bioschemas.org/> .
@prefix bsct: <http://bioschemas.org/types/> .
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix edam: <http://edamontology.org/> .
@prefix sc: <http://schema.org/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<https://bio.tools/frogs> a sc:SoftwareApplication ;
    dcterms:conformsTo "https://bioschemas.org/profiles/ComputationalTool/0.6-DRAFT" ;
    sc:additionalType "Command-line tool" ;
    sc:applicationSubCategory edam:topic_0637,
        edam:topic_3168,
        edam:topic_3174,
        edam:topic_3299,
        edam:topic_3697 ;
    sc:citation <https://doi.org/10.1093/bib/bbab318>,
        <https://doi.org/10.1093/bioinformatics/btx791>,
        "pubmed:29228191",
        "pubmed:34410336" ;
    sc:description "The user-friendly and Galaxy-supported pipeline FROGS analyses large sets of DNA amplicons sequences accurately and rapidly, essential for microbe community studies." 

In [64]:
q = """
PREFIX edam: <http://edamontology.org/> 
PREFIX bsc: <http://bioschemas.org/> 
PREFIX sc: <http://schema.org/> 

CONSTRUCT {
  ?X sc:applicationSubCategory ?topic, ?inf_ot, ?inf_odt, ?inf_idt ;
                            sc:featureList ?operation, ?inf_to, ?inf_odo, ?inf_ido ;
                            bsc:output ?out_data, ?inf_tod, ?inf_ood ;
                            bsc:input ?in_data, ?inf_tid, ?inf_oid  .    
} WHERE {
    VALUES ?X { <https://bio.tools/multiqc> <https://bio.tools/qiime2> <https://bio.tools/frogs> }
    ?X a sc:SoftwareApplication .
    OPTIONAL {?X bsc:output/sc:additionalType ?out_data } . 
    OPTIONAL {?X bsc:output/sc:additionalType/edam:has_topic ?inf_odt } . 
    OPTIONAL {?X bsc:output/sc:additionalType/edam:has_operation ?inf_odo } . 
    OPTIONAL {?X bsc:input/sc:additionalType ?in_data } . 
    OPTIONAL {?X bsc:input/sc:additionalType/edam:has_topic ?inf_idt } . 
    OPTIONAL {?X bsc:input/sc:additionalType/edam:has_operation ?inf_ido } . 
    OPTIONAL {?X sc:applicationSubCategory ?topic } .
    OPTIONAL {?X sc:applicationSubCategory/edam:has_operation ?inf_to } .
    OPTIONAL {?X sc:applicationSubCategory/edam:has_input ?inf_tid } .
    OPTIONAL {?X sc:applicationSubCategory/edam:has_output ?inf_tod } .
    OPTIONAL {?X sc:featureList ?operation } .
    OPTIONAL {?X sc:featureList/edam:has_topic ?inf_ot } .
    OPTIONAL {?X sc:featureList/edam:has_input ?inf_oid } .
    OPTIONAL {?X sc:featureList/edam:has_output ?inf_ood } .
}
"""

res_multiqc = full_inferred_kg.query(q).graph
print(f"MultiQC inferred knowledge graph has {len(res_multiqc)} triples")
print(res_multiqc.serialize(format="turtle"))

MultiQC inferred knowledge graph has 29 triples
@prefix ns1: <http://bioschemas.org/> .
@prefix ns2: <http://schema.org/> .

<https://bio.tools/frogs> ns2:applicationSubCategory <http://edamontology.org/topic_0080>,
        <http://edamontology.org/topic_0637>,
        <http://edamontology.org/topic_3168>,
        <http://edamontology.org/topic_3174>,
        <http://edamontology.org/topic_3299>,
        <http://edamontology.org/topic_3697> ;
    ns2:featureList <http://edamontology.org/operation_3460> .

<https://bio.tools/multiqc> ns1:input <http://edamontology.org/data_2977> ;
    ns1:output <http://edamontology.org/data_0867>,
        <http://edamontology.org/data_3914> ;
    ns2:applicationSubCategory <http://edamontology.org/topic_0077>,
        <http://edamontology.org/topic_0080>,
        <http://edamontology.org/topic_0091>,
        <http://edamontology.org/topic_0622>,
        <http://edamontology.org/topic_3168> ;
    ns2:featureList <http://edamontology.org/operation_2428>,

In [56]:
desc_q = """
SELECT * WHERE {
<https://bio.tools/multiqc> sc:applicationSubCategory ?inf_tt
}
"""
res = full_inferred_kg.query(desc_q)
for row in res:
    print(f"MultiQC has topic {row.inf_tt}")

MultiQC has topic http://edamontology.org/topic_0080
MultiQC has topic http://edamontology.org/topic_0091
MultiQC has topic http://edamontology.org/topic_0622
MultiQC has topic http://edamontology.org/topic_3168


In [39]:
desc_q = """
DESCRIBE <http://edamontology.org/operation_2478>
"""
res = full_inferred_kg.query(desc_q).graph
print(res.serialize(format="turtle"))

@prefix edam: <http://edamontology.org/> .
@prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

edam:operation_2478 a owl:Class ;
    rdfs:label "Nucleic acid sequence analysis" ;
    edam:created_in "beta12orEarlier" ;
    edam:has_input edam:data_2977 ;
    edam:has_topic edam:topic_0077,
        edam:topic_0080 ;
    oboInOwl:hasDefinition "Analyse a nucleic acid sequence (using methods that are only applicable to nucleic acid sequences)." ;
    oboInOwl:hasExactSynonym "Sequence analysis (nucleic acid)" ;
    oboInOwl:hasNarrowSynonym "Nucleic acid sequence alignment analysis",
        "Sequence alignment analysis (nucleic acid)" ;
    oboInOwl:inSubset edam:bio,
        edam:operations ;
    rdfs:subClassOf [ a owl:Restriction ;
            owl:onProperty edam:has_topic ;
            owl:someValuesFrom edam:topic_0077 ],
        [ a owl:Restriction ;
            

In [51]:
desc_q = """
DESCRIBE <http://edamontology.org/operation_3218>
"""
res = full_inferred_kg.query(desc_q).graph
print(res.serialize(format="turtle"))


@prefix edam: <http://edamontology.org/> .
@prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

edam:operation_3218 a owl:Class ;
    rdfs:label "Sequencing quality control" ;
    edam:created_in "1.1" ;
    edam:has_input edam:data_2977 ;
    edam:has_topic edam:topic_0077,
        edam:topic_0080 ;
    oboInOwl:hasDefinition "Raw sequence data quality control." ;
    oboInOwl:hasExactSynonym "Sequencing QC",
        "Sequencing quality assessment" ;
    oboInOwl:inSubset edam:bio,
        edam:operations ;
    rdfs:comment "Analyse raw sequence data from a sequencing pipeline and identify (and possiby fix) problems." ;
    rdfs:subClassOf edam:operation_2428,
        edam:operation_2478 .




In [29]:
# test R C hasInput D AND D hasOperation O

edam_kg = Graph()
edam_kg.parse("https://edamontology.org/EDAM.owl", format="xml")
print(f"Successfully loaded EDAM: {len(edam_kg)} triples")

Successfully loaded EDAM: 38792 triples


In [None]:
edam_kg_2 = edam_kg.query(rule_0).graph
print(f"EDAM after rule 0 has {len(edam_kg_2)} triples")
print(edam_kg_2.serialize(format="turtle"))

EDAM after rule 0 has 862 triples
@prefix ns1: <http://edamontology.org/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .

ns1:data_0860 ns1:has_topic ns1:topic_0160 .

ns1:data_0940 ns1:has_topic ns1:topic_1317 .

ns1:data_0942 ns1:has_topic ns1:topic_0121 .

ns1:data_0944 ns1:has_topic ns1:topic_0121 .

ns1:data_0945 ns1:has_topic ns1:topic_0121 .

ns1:data_0956 ns1:has_topic ns1:topic_3489 .

ns1:data_0987 ns1:is_identifier_of ns1:data_0919 .

ns1:data_0989 ns1:is_identifier_of ns1:data_0896 .

ns1:data_0993 ns1:is_identifier_of ns1:data_2851 .

ns1:data_0994 ns1:is_identifier_of ns1:data_2016 .

ns1:data_1008 ns1:is_identifier_of ns1:data_1467 .

ns1:data_1025 ns1:is_identifier_of ns1:data_0916 .

ns1:data_1038 ns1:is_identifier_of ns1:data_1468 .

ns1:data_1048 ns1:is_identifier_of ns1:data_0957 .

ns1:data_1051 ns1:is_identifier_of ns1:data_0582 .

ns1:data_1061 ns1:is_identifier_of ns1:data_0847 .

ns1:data_1063 ns1:is_ide

In [32]:
test_r_6 = """
PREFIX edam: <http://edamontology.org/>
SELECT * WHERE {
    ?concept rdf:type owl:Class .
    ?concept edam:has_input ?data .
    ?data edam:has_operation ?operation .
}
"""

res = edam_kg_2.query(test_r_6)
for row in res:
    print(f"{row.concept} has operation {row.operation} through input {row.data}")