# Ontology Examples

Using the KGTK query API to find paths between qnodes

### Preamble: set up the environment and files used in the tutorial

In [1]:
import io
import os
import subprocess
import sys

import kgtk.kypher.api as kapi
import numpy as np
import pandas as pd
from kgtk.configure_kgtk_notebooks import ConfigureKGTK
from kgtk.functions import kgtk, kypher

In [2]:
# Parameters

kgtk_path = "/Users/pedroszekely/Documents/GitHub/kgtk"

# Folder on local machine where to create the output and temporary folders
input_path = "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data/"
input_path = "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2/"
output_path = "/Users/pedroszekely/Downloads/kypher/projects"
graph_cache_path = "/Users/pedroszekely/Downloads/kypher/wikidata.sqlite3.db"
project_name = "browser-profiling"

Our Wikidata distribution partitions the knowledge in Wikidata into smaller files that make it possible for you to pick and choose which files you want to use. Our tutorial KG is a subset of Wikidata, and is partitioned in the same way as the full Wikidata. The following is a partial list of all the files:

In [3]:
files = [
    "item",
    "p279star",
    "label"
]
ck = ConfigureKGTK(files, kgtk_path=kgtk_path)
ck.configure_kgtk(input_graph_path=input_path,
                  output_path=output_path,
                  graph_cache_path=graph_cache_path,
                  project_name=project_name,
                  debug=True
                 )

User home: /Users/pedroszekely
Current dir: /Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/use-cases
KGTK dir: /Users/pedroszekely/Documents/GitHub/kgtk
Use-cases dir: /Users/pedroszekely/Documents/GitHub/kgtk/use-cases


The KGTK setup command defines environment variables for all the files so that you can reuse the Jupyter notebook when you install it on your local machine.

In [4]:
ck.print_env_variables()

KGTK_LABEL_FILE: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//labels.en.tsv.gz
STORE: /Users/pedroszekely/Downloads/kypher/wikidata.sqlite3.db
kgtk: kgtk --debug
KGTK_GRAPH_CACHE: /Users/pedroszekely/Downloads/kypher/wikidata.sqlite3.db
EXAMPLES_DIR: /Users/pedroszekely/Documents/GitHub/kgtk/examples
kypher: kgtk --debug query --graph-cache /Users/pedroszekely/Downloads/kypher/wikidata.sqlite3.db
OUT: /Users/pedroszekely/Downloads/kypher/projects/browser-profiling
KGTK_OPTION_DEBUG: false
GRAPH: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2/
TEMP: /Users/pedroszekely/Downloads/kypher/projects/browser-profiling/temp.browser-profiling
USE_CASES_DIR: /Users/pedroszekely/Documents/GitHub/kgtk/use-cases
item: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//claims.wikibase-item.tsv.gz
p279star: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//derived.P279star.tsv.gz
label: /Volumes/

In [5]:
ck.load_files_into_cache()

kgtk --debug query --graph-cache /Users/pedroszekely/Downloads/kypher/wikidata.sqlite3.db -i "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//claims.wikibase-item.tsv.gz" --as item  -i "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//derived.P279star.tsv.gz" --as p279star  -i "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//labels.en.tsv.gz" --as label  --limit 3
[2021-12-02 21:18:59 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_1 AS graph_1_c1
     LIMIT ?
  PARAS: [3]
---------------------------------------------
node1	label	node2	id	rank	node2;wikidatatype
P10	P1629	Q34508	P10-P1629-Q34508-bcc39400-0	normal	wikibase-item
P10	P1855	Q15075950	P10-P1855-Q15075950-7eff6d65-0	normal	wikibase-item
P10	P1855	Q4504	P10-P1855-Q4504-a69d2c73-0	normal	wikibase-item


In [6]:
# best indexing for this notebook
# !kgtk --debug query -i item --idx mode:graph -i p279star --idx mode:monograph --limit 3
!kgtk --debug query -i item --idx mode:graph --limit 3
!kgtk --debug query -i p279star --idx mode:monograph --limit 3
!kgtk --debug query -i label --idx mode:valuegraph --limit 3

[2021-12-02 21:18:59 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_1 AS graph_1_c1
     LIMIT ?
  PARAS: [3]
---------------------------------------------
[2021-12-02 21:18:59 sqlstore]: DROP INDEX "graph_1_node1_label_node2_idx"
[2021-12-02 21:20:11 sqlstore]: DROP INDEX "graph_1_label_idx"
[2021-12-02 21:20:40 sqlstore]: DROP INDEX "graph_1_node2_label_node1_idx"
[2021-12-02 21:21:51 sqlstore]: CREATE INDEX "graph_1_node1_label_node2_idx" ON "graph_1" ("node1", "label", "node2")
[2021-12-02 21:25:00 sqlstore]: ANALYZE "graph_1_node1_label_node2_idx"
[2021-12-02 21:25:34 sqlstore]: CREATE INDEX "graph_1_label_idx" ON "graph_1" ("label")
[2021-12-02 21:28:53 sqlstore]: ANALYZE "graph_1_label_idx"
[2021-12-02 21:29:07 sqlstore]: CREATE INDEX "graph_1_node2_label_node1_idx" ON "graph_1" ("node2", "label", "node1")
[2021-12-02 21:35:21 sqlstore]: ANALYZE "graph_1_node2_label_node1_idx"
node1	label	node2	id	rank	node2;wikidatatype
P10	P16

### Instantiate the Kypher API

In [97]:
kypher_api = kapi.KypherApi(graphcache=os.environ['STORE'], loglevel=0, index='auto', maxresults=1, maxcache=0)

### Find superclass links

In [102]:
find_superclasses_query = kypher_api.get_query(
    inputs=['p279star', 'label'],
    match='p279star: (n1)-[edgeid {label: property}]->(n2)',
    where=f'((n1 = $QNODE1 and n2 = $QNODE2) or (n2 = $QNODE1 and n1 = $QNODE2))',
    opt1='label: (n1)-[]->(n1_label)',
    opt2='label: (property)-[]->(property_label)',
    opt3='label: (n2)-[]->(n2_label)',
    ret="""distinct
        n1 as node1, 
        n1_label as `node1;label`,
        property as label, 
        property_label as `property;label`, 
        n2 as node2, 
        n2_label as `node2;label`""",
    limit='$LIMIT')

def find_superclasses(qnode1, qnode2, limit=20):
    """Find direct links between qnode1 and qnode2"""
    return kypher_api.execute_query(find_superclasses_query, fmt='df', QNODE1=qnode1, QNODE2=qnode2, LIMIT=limit)

In [104]:
find_superclasses("Q21546392", "Q2095")

Unnamed: 0,node1,node1;label,label,property;label,node2,node2;label
0,Q21546392,'garlic'@en,P279star,,Q2095,'food'@en


### Find instance to class links

Traverse P279*

In [108]:
find_instance_class_links_query = kypher_api.get_query(
    inputs=['item', 'p279star', 'label'],
    match="""
        item: (n1)-[:P31]->(class),
        p279star: (class)-[edgeid {label: property}]->(n2)""",
    where=f'((n1 = $QNODE1 and n2 = $QNODE2) or (n2 = $QNODE1 and n1 = $QNODE2))',
    opt1='label: (n1)-[]->(n1_label)',
    opt2='label: (property)-[]->(property_label)',
    opt3='label: (n2)-[]->(n2_label)',
    ret="""distinct
        n1 as node1, 
        n1_label as `node1;label`,
        property as label, 
        property_label as `property;label`, 
        n2 as node2, 
        n2_label as `node2;label`""",
    limit='$LIMIT')

def find_instance_class_links(qnode1, qnode2, limit=20):
    """Find direct links between qnode1 and qnode2"""
    return kypher_api.find_instance_class_links_query(find_superclasses_query, fmt='df', QNODE1=qnode1, QNODE2=qnode2, LIMIT=limit)

### Find direct links

In [77]:
find_direct_links_query = kypher_api.get_query(
    inputs=['item', 'p279star', 'label'],
    match='item: (n1)-[edgeid {label: property}]->(n2)',
    where=f'((n1 = $QNODE1 and n2 = $QNODE2) or (n2 = $QNODE1 and n1 = $QNODE2)) and property != "P279" and property != "P1889"',
    opt1='label: (n1)-[]->(n1_label)',
    opt2='label: (property)-[]->(property_label)',
    opt3='label: (n2)-[]->(n2_label)',
    ret="""distinct
        n1 as node1, 
        n1_label as `node1;label`,
        property as label, 
        property_label as `property;label`, 
        n2 as node2, 
        n2_label as `node2;label`""",
    limit='$LIMIT')

def find_direct_links(qnode1, qnode2, limit=20):
    """Find direct links between qnode1 and qnode2"""
    return kypher_api.execute_query(find_direct_links_query, fmt='df', QNODE1=qnode1, QNODE2=qnode2, LIMIT=limit)

In [78]:
find_direct_links("Q81513", "Q1429336")

Unnamed: 0,node1,node1;label,label,property;label,node2,node2;label
0,Q81513,'Citrus'@en,P4000,'has fruit type'@en,Q1429336,'hesperidium'@en


### Find class to class links

In [62]:
find_class_class_links_via_superclass_query = kypher_api.get_query(
        inputs=['item', 'p279star', 'label'],
        match="""
            p279star: (n1)-[]->(hop1),
            item: (hop1)-[edgeid {label: property}]->(n2)""",
        where=f'((n1 = $QNODE1 and n2 = $QNODE2) or (n2 = $QNODE1 and n1 = $QNODE2)) and property != "P279" and property != "P1889"',
        opt1='label: (n1)-[]->(n1_label)',
        opt2='label: (property)-[]->(property_label)',
        opt3='label: (n2)-[]->(n2_label)',
        opt4='label: (hop1)-[]->(hop1_label)',
        ret="""distinct
            n1 as node1, 
            n1_label as `node1;label`,
            property as label, 
            property_label as `property;label`, 
            n2 as node2, 
            n2_label as `node2;label`, 
            hop1 as hop1, 
            hop1_label as `hop1;label`
            """,
        limit='$LIMIT'
    )

def find_class_class_links_via_superclass(qnode1, qnode2, limit=20):
    """Use case: qnode1 and qnode2 are classes: find a link between a P279* of qnode1 and qnode2"""
    return kypher_api.execute_query(find_class_class_links_via_superclass_query, fmt='df', QNODE1=qnode1, QNODE2=qnode2, LIMIT=limit)

In [63]:
find_class_class_links_via_superclass("Q183206", "Q1422299")

Unnamed: 0,node1,node1;label,label,property;label,node2,node2;label,hop1,hop1;label
0,Q183206,'vitamin B'@en,P460,'said to be the same as'@en,Q1422299,'nutriment'@en,Q181394,'nutrient'@en


### Find instance to class links

In [81]:
find_instance_class_links_via_superclass_query = kypher_api.get_query(
        inputs=['item', 'p279star', 'label'],
        match="""
            item: (n1)-[:P31]->(class),
            p279star: (class)-[]->(hop1),
            item: (hop1)-[edgeid {label: property}]->(n2)""",
        where=f'((n1 = $QNODE1 and n2 = $QNODE2) or (n2 = $QNODE1 and n1 = $QNODE2)) and property != "P279" and property != "P1889"',
        opt1='label: (n1)-[]->(n1_label)',
        opt2='label: (property)-[]->(property_label)',
        opt3='label: (n2)-[]->(n2_label)',
        opt4='label: (hop1)-[]->(hop1_label)',
        ret="""distinct
            n1 as node1, 
            n1_label as `node1;label`,
            property as label, 
            property_label as `property;label`, 
            n2 as node2, 
            n2_label as `node2;label`, 
            hop1 as hop1, 
            hop1_label as `hop1;label`
            """,
        limit='$LIMIT'
    )

def find_instance_class_links_via_superclass(qnode1, qnode2, limit=20):
    """Use case: qnode1 is an instance, qnode2 are classes: find a link between a P31/P279* of qnode1 and qnode2"""
    return kypher_api.execute_query(find_instance_class_links_via_superclass_query, fmt='df', QNODE1=qnode1, QNODE2=qnode2, LIMIT=limit)

In [82]:
find_instance_class_links_via_superclass("Q183206", "Q1422299")

Unnamed: 0,node1,node1;label,label,property;label,node2,node2;label,hop1,hop1;label


### Find class to class links via superclasses
This variant is more generous as it finds relations between the superclasses of both inputs and thus can produce unexpected results.
Suppose the inputs are car and sail: we can go from car up to vehicle, and from sail up to vehicle part, and discover the has-part relation between vehicle and vehicle-part, which would be wrong as cars don't have sails.

In [83]:
find_class_class_links_via_superclass_and_subclass_query = kypher_api.get_query(
        inputs=['item', 'p279star', 'label'],
        match="""
            p279star: (n1)-[]->(hop1),
            item: (hop1)-[edgeid {label: property}]->(hop2),
            p279star: (n2)-[]->(hop2)""",
        where=f'((n1 = $QNODE1 and n2 = $QNODE2) or (n2 = $QNODE1 and n1 = $QNODE2)) and property != "P279" and property != "P1889"',
        opt1='label: (n1)-[]->(n1_label)',
        opt2='label: (property)-[]->(property_label)',
        opt3='label: (n2)-[]->(n2_label)',
        opt4='label: (hop1)-[]->(hop1_label)',
        opt5='label: (hop2)-[]->(hop2_label)',
        ret="""distinct
            n1 as node1, 
            n1_label as `node1;label`,
            property as label, 
            property_label as `property;label`, 
            n2 as node2, 
            n2_label as `node2;label`,
            hop1 as hop1, 
            hop1_label as `hop1;label`, 
            hop2 as hop2, 
            hop2_label as `hop2;label`
            """,
        limit='$LIMIT'
    )
    
def find_class_class_links_via_superclass_and_subclass(qnode1, qnode2, limit=20):
    """Use case: qnode1 and qnode2 are classes: find a link between a P279* of qnode1 and a P279* of qnode2"""
    return kypher_api.execute_query(find_class_class_links_via_superclass_and_subclass_query, fmt='df', QNODE1=qnode1, QNODE2=qnode2, LIMIT=limit)

In [84]:
find_class_class_links_via_superclass_and_subclass("Q199678", "Q1093742")

Unnamed: 0,node1,node1;label,label,property;label,node2,node2;label,hop1,hop1;label,hop2,hop2;label


### Find instance to class links via superclasses

In [68]:
find_instance_class_links_via_superclass_and_subclass_query = kypher_api.get_query(
        inputs=['item', 'p279star', 'label'],
        match="""
            item: (n1)-[:P31]->(class),
            p279star: (class)-[]->(hop1),
            item: (hop1)-[edgeid {label: property}]->(hop2),
            p279star: (n2)-[]->(hop2)""",
        where=f'((n1 = $QNODE1 and n2 = $QNODE2) or (n2 = $QNODE1 and n1 = $QNODE2)) and property != "P279" and property != "P1889"',
        opt1='label: (n1)-[]->(n1_label)',
        opt2='label: (property)-[]->(property_label)',
        opt3='label: (n2)-[]->(n2_label)',
        opt4='label: (hop1)-[]->(hop1_label)',
        opt5='label: (hop2)-[]->(hop2_label)',
        ret="""distinct
            n1 as node1, 
            n1_label as `node1;label`,
            property as label, 
            property_label as `property;label`, 
            n2 as node2, 
            n2_label as `node2;label`,
            hop1 as hop1, 
            hop1_label as `hop1;label`, 
            hop2 as hop2, 
            hop2_label as `hop2;label`
            """,
        limit='$LIMIT'
    )

def find_instance_class_links_via_superclass_and_subclass(qnode1, qnode2, limit=20):
    """Use case: qnode1 is an instance, qnode2 is a class: find a link between a P31/P279* of qnode1 and a P279* of qnode2"""
    return kypher_api.execute_query(find_instance_class_links_via_superclass_and_subclass_query, fmt='df', QNODE1=qnode1, QNODE2=qnode2, LIMIT=limit)

In [74]:
find_instance_class_links_via_superclass_and_subclass("Q199678", "Q1093742", limit=5)

Unnamed: 0,node1,node1;label,label,property;label,node2,node2;label,hop1,hop1;label,hop2,hop2;label
0,Q199678,'vitamin C'@en,P527,'has part'@en,Q1093742,'lemon'@en,Q223557,'physical object'@en,Q28732711,'physical substance'@en
1,Q199678,'vitamin C'@en,P2670,'has parts of the class'@en,Q1093742,'lemon'@en,Q28732711,'physical substance'@en,Q28728771,'material substance'@en
2,Q199678,'vitamin C'@en,P361,'part of'@en,Q1093742,'lemon'@en,Q28732711,'physical substance'@en,Q223557,'physical object'@en
3,Q199678,'vitamin C'@en,P1269,'facet of'@en,Q1093742,'lemon'@en,Q4406616,'concrete object'@en,Q488383,'object'@en
4,Q199678,'vitamin C'@en,P460,'said to be the same as'@en,Q1093742,'lemon'@en,Q488383,'object'@en,Q4406616,'concrete object'@en


### Find instance to instance links via superclasses

In [98]:
find_instance_instance_links_via_superclass_and_subclass_query = kypher_api.get_query(
        inputs=['item', 'p279star', 'label'],
        match="""
            item: (n1)-[:P31]->(n1_class),
            p279star: (n1_class)-[]->(hop1),
            item: (hop1)-[edgeid {label: property}]->(hop2),
            item: (n2)-[:P31]->(n2_class),
            p279star: (n2_class)-[]->(hop2)""",
        where=f'((n1 = $QNODE1 and n2 = $QNODE2) or (n2 = $QNODE1 and n1 = $QNODE2)) and property != "P279" and property != "P1889"',
        opt1='label: (n1)-[]->(n1_label)',
        opt2='label: (property)-[]->(property_label)',
        opt3='label: (n2)-[]->(n2_label)',
        opt4='label: (hop1)-[]->(hop1_label)',
        opt5='label: (hop2)-[]->(hop2_label)',
        ret="""distinct
            n1 as node1, 
            n1_label as `node1;label`,
            property as label, 
            property_label as `property;label`, 
            n2 as node2, 
            n2_label as `node2;label`,
            hop1 as hop1, 
            hop1_label as `hop1;label`, 
            hop2 as hop2, 
            hop2_label as `hop2;label`
            """,
        limit='$LIMIT'
    )
    
def find_instance_instance_links_via_superclass_and_subclass(qnode1, qnode2, limit=20):
    """Use case: qnode1 and qnode2 are instances: find a link between a P31/P279* of qnode1 and a P31/P279* of qnode2"""
    return kypher_api.execute_query(find_instance_instance_links_via_superclass_and_subclass_query, fmt='df', QNODE1=qnode1, QNODE2=qnode2, LIMIT=limit)

In [99]:
find_instance_instance_links_via_superclass_and_subclass("Q199678", "Q1093742")

Unnamed: 0,node1,node1;label,label,property;label,node2,node2;label,hop1,hop1;label,hop2,hop2;label
0,Q199678,'vitamin C'@en,P527,'has part'@en,Q1093742,'lemon'@en,Q223557,'physical object'@en,Q28732711,'physical substance'@en
1,Q199678,'vitamin C'@en,P2670,'has parts of the class'@en,Q1093742,'lemon'@en,Q28732711,'physical substance'@en,Q28728771,'material substance'@en
2,Q199678,'vitamin C'@en,P361,'part of'@en,Q1093742,'lemon'@en,Q28732711,'physical substance'@en,Q223557,'physical object'@en
3,Q199678,'vitamin C'@en,P1269,'facet of'@en,Q1093742,'lemon'@en,Q4406616,'concrete object'@en,Q488383,'object'@en
4,Q199678,'vitamin C'@en,P460,'said to be the same as'@en,Q1093742,'lemon'@en,Q488383,'object'@en,Q4406616,'concrete object'@en
5,Q199678,'vitamin C'@en,P461,'opposite of'@en,Q1093742,'lemon'@en,Q5127848,'class'@en,Q23958946,'individual entity'@en
6,Q199678,'vitamin C'@en,P461,'opposite of'@en,Q1093742,'lemon'@en,Q7184903,'abstract object'@en,Q4406616,'concrete object'@en
7,Q199678,'vitamin C'@en,P461,'opposite of'@en,Q1093742,'lemon'@en,Q16686448,'artificial entity'@en,Q29651224,'natural object'@en
8,Q199678,'vitamin C'@en,P461,'opposite of'@en,Q1093742,'lemon'@en,Q8205328,'artificial physical object'@en,Q16686022,'natural physical object'@en
9,Q199678,'vitamin C'@en,P2670,'has parts of the class'@en,Q1093742,'lemon'@en,Q16887380,'group'@en,Q488383,'object'@en


In [106]:
find_instance_class_links_via_superclass_and_subclass("Q21546392",  "Q474191")

Unnamed: 0,node1,node1;label,label,property;label,node2,node2;label,hop1,hop1;label,hop2,hop2;label
0,Q21546392,'garlic'@en,P366,'use'@en,Q474191,'diet'@en,Q1422299,'nutriment'@en,Q2138622,'nutrition'@en
1,Q21546392,'garlic'@en,P1542,'has effect'@en,Q474191,'diet'@en,Q1190554,'occurrence'@en,Q1190554,'occurrence'@en
2,Q21546392,'garlic'@en,P1542,'has effect'@en,Q474191,'diet'@en,Q1190554,'occurrence'@en,Q3249551,'process'@en
3,Q21546392,'garlic'@en,P828,'has cause'@en,Q474191,'diet'@en,Q1190554,'occurrence'@en,Q3249551,'process'@en
4,Q21546392,'garlic'@en,P527,'has part'@en,Q474191,'diet'@en,Q16887380,'group'@en,Q35120,'entity'@en
5,Q21546392,'garlic'@en,P1542,'has effect'@en,Q474191,'diet'@en,Q3249551,'process'@en,Q1190554,'occurrence'@en
6,Q21546392,'garlic'@en,P828,'has cause'@en,Q474191,'diet'@en,Q3249551,'process'@en,Q1190554,'occurrence'@en


In [107]:
find_class_class_links_via_superclass_and_subclass("Q21546392",  "Q474191")

Unnamed: 0,node1,node1;label,label,property;label,node2,node2;label,hop1,hop1;label,hop2,hop2;label
0,Q21546392,'garlic'@en,P366,'use'@en,Q474191,'diet'@en,Q1422299,'nutriment'@en,Q2138622,'nutrition'@en
1,Q474191,'diet'@en,P460,'said to be the same as'@en,Q21546392,'garlic'@en,Q16722960,'phenomenon'@en,Q488383,'object'@en
2,Q474191,'diet'@en,P2670,'has parts of the class'@en,Q21546392,'garlic'@en,Q16887380,'group'@en,Q488383,'object'@en
3,Q474191,'diet'@en,P527,'has part'@en,Q21546392,'garlic'@en,Q16887380,'group'@en,Q35120,'entity'@en
4,Q474191,'diet'@en,P527,'has part'@en,Q21546392,'garlic'@en,Q66615814,"'diet, food and nutrition'@en",Q2095,'food'@en
