In [4]:
# Parameters

# Folder on local machine where to create the output and temporary folders
output_path = "/Users/pedroszekely/Downloads/kypher"

# The names of the output and temporary folders
output_folder = "movies"
temp_folder = "temp.movies"

# The location of input Wikidata files
wikidata_folder = "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd/"
# wikidata_folder = "/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/"
# The wikidata_os files can be downloaded from https://drive.google.com/drive/folders/1V6oAQKmwQ4LJnrBai-uv5gHWphFSCt50?usp=sharing

# Location of the cache database for kypher
cache_path = "/Users/pedroszekely/Downloads/kypher/temp.novartis"
# cache_path = "/Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4/wikidata.sqlite3.db"
# Whether to delete the cache database
delete_database = False

# shortcuts to commands
kgtk = "time kgtk --debug"
# kgtk = "kgtk --debug"

In [5]:
import io
import os
import subprocess
import sys

import numpy as np
import pandas as pd

import altair as alt

import papermill as pm

In [6]:
def kgtk_to_dataframe(kgtk):
    columns = kgtk[0].split("\t")
    data = []
    for line in kgtk[1:]:
        data.append(line.encode('utf-8').decode('utf-8').split("\t"))
    return pd.DataFrame(data, columns=columns)    

In [7]:
# The names of files in the KGTK Wikidata distirbution that we will use in this notebook.
file_names = {
    "claims": "claims.tsv.gz",
    "quantity": "claims.quantity.tsv.gz",
    "label": "labels.en.tsv.gz",
    "alias": "aliases.en.tsv.gz",
    "description": "descriptions.en.tsv.gz",
    "item": "claims.wikibase-item.tsv.gz",
    "qualifiers": "qualifiers.tsv.gz",
    "sitelinks": "sitelinks.tsv.gz",
    "qualifiers_time": "qualifiers.time.tsv.gz",
    "property_datatypes": "metadata.property.datatypes.tsv.gz",
    "isa": "derived.isa.tsv.gz",
    "p279star": "derived.P279star.tsv.gz",
    "p279": "derived.P279.tsv.gz",
    "p31": "derived.P31.tsv.gz",
    "dwd_isa": "derived.dwd_isa.tsv.gz"
}

# We will define environment variables to hold the full paths to the files as we will use them in the shell commands
kgtk_environment_variables = []

os.environ['WIKIDATA'] = wikidata_folder
kgtk_environment_variables.append('WIKIDATA')

for key, value in file_names.items():
    variable = key.upper()
    os.environ[variable] = wikidata_folder + value
    kgtk_environment_variables.append(variable)
    
# KGTK creates a SQLite database to index the knowledge graph.
if cache_path:
    os.environ['STORE'] = "{}/wikidata.sqlite3.db".format(cache_path)
else:
    os.environ['STORE'] = "{}/{}/wikidata.sqlite3.db".format(output_path, temp_folder)
kgtk_environment_variables.append('STORE')

# We will create many temporary files, so set up a folder for outputs and one for the temporary files.
os.environ['TEMP'] = "{}/{}".format(output_path, temp_folder) 
os.environ['OUT'] = "{}/{}".format(output_path, output_folder) 
kgtk_environment_variables.append('TEMP')
kgtk_environment_variables.append('OUT')

# Envronment variables with shortcuts to the commands we use often
os.environ['kgtk'] = kgtk
# Use for debugging, but careful as it causes import to dataframes to break
os.environ['kypher'] = "time kgtk --debug query --graph-cache " + os.environ['STORE']
os.environ['kypher'] = "kgtk query --graph-cache " + os.environ['STORE']
kgtk_environment_variables.append('kgtk')
kgtk_environment_variables.append('kypher')

# We'll save the current working directory so we can call into other example notebooks later
os.environ["EXAMPLES_DIR"] = os.getcwd()
kgtk_environment_variables.append('EXAMPLES_DIR')

kgtk_environment_variables.sort()
for variable in kgtk_environment_variables:
    print("{}: \"{}\"".format(variable, os.environ[variable]))

ALIAS: "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd/aliases.en.tsv.gz"
CLAIMS: "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd/claims.tsv.gz"
DESCRIPTION: "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd/descriptions.en.tsv.gz"
DWD_ISA: "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd/derived.dwd_isa.tsv.gz"
EXAMPLES_DIR: "/Users/pedroszekely/Documents/GitHub/kgtk/examples"
ISA: "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd/derived.isa.tsv.gz"
ITEM: "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd/claims.wikibase-item.tsv.gz"
LABEL: "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd/labels.en.tsv.gz"
OUT: "/Users/pedroszekely/Downloads/kypher/movies"
P279: "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd/derived.P279.tsv.gz"
P279STAR: "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-202102

In [8]:
%cd {output_path}

/Users/pedroszekely/Downloads/kypher


The ISA file is missing the identifiers and we need them, so this is a patch to add them

In [9]:
# !kgtk add-id --id-style wikidata -i "$ISA" -o "$TEMP"/derived.isa.tsv.gz

In [10]:
os.environ['ISA'] = os.environ['TEMP'] + "/derived.isa.tsv.gz"

Define the shortcuts for Kypher

In [11]:
!$kypher \
-i "$CLAIMS" --as claims \
-i "$P31" --as p31 \
-i "$P279" --as p279 \
-i "$ISA" --as isa \
-i "$DWD_ISA" --as dwd_isa \
-i "$LABEL" --as labels \
-i "$ALIAS" --as aliases \
-i "$P279STAR" --as p279star \
-i "$QUALIFIERS" --as qualifiers \
-i "$ITEM" --as items \
-i "$QUANTITY" --as quantities \
--limit 10

id	node1	label	node2	rank	node2;wikidatatype
P10-P1628-32b85d-7927ece6-0	P10	P1628	"http://www.w3.org/2006/vcard/ns#Video"	normal	url
P10-P1628-acf60d-b8950832-0	P10	P1628	"https://schema.org/video"	normal	url
P10-P1629-Q34508-bcc39400-0	P10	P1629	Q34508	normal	wikibase-item
P10-P1659-P1651-c4068028-0	P10	P1659	P1651	normal	wikibase-property
P10-P1659-P18-5e4b9c4f-0	P10	P1659	P18	normal	wikibase-property
P10-P1659-P4238-d21d1ac0-0	P10	P1659	P4238	normal	wikibase-property
P10-P1659-P51-86aca4c5-0	P10	P1659	P51	normal	wikibase-property
P10-P1855-Q15075950-7eff6d65-0	P10	P1855	Q15075950	normal	wikibase-item
P10-P1855-Q4504-a69d2c73-0	P10	P1855	Q4504	normal	wikibase-item
P10-P1855-Q69063653-c8cdb04c-0	P10	P1855	Q69063653	normal	wikibase-item


Mini node browser to help with debugging

In [12]:
def display_node(node): 
    template = """ -i claims -i labels \
    --match '(q)-[l {label: property}]->(n2), labels: (property)-[]->(property_label), labels: (n2)-[]->(n2_label)' \
    --return 'distinct property as label, property_label as `label;label`, n2 as node2, n2_label as `node2;label`' \
    --order-by 'property' \
    --where 'q = "__query__"'""".replace("__query__", node)

    lines = !kgtk query --graph-cache /Users/pedroszekely/Downloads/kypher/temp.novartis/wikidata.sqlite3.db {template}
    df = kgtk_to_dataframe(lines)
    # pd.set_option('display.max_rows', None)
    return df

In [13]:
display_node("P10")

Unnamed: 0,label,label;label,node2,node2;label
0,P1629,'subject item of this property'@en,Q34508,'videotape recording'@en
1,P1659,'see also'@en,P1651,'YouTube video ID'@en
2,P1659,'see also'@en,P18,'image'@en
3,P1659,'see also'@en,P4238,'webcam page URL'@en
4,P1659,'see also'@en,P51,'audio'@en
5,P1855,'Wikidata property example'@en,Q4504,'Varanus komodoensis'@en
6,P1855,'Wikidata property example'@en,Q69063653,'Couch Commander'@en
7,P1855,'Wikidata property example'@en,Q7378,'elephant'@en
8,P2302,'property constraint'@en,Q21502404,'format constraint'@en
9,P2302,'property constraint'@en,Q21510851,'allowed qualifiers constraint'@en


# Build Files For Path-Based Similarity

Get the P106 (occupation), P39 (position held) edges

In [14]:
!$kgtk filter -i "$ITEM" -p '; P106 ;' -o "$TEMP"/P106.tsv.gz

      734.09 real       600.22 user        19.11 sys


In [15]:
!$kgtk filter -i "$ITEM" -p '; P39 ;' -o "$TEMP"/P39.tsv.gz

      521.62 real       509.41 user         1.24 sys


Write files with the inverses of the relations we want to use for traversing paths. Designate the inverse property using a trailing underscore, e.g., `P31_` is the inverse of `P31`.

In [77]:
!$kypher -i p279 \
--match '(n1)-[]->(n2)' \
--return 'n2 as node1, "P279_" as label, n1 as node2' \
/ add-id --id-style wikidata \
-o "$TEMP"/P279_.tsv.gz

In [78]:
!$kypher -i p31 \
--match '(n1)-[]->(n2)' \
--return 'n2 as node1, "P31_" as label, n1 as node2' \
/ add-id --id-style wikidata \
-o "$TEMP"/P31_.tsv.gz

In [79]:
!$kypher -i "$TEMP"/P106.tsv.gz \
--match '(n1)-[]->(n2)' \
--return 'n2 as node1, "P106_" as label, n1 as node2' \
/ add-id --id-style wikidata \
-o "$TEMP"/P106_.tsv.gz

In [80]:
!$kypher -i "$TEMP"/P39.tsv.gz \
--match '(n1)-[]->(n2)' \
--return 'n2 as node1, "P39_" as label, n1 as node2' \
/ add-id --id-style wikidata \
-o "$TEMP"/P39_.tsv.gz

Connect the occupations and position helds to human

In [81]:
with open(os.environ['TEMP'] + '/custom-edges.tsv', 'w') as fp:
    fp.write("node1\tlabel\tnode2\n")
    fp.write("Q215627\tdwd_isa\tQ5\n") # person dwd_isa human
    fp.write("Q12737077\tdwd_isa\tQ5\n") # occupation dwd_isa human (perhaps controversial)
    fp.write("Q5\tdwd_isa_\tQ215627\n") # inverse
    fp.write("Q5\tdwd_isa_\tQ12737077\n") # inverse
fp.close()

Concatenating the files gives us all the edges to compute similarity paths

`similarity.P31.P279.P106.P39.tsv` can be used to test similarity of instances as it contains the P31 edges

In [None]:
!$kgtk cat \
    -i "$P279" -i "$TEMP"/P279_.tsv.gz \
    -i "$P31" -i "$TEMP"/P31_.tsv.gz \
    -i "$TEMP"/P39.tsv.gz \
    -i "$TEMP"/P106.tsv.gz \
    -i "$TEMP"/custom-edges.tsv \
/ deduplicate \
/ add-id --id-style wikidata \
-o "$TEMP"/similarity.P31.P279.P106.P39.tsv.gz 

`similarity.P279.P106.P39.tsv` can be used to test similiarity of classes, it is much smaller and runs much faster

In [None]:
!$kgtk cat \
    -i "$P279" -i "$TEMP"/P279_.tsv.gz \
    -i "$TEMP"/P39.tsv.gz \
    -i "$TEMP"/P106.tsv.gz \
    -i "$TEMP"/custom-edges.tsv \
/ deduplicate \
/ add-id --id-style wikidata \
-o "$TEMP"/similarity.P279.P106.P39.tsv.gz 

Sample file to test similarity paths

In [None]:
with open(os.environ['TEMP'] + '/paths.tsv', 'w') as fp:
    fp.write("node1\tlabel\tnode2\n")
    #fp.write("Q2685\tdummy\tQ483501\n")
    #fp.write("Q2685\tdummy\tQ76\n")
    #fp.write("Q124072\tdummy\tQ22731\n") #rifle, stone
    #fp.write("Q124072\tdummy\tQ728\n")
    #fp.write("Q124072\tdummy\tQ1907525\n")
    fp.write("Q1028181\tdummy\tQ177220\n") #painter, singer
    #fp.write("Q484876\t\dummy\tQ30461\n") # ceo, president
fp.close()

Compute paths on the small graph containing only classes. 

Right now we have to guess the max-hops that produces the shortest paths. We need a version of the `paths` command that computes all the shortest paths using the graphtools function that can do it.

In [None]:
!$kgtk paths -i "$TEMP"/similarity.P279.P106.P39.tsv.gz --path-file "$TEMP"/paths.tsv \
    --max_hops 5 \
    --statistics-only True \
    -o "$TEMP"/path_output.tsv 

Compute paths on the large graph

In [66]:
!$kgtk paths -i "$TEMP"/similarity.P31.P279.P106.P39.tsv.gz --path-file "$TEMP"/paths.tsv \
    --max_hops 4 \
    --statistics-only True \
    -o "$TEMP"/path_output_large.tsv 


	Using the fallback 'C' locale.
      686.81 real       665.24 user        18.58 sys


Query to add labels to the paths so that they are understandable. Note: this query should use optionals as edges in the path will disappear if one of the nodes doesn't have an English label

In [76]:
!$kypher -i "$TEMP"/path_output.tsv -i "$TEMP"/similarity.P31.P279.P106.P31.tsv.gz -i labels \
--match ' \
    path: (path)-[l {label: index}]->(edge_id), \
    similarity: (source)-[edge_id {label: property}]->(destination), \
    labels: (source)-[]->(source_label), \
    labels: (destination)-[]->(destination_label)' \
--return 'distinct path as path_id, index as index, edge_id as edge_id, source_label as source_label, destination_label as destination_label, property as property' \
--order-by 'path, cast(index, int)'

path_id	index	edge_id	source_label	destination_label	property
p0	0	Q1028181-P279-Q3391743-bf40700d-0	'painter'@en	'visual artist'@en	P279
p0	1	Q3391743-P279-Q483501-9e159a78-0	'visual artist'@en	'artist'@en	P279
p0	2	Q483501-P279_-Q639669	'artist'@en	'musician'@en	P279_
p0	3	Q639669-P279_-Q1369991	'musician'@en	'deejay'@en	P279_
p0	4	Q1369991-P279-Q177220-8854f969-0	'deejay'@en	'singer'@en	P279
p1	0	Q1028181-P279-Q3391743-bf40700d-0	'painter'@en	'visual artist'@en	P279
p1	1	Q3391743-P279-Q483501-9e159a78-0	'visual artist'@en	'artist'@en	P279
p1	2	Q483501-P279_-Q639669	'artist'@en	'musician'@en	P279_
p1	3	Q639669-P279_-Q186370	'musician'@en	'troubadour'@en	P279_
p1	4	Q186370-P279-Q177220-37970699-0	'troubadour'@en	'singer'@en	P279
p2	0	Q1028181-P279-Q3391743-bf40700d-0	'painter'@en	'visual artist'@en	P279
p2	1	Q3391743-P279-Q483501-9e159a78-0	'visual artist'@en	'artist'@en	P279
p2	2	Q483501-P279_-Q639669	'artist'@en	'musician'@en	P279_
p2	3	Q639669-P279_-Q2330480	'musician'@en	'kobzar'@