In [1]:
# Parameters

# Folder on local machine where to create the output and temporary folders
output_path = "/Users/pedroszekely/Downloads/kypher"

# The names of the output and temporary folders
output_folder = "schwarzenegger"
temp_folder = "temp.schwarzenegger"

data_folder = output_path + "/" + output_folder

# Location of the cache database for kypher
cache_path = "/Users/pedroszekely/Downloads/kypher/temp.schwarzenegger"

# shortcuts to commands
kgtk = "time kgtk --debug"
# kgtk = "kgtk --debug"

In [2]:
import io
import os
import subprocess
import sys

import numpy as np
import pandas as pd

import papermill as pm

In [7]:
# The names of files in the KGTK Wikidata distirbution that we will use in this notebook.
file_names = {
    "all": "Q2685.graph.all.tsv.gz",
}

# We will define environment variables to hold the full paths to the files as we will use them in the shell commands
kgtk_environment_variables = []

for key, value in file_names.items():
    variable = key.upper()
    os.environ[variable] = data_folder + "/" + value
    kgtk_environment_variables.append(variable)
    
# KGTK creates a SQLite database to index the knowledge graph.
if cache_path:
    os.environ['STORE'] = "{}/wikidata.sqlite3.db".format(cache_path)
else:
    os.environ['STORE'] = "{}/{}/wikidata.sqlite3.db".format(output_path, temp_folder)
kgtk_environment_variables.append('STORE')

# We will create many temporary files, so set up a folder for outputs and one for the temporary files.
os.environ['TEMP'] = "{}/{}".format(output_path, temp_folder) 
os.environ['OUT'] = "{}/{}".format(output_path, output_folder) 
kgtk_environment_variables.append('TEMP')
kgtk_environment_variables.append('OUT')

# Envronment variables with shortcuts to the commands we use often
os.environ['kgtk'] = kgtk
# Use for debugging, but careful as it causes import to dataframes to break
os.environ['kypher'] = "time kgtk --debug query --graph-cache " + os.environ['STORE']
# os.environ['kypher'] = "time kgtk query --graph-cache " + os.environ['STORE']
#os.environ['kypher'] = "kgtk query --graph-cache " + os.environ['STORE']
kgtk_environment_variables.append('kgtk')
kgtk_environment_variables.append('kypher')

# We'll save the current working directory so we can call into other example notebooks later
os.environ["EXAMPLES_DIR"] = os.getcwd()
kgtk_environment_variables.append('EXAMPLES_DIR')

kgtk_environment_variables.sort()
for variable in kgtk_environment_variables:
    print("{}: \"{}\"".format(variable, os.environ[variable]))

ALL: "/Users/pedroszekely/Downloads/kypher/schwarzenegger/Q2685.graph.all.tsv.gz"
EXAMPLES_DIR: "/Users/pedroszekely/Downloads/kypher"
OUT: "/Users/pedroszekely/Downloads/kypher/schwarzenegger"
STORE: "/Users/pedroszekely/Downloads/kypher/temp.schwarzenegger/wikidata.sqlite3.db"
TEMP: "/Users/pedroszekely/Downloads/kypher/temp.schwarzenegger"
kgtk: "time kgtk --debug"
kypher: "time kgtk --debug query --graph-cache /Users/pedroszekely/Downloads/kypher/temp.schwarzenegger/wikidata.sqlite3.db"


In [8]:
def kgtk_to_dataframe(kgtk):
    columns = kgtk[0].split("\t")
    data = []
    for line in kgtk[1:]:
        data.append(line.encode('utf-8').decode('utf-8').split("\t"))
    return pd.DataFrame(data, columns=columns)    

In [9]:
%cd {output_path}

/Users/pedroszekely/Downloads/kypher


Define the shortcuts for Kypher

In [10]:
!$kypher \
-i "$ALL" --as all \
--limit 3

[2021-09-28 08:50:09 sqlstore]: IMPORT graph directly into table graph_1 from /Users/pedroszekely/Downloads/kypher/schwarzenegger/Q2685.graph.all.tsv.gz ...
[2021-09-28 08:50:22 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_1 AS graph_1_c1
     LIMIT ?
  PARAS: [3]
---------------------------------------------
node1	label	node2	id
P10	datatype	commonsMedia	P10-datatype
P1000	P31	Q18608871	P1000-P31-Q18608871-093affb5-0
P1001	P1647	P276	P1001-P1647-P276-e4e44f83-0
       14.24 real        22.22 user         0.78 sys


# To Do
- Do partition of the graph in to the usual Wikidata files
- Compute the derived files

# Explore The Arnold Schwarzenegger Graph `Q2685`



In [11]:
!$kypher -i all \
--match '(n1)-[:P31]->(:Q5)' \
--return 'count(distinct n1)'

[2021-09-28 09:17:55 query]: SQL Translation:
---------------------------------------------
  SELECT count(DISTINCT graph_1_c1."node1")
     FROM graph_1 AS graph_1_c1
     WHERE graph_1_c1."label" = ?
        AND graph_1_c1."node2" = ?
  PARAS: ['P31', 'Q5']
---------------------------------------------
[2021-09-28 09:17:55 sqlstore]: CREATE INDEX on table graph_1 column label ...
[2021-09-28 09:17:57 sqlstore]: ANALYZE INDEX on table graph_1 column label ...
[2021-09-28 09:17:57 sqlstore]: CREATE INDEX on table graph_1 column node2 ...
[2021-09-28 09:17:59 sqlstore]: ANALYZE INDEX on table graph_1 column node2 ...
count(DISTINCT graph_1_c1."node1")
18446
        5.47 real         4.27 user         0.71 sys


Count organizations

In [12]:
!$kypher -i all \
--match '(n1)-[:P31]->(:Q43229)' \
--return 'count(distinct n1)'

[2021-09-28 09:19:48 query]: SQL Translation:
---------------------------------------------
  SELECT count(DISTINCT graph_1_c1."node1")
     FROM graph_1 AS graph_1_c1
     WHERE graph_1_c1."label" = ?
        AND graph_1_c1."node2" = ?
  PARAS: ['P31', 'Q43229']
---------------------------------------------
count(DISTINCT graph_1_c1."node1")
614
        1.35 real         0.85 user         0.21 sys


Schwarzenegger films, may be incomplete as we need to use P279star, but we don't have those files yet

In [15]:
!$kypher -i all \
--match ' \
    (film)-[:P31]->(:Q11424), \
    (film)-[:P161]->(:Q2685)' \
--return 'distinct film' 

[2021-09-28 09:24:17 query]: SQL Translation:
---------------------------------------------
  SELECT DISTINCT graph_1_c1."node1"
     FROM graph_1 AS graph_1_c1
     INNER JOIN graph_1 AS graph_1_c2
     ON graph_1_c1."node1" = graph_1_c2."node1"
        AND graph_1_c1."label" = ?
        AND graph_1_c1."node2" = ?
        AND graph_1_c2."label" = ?
        AND graph_1_c2."node2" = ?
  PARAS: ['P31', 'Q11424', 'P161', 'Q2685']
---------------------------------------------
node1
Q15140437
Q162255
Q170564
Q200804
Q29054009
Q309003
Q39072454
Q740516
        1.52 real         0.93 user         0.24 sys
