In [1]:
import io
import os
import subprocess
import sys

import numpy as np
import pandas as pd

from IPython.display import display, HTML

import papermill as pm
from configure_kgtk_notebooks import ConfigureKGTK

User home: /Users/amandeep
Current dir: /Users/amandeep/Github/kgtk/examples
Use-cases dir: /Users/amandeep/Github/kgtk/use-cases


In [2]:
# Parameters

# Folder on local machine where to create the output and temporary folders
input_path = "/Users/amandeep/iswc-2021-tutorial"
output_path = "/Users/amandeep/iswc-2021-tutorial"
project_name = "arnold"
files = "all,label"

In [3]:
files = files.split(",")

In [4]:
ck = ConfigureKGTK()
ck.configure_kgtk(input_graph_path=input_path,
                 output_path=output_path,
                 project_name=project_name)

In [5]:
ck.print_env_variables(files)

EXAMPLES_DIR: /Users/amandeep/Github/kgtk/examples
USE_CASES_DIR: /Users/amandeep/Github/kgtk/use-cases
GRAPH: /Users/amandeep/iswc-2021-tutorial
OUT: /Users/amandeep/iswc-2021-tutorial/arnold
TEMP: /Users/amandeep/iswc-2021-tutorial/arnold/temp.arnold
STORE: /Users/amandeep/iswc-2021-tutorial/arnold/temp.arnold/wikidata.sqlite3.db
kgtk: kgtk --debug
kypher: kgtk --debug query --graph-cache /Users/amandeep/iswc-2021-tutorial/arnold/temp.arnold/wikidata.sqlite3.db
all: /Users/amandeep/iswc-2021-tutorial/all.tsv.gz
label: /Users/amandeep/iswc-2021-tutorial/labels.en.tsv.gz


In [6]:
ck.load_files_into_cache(file_list=files)

kgtk --debug query --graph-cache /Users/amandeep/iswc-2021-tutorial/arnold/temp.arnold/wikidata.sqlite3.db -i "/Users/amandeep/iswc-2021-tutorial/all.tsv.gz" --as all  -i "/Users/amandeep/iswc-2021-tutorial/labels.en.tsv.gz" --as label  --limit 3
[2021-10-01 09:50:12 sqlstore]: IMPORT graph directly into table graph_1 from /Users/amandeep/iswc-2021-tutorial/all.tsv.gz ...
[2021-10-01 09:50:20 sqlstore]: IMPORT graph directly into table graph_2 from /Users/amandeep/iswc-2021-tutorial/labels.en.tsv.gz ...
[2021-10-01 09:50:20 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_1 AS graph_1_c1
     LIMIT ?
  PARAS: [3]
---------------------------------------------
node1	label	node2	id
P10	P31	Q18610173	P10-P31-Q18610173-85ef4d24-0
P1000	P31	Q18608871	P1000-P31-Q18608871-093affb5-0
P1001	P1647	P276	P1001-P1647-P276-e4e44f83-0


In [7]:
os.environ['KGTK_LABEL_FILE'] = "{}".format(os.environ['label']) 

In [8]:
kypher = os.environ['kypher']
kgtk = os.environ['kgtk']

In [9]:
def kgtk_to_dataframe(kgtk):
    columns = kgtk[0].split("\t")
    data = []
    for line in kgtk[1:]:
        data.append(line.encode('utf-8').decode('utf-8').split("\t"))
    return pd.DataFrame(data, columns=columns)    

Define the shortcuts for Kypher

# To Do
- Do partition of the graph in to the usual Wikidata files
- Compute the derived files

# Explore The Arnold Schwarzenegger Graph `Q2685`



In [10]:
!$kypher -i all \
--match '(n1)-[:P31]->(:Q5)' \
--return 'count(distinct n1)'

[2021-10-01 09:50:47 query]: SQL Translation:
---------------------------------------------
  SELECT count(DISTINCT graph_1_c1."node1")
     FROM graph_1 AS graph_1_c1
     WHERE graph_1_c1."label" = ?
        AND graph_1_c1."node2" = ?
  PARAS: ['P31', 'Q5']
---------------------------------------------
[2021-10-01 09:50:47 sqlstore]: CREATE INDEX on table graph_1 column label ...
[2021-10-01 09:50:48 sqlstore]: ANALYZE INDEX on table graph_1 column label ...
[2021-10-01 09:50:48 sqlstore]: CREATE INDEX on table graph_1 column node2 ...
[2021-10-01 09:50:50 sqlstore]: ANALYZE INDEX on table graph_1 column node2 ...
count(DISTINCT graph_1_c1."node1")
10918


Count organizations

Number of edges in the graph (not counting qualifier edges)

In [11]:
!$kypher -i all \
--match '()-[l]->()' \
--return 'count(distinct l)'

[2021-10-01 09:50:53 query]: SQL Translation:
---------------------------------------------
  SELECT count(DISTINCT graph_1_c1."id")
     FROM graph_1 AS graph_1_c1
  PARAS: []
---------------------------------------------
count(DISTINCT graph_1_c1."id")
1523735


Count the number of qualifier edges

In [12]:
!$kypher -i all \
--match '()-[l]->(), (l)-[q]->()' \
--return 'count(distinct q)'

[2021-10-01 09:50:57 query]: SQL Translation:
---------------------------------------------
  SELECT count(DISTINCT graph_1_c2."id")
     FROM graph_1 AS graph_1_c1
     INNER JOIN graph_1 AS graph_1_c2
     ON graph_1_c1."id" = graph_1_c2."node1"
  PARAS: []
---------------------------------------------
[2021-10-01 09:50:57 sqlstore]: CREATE INDEX on table graph_1 column node1 ...
[2021-10-01 09:50:58 sqlstore]: ANALYZE INDEX on table graph_1 column node1 ...
[2021-10-01 09:50:58 sqlstore]: CREATE INDEX on table graph_1 column id ...
[2021-10-01 09:51:00 sqlstore]: ANALYZE INDEX on table graph_1 column id ...
count(DISTINCT graph_1_c2."id")
270275


In [13]:
!$kypher -i all \
--match '(n1)-[:P31]->(:Q43229)' \
--return 'count(distinct n1)'

[2021-10-01 09:51:02 query]: SQL Translation:
---------------------------------------------
  SELECT count(DISTINCT graph_1_c1."node1")
     FROM graph_1 AS graph_1_c1
     WHERE graph_1_c1."label" = ?
        AND graph_1_c1."node2" = ?
  PARAS: ['P31', 'Q43229']
---------------------------------------------
count(DISTINCT graph_1_c1."node1")
206


Schwarzenegger films, may be incomplete as we need to use P279star, but we don't have those files yet

In [14]:
h = !$kypher -i all \
--match ' \
    (film)-[:P161]->(:Q2685)' \
--return 'distinct film as id' \
/ add-labels / html

display(HTML(" ".join(h)))

id,id;label
Q110397,'True Lies'@en
Q15140437,'Terminator Genisys'@en
Q162255,'The Terminator'@en
Q170564,'Terminator 2: Judgment Day'@en
Q200804,'Predator'@en
Q222018,'Total Recall'@en
Q2842976,'American Masters'@en
Q29054009,'Terminator 3: Rise of the Machines'@en
Q309003,'Conan the Barbarian'@en
Q370326,'Eraser'@en


In [15]:
!$kypher -i all \
--match ' \
    (film)-[:P166]->(:Q630018)' \
--return 'distinct film as id' \
/ add-labels

[2021-10-01 09:51:07 query]: SQL Translation:
---------------------------------------------
  SELECT DISTINCT graph_1_c1."node1" "_aLias.id"
     FROM graph_1 AS graph_1_c1
     WHERE graph_1_c1."label" = ?
        AND graph_1_c1."node2" = ?
  PARAS: ['P166', 'Q630018']
---------------------------------------------
id	id;label
Q11975	'Britney Spears'@en
Q162389	'Tony Curtis'@en
Q212648	'Rudy Giuliani'@en
Q214574	'Jan Josef Liefers'@en
Q221074	'Bud Spencer'@en
Q229760	'Rita Ora'@en
Q243430	'Terence Hill'@en
Q2685	'Arnold Schwarzenegger'@en
Q312674	'Giorgio Moroder'@en
Q342617	'Ben Whishaw'@en
Q450675	'Francis'@en
Q60863	'Nadja Uhl'@en
Q78766	'Elyas M\'Barek'@en
