# Add Derived Graphs To The Tutorial Graph



In [1]:
import io
import os
import subprocess
import sys

import numpy as np
import pandas as pd
from IPython.display import display, HTML

import papermill as pm

sys.path.insert(0,'..')
from configure_kgtk_notebooks import ConfigureKGTK

from kgtk.functions import kgtk, kypher

In [2]:
# Parameters

kgtk_path = "/Users/pedroszekely/Documents/GitHub/kgtk"

# Folder on local machine where to create the output and temporary folders
input_path = "/Users/pedroszekely/Downloads/kypher/projects/build-tutorial"
output_path = "/Users/pedroszekely/Downloads/kypher/projects"
project_name = "tutorial-derived-graphs"
tutorial_files_path = "/Users/pedroszekely/Downloads/kypher/projects/tutorial-files"

In [3]:
files = [
    "all"
]
ck = ConfigureKGTK()
ck.configure_kgtk(input_graph_path=input_path,
                  output_path=output_path,
                  project_name=project_name)

User home: /Users/pedroszekely
Current dir: /Users/pedroszekely/Documents/GitHub/kgtk/tutorial
KGTK dir: /Users/pedroszekely/Documents/GitHub/kgtk
Use-cases dir: /Users/pedroszekely/Documents/GitHub/kgtk/use-cases


In [4]:
ck.print_env_variables(files)

STORE: /Users/pedroszekely/Downloads/kypher/projects/tutorial-derived-graphs/temp.tutorial-derived-graphs/wikidata.sqlite3.db
TEMP: /Users/pedroszekely/Downloads/kypher/projects/tutorial-derived-graphs/temp.tutorial-derived-graphs
EXAMPLES_DIR: /Users/pedroszekely/Documents/GitHub/kgtk/examples
OUT: /Users/pedroszekely/Downloads/kypher/projects/tutorial-derived-graphs
kypher: kgtk query --graph-cache /Users/pedroszekely/Downloads/kypher/projects/tutorial-derived-graphs/temp.tutorial-derived-graphs/wikidata.sqlite3.db
kgtk: kgtk
GRAPH: /Users/pedroszekely/Downloads/kypher/projects/build-tutorial
USE_CASES_DIR: /Users/pedroszekely/Documents/GitHub/kgtk/use-cases
all: /Users/pedroszekely/Downloads/kypher/projects/build-tutorial/all.tsv.gz


Turn on debugging for kypher

In [5]:
os.environ['tutorial_files_path'] = tutorial_files_path
os.environ['kgtk_path'] = kgtk_path
os.environ['KGTK_GRAPH_CACHE'] = os.environ['STORE']
os.environ['KGTK_LABEL_FILE'] = os.environ['tutorial_files_path'] + "/labels.en.tsv.gz"
os.environ['KGTK_OPTION_DEBUG'] = "true"

Load all my files into the kypher cache so that all graph aliases are defined

In [6]:
ck.load_files_into_cache(file_list=files)

kgtk query --graph-cache /Users/pedroszekely/Downloads/kypher/projects/tutorial-derived-graphs/temp.tutorial-derived-graphs/wikidata.sqlite3.db -i "/Users/pedroszekely/Downloads/kypher/projects/build-tutorial/all.tsv.gz" --as all  --limit 3
[2021-10-06 19:34:58 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_1 AS graph_1_c1
     LIMIT ?
  PARAS: [3]
---------------------------------------------
node1	label	node2	id
P10	P31	Q18610173	P10-P31-Q18610173-85ef4d24-0
P1000	P31	Q18608871	P1000-P31-Q18608871-093affb5-0
P1001	P1647	P276	P1001-P1647-P276-e4e44f83-0


In [7]:
%cd {os.environ['OUT']}

/Users/pedroszekely/Downloads/kypher/projects/tutorial-derived-graphs


In [8]:
!kgtk filter --verbose=False --use-mgzip=True --first-match-only \
--input-file /Users/pedroszekely/Downloads/kypher/projects/tutorial-derived-graphs/parts/all.tsv.gz \
-p '; datatype ;' \
-o /Users/pedroszekely/Downloads/kypher/projects/tutorial-derived-graphs/parts/metadata.property.datatypes.tsv.gz 

## Run partition notebook

We need the parts to run the Useful Files notebook

In [9]:
pm.execute_notebook(
    os.environ["EXAMPLES_DIR"] + "/partition-wikidata.ipynb",
    os.environ["TEMP"] + "/partition-wikidata.out.ipynb",
    parameters=dict(
        wikidata_input_path = input_path + "/all.tsv.gz",
        wikidata_parts_path = os.environ["OUT"] + "/parts",
        temp_folder_path = os.environ["OUT"] + "/parts/temp",
        sort_extras = "--buffer-size 30% --temporary-directory $OUT/parts/temp",
        verbose = False,
        gzip_command = 'gzip'
    )
)
;

Executing:   0%|          | 0/49 [00:00<?, ?cell/s]

''

Show the files after partition

In [10]:
!ls $OUT/parts

aliases.en.tsv.gz                   metadata.property.datatypes.tsv.gz
aliases.tsv.gz                      metadata.types.tsv.gz
all.tsv.gz                          qualifiers.commonsMedia.tsv.gz
claims.commonsMedia.tsv.gz          qualifiers.external-id.tsv.gz
claims.external-id.tsv.gz           qualifiers.geo-shape.tsv.gz
claims.geo-shape.tsv.gz             qualifiers.globe-coordinate.tsv.gz
claims.globe-coordinate.tsv.gz      qualifiers.math.tsv.gz
claims.math.tsv.gz                  qualifiers.monolingualtext.tsv.gz
claims.monolingualtext.tsv.gz       qualifiers.musical-notation.tsv.gz
claims.musical-notation.tsv.gz      qualifiers.quantity.tsv.gz
claims.other.tsv.gz                 qualifiers.string.tsv.gz
claims.quantity.tsv.gz              qualifiers.tabular-data.tsv.gz
claims.string.tsv.gz                qualifiers.time.tsv.gz
claims.tabular-data.tsv.gz          qualifiers.tsv.gz
claims.time.tsv.gz                  qualifiers.url.tsv.gz
claims.tsv.gz                       quali

Copy the files we want for the tutorial to our output directory

In [11]:
!cp -p $OUT/parts/*.tsv.gz $tutorial_files_path

## Run useful files notebook

In [13]:
pm.execute_notebook(
    os.environ["USE_CASES_DIR"] + "/Wikidata Useful Files.ipynb",
    os.environ["TEMP"] + "/Wikidata Useful Files Out.ipynb",
    parameters=dict(
        output_path = os.environ["OUT"],
        output_folder = "useful_files",
        temp_folder = "temp.useful_files",
        wiki_root_folder = os.environ["OUT"] + "/parts/",
        cache_path = os.environ["TEMP"],
        languages = 'en',
        compute_pagerank = True,
        compute_degrees = True,
        compute_hits = False, 
        delete_database = False,
        debug = "false"
    )
)
;

Executing:   0%|          | 0/157 [00:00<?, ?cell/s]

''

In [14]:
!ls -l $OUT/useful_files

total 43456
-rw-r--r--   1 pedroszekely  staff  1167663 Oct  6 15:24 aliases.en.tsv.gz
-rw-r--r--   1 pedroszekely  staff   326969 Oct  6 15:24 derived.P279.tsv.gz
-rw-r--r--   1 pedroszekely  staff  2649699 Oct  6 15:24 derived.P279star.tsv.gz
-rw-r--r--   1 pedroszekely  staff  1006728 Oct  6 15:24 derived.P31.tsv.gz
-rw-r--r--   1 pedroszekely  staff    46319 Oct  6 15:25 derived.dwd.count.tsv.gz
-rw-r--r--   1 pedroszekely  staff  1540685 Oct  6 15:25 derived.dwd_isa.tsv.gz
-rw-r--r--   1 pedroszekely  staff  1158470 Oct  6 15:24 descriptions.en.tsv.gz
-rw-r--r--   1 pedroszekely  staff  2524967 Oct  6 15:25 dwd_isa_class_count.compact.tsv.gz
-rw-r--r--   1 pedroszekely  staff  1260472 Oct  6 15:26 item.property.count.compact.tsv.gz
-rw-r--r--   1 pedroszekely  staff   886001 Oct  6 15:24 labels.en.tsv.gz
-rw-r--r--   1 pedroszekely  staff   274394 Oct  6 15:24 metadata.in_degree.tsv.gz
-rw-r--r--   1 pedroszekely  staff   496777 Oct  6 15:24 metadata.out_degree.tsv.gz
-rw-r--r--  

### Enhance pagerank files to include ordinal

In [8]:
%%time
directed_pagerank = kgtk("""
    query -i $OUT/useful_files/metadata.pagerank.directed.tsv.gz 
    --match '(n1)-[l:Pdirected_pagerank]->(pagerank)'
""")

directed_pagerank_sorted = directed_pagerank.sort_values("node2", ascending=False)
directed_pagerank_sorted.insert(0, 'P1545', range(1, 1 + len(directed_pagerank_sorted)))
directed_pagerank_sorted

[2021-10-06 19:35:19 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_34 AS graph_34_c1
     WHERE graph_34_c1."label" = ?
  PARAS: ['Pdirected_pagerank']
---------------------------------------------

CPU times: user 297 ms, sys: 70.7 ms, total: 368 ms
Wall time: 875 ms


Unnamed: 0,P1545,node1,label,node2,id
23658,1,Q23958852,Pdirected_pagerank,0.074847,Q23958852-Pdirected_pagerank-70976
37555,2,Q23960977,Pdirected_pagerank,0.034511,Q23960977-Pdirected_pagerank-112667
13394,3,Q35120,Pdirected_pagerank,0.030465,Q35120-Pdirected_pagerank-40184
10098,4,Q151885,Pdirected_pagerank,0.027707,Q151885-Pdirected_pagerank-30296
11016,5,Q7184903,Pdirected_pagerank,0.011759,Q7184903-Pdirected_pagerank-33050
...,...,...,...,...,...
32303,55947,Q19345728,Pdirected_pagerank,0.000003,Q19345728-Pdirected_pagerank-96911
32291,55948,Q193369,Pdirected_pagerank,0.000003,Q193369-Pdirected_pagerank-96875
32281,55949,Q193326,Pdirected_pagerank,0.000003,Q193326-Pdirected_pagerank-96845
32279,55950,Q1932957,Pdirected_pagerank,0.000003,Q1932957-Pdirected_pagerank-96839


In [9]:
directed_pagerank_sorted.to_csv(f"{os.environ['TEMP']}/directed-pagerank.ordinal.tsv", index=False, sep='\t')

In [23]:
kgtk("""
    head -i "$TEMP"/directed-pagerank.ordinal.tsv / add-labels
""")

Unnamed: 0,P1545,node1,label,node2,id,node1;label
0,1,Q23958852,Pdirected_pagerank,0.074847,Q23958852-Pdirected_pagerank-70976,'variable-order class'@en
1,2,Q23960977,Pdirected_pagerank,0.034511,Q23960977-Pdirected_pagerank-112667,'(meta)class'@en
2,3,Q35120,Pdirected_pagerank,0.030465,Q35120-Pdirected_pagerank-40184,'entity'@en
3,4,Q151885,Pdirected_pagerank,0.027707,Q151885-Pdirected_pagerank-30296,'concept'@en
4,5,Q7184903,Pdirected_pagerank,0.011759,Q7184903-Pdirected_pagerank-33050,'abstract object'@en
5,6,Q5,Pdirected_pagerank,0.011419,Q5-Pdirected_pagerank-13580,'human'@en
6,7,Q30,Pdirected_pagerank,0.008336,Q30-Pdirected_pagerank-497,'United States of America'@en
7,8,Q19478619,Pdirected_pagerank,0.007651,Q19478619-Pdirected_pagerank-14804,'metaclass'@en
8,9,Q24017414,Pdirected_pagerank,0.006919,Q24017414-Pdirected_pagerank-14342,'second-order class'@en
9,10,Q1207505,Pdirected_pagerank,0.006406,Q1207505-Pdirected_pagerank-13835,'quality'@en


In [10]:
kgtk("""
    normalize -i "$TEMP"/directed-pagerank.ordinal.tsv
    / add-id --id-style wikidata 
    -o "$OUT"/directed-pagerank.ordinal.tsv.gz
""")

EmptyDataError: No columns to parse from file

In [23]:
!cp $OUT/useful_files/*.tsv.gz $tutorial_files_path

usage: cp [-R [-H | -L | -P]] [-fi | -n] [-apvXc] source_file target_file
       cp [-R [-H | -L | -P]] [-fi | -n] [-apvXc] source_file ... target_directory


In [26]:
!zcat < $tutorial_files_path/metadata.property.datatypes.tsv.gz | wc -l

    3507


In [27]:
!zcat < $tutorial_files_path/metadata.pagerank.directed.tsv.gz | head

node1	label	node2	id
P10	vertex_in_degree	0	P10-vertex_in_degree-0
P10	vertex_out_degree	1	P10-vertex_out_degree-1
P10	Pdirected_pagerank	2.7060194813141665e-06	P10-Pdirected_pagerank-2
Q18610173	vertex_in_degree	15	Q18610173-vertex_in_degree-3
Q18610173	vertex_out_degree	1	Q18610173-vertex_out_degree-4
Q18610173	Pdirected_pagerank	1.800179459943962e-05	Q18610173-Pdirected_pagerank-5
P1000	vertex_in_degree	0	P1000-vertex_in_degree-6
P1000	vertex_out_degree	1	P1000-vertex_out_degree-7
P1000	Pdirected_pagerank	2.7060194813141665e-06	P1000-Pdirected_pagerank-8
zcat: error writing to output: Broken pipe


In [28]:
!zcat < $tutorial_files/metadata.in_degree.tsv.gz | head

/bin/bash: /metadata.in_degree.tsv.gz: No such file or directory


In [29]:
!echo $kgtk_path

/Users/pedroszekely/Documents/GitHub/kgtk


In [30]:
cp "$kgtk_path"/kgtk-properties/kgtk.properties.tsv "$tutorial_files_path"

In [31]:
!grep page "$kgtk_path"/kgtk-properties/kgtk.properties.tsv 

Pdirected_pagerank	label	'pagerank (directed)'@en	Pdirected_pagerank-label-d3bd07
Pdirected_pagerank	alias	'page rank (directed)'@en	Pdirected_pagerank-alias-9d4733
Pdirected_pagerank	description	'pagerank canculated on the directed graph'@en	Pdirected_pagerank-description-b62fff
Pdirected_pagerank	P31	Q18616576	Pdirected_pagerank-P31-Q18616576
Pdirected_pagerank	P31	Q47512165	Pdirected_pagerank-P31-Q47512165
Pdirected_pagerank	P1629	Q184316	Pdirected_pagerank-P1629-Q184316
Pdirected_pagerank	data_type	quantity	Pdirected_pagerank-data_type-1a7b30
Pundirected_pagerank	label	'pagerank (undirected)'@en	Pundirected_pagerank-label-d3bd07
Pundirected_pagerank	alias	'page rank (undirected)'@en	Pundirected_pagerank-alias-9d4733
Pundirected_pagerank	description	'pagerank canculated on the undirected graph'@en	Pundirected_pagerank-description-ee8b1c
Pundirected_pagerank	P31	Q18616576	Pundirected_pagerank-P31-Q18616576
Pundirected_pagerank	P31	Q47512165	Pundirected_pagerank-P31-Q47512165
Pundirec

In [32]:
!ls -1 "$tutorial_files_path"

aliases.en.tsv.gz
aliases.tsv.gz
all.tsv.gz
arnold.all.tsv.gz
claims.commonsMedia.tsv.gz
claims.external-id.tsv.gz
claims.geo-shape.tsv.gz
claims.globe-coordinate.tsv.gz
claims.math.tsv.gz
claims.monolingualtext.tsv.gz
claims.musical-notation.tsv.gz
claims.other.tsv.gz
claims.quantity.tsv.gz
claims.string.tsv.gz
claims.tabular-data.tsv.gz
claims.time.tsv.gz
claims.tsv.gz
claims.url.tsv.gz
claims.wikibase-form.tsv.gz
claims.wikibase-item.tsv.gz
claims.wikibase-lexeme.tsv.gz
claims.wikibase-property.tsv.gz
claims.wikibase-sense.tsv.gz
derived.P279.tsv.gz
derived.P279star.tsv.gz
derived.P31.tsv.gz
derived.P31P279star.tsv.gz
derived.P31_39_106_279star.tsv.gz
derived.dwd.count.tsv.gz
derived.dwd_isa.tsv.gz
derived.isa.tsv.gz
derived.isastar.tsv.gz
descriptions.en.tsv.gz
descriptions.tsv.gz
dwd_isa_class_count.compact.tsv.gz
item.property.count.compact.tsv.gz
kgtk.properties.tsv
labels.en.tsv.gz
labels.tsv.gz
metadata.in_degree.tsv.gz
metadata.out_degree.tsv.gz
metadata.pagerank.directed.tsv

In [33]:
tutorial_files = [
    "aliases.tsv.gz",
    "claims.external-id.tsv.gz",
    "claims.monolingualtext.tsv.gz",
    "claims.quantity.tsv.gz",
    "claims.string.tsv.gz",
    "claims.time.tsv.gz",
    "claims.url.tsv.gz",
    "claims.wikibase-item.tsv.gz",
    "claims.wikibase-property.tsv.gz",
    "derived.P279.tsv.gz",
    "derived.P279star.tsv.gz",
    "derived.P31.tsv.gz",
    "descriptions.tsv.gz",
    "kgtk.properties.tsv",
    "labels.en.tsv.gz",
    "metadata.in_degree.tsv.gz",
    "metadata.out_degree.tsv.gz",
    "metadata.pagerank.directed.tsv.gz",
    "metadata.pagerank.undirected.tsv.gz",
    "metadata.property.datatypes.tsv.gz",
    "metadata.types.tsv.gz",
    "qualifiers.tsv.gz",
]

tutorial_files_argument = (" -i "+tutorial_files_path+"/") + (" -i "+tutorial_files_path+"/").join(tutorial_files)

In [34]:
tutorial_files_argument

' -i /Users/pedroszekely/Downloads/kypher/projects/tutorial-files/aliases.tsv.gz -i /Users/pedroszekely/Downloads/kypher/projects/tutorial-files/claims.external-id.tsv.gz -i /Users/pedroszekely/Downloads/kypher/projects/tutorial-files/claims.monolingualtext.tsv.gz -i /Users/pedroszekely/Downloads/kypher/projects/tutorial-files/claims.quantity.tsv.gz -i /Users/pedroszekely/Downloads/kypher/projects/tutorial-files/claims.string.tsv.gz -i /Users/pedroszekely/Downloads/kypher/projects/tutorial-files/claims.time.tsv.gz -i /Users/pedroszekely/Downloads/kypher/projects/tutorial-files/claims.url.tsv.gz -i /Users/pedroszekely/Downloads/kypher/projects/tutorial-files/claims.wikibase-item.tsv.gz -i /Users/pedroszekely/Downloads/kypher/projects/tutorial-files/claims.wikibase-property.tsv.gz -i /Users/pedroszekely/Downloads/kypher/projects/tutorial-files/derived.P279.tsv.gz -i /Users/pedroszekely/Downloads/kypher/projects/tutorial-files/derived.P279star.tsv.gz -i /Users/pedroszekely/Downloads/kyphe

In [35]:
!kgtk cat {tutorial_files_argument} -o "$tutorial_files_path"/arnold.all.tsv.gz

In [36]:
kgtk("""
    head -i "$tutorial_files_path"/arnold.all.tsv.gz
""")

Unnamed: 0,node1,label,node2,id,node2;wikidatatype
0,P10,alias,'gif'@en,P10-alias-en-282226-0,
1,P10,alias,'animation'@en,P10-alias-en-2f86d8-0,
2,P10,alias,'media'@en,P10-alias-en-c1427e-0,
3,P10,alias,'trailer (Commons)'@en,P10-alias-en-c61ab1-0,
4,P1001,alias,'belongs to jurisdiction'@en,P1001-alias-en-0dd7ce-0,
5,P1001,alias,'linked to jurisdiction'@en,P1001-alias-en-106818-0,
6,P1001,alias,'of jurisdiction'@en,P1001-alias-en-7e4abe-0,
7,P1001,alias,'applied to jurisdiction'@en,P1001-alias-en-89ed18-0,
8,P1001,alias,'jurisdiction'@en,P1001-alias-en-a524ab-0,
9,P1001,alias,'valid in jurisdiction'@en,P1001-alias-en-ca2e7c-0,


In [40]:
%%time
kgtk("""
    head -i "$tutorial_files_path"/arnold.all.tsv.gz / add-labels
""")

CPU times: user 5.64 ms, sys: 13.8 ms, total: 19.4 ms
Wall time: 914 ms


Unnamed: 0,node1,label,node2,id,node2;wikidatatype,node1;label
0,P10,alias,'gif'@en,P10-alias-en-282226-0,,'video'@en
1,P10,alias,'animation'@en,P10-alias-en-2f86d8-0,,'video'@en
2,P10,alias,'media'@en,P10-alias-en-c1427e-0,,'video'@en
3,P10,alias,'trailer (Commons)'@en,P10-alias-en-c61ab1-0,,'video'@en
4,P1001,alias,'belongs to jurisdiction'@en,P1001-alias-en-0dd7ce-0,,'applies to jurisdiction'@en
5,P1001,alias,'linked to jurisdiction'@en,P1001-alias-en-106818-0,,'applies to jurisdiction'@en
6,P1001,alias,'of jurisdiction'@en,P1001-alias-en-7e4abe-0,,'applies to jurisdiction'@en
7,P1001,alias,'applied to jurisdiction'@en,P1001-alias-en-89ed18-0,,'applies to jurisdiction'@en
8,P1001,alias,'jurisdiction'@en,P1001-alias-en-a524ab-0,,'applies to jurisdiction'@en
9,P1001,alias,'valid in jurisdiction'@en,P1001-alias-en-ca2e7c-0,,'applies to jurisdiction'@en


In [9]:
!ls "$OUT"/useful_files

aliases.en.tsv.gz                      labels.en.tsv.gz
derived.P279.tsv.gz                    metadata.in_degree.tsv.gz
derived.P279star.tsv.gz                metadata.out_degree.tsv.gz
derived.P31.tsv.gz                     metadata.pagerank.directed.tsv.gz
derived.dwd.count.tsv.gz               metadata.pagerank.undirected.tsv.gz
derived.dwd_isa.tsv.gz                 statistics.in_degree.distribution.tsv
descriptions.en.tsv.gz                 statistics.out_degree.distribution.tsv
dwd_isa_class_count.compact.tsv.gz     [34mtemp.useful_files[m[m
item.property.count.compact.tsv.gz
