In [1]:
# Parameters

# Folder on local machine where to create the output and temporary folders
output_path = "/Users/pedroszekely/Downloads/kypher"

# The names of the output and temporary folders
output_folder = "wikidata_os_v5"
temp_folder = "temp.wikidata_os_v5"

# The location of input Wikidata files
wikidata_folder = "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/"
wikidata_folder = "/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/"
# The wikidata_os files can be downloaded from https://drive.google.com/drive/folders/1V6oAQKmwQ4LJnrBai-uv5gHWphFSCt50?usp=sharing

# Location of the cache database for kypher
cache_path = "/Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4"
# cache_path = "/Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4/wikidata.sqlite3.db"
# Whether to delete the cache database
delete_database = False

# shortcuts to commands
kgtk = "time kgtk --debug"
# kgtk = "kgtk --debug"

In [2]:
import io
import os
import subprocess
import sys

import numpy as np
import pandas as pd

import altair as alt

import papermill as pm

In [3]:
def kgtk_to_dataframe(kgtk):
    columns = kgtk[0].split("\t")
    data = []
    for line in kgtk[1:]:
        data.append(line.encode('utf-8').decode('utf-8').split("\t"))
    return pd.DataFrame(data, columns=columns)    

In [4]:
# The names of files in the KGTK Wikidata distirbution that we will use in this notebook.
file_names = {
    "claims": "claims.tsv.gz",
    "label": "labels.en.tsv.gz",
    "alias": "aliases.en.tsv.gz",
    "description": "descriptions.en.tsv.gz",
    "item": "claims.wikibase-item.tsv.gz",
    "qualifiers": "qualifiers.tsv.gz",
    "sitelinks": "sitelinks.tsv.gz",
    "qualifiers_time": "qualifiers.time.tsv.gz",
    "property_datatypes": "metadata.property.datatypes.tsv.gz",
    "isa": "derived.isa.tsv.gz",
    "p279star": "derived.P279star.tsv.gz",
    "p279": "derived.P279.tsv.gz",
    "p31": "derived.P31.tsv.gz"
}

# We will define environment variables to hold the full paths to the files as we will use them in the shell commands
kgtk_environment_variables = []

os.environ['WIKIDATA'] = wikidata_folder
kgtk_environment_variables.append('WIKIDATA')

for key, value in file_names.items():
    variable = key.upper()
    os.environ[variable] = wikidata_folder + value
    kgtk_environment_variables.append(variable)
    
# KGTK creates a SQLite database to index the knowledge graph.
if cache_path:
    os.environ['STORE'] = "{}/wikidata.sqlite3.db".format(cache_path)
else:
    os.environ['STORE'] = "{}/{}/wikidata.sqlite3.db".format(output_path, temp_folder)
kgtk_environment_variables.append('STORE')

# We will create many temporary files, so set up a folder for outputs and one for the temporary files.
os.environ['TEMP'] = "{}/{}".format(output_path, temp_folder) 
os.environ['OUT'] = "{}/{}".format(output_path, output_folder) 
kgtk_environment_variables.append('TEMP')
kgtk_environment_variables.append('OUT')

# Envronment variables with shortcuts to the commands we use often
os.environ['kgtk'] = kgtk
os.environ['kypher'] = "time kgtk --debug query --graph-cache " + os.environ['STORE']
os.environ['kypher'] = "kgtk query --graph-cache " + os.environ['STORE']
kgtk_environment_variables.append('kgtk')
kgtk_environment_variables.append('kypher')

# We'll save the current working directory so we can call into other example notebooks later
os.environ["EXAMPLES_DIR"] = os.getcwd()
kgtk_environment_variables.append('EXAMPLES_DIR')

kgtk_environment_variables.sort()
for variable in kgtk_environment_variables:
    print("{}: \"{}\"".format(variable, os.environ[variable]))

ALIAS: "/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/aliases.en.tsv.gz"
CLAIMS: "/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/claims.tsv.gz"
DESCRIPTION: "/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/descriptions.en.tsv.gz"
EXAMPLES_DIR: "/Users/pedroszekely/Documents/GitHub/kgtk/examples"
ISA: "/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/derived.isa.tsv.gz"
ITEM: "/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/claims.wikibase-item.tsv.gz"
LABEL: "/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/labels.en.tsv.gz"
OUT: "/Users/pedroszekely/Downloads/kypher/wikidata_os_v5"
P279: "/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/derived.P279.tsv.gz"
P279STAR: "/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/derived.P279star.tsv.gz"
P31: "/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/derived.P31.tsv.gz"
PROPERTY_DATATYPES: "/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/metadata.property.datatypes.tsv.gz"
QUALIFIERS: "/Users/pedroszeke

In [5]:
%cd {output_path}

/Users/pedroszekely/Downloads/kypher


In [22]:
!$kypher -i "$CLAIMS" -i "$LABEL" -i "$ISA"\
--match 'claims: (n1)-[:P106]->(n2), claims: (n1)-[l {label: property}]->(), isa: (n2)-[]->(n2_class), label: (n2_class)-[]->(n2_class_label), label: (n2)-[]->(n2_label), label: (property)-[]->(property_label)' \
--return 'distinct n2_class_label as class, property as property, count(n1) as count, property_label as property_label, n2_label as `occupation`' \
-o "$TEMP"/occupations.tsv.gz

     2116.33 real      1024.03 user       487.43 sys


In [34]:
!zcat < "$TEMP"/occupations.tsv.gz | wc

  560818 4634193 39748060


In [69]:
!zcat < "$TEMP"/occupations.tsv.gz | head -10 | column -t -s $'\t'

class        property  count  property_label                                                     occupation
'-elect'@en  P106      3      'occupation'@en                                                    'bishop-elect'@en
zcat: '-elect'@en  P140      2      'religion'@en                                                      'bishop-elect'@en
'-elect'@en  P1871     1      'CERL Thesaurus ID'@en                                             'bishop-elect'@en
'-elect'@en  P19       1      'place of birth'@en                                                'bishop-elect'@en
'-elect'@en  P21       2      'sex or gender'@en                                                 'bishop-elect'@en
'-elect'@en  P214      1      'VIAF ID'@en                                                       'bishop-elect'@en
'-elect'@en  P227      1      'GND ID'@en                                                        'bishop-elect'@en
'-elect'@en  P2580     1      'Baltisches Biographisches Lexikon digital ID (form

In [15]:
# produces an error
!$kypher -i "$CLAIMS" -i "$LABEL" -i "$ISA"\
--match 'claims: (n1)-[:P106]->(n2), claims: (n1)-[l {label: property}]->(), label: (n2)-[]->(n2_label), label: (property)-[]->(property_label)' \
--return 'distinct n2 as occupation_id, property as property, count(n1) as count, property_label as property_label, n2_label as `occupation`' \
--where 'count > 500' \
--order-by 'occupation, cast(count, integer) desc' \
-o "$TEMP"/occupations.class.tsv.gz

misuse of aggregate: count()



In [53]:
!$kypher -i "$CLAIMS" -i "$LABEL" -i "$ISA"\
--match 'claims: (n1)-[:P106]->(n2), claims: (n1)-[l {label: property}]->(), label: (n2)-[]->(n2_label), label: (property)-[]->(property_label)' \
--return 'distinct n2 as subject, property as property, count(n1) as object, property_label as `property;label`, n2_label as `subject;label`' \
--order-by '`subject;label`, cast(object, integer) desc' \
-o "$TEMP"/occupations.class.tsv.gz

      829.63 real       355.43 user       182.87 sys


In [70]:
!zcat < "$TEMP"/occupations.class.tsv.gz | head -10 | column -t -s $'\t'

zcat: error writing to output: Broken pipe
subject   property  object  property;label               subject;label
Q2340985  P1441     33      'present in work'@en         '00 Agent'@en
Q2340985  P31       11      'instance of'@en             '00 Agent'@en
Q2340985  P175      10      'performer'@en               '00 Agent'@en
Q2340985  P4969     9       'derivative work'@en         '00 Agent'@en
Q2340985  P106      7       'occupation'@en              '00 Agent'@en
Q2340985  P27       5       'country of citizenship'@en  '00 Agent'@en
Q2340985  P21       5       'sex or gender'@en           '00 Agent'@en
Q2340985  P941      3       'inspired by'@en             '00 Agent'@en
Q2340985  P735      3       'given name'@en              '00 Agent'@en


In [80]:
!$kgtk rename-columns -i "$TEMP"/occupations.class.tsv.gz  \
--mode NONE \
--output-columns node1 label node2  'label;label' 'node1;label' \
-o "$TEMP"/occupations.class.counts.tsv.gz

        7.44 real         7.28 user         0.14 sys


In [82]:
!$kypher -i "$TEMP"/occupations.class.counts.tsv.gz -i "$PROPERTY_DATATYPES" \
--match 'counts: (n1)-[l {label: property}]->(n2), datatype: (property)-[]->(type)' \
--where 'cast(n2, integer) > 500 and type != "external-id"' \
--return 'n1 as node1, property as label, n2 as node2, label.label as `label;label`, n1.label as `node1;label`' \
--order-by 'n1, cast(n2, integer) desc' \
-o "$TEMP"/occupations.class.counts.filtered.tsv.gz

        1.16 real         0.95 user         0.18 sys


In [17]:
%time df = !$kypher -i "$CLAIMS" -i "$LABEL" -i "$P279STAR" -i "$ISA" -i "$QUALIFIERS" \
--match '\
  claims: (c1)-[:P413]->(c2), \
  isa: (c1)-[isa]->(:Q5), \
  isa: (c2)-[]->(n2), P279star: (n2)-[:P279star]-(:Q694589), \
  claims: (c0:Q1035067)-[c4:P1346]->(c1), \
  qual: (c4)-[:P585]->(d0), \
  label: (c0)-[]->(c0_label), \
  label: (c1)-[]->(c1_label), \
  label: (c2)-[]->(c2_label)' \
--return 'distinct c0_label as award, c1_label as Player, c2_label as Position, d0 as Year' \
--order-by 'Year' 
kgtk_to_dataframe(df)

CPU times: user 2.07 ms, sys: 4.87 ms, total: 6.93 ms
Wall time: 958 ms


Unnamed: 0,Undirected relationships are not (yet) allowed
0,


In [19]:
!$kypher -i "$CLAIMS" -i "$LABEL" -i "$P279STAR" -i "$ISA" -i "$QUALIFIERS" \
--match '\
  claims: (c1)-[:P413]->(c2), \
  isa: (c1)-[isa]->(:Q5), \
  isa: (c2)-[]->(n2), P279star: (n2)-[:P279star]->(c2_1:Q694589), \
  claims: (c0:Q1035067)-[c4:P1346]->(c1), \
  qual: (c4)-[:P585]->(d0), \
  label: (c0)-[]->(c0_label), \
  label: (c1)-[]->(c1_label), \
  label: (c2)-[]->(c2_label)' \
--return 'distinct c0_label as Award, c1_label as Player, c2_label as Position, d0 as Year' \
--order-by 'Year' 

Award	Player	Position	Year
'Heisman Trophy'@en	'Jay Berwanger'@en	'halfback'@en	^1935-00-00T00:00:00Z/9
'Heisman Trophy'@en	'Larry Kelley'@en	'tight end'@en	^1936-00-00T00:00:00Z/9
'Heisman Trophy'@en	'Clint Frank'@en	'halfback'@en	^1937-00-00T00:00:00Z/9
'Heisman Trophy'@en	'Davey O\\'Brien'@en	'quarterback'@en	^1938-00-00T00:00:00Z/9
'Heisman Trophy'@en	'Nile Kinnick'@en	'quarterback'@en	^1939-00-00T00:00:00Z/9
'Heisman Trophy'@en	'Tom Harmon'@en	'halfback'@en	^1940-00-00T00:00:00Z/9
'Heisman Trophy'@en	'Bruce Smith'@en	'halfback'@en	^1941-00-00T00:00:00Z/9
'Heisman Trophy'@en	'Frank Sinkwich'@en	'halfback'@en	^1942-00-00T00:00:00Z/9
'Heisman Trophy'@en	'Angelo Bertelli'@en	'quarterback'@en	^1943-00-00T00:00:00Z/9
'Heisman Trophy'@en	'Les Horvath'@en	'halfback'@en	^1944-00-00T00:00:00Z/9
'Heisman Trophy'@en	'Les Horvath'@en	'quarterback'@en	^1944-00-00T00:00:00Z/9
'Heisman Trophy'@en	'Doc Blanchard'@en	'fullback'@en	^1945-00-00T00:00:00Z/9
'Heisman Trophy'@en	'Glenn Davis'@en	'halfba

In [75]:
!wd u P166

[90mid[39m P166
[42mLabel[49m award received
[44mDescription[49m award or recognition received by a person, organisation or creative work
[30m[47minstance of[49m[39m [90m(P31)[39m[90m: [39mWikidata property for items about people or organisations [90m(Q57955292)[39m | Wikidata property for items about works [90m(Q18618644)[39m | Wikidata property related to awards, prizes and honours [90m(Q56150830)[39m


In [12]:
!$kypher -i "$CLAIMS" -i "$LABEL" \
--match 'claims: (:Q82955)<-[:P106]-(n1), claims: (n1)-[p {label: property}]->(), label: (property)-[]->(property_label)' \
--return 'distinct property as property, count(property) as node2, property_label as `node1;label`' \
--order-by 'node2 desc' \
--limit 50

property	node2	node1;label
P106	888992	'occupation'@en
P39	838822	'position held'@en
P31	617239	'instance of'@en
P21	579438	'sex or gender'@en
P27	534961	'country of citizenship'@en
P569	505487	'date of birth'@en
P735	467038	'given name'@en
P102	330958	'member of political party'@en
P19	311750	'place of birth'@en
P1412	297248	'languages spoken, written or signed'@en
P734	288095	'family name'@en
P570	278213	'date of death'@en
P69	197524	'educated at'@en
P214	154017	'VIAF ID'@en
P18	150674	'image'@en
P20	147454	'place of death'@en
P1559	141629	'name in native language'@en
P937	139276	'work location'@en
P166	131320	'award received'@en
P7859	128177	'WorldCat Identities ID'@en
P213	87226	'ISNI'@en
P40	81986	'child'@en
P373	81115	'Commons category'@en
P646	81046	'Freebase ID'@en
P244	78245	'Library of Congress authority ID'@en
P227	72345	'GND ID'@en
P3602	47459	'candidacy in election'@en
P463	44632	'member of'@en
P2163	43555	'FAST ID'@en
P1343	42457	'described by source'@en
P3373	41146	'sibl

In [90]:
!$kypher -i "$CLAIMS" -i "$LABEL" \
--match 'claims: (n1)-[:P106]->(:Q82955), claims: (n1)-[:P31]->(:Q5), label: (n1)-[]->(n1_label)' \
--limit 20

id	node1	label	node2	rank	node2;wikidatatype	id	node1	label	node2	rank	node2;wikidatatype	id	node1	label	node2
Q1000051-P106-Q82955-1d49c0e3-0	Q1000051	P106	Q82955	normal	wikibase-item	Q1000051-P31-Q5-d4b9989c-0	Q1000051	P31	Q5	normal	wikibase-item	Q1000051-label-en	Q1000051	label	'Joseph C. O\\'Mahoney'@en
Q1000053-P106-Q82955-9ea52895-0	Q1000053	P106	Q82955	normal	wikibase-item	Q1000053-P31-Q5-b45a4215-0	Q1000053	P31	Q5	normal	wikibase-item	Q1000053-label-en	Q1000053	label	'Vasily Nebenzya'@en
Q1000061-P106-Q82955-03428baf-0	Q1000061	P106	Q82955	normal	wikibase-item	Q1000061-P31-Q5-6d7f3e39-0	Q1000061	P31	Q5	normal	wikibase-item	Q1000061-label-en	Q1000061	label	'Valentyn Symonenko'@en
Q1000070-P106-Q82955-eaeeb6df-0	Q1000070	P106	Q82955	normal	wikibase-item	Q1000070-P31-Q5-50486e16-0	Q1000070	P31	Q5	normal	wikibase-item	Q1000070-label-en	Q1000070	label	'Myron V. George'@en
Q1000085-P106-Q82955-b0515d71-0	Q1000085	P106	Q82955	normal	wikibase-item	Q1000085-P31-Q5-3b951a7c-0	Q1000085	P3

In [22]:
!$kypher -i "$CLAIMS" -i "$LABEL" \
--match 'claims: (n1)-[id {label: property}]->(), claims: (n1)-[:P31]->(:Q5), label: (property)-[]->(property_label)' \
--return 'distinct property as property, count(property) as count, property_label as name' \
--order-by 'count desc' \
--limit 100

property	count	name
P31	8077740	'instance of'@en
P106	6540204	'occupation'@en
P21	6343164	'sex or gender'@en
P569	4717853	'date of birth'@en
P735	4311301	'given name'@en
P27	3725745	'country of citizenship'@en
P19	2586707	'place of birth'@en
P734	2458978	'family name'@en
P570	2361434	'date of death'@en
P1087	2338336	'Elo rating'@en
P214	2079758	'VIAF ID'@en
P496	1566234	'ORCID iD'@en
P7859	1510693	'WorldCat Identities ID'@en
P69	1424601	'educated at'@en
P54	1364830	'member of sports team'@en
P40	1306484	'child'@en
P213	1139943	'ISNI'@en
P39	1125879	'position held'@en
P108	1104544	'employer'@en
P227	1017918	'GND ID'@en
P20	996359	'place of death'@en
P166	991018	'award received'@en
P244	974080	'Library of Congress authority ID'@en
P1412	973669	'languages spoken, written or signed'@en
P18	902396	'image'@en
P641	873398	'sport'@en
P22	800065	'father'@en
P4638	718951	'The Peerage person ID'@en
P7902	608818	'Deutsche Biographie ID'@en
P1344	521036	'participant in'@en
P25	507016	'mother'@en
P3

In [35]:
!$kypher -i "$CLAIMS" -i "$LABEL" -i "$PROPERTY_DATATYPES" \
--match 'claims: (n1)-[id {label: property}]->(), claims: (n1)-[:P31]->(:Q5), label: (property)-[]->(property_label), property: (property)-[]->(type)' \
--return 'distinct property as property, count(property) as count, property_label as name, type as type' \
--where 'type != "external-id"' \
--order-by 'count desc' \
--limit 25

^C


In [36]:
!$kypher -i "$CLAIMS" -i "$LABEL" -i "$PROPERTY_DATATYPES" -i "$P279STAR" \
--match 'claims: (n1)-[id {label: property}]->(), claims: (n1)-[:P31]->(c), P279: (c)-[]->(:Q486972), label: (property)-[]->(property_label), property: (property)-[]->(type)' \
--return 'distinct property as property, count(property) as count, property_label as name, type as type' \
--where 'type != "external-id"' \
--order-by 'count desc' \
--limit 10

^C


In [37]:
!$kypher -i "$CLAIMS" -i "$LABEL" -i "$PROPERTY_DATATYPES" \
--match 'claims: (n1)-[id {label: property}]->(), claims: (n1)-[:P31]->(:Q8502), label: (property)-[]->(property_label), property: (property)-[]->(type)' \
--return 'distinct property as property, count(property) as count, property_label as name, type as type' \
--where 'type != "external-id"' \
--order-by 'count desc' \
--limit 10

^C


In [50]:
!$kypher -i "$CLAIMS" -i "$LABEL" -i "$P279STAR" -i "$ISA"  \
--match '\
  isa: (n1)-[]->(n2), \
  P279star: (n2)-[]->(n3)' \
--return 'distinct n1 as node1, "isa_star" as label, n3 as node2' \
-o "$TEMP"/derived.isastar_1.tsv.gz

In [51]:
!zcat < "$TEMP"/derived.isastar_1.tsv.gz | wc

 1092722609 3278167827 30065058032


In [52]:
!$kgtk add-id --id-style wikidata -i "$TEMP"/derived.isastar_1.tsv.gz \
/ sort2 -o "$OUT"/derived.isastar.tsv.gz

    12027.01 real     11761.24 user      1806.49 sys


In [53]:
!$kypher -i "$OUT"/derived.isastar.tsv.gz \
--match '(n1)-[]->()' \
--return 'distinct n1' \
--order-by 'n1' \
-o "$TEMP"/derived.isastar.node1.tsv.gz

In [54]:
!zcat < "$TEMP"/derived.isastar.node1.tsv.gz | wc

 28657085 28657085 280499739


In [48]:
!$kypher -i "$CLAIMS" -i "$LABEL" -i "$P279STAR" -i "$ISA"  \
--match '\
  isa: (n1)-[]->(n2)' \
--return 'distinct n1' \
--order-by 'n1' \
-o "$TEMP"/isa.node1.tsv.gz

In [49]:
!zcat < "$TEMP"/isa.node1.tsv.gz | wc

 28789155 28789155 281777039


In [None]:
!$kypher -i "$CLAIMS" -i "$LABEL" -i "$P279STAR" -i "$ISA"  \
--match '\
  claims: (n1)-[:P31]->(n2), \
  P279star: (n2)-[]->(n3)' \
--return 'distinct n1 as node1, "P31P279star" as label, n3 as node2' \
-o "$TEMP"/derived.P31P279star.gz

In [None]:
!$kgtk add-id --id-style wikidata -i "$TEMP"/derived.P31P279star.gz \
/ sort2 -o "$OUT"/derived.P31P279star.tsv.gz