# Profile The Tutorial Graph



In [1]:
import io
import os
import subprocess
import sys

import numpy as np
import pandas as pd
from IPython.display import display, HTML

import papermill as pm

sys.path.insert(0,'..')
from kgtk.configure_kgtk_notebooks import ConfigureKGTK

from kgtk.functions import kgtk, kypher

In [2]:
# Parameters

kgtk_path = "/Users/pedroszekely/Documents/GitHub/kgtk"

tutorial_deployment_path = "/Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets"
project_deployment_path = tutorial_deployment_path + "/arnold-profiled"

# Folder on local machine where to create the output and temporary folders
input_path = "/Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold"
output_path = "/Users/pedroszekely/Downloads/kypher/projects"
project_name = "tutorial-profiling"

These are all the files that we have, but I am tempted to just use the `all` file as it helps to keep the tutorial simpler

In [3]:
files = [
    "all",
    "label",
    "alias",
    "description",
    "external_id",
    "monolingualtext",
    "quantity",
    "string",
    "time",
    "item",
    "wikibase_property",
    "qualifiers",
    "datatypes",
    "p279",
    "p279star",
    "p31",
    "in_degree",
    "out_degree",
    "pagerank_directed",
    "pagerank_undirected"
]
ck = ConfigureKGTK(files, kgtk_path=kgtk_path)
ck.configure_kgtk(input_graph_path=input_path,
                  output_path=output_path,
                  project_name=project_name)

User home: /Users/pedroszekely
Current dir: /Users/pedroszekely/Documents/GitHub/kgtk/tutorial
KGTK dir: /Users/pedroszekely/Documents/GitHub/kgtk
Use-cases dir: /Users/pedroszekely/Documents/GitHub/kgtk/use-cases


In [4]:
ck.print_env_variables()

STORE: /Users/pedroszekely/Downloads/kypher/projects/tutorial-profiling/temp.tutorial-profiling/wikidata.sqlite3.db
USE_CASES_DIR: /Users/pedroszekely/Documents/GitHub/kgtk/use-cases
kgtk: kgtk
EXAMPLES_DIR: /Users/pedroszekely/Documents/GitHub/kgtk/examples
OUT: /Users/pedroszekely/Downloads/kypher/projects/tutorial-profiling
kypher: kgtk query --graph-cache /Users/pedroszekely/Downloads/kypher/projects/tutorial-profiling/temp.tutorial-profiling/wikidata.sqlite3.db
TEMP: /Users/pedroszekely/Downloads/kypher/projects/tutorial-profiling/temp.tutorial-profiling
GRAPH: /Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold
all: /Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold/all.tsv.gz
label: /Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold/labels.en.tsv.gz
alias: /Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold/aliases.en.tsv.gz
description: /Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/

Set up defaults KGTK

In [5]:
os.environ['kgtk_path'] = kgtk_path
os.environ['KGTK_GRAPH_CACHE'] = os.environ['STORE']
os.environ['KGTK_LABEL_FILE'] = input_path + "/labels.en.tsv.gz"
os.environ['KGTK_OPTION_DEBUG'] = "false"

Load all my files into the kypher cache so that all graph aliases are defined

In [6]:
%%time
ck.load_files_into_cache()

kgtk query --graph-cache /Users/pedroszekely/Downloads/kypher/projects/tutorial-profiling/temp.tutorial-profiling/wikidata.sqlite3.db -i "/Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold/all.tsv.gz" --as all  -i "/Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold/labels.en.tsv.gz" --as label  -i "/Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold/aliases.en.tsv.gz" --as alias  -i "/Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold/descriptions.en.tsv.gz" --as description  -i "/Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold/claims.external-id.tsv.gz" --as external_id  -i "/Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold/claims.monolingualtext.tsv.gz" --as monolingualtext  -i "/Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold/claims.quantity.tsv.gz" --as quantity  -i "/Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/data

In [7]:
%cd {os.environ['OUT']}

/Users/pedroszekely/Downloads/kypher/projects/tutorial-profiling


## Get instance counts



We can compute the instance counts by retrieving all statements that use `instance of (P31)` and counting the instances for each class

In [8]:
%%time
kgtk("""
    query -i all
        --match '(instance)-[:P31]->(class)'
        --return 'class as class, count(distinct instance) as count'
        --order-by 'cast(count, int) desc'
        --limit 10 
    / add-labels
""")

CPU times: user 7.29 ms, sys: 12.9 ms, total: 20.1 ms
Wall time: 2.76 s


Unnamed: 0,class,count,class;label
0,Q5,13870,'human'@en
1,Q15221623,3177,'bilateral relation'@en
2,Q11424,2136,'film'@en
3,Q4022,1550,'river'@en
4,Q3918,815,'university'@en
5,Q4164871,645,'position'@en
6,Q1549591,627,'big city'@en
7,Q3917681,614,'embassy'@en
8,Q19595382,595,'Wikidata property for authority control for p...
9,Q11862829,567,'academic discipline'@en


We want to add the profiling data back into the KG so that we can use it in queries and look at it in the browser.
To do so, we create a KGTK graph by using `node1, label, node2` as column headers:

In [9]:
%%time
kgtk("""
    query -i all
        --match '(instance)-[:P31]->(class)'
        --return 'class as node1, "P31_count" as label, count(distinct instance) as node2'
        --order-by 'cast(node2, int) desc'
    --limit 10 
""")

CPU times: user 4.9 ms, sys: 9.98 ms, total: 14.9 ms
Wall time: 631 ms


Unnamed: 0,node1,label,node2
0,Q5,P31_count,13870
1,Q15221623,P31_count,3177
2,Q11424,P31_count,2136
3,Q4022,P31_count,1550
4,Q3918,P31_count,815
5,Q4164871,P31_count,645
6,Q1549591,P31_count,627
7,Q3917681,P31_count,614
8,Q19595382,P31_count,595
9,Q11862829,P31_count,567


It is good practice to add identifiers to the edges so that we can add qualifiers later if we desire. To add the identifiers, we chain the query output to the `add-id` command:

In [10]:
%%time
kgtk("""
    query -i all
        --match '(instance)-[:P31]->(class)'
        --return 'class as node1, "P31count" as label, count(distinct instance) as node2'
        --order-by 'cast(node2, int) desc' 
    / add-id --id-style wikidata
""")

CPU times: user 51.8 ms, sys: 20 ms, total: 71.8 ms
Wall time: 1.09 s


Unnamed: 0,node1,label,node2,id
0,Q5,P31count,13870,Q5-P31count-9a9c3a
1,Q15221623,P31count,3177,Q15221623-P31count-61d8c4
2,Q11424,P31count,2136,Q11424-P31count-907bdc
3,Q4022,P31count,1550,Q4022-P31count-c27484
4,Q3918,P31count,815,Q3918-P31count-96da2f
...,...,...,...,...
5754,Q99566538,P31count,1,Q99566538-P31count-6b86b2
5755,Q99567847,P31count,1,Q99567847-P31count-6b86b2
5756,Q996839,P31count,1,Q996839-P31count-6b86b2
5757,Q99960791,P31count,1,Q99960791-P31count-6b86b2


Now that we saw the steps to create the graph with the counts, we want to output the results to a file using the `-o` option:

In [11]:
%%time
kgtk("""
    query -i all
        --match '(instance)-[:P31]->(class)'
        --return 'class as node1, "P31count" as label, count(distinct instance) as node2'
        --order-by 'cast(node2, int) desc'
    / add-id --id-style wikidata
    -o $OUT/metadata.p31.count.tsv
""")

CPU times: user 3.2 ms, sys: 10 ms, total: 13.2 ms
Wall time: 1.05 s


Confirm that the output file went to the right place:

In [12]:
!ls -l $OUT

total 129576
-rw-r--r--  1 pedroszekely  staff  29048633 Oct 10 09:19 derived.P1963computed.tsv
-rw-r--r--  1 pedroszekely  staff   4665158 Oct 10 09:16 derived.P31x.tsv
-rw-r--r--  1 pedroszekely  staff     32115 Oct 10 11:25 derived.Paward_count.tsv
-rw-r--r--  1 pedroszekely  staff  31641881 Oct 10 09:19 derived.Pproperty_domain.tsv
-rw-r--r--  1 pedroszekely  staff     30495 Oct 10 11:04 derived.Punits_used.tsv
-rw-r--r--  1 pedroszekely  staff    599740 Oct 10 09:16 metadata.p31.count.transitive.tsv
-rw-r--r--  1 pedroszekely  staff    260003 Oct 10 12:04 metadata.p31.count.tsv
-rw-r--r--  1 pedroszekely  staff     44254 Oct 10 09:17 metadata.p31x.count.transitive.tsv
drwxr-xr-x  7 pedroszekely  staff       224 Oct 10 12:04 [34mtemp.tutorial-profiling[m[m


Load the `P31count` graph in the KGTK cache so that we can use it in queries later

In [13]:
kgtk("""
    query -i $OUT/metadata.p31.count.tsv --as p31count --limit 2
""")

Unnamed: 0,node1,label,node2,id
0,Q5,P31count,13870,Q5-P31count-9a9c3a
1,Q15221623,P31count,3177,Q15221623-P31count-61d8c4


Summary of this section:
- In this section we computed the count of instances for every class in our KG.
- We illustrated the use of `instance of (P31)` to do queries.
- We illustrated common conventions to add identifiers to edges and to save results to files.

## Compute `P31count_transitive`, the count of instances of a class including the instances of all the subclasses

Approach:
- get the class of each instance
- get all the superclass of the class of each instance
- for every superclass, count all the instances

> This query will run at the scale of all Wikidata, which contains millions of classes

We add the labels to see the results, not surprisingly, `entity` has the most instances, and the top classes are those at the top of the Wikidata ontology:

In [14]:
%%time
kgtk("""
    query -i all
        --match '
            (instance)-[:P31]->(class),
            (class)-[:P279star]->(superclass)'
        --return 'superclass as class, count(distinct instance) as count'
        --order-by 'cast(count, int) desc'
    / add-labels
""")

CPU times: user 55.4 ms, sys: 21.1 ms, total: 76.5 ms
Wall time: 13 s


Unnamed: 0,class,count,class;label
0,Q35120,58187,'entity'@en
1,Q99527517,38311,'collection entity'@en
2,Q28813620,35497,'set'@en
3,Q16887380,35476,'group'@en
4,Q58415929,30782,'spatio-temporal entity'@en
...,...,...,...
8897,Q100166391,1,'salt production facility'@en
8898,Q1001059,1,'writ'@en
8899,Q1000660,1,'algebra over a field'@en
8900,Q100052008,1,'anthropomorphic Pantherinae'@en


Store the results in a file using a new property `P31count_transitive`

In [15]:
%%time
kgtk("""
    query -i all 
        --match '
            (instance)-[:P31]->(class),
            (class)-[:P279star]->(superclass)'
        --return 'superclass as node1, "P31count_transitive" as label, count(distinct instance) as node2'
        --order-by 'cast(node2, int) desc'
    / add-id --id-style wikidata
    -o $OUT/metadata.p31.count.transitive.tsv
""")

CPU times: user 6 ms, sys: 12.7 ms, total: 18.7 ms
Wall time: 9.99 s


Find the number of instances of `Q5: human`, `artist: Q483501` and `film director: Q2526255`. There are many instances of human, but only one of artist and zero of film director.

In [16]:
kgtk("""
    filter -i $OUT/metadata.p31.count.transitive.tsv -p "Q5, Q483501, Q2526255 ;;" / add-labels
""")

Unnamed: 0,node1,label,node2,id,node1;label
0,Q5,P31count_transitive,13941,Q5-P31count_transitive-76febd,'human'@en
1,Q483501,P31count_transitive,1,Q483501-P31count_transitive-6b86b2,'artist'@en


The reason there are no instances of `artist: Q483501` or `film director: Q2526255`  is that Wikidata uses the property `occupation: P106` to relate people to their occupations, so the connection between human and artist of director is not `instance of: P31`. It would be nice if the browser page for `artist: Q483501` or `film director: Q2526255` would show the number of people with this occupation. DBpedia uses a different model where humans are instances of artist or film director.


### Summary of this section
In this section we:
- Computed the count of instaces of every class, including all subclasses.
- Introduced `P279star`, the precomputed transitive closure of the Wikidata `subclass of (P279)` property that allows you to conveniently do queries over all super classes or subclasses of an entity.

## Define `P31x`, a generalization of `instance of: P31`

In our KG we are going to define a new property called `instance of (generalized): P31x` that behaves like DBpedia, so that we can ask for instances of `artist: Q483501`.
We do this by generalizing `occupation: P106` abd `position held: 39` to also behave as `P31` statements.

Approach:
- Combine `x P31 y`, `x P106 y` and `x P39 y` statements using a new `P31x` predicate

Use the `filter` to take a peek at the data and see whether our plan makes sense.

In [17]:
kgtk("""
    filter -i $item -p "; P39, P106 ;"
    / head
    / add-labels
""")

Unnamed: 0,node1,label,node2,id,node2;wikidatatype,node1;label,label;label,node2;label
0,Q1000048,P106,Q1622272,Q1000048-P106-Q1622272-3a1be6b5-0,wikibase-item,'Franz Zimmermann'@en,'occupation'@en,'university teacher'@en
1,Q1000048,P106,Q16267607,Q1000048-P106-Q16267607-e13e45d1-0,wikibase-item,'Franz Zimmermann'@en,'occupation'@en,'classical philologist'@en
2,Q100063874,P39,Q1162163,Q100063874-P39-Q1162163-ae076e77-0,wikibase-item,'Catherine Musson'@en,'position held'@en,'director'@en
3,Q100066085,P39,Q1162163,Q100066085-P39-Q1162163-93ac33fd-0,wikibase-item,'Anne-Laurence Mennessier'@en,'position held'@en,'director'@en
4,Q1001,P106,Q11774202,Q1001-P106-Q11774202-45d8eb34-0,wikibase-item,'Mahatma Gandhi'@en,'occupation'@en,'essayist'@en
5,Q1001,P106,Q17351648,Q1001-P106-Q17351648-e64838e9-0,wikibase-item,'Mahatma Gandhi'@en,'occupation'@en,'newspaper editor'@en
6,Q1001,P106,Q1930187,Q1001-P106-Q1930187-6cf568db-0,wikibase-item,'Mahatma Gandhi'@en,'occupation'@en,'journalist'@en
7,Q1001,P106,Q4964182,Q1001-P106-Q4964182-a0867b04-0,wikibase-item,'Mahatma Gandhi'@en,'occupation'@en,'philosopher'@en
8,Q1001,P106,Q808967,Q1001-P106-Q808967-57fe7a7e-0,wikibase-item,'Mahatma Gandhi'@en,'occupation'@en,'barrister'@en
9,Q100159381,P106,Q37226,Q100159381-P106-Q37226-d95f0b81-0,wikibase-item,'Victor Cherner'@en,'occupation'@en,'teacher'@en


Select all the `P31`, `P39` and `P106` statements and rewrite them as `P31x` statements, and also make sure that we do this only for humans:

In [18]:
kgtk("""
    query -i all
        --match '
            (n1)-[:P31]->(:Q5),
            (n1)-[r {label: property}]->(n2)'
        --where 'property in ["P106", "P39", "P31"]'
        --return 'distinct n1 as node1, "P31x" as label, n2 as node2'
        --limit 10
    / add-labels
""")

Unnamed: 0,node1,label,node2,node1;label,node2;label
0,Q1000048,P31x,Q1622272,'Franz Zimmermann'@en,'university teacher'@en
1,Q1000048,P31x,Q16267607,'Franz Zimmermann'@en,'classical philologist'@en
2,Q1000048,P31x,Q5,'Franz Zimmermann'@en,'human'@en
3,Q1000061,P31x,Q5,'Valentyn Symonenko'@en,'human'@en
4,Q100063874,P31x,Q5,'Catherine Musson'@en,'human'@en
5,Q100063874,P31x,Q1162163,'Catherine Musson'@en,'director'@en
6,Q100066085,P31x,Q5,'Anne-Laurence Mennessier'@en,'human'@en
7,Q100066085,P31x,Q1162163,'Anne-Laurence Mennessier'@en,'director'@en
8,Q1001,P31x,Q11774202,'Mahatma Gandhi'@en,'essayist'@en
9,Q1001,P31x,Q17351648,'Mahatma Gandhi'@en,'newspaper editor'@en


The query needs to be more sophisticated, because the previous query adds the extended `instance of` only to humans. If we don't do this, fictional characters that have occupations end up below `human (Q5)` due to the way the Wikidata ontology is structure. The fix is to concatenate (`cat`)the results of the previuos query with the original `instance of (P31)` graph and to deduplicate (`compact`).
The resulting graph goes in file `derived.P31x.tsv`:

In [19]:
%%time
kgtk("""
    query -i item
        --match '
            (n1)-[:P31]->(:Q5),
            (n1)-[r {label: property}]->(n2)'
        --where 'property in ["P106", "P39", "P31"]'
        --return 'distinct n1 as node1, "P31x" as label, n2 as node2'
    / add-id --id-style wikidata
    / cat -i - -i $p31
    / compact
    -o $OUT/derived.P31x.tsv
""")

CPU times: user 3.51 ms, sys: 10.4 ms, total: 13.9 ms
Wall time: 2.5 s


Load the `p31x` graph defining our generalized `instance of` property:

In [20]:
kgtk("""
    query -i $OUT/derived.P31x.tsv --as p31x --limit 2
""")

Unnamed: 0,node1,label,node2,id
0,P10,P31,Q18610173,P10-P31-Q18610173-85ef4d24-0
1,P1000,P31,Q18608871,P1000-P31-Q18608871-093affb5-0


Now we can fix our `P31count_transitive` property to also include classes such as `film director (Q2526255)`. Use the new `P31x` graph to substitute `P31x` for `P31` in our query that computes the class counts:

In [21]:
%%time
kgtk("""
    query -i all -i p31x
        --match '
            p31x: (instance)-[:P31x]->(class),
            all: (class)-[:P279star]->(superclass)'
        --return 'superclass as node1, "P31xcount_transitive" as label, count(distinct instance) as node2'
        --order-by 'cast(node2, int) desc'
    / add-id --id-style wikidata
    -o $OUT/metadata.p31x.count.transitive.tsv
""")

CPU times: user 3.67 ms, sys: 10.8 ms, total: 14.5 ms
Wall time: 2.44 s


Redo our query to get the number of instances of `Q5: human`, `artist: Q483501` and `film director: Q2526255`.
Now we get more reasonable counts for artist and film directors:

In [22]:
kgtk("""
    filter -i $OUT/metadata.p31x.count.transitive.tsv -p "Q5, Q483501, Q2526255 ;;" / add-labels
""")

Unnamed: 0,node1,label,node2,id,node1;label
0,Q5,P31xcount_transitive,10918,Q5-P31xcount_transitive-2bf374,'human'@en
1,Q483501,P31xcount_transitive,2526,Q483501-P31xcount_transitive-565330,'artist'@en
2,Q2526255,P31xcount_transitive,664,Q2526255-P31xcount_transitive-09eac9,'film director'@en


Find out the classes that appear in the new file that didn't appear in the old file. To do this we use the `ifnotexists` command that can be used to subtract the statements of one grpah from the statements from another graph.
> Some classes may appear in both graphs and have their counts updated (e.g., artists appeared with a count of 1 before):

In [23]:
kgtk("""
    ifnotexists -i $OUT/metadata.p31x.count.transitive.tsv
        --filter-on $OUT/metadata.p31.count.transitive.tsv
        --input-keys node1
        --filter-keys node1
    / add-labels
""")

Unnamed: 0,node1,label,node2,id,node1;label
0,Q713200,P31xcount_transitive,1890,Q713200-P31xcount_transitive-532530,'performing artist'@en
1,Q33999,P31xcount_transitive,1889,Q33999-P31xcount_transitive-285f8e,'actor'@en
2,Q15980804,P31xcount_transitive,1117,Q15980804-P31xcount_transitive-e1d9ce,'media professional'@en
3,Q3282637,P31xcount_transitive,877,Q3282637-P31xcount_transitive-30e26c,'film producer'@en
4,Q28389,P31xcount_transitive,801,Q28389-P31xcount_transitive-096012,'screenwriter'@en
...,...,...,...,...,...
229,Q66495020,P31xcount_transitive,1,Q66495020-P31xcount_transitive-6b86b2,'estate owner'@en
230,Q7141,P31xcount_transitive,1,Q7141-P31xcount_transitive-6b86b2,'cell biology'@en
231,Q856887,P31xcount_transitive,1,Q856887-P31xcount_transitive-6b86b2,'security guard'@en
232,Q957729,P31xcount_transitive,1,Q957729-P31xcount_transitive-6b86b2,'photojournalist'@en


### Summary of this section
In this section we:
- Computed  `P31x` representing our generalized instance of property. Results in `derived.P31x.tsv`.
- Computed `P31xcount_transitive` as a revision of `P31count_transitive` to also include counts via occupation and position held links. Results in `metadata.p31x.count.transitive.tsv`.
- Illustrated how to work with precomputed transitive closures (`P279star`), which enables KGTK to efficiently execute queries that otherwise would be very expensive

## Compute the number of times each property appears in a class

In this section we will compute the distribution of the use of properties in every class in th KG. 
We want to know the count of the different properties used in all instance of a class.
For example, if we look at `film (Q11424)` we want to see what properties are used to describe films, including all subclasses of film.

Computing this distirbution is challenging because as the query below shows, there are many classes in our KG:

In [24]:
kgtk("""
    query -i all --match '(entity)-[:P279]->(class)' --return 'count(distinct class) as `count of classes`'
""")

Unnamed: 0,count of classes
0,7431


Approach: we divide the task into two steps:
- For every entity, compute the set of properties used to describe it, and store this information in `item_properties.tsv`
- For every class, collect all the instances below it, and count the number of times each property appears in `item_properties.tsv`

The query for the first step is below. 
The first clause of the match clause gets the properties used in every instance of the KG.
I included a second clause to get the data type of the property, and used the `--where` clause to exlude properties with external identifiers, as there are so many of them, and for the tutorial we want the query to run faster.

In [25]:
%%time
kgtk("""
    query -i all
        --match '
            (entity)-[l {label: property}]->(),
            (property)-[:datatype]->(datatype)'
        --where 'datatype != "external-id"' 
        --return 'distinct entity as node1, "Phas_property" as label, property as node2'
    / add-labels
""")

CPU times: user 4.07 s, sys: 767 ms, total: 4.84 s
Wall time: 11.9 s


Unnamed: 0,node1,label,node2,node1;label,node2;label
0,P8874,Phas_property,P1001,'Hong Kong film rating'@en,'applies to jurisdiction'@en
1,Q1001543,Phas_property,P1001,"'Embassy of Finland, Budapest'@en",'applies to jurisdiction'@en
2,Q100325415,Phas_property,P1001,"'Embassy of Belarus, Budapest'@en",'applies to jurisdiction'@en
3,Q1005422,Phas_property,P1001,"'Federal Office of Bundeswehr Equipment, Infor...",'applies to jurisdiction'@en
4,Q1006360,Phas_property,P1001,'Bundesminister'@en,'applies to jurisdiction'@en
...,...,...,...,...,...
806344,Q7020999,Phas_property,P991,'2017 French presidential election'@en,'successful candidate'@en
806345,Q72251,Phas_property,P991,'1876 United States presidential election'@en,'successful candidate'@en
806346,Q72472,Phas_property,P991,'1892 United States presidential election'@en,'successful candidate'@en
806347,Q72835,Phas_property,P991,'1908 United States presidential election'@en,'successful candidate'@en


The results look good, so we add the identifiers to the edges and store the results in `item_properties.tsv`.

In [26]:
%%time
kgtk("""
    query -i all
        --match '
            (property)-[:datatype]->(datatype), 
            (entity)-[l {label: property}]->()'
        --where 'datatype != "external-id"' 
        --return 'distinct entity as node1, "Phas_property" as label, property as node2'
    / add-id --id-style wikidata
    -o $TEMP/item_properties.tsv
""")

CPU times: user 4.39 ms, sys: 18.5 ms, total: 22.9 ms
Wall time: 5.75 s


In the second step, we use `P279star` to get all the superclasses of each entity, and then look up the entity in the `item_properties` graph to find the properties it uses.
We invent a new property called `P1963computed` to store the counts. Wikidata has a property `properties for this type (P1963)` where editors can manually specify the properties that should be used to describe the instance of a class. We are computing the properties bottom up from the data, so we call the property `P1963computed`.

In the return clause, we list `superclass`, and the value of the `property` variable ahead of the `count` clause to tell KGTK that we want to aggregate by superclass and property. We reuse the Wikidata `quantity (P1114)` to record the counts:

> This query is very expensive to run on the full Wikidata as it touches every entity in Wikidata, but it will complete after many hours.

In [27]:
%%time
kgtk("""
    query -i all -i p31x -i $TEMP/item_properties.tsv
        --match ' 
            p31x: (entity)-[]->(class), 
            all: (class)-[:P279star]->(superclass),
            item_properties: (entity)-[l]->(property)'
        --return 'distinct superclass as node1, "P1963computed" as label, property as node2, count(distinct l) as P1114' \
        --order-by 'cast(P1114, int) desc'
        --limit 100
    / add-labels
""")

CPU times: user 25.2 ms, sys: 26.4 ms, total: 51.6 ms
Wall time: 58 s


Unnamed: 0,node1,label,node2,P1114,node1;label,node2;label
0,Q35120,P1963computed,P31,57569,'entity'@en,'instance of'@en
1,Q99527517,P1963computed,P31,37766,'collection entity'@en,'instance of'@en
2,Q28813620,P1963computed,P31,34979,'set'@en,'instance of'@en
3,Q16887380,P1963computed,P31,34958,'group'@en,'instance of'@en
4,Q58415929,P1963computed,P31,31357,'spatio-temporal entity'@en,'instance of'@en
...,...,...,...,...,...,...
95,Q56061,P1963computed,P31,10533,'administrative territorial entity'@en,'instance of'@en
96,Q4406616,P1963computed,P17,10452,'concrete object'@en,'country'@en
97,Q488383,P1963computed,P373,10326,'object'@en,'Commons category'@en
98,Q58416391,P1963computed,P571,10288,'spatial entity'@en,'inception'@en


The results look good, so we store them in `derived.P1963computed.tsv`

In [28]:
%%time
kgtk("""
    query -i all -i p31x -i $TEMP/item_properties.tsv
        --match ' 
            p31x: (entity)-[]->(class), 
            all: (class)-[:P279star]->(superclass),
            item_properties: (entity)-[l]->(property)'
        --return 'distinct superclass as node1, "P1963computed" as label, property as node2, count(distinct l) as P1114' 
    / add-id --id-style wikidata
    / normalize --add-id True
    -o $OUT/derived.P1963computed.tsv
""")

CPU times: user 19.8 ms, sys: 22.7 ms, total: 42.5 ms
Wall time: 50.7 s


Add the new graph to the databse anbd define alias `p1963computed` for it.

In [29]:
kgtk("""
    query -i $OUT/derived.P1963computed.tsv --as p1963computed --limit 10
""")

Unnamed: 0,node1,label,node2,id
0,Q100039327,P1963computed,P159,Q100039327-P1963computed-P159
1,Q100039327-P1963computed-P159,P1114,1,Q100039327-P1963computed-P159-P1114-1-0000
2,Q100039327,P1963computed,P17,Q100039327-P1963computed-P17
3,Q100039327-P1963computed-P17,P1114,1,Q100039327-P1963computed-P17-P1114-1-0000
4,Q100039327,P1963computed,P1813,Q100039327-P1963computed-P1813
5,Q100039327-P1963computed-P1813,P1114,1,Q100039327-P1963computed-P1813-P1114-1-0000
6,Q100039327,P1963computed,P31,Q100039327-P1963computed-P31
7,Q100039327-P1963computed-P31,P1114,1,Q100039327-P1963computed-P31-P1114-1-0000
8,Q100039327,P1963computed,P373,Q100039327-P1963computed-P373
9,Q100039327-P1963computed-P373,P1114,1,Q100039327-P1963computed-P373-P1114-1-0000


Let' see the distribution of properties for `film (Q11424)`:
> You can try it for `film director (Q2526255)` or `entity (Q35120)`, which gives you the distribution of all properties in the KG:

In [30]:
%%time
kgtk("""
    query -i p1963computed
        --match '
            (class:Q11424)-[l:P1963computed]->(property),
            (l)-[:P1114]->(quantity)'
        --return 'distinct class as class, property as property, quantity as count'
        --order-by 'cast(count, int) desc'
    / add-labels
""")

CPU times: user 5.65 ms, sys: 15.7 ms, total: 21.3 ms
Wall time: 1.64 s


Unnamed: 0,class,property,count,class;label,property;label
0,Q11424,P31,2447,'film'@en,'instance of'@en
1,Q11424,P577,1402,'film'@en,'publication date'@en
2,Q11424,P495,1398,'film'@en,'country of origin'@en
3,Q11424,P1476,1381,'film'@en,'title'@en
4,Q11424,P364,1368,'film'@en,'original language of film or TV show'@en
...,...,...,...,...,...
91,Q11424,P6251,1,'film'@en,'catchphrase'@en
92,Q11424,P641,1,'film'@en,'sport'@en
93,Q11424,P767,1,'film'@en,'contributor to the creative work or subject'@en
94,Q11424,P8411,1,'film'@en,'set in environment'@en


Store the resulting graph in `derived.Pproperty_domain.tsv` and define the alias `property_domain` for it in the database:

In [31]:
%%time
kgtk("""
    query -i p1963computed
        --match '
            (class)-[l:P1963computed]->(property),
            (l)-[:P1114]->(quantity)'
        --return 'distinct property as node1, "Pproperty_domain" as label, class as node2, quantity as P1114'
        --order-by 'property, cast(P1114, int) desc'
    / add-id --id-style wikidata
    / normalize --add-id True
    -o $OUT/derived.Pproperty_domain.tsv
""")

kgtk("query -i $OUT/derived.Pproperty_domain.tsv --as property_domain --limit 10")

CPU times: user 8.94 ms, sys: 24.7 ms, total: 33.7 ms
Wall time: 7.89 s


Unnamed: 0,node1,label,node2,id
0,P1001,Pproperty_domain,Q35120,P1001-Pproperty_domain-Q35120
1,P1001-Pproperty_domain-Q35120,P1114,2317,P1001-Pproperty_domain-Q35120-P1114-2317-0000
2,P1001,Pproperty_domain,Q99527517,P1001-Pproperty_domain-Q99527517
3,P1001-Pproperty_domain-Q99527517,P1114,1940,P1001-Pproperty_domain-Q99527517-P1114-1940-0000
4,P1001,Pproperty_domain,Q16889133,P1001-Pproperty_domain-Q16889133
5,P1001-Pproperty_domain-Q16889133,P1114,1866,P1001-Pproperty_domain-Q16889133-P1114-1866-0000
6,P1001,Pproperty_domain,Q16686448,P1001-Pproperty_domain-Q16686448
7,P1001-Pproperty_domain-Q16686448,P1114,1667,P1001-Pproperty_domain-Q16686448-P1114-1667-0000
8,P1001,Pproperty_domain,Q16887380,P1001-Pproperty_domain-Q16887380
9,P1001-Pproperty_domain-Q16887380,P1114,1410,P1001-Pproperty_domain-Q16887380-P1114-1410-0000


Let's see the distribution of classes for `cast member(P161)`. We restrict the results to be subclasses of `visual artwork (Q4502142)` because otherwise the results contain too many of the abstract classes. We see that property `cast member(P161)` is defined for film and subclasses of film:

In [32]:
kgtk("""
    query -i property_domain -i all
        --match '
            all: (class)-[:P279star]->(:Q4502142), 
            property_domain: (property:P161)-[l:Pproperty_domain]->(class),
            property_domain: (l)-[:P1114]->(quantity)'
        --return 'distinct property as node1, "Pproperty_domain" as label, class as node2, quantity as P1114'
        --order-by 'property, cast(P1114, int) desc'
        --limit 10
    / add-labels
""")

Unnamed: 0,node1,label,node2,P1114,node1;label,node2;label
0,P161,Pproperty_domain,Q11424,1133,'cast member'@en,'film'@en
1,P161,Pproperty_domain,Q4502142,1133,'cast member'@en,'visual artwork'@en
2,P161,Pproperty_domain,Q24869,38,'cast member'@en,'feature film'@en
3,P161,Pproperty_domain,Q229390,36,'cast member'@en,'3D film'@en
4,P161,Pproperty_domain,Q506240,17,'cast member'@en,'television film'@en
5,P161,Pproperty_domain,Q61283808,8,'cast member'@en,'Star Trek film'@en
6,P161,Pproperty_domain,Q202866,5,'cast member'@en,'animated film'@en
7,P161,Pproperty_domain,Q25110269,5,'cast member'@en,'live-action animated film'@en
8,P161,Pproperty_domain,Q517386,5,'cast member'@en,'live action'@en
9,P161,Pproperty_domain,Q24862,4,'cast member'@en,'short film'@en


### Summary of this section
In this section we:
- Computed  `P1963computed`, to record the frequence of the use of properties in every class.
- Used `P1963computed` to see the distribution of properties for a few classes.
- Illustrated the ability to break down very expensive queries into simpler steps.
- Illustrated a KGTK feature that allows you to use the results of one query as a new graph (`$TEMP/item_properties.tsv`) that can be integrated into other queries.

## Compute the distribution of units for quantity properties
This part of the tutorial illustrates how to work with KGTK structured literals:
- quantities: composed of a numeric value followed by the identifier of a unit, quantities can also define tolerances
- dates and times: composed of an ISO-formatted date, followed by a numeric precision indicator, and sometimes by a calendar
- monolingual strings: composed of a unicode string followed by a language tag

Additional documentation on the KGTK file format is in https://kgtk.readthedocs.io/en/latest/specification/
and documentation for the functions to operate on structured literals within queries is in https://kgtk.readthedocs.io/en/latest/transform/query/

Below is a specific example of how to query the units in structured literals. THe objective in the example is to compute a distribution of the units used in all properties that store quantities.
The query uses the `quantity` graph, which contains all properties whose values are quantities. 

The results of the query are interesting as we see some inconsistencies in the data present in our small subset of Wikidata. 
For example, most instances of `population (P1082)` have no units `point in time (Q186408)`, one has unit `Habitants (Q15621516)`, neither of which are units of `unit of measurement (Q47574)`

In [33]:
kgtk("""
    query -i quantity
        --match '(n1)-[l {label: property}]->(quantity)'
        --return 'distinct property as node1, "Pproperty_units_used" as label, kgtk_quantity_wd_units(quantity) as node2, count(distinct l) as P1114'
        --order-by 'property, cast(P1114, int) desc'
    / add-labels
""")

Unnamed: 0,node1,label,node2,P1114,node1;label,node2;label
0,P1081,Pproperty_units_used,,6809,'Human Development Index'@en,
1,P1082,Pproperty_units_used,,45873,'population'@en,
2,P1082,Pproperty_units_used,Q186408,2,'population'@en,'point in time'@en
3,P1082,Pproperty_units_used,Q15621516,1,'population'@en,'Habitants'@en
4,P1082,Pproperty_units_used,Q5727902,1,'population'@en,'circa'@en
...,...,...,...,...,...,...
329,P8476,Pproperty_units_used,,992,'BTI Governance Index'@en,
330,P8477,Pproperty_units_used,,970,'BTI Status Index'@en,
331,P8687,Pproperty_units_used,,5439,'social media followers'@en,
332,P8843,Pproperty_units_used,,201,'poverty incidence'@en,


We will store the units graph in `derived.Pproperty_units_used.tsv`. The final query includes a `where` clause to filter out the NULL values.

In [34]:
kgtk("""
    query -i quantity
        --match '(n1)-[l {label: property}]->(quantity)'
        --where 'kgtk_quantity_wd_units(quantity) IS NOT NULL'
        --return 'distinct property as node1, "Pproperty_units_used" as label, kgtk_quantity_wd_units(quantity) as node2, count(distinct l) as P1114'
        --order-by 'property, cast(P1114, int) desc'
    / add-id --id-style wikidata
    / normalize --add-id True
    -o $OUT/derived.Punits_used.tsv
""")

kgtk("query -i $OUT/derived.Pproperty_units_used.tsv --as property_units_used --limit 10")

[Errno 2] No such file or directory: '/Users/pedroszekely/Downloads/kypher/projects/tutorial-profiling/derived.Pproperty_units_used.tsv'




### Summary of this section
In this section we:
- Computed the distribution of the units used for properties that store quantities
- Found examples of inappropriate use of units of measure in Wikidata
- Illustrated how to use functions in `query` to extract elements from structured literals

## Compute the number of awards by sex or gender of the receiver

First, get a distirbution of the `sex or gender (P21)` of people in our graph.
The distribution is skewed, perhaps because it is skewed in Wikidata or a result of how the tutorial graph was constructed.

In [35]:
kgtk("""
    query -i all
        --match '
            (person)-[:P31]->(:Q5),
            (person)-[:P21]->(sex_or_gender)'
        --return 'distinct sex_or_gender as sex_or_gender, count(distinct person) as count'
    / add-labels
""")

Unnamed: 0,sex_or_gender,count,sex_or_gender;label
0,Q6581072,1783,'female'@en
1,Q6581097,8111,'male'@en


Below, we compute the distirbution of `sex or gender (P21)`  per type of award. We use the property `award received (P166)` to extract the awards that people received.

We create a new property `Paward_count` to record the count, and put the `sex or gender (P21)` as a qualifier.

In [36]:
%%time
kgtk("""
    query -i all
        --match '
            (actor)-[:P31]->(:Q5),
            (actor)-[:P21]->(sex_or_gender),
            (actor)-[:P166]->(award)-[:P31]->(award_type)'
        --return 'distinct award_type as node1, "Paward_count" as label, sex_or_gender as P21, count(distinct actor) as node2'
        --order-by 'award_type'
    / add-labels
""")

CPU times: user 6.83 ms, sys: 14.7 ms, total: 21.5 ms
Wall time: 1.82 s


Unnamed: 0,node1,label,P21,node2,node1;label,P21;label
0,Q101007233,Paward_count,Q6581097,1,'film critics association'@en,'male'@en
1,Q1011547,Paward_count,Q6581072,38,'Golden Globe Award'@en,'female'@en
2,Q1011547,Paward_count,Q6581097,42,'Golden Globe Award'@en,'male'@en
3,Q101251494,Paward_count,Q6581097,24,'star'@en,'male'@en
4,Q1044427,Paward_count,Q6581072,8,'Primetime Emmy Award'@en,'female'@en
...,...,...,...,...,...,...
220,Q96474707,Paward_count,Q6581097,16,'honorary award'@en,'male'@en
221,Q96474709,Paward_count,Q6581072,2,'award for best visual effects'@en,'female'@en
222,Q96474709,Paward_count,Q6581097,121,'award for best visual effects'@en,'male'@en
223,Q973011,Paward_count,Q6581097,18,'campaign medal'@en,'male'@en


Store the new `Paward_count` graph in a file and define the alias `award_count` for it

In [37]:
%%time
kgtk("""
    query -i all
        --match '
            (actor)-[:P31]->(:Q5),
            (actor)-[:P21]->(sex_or_gender),
            (actor)-[:P166]->(award)-[:P31]->(award_type)'
        --return 'distinct award_type as node1, "Paward_count" as label, sex_or_gender as P21, count(distinct actor) as node2'
        --order-by 'award_type'
    / add-id --id-style wikidata
    / normalize --add-id True
    -o $OUT/derived.Paward_count.tsv
""")

kgtk("query -i $OUT/derived.Paward_count.tsv --as award_count --limit 10")

CPU times: user 6.66 ms, sys: 21.7 ms, total: 28.3 ms
Wall time: 2.15 s


Unnamed: 0,node1,label,node2,id
0,Q101007233,Paward_count,1,Q101007233-Paward_count-6b86b2
1,Q101007233-Paward_count-6b86b2,P21,Q6581097,Q101007233-Paward_count-6b86b2-P21-Q6581097-0000
2,Q1011547,Paward_count,38,Q1011547-Paward_count-aea921
3,Q1011547-Paward_count-aea921,P21,Q6581072,Q1011547-Paward_count-aea921-P21-Q6581072-0000
4,Q1011547,Paward_count,42,Q1011547-Paward_count-73475c
5,Q1011547-Paward_count-73475c,P21,Q6581097,Q1011547-Paward_count-73475c-P21-Q6581097-0000
6,Q101251494,Paward_count,24,Q101251494-Paward_count-c23560
7,Q101251494-Paward_count-c23560,P21,Q6581097,Q101251494-Paward_count-c23560-P21-Q6581097-0000
8,Q1044427,Paward_count,8,Q1044427-Paward_count-2c6242
9,Q1044427-Paward_count-2c6242,P21,Q6581072,Q1044427-Paward_count-2c6242-P21-Q6581072-0000


### Summary of this section
In this section we:
- Profiled awards to find the gender or sex of awardees, and found that males appear more frequently. We don't know if it is a skew in Wikidata or the real world.
- Defined a new property to hold the data so that it can be shown in the browser.

In [38]:
kgtk("""
    query -i all
        --match '
            (award)-[P31]->(award_type)-[:P279star]->(:Q4220917)'
        --return 'distinct award_type as award_type'
    / add-labels
""")

Unnamed: 0,award_type,award_type;label
0,Q1011547,'Golden Globe Award'@en
1,Q106301,'Academy Award for Best Supporting Actress'@en
2,Q110145,'MTV Movie Awards'@en
3,Q1111310,'Directors Guild of America Award'@en
4,Q1131772,'Saturn Award for Best Science Fiction Film'@en
...,...,...
90,Q96474700,'award for best screenplay'@en
91,Q96474701,'award for best adapted screenplay'@en
92,Q96474704,'award for best makeup and hairdressing'@en
93,Q96474707,'honorary award'@en


# Deploy the results

Deplot the tutorial files after completing this notebook.

In [47]:
files_to_deploy = [
    "metadata.p31x.count.transitive.tsv",
    "derived.P31x.tsv",
    "derived.P1963computed.tsv",
    "derived.Pproperty_domain.tsv",
    "derived.Punits_used.tsv",
    "derived.Paward_count.tsv"
]

# First copy all the files from the add-derived-graphs, we will overwrite the ones that change, e.g., all.tsv
!cp -p {tutorial_deployment_path + "/arnold"}/*.tsv* {project_deployment_path}

for file in files_to_deploy:
    path = "$OUT/" + file
    !cp -p {path} {project_deployment_path} 

all_file_path = project_deployment_path + "/all.tsv.gz"
if os.path.exists(all_file_path):
    !rm {all_file_path}
!kgtk cat -i {tutorial_deployment_path + "/arnold/all.tsv.gz"} -i {project_deployment_path}/*.tsv -o {all_file_path}

List all the files:

In [48]:
!ls -l {project_deployment_path}

total 320200
-rw-r--r--  1 pedroszekely  staff   1342345 Oct 10 11:52 aliases.en.tsv.gz
-rw-r--r--  1 pedroszekely  staff  52030485 Oct 10 12:21 all.tsv.gz
-rw-r--r--  1 pedroszekely  staff  13620313 Oct 10 11:52 claims.external-id.tsv.gz
-rw-r--r--  1 pedroszekely  staff   1069769 Oct 10 11:52 claims.monolingualtext.tsv.gz
-rw-r--r--  1 pedroszekely  staff   1936951 Oct 10 11:52 claims.quantity.tsv.gz
-rw-r--r--  1 pedroszekely  staff   1095875 Oct 10 11:52 claims.string.tsv.gz
-rw-r--r--  1 pedroszekely  staff    781182 Oct 10 11:52 claims.time.tsv.gz
-rw-r--r--  1 pedroszekely  staff   6332200 Oct 10 11:52 claims.wikibase-item.tsv.gz
-rw-r--r--  1 pedroszekely  staff     97267 Oct 10 11:52 claims.wikibase-property.tsv.gz
-rw-r--r--  1 pedroszekely  staff  29048633 Oct 10 12:07 derived.P1963computed.tsv
-rw-r--r--  1 pedroszekely  staff    390973 Oct 10 11:53 derived.P279.tsv.gz
-rw-r--r--  1 pedroszekely  staff   3325552 Oct 10 11:54 derived.P279star.tsv.gz
-rw-r--r--  1 pedroszekel