# Profile The Tutorial Graph



In [1]:
import io
import os
import subprocess
import sys

import numpy as np
import pandas as pd
from IPython.display import display, HTML

import papermill as pm

sys.path.insert(0,'..')
from configure_kgtk_notebooks import ConfigureKGTK

from kgtk.functions import kgtk, kypher

In [2]:
# Parameters

kgtk_path = "/Users/pedroszekely/Documents/GitHub/kgtk"

# Folder on local machine where to create the output and temporary folders
input_path = "/Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold"
output_path = "/Users/pedroszekely/Downloads/kypher/projects"
project_name = "tutorial-profiling"

These are all the files that we have, but I am tempted to just use the `all` file as it helps to keep the tutorial simpler

In [3]:
files = [
    "all",
    "label",
    "alias",
    "description",
    "external_id",
    "monolingualtext",
    "quantity",
    "string",
    "time",
    "item",
    "wikibase_property",
    "qualifiers",
    "datatypes",
    "p279",
    "p279star",
    "p31",
    "in_degree",
    "out_degree",
    "pagerank_directed",
    "pagerank_undirected"
]
ck = ConfigureKGTK(kgtk_path=kgtk_path)
ck.configure_kgtk(input_graph_path=input_path,
                  output_path=output_path,
                  project_name=project_name)

User home: /Users/pedroszekely
Current dir: /Users/pedroszekely/Documents/GitHub/kgtk/tutorial
KGTK dir: /Users/pedroszekely/Documents/GitHub/kgtk
Use-cases dir: /Users/pedroszekely/Documents/GitHub/kgtk/use-cases


In [4]:
ck.print_env_variables(files)

TEMP: /Users/pedroszekely/Downloads/kypher/projects/tutorial-profiling/temp.tutorial-profiling
EXAMPLES_DIR: /Users/pedroszekely/Documents/GitHub/kgtk/examples
USE_CASES_DIR: /Users/pedroszekely/Documents/GitHub/kgtk/use-cases
GRAPH: /Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold
kypher: kgtk query --graph-cache /Users/pedroszekely/Downloads/kypher/projects/tutorial-profiling/temp.tutorial-profiling/wikidata.sqlite3.db
OUT: /Users/pedroszekely/Downloads/kypher/projects/tutorial-profiling
STORE: /Users/pedroszekely/Downloads/kypher/projects/tutorial-profiling/temp.tutorial-profiling/wikidata.sqlite3.db
kgtk: kgtk
all: /Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold/all.tsv.gz
label: /Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold/labels.en.tsv.gz
alias: /Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold/aliases.en.tsv.gz
description: /Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/

Set up defaults KGTK

In [5]:
os.environ['kgtk_path'] = kgtk_path
os.environ['KGTK_GRAPH_CACHE'] = os.environ['STORE']
os.environ['KGTK_LABEL_FILE'] = input_path + "/labels.en.tsv.gz"
os.environ['KGTK_OPTION_DEBUG'] = "false"

Load all my files into the kypher cache so that all graph aliases are defined

In [6]:
%%time
ck.load_files_into_cache(file_list=files)

kgtk query --graph-cache /Users/pedroszekely/Downloads/kypher/projects/tutorial-profiling/temp.tutorial-profiling/wikidata.sqlite3.db -i "/Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold/all.tsv.gz" --as all  -i "/Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold/labels.en.tsv.gz" --as label  -i "/Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold/aliases.en.tsv.gz" --as alias  -i "/Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold/descriptions.en.tsv.gz" --as description  -i "/Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold/claims.external-id.tsv.gz" --as external_id  -i "/Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold/claims.monolingualtext.tsv.gz" --as monolingualtext  -i "/Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold/claims.quantity.tsv.gz" --as quantity  -i "/Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/data

In [7]:
%cd {os.environ['OUT']}

/Users/pedroszekely/Downloads/kypher/projects/tutorial-profiling


## Get instance counts



We can compute the instance counts by retrieving all statements that use `instance of (P31)` and counting the instances for each class

In [8]:
%%time
kgtk("""
    query -i all
        --match '(instance)-[:P31]->(class)'
        --return 'class as class, count(distinct instance) as count'
        --order-by 'cast(count, int) desc'
        --limit 10 
    / add-labels
""")

CPU times: user 8.03 ms, sys: 12.6 ms, total: 20.6 ms
Wall time: 1.45 s


Unnamed: 0,class,count,class;label
0,Q5,10918,'human'@en
1,Q15221623,3176,'bilateral relation'@en
2,Q11424,2126,'film'@en
3,Q4022,1547,'river'@en
4,Q3918,778,'university'@en
5,Q3917681,613,'embassy'@en
6,Q1549591,590,'big city'@en
7,Q19595382,583,'Wikidata property for authority control for p...
8,Q11862829,530,'academic discipline'@en
9,Q15632617,493,'fictional human'@en


We want to add the profiling data back into the KG so that we can use it in queries and look at it in the browser.
To do so, we create a KGTK graph by using `node1, label, node2` as column headers:

In [9]:
%%time
kgtk("""
    query -i all
        --match '(instance)-[:P31]->(class)'
        --return 'class as node1, "P31_count" as label, count(distinct instance) as node2'
        --order-by 'cast(node2, int) desc'
    --limit 10 
""")

CPU times: user 5.36 ms, sys: 11.4 ms, total: 16.8 ms
Wall time: 674 ms


Unnamed: 0,node1,label,node2
0,Q5,P31_count,10918
1,Q15221623,P31_count,3176
2,Q11424,P31_count,2126
3,Q4022,P31_count,1547
4,Q3918,P31_count,778
5,Q3917681,P31_count,613
6,Q1549591,P31_count,590
7,Q19595382,P31_count,583
8,Q11862829,P31_count,530
9,Q15632617,P31_count,493


It is good practice to add identifiers to the edges so that we can add qualifiers later if we desire. To add the identifiers, we chain the query output to the `add-id` command:

In [10]:
%%time
kgtk("""
    query -i all
        --match '(instance)-[:P31]->(class)'
        --return 'class as node1, "P31count" as label, count(distinct instance) as node2'
        --order-by 'cast(node2, int) desc' 
    / add-id --id-style wikidata
""")

CPU times: user 46.5 ms, sys: 18.7 ms, total: 65.2 ms
Wall time: 1.16 s


Unnamed: 0,node1,label,node2,id
0,Q5,P31count,10918,Q5-P31count-2bf374
1,Q15221623,P31count,3176,Q15221623-P31count-73e7f3
2,Q11424,P31count,2126,Q11424-P31count-d8adfb
3,Q4022,P31count,1547,Q4022-P31count-05fb3c
4,Q3918,P31count,778,Q3918-P31count-93411f
...,...,...,...,...
4965,Q996839,P31count,1,Q996839-P31count-6b86b2
4966,Q99934885,P31count,1,Q99934885-P31count-6b86b2
4967,Q99935030,P31count,1,Q99935030-P31count-6b86b2
4968,Q99960791,P31count,1,Q99960791-P31count-6b86b2


Now that we saw the steps to create the graph with the counts, we want to output the results to a file using the `-o` option:

In [11]:
%%time
kgtk("""
    query -i all
        --match '(instance)-[:P31]->(class)'
        --return 'class as node1, "P31count" as label, count(distinct instance) as node2'
        --order-by 'cast(node2, int) desc'
    / add-id --id-style wikidata
    -o $OUT/metadata.p31.count.tsv
""")

CPU times: user 3.17 ms, sys: 10.1 ms, total: 13.3 ms
Wall time: 1.08 s


Confirm that the output file went to the right place:

In [12]:
!ls -l $OUT

total 3208
-rw-r--r--  1 pedroszekely  staff  741783 Oct  8 22:42 derived.P31x.tsv
-rw-r--r--  1 pedroszekely  staff  528980 Oct  8 21:45 metadata.p31.count.transitive.tsv
-rw-r--r--  1 pedroszekely  staff  224219 Oct  8 23:00 metadata.p31.count.tsv
-rw-r--r--  1 pedroszekely  staff   44254 Oct  8 22:59 metadata.p31x.count.transitive.tsv
drwxr-xr-x  6 pedroszekely  staff     192 Oct  8 23:00 [34mtemp.tutorial-profiling[m[m


Load the `P31count` graph in the KGTK cache so that we can use it in queries later

In [13]:
kgtk("""
    query -i $OUT/metadata.p31.count.tsv --as p31count --limit 2
""")

Unnamed: 0,node1,label,node2,id
0,Q5,P31count,10918,Q5-P31count-2bf374
1,Q15221623,P31count,3176,Q15221623-P31count-73e7f3


## Compute `P31count_transitive`, the count of instances of a class including the instances of all the subclasses

Approach:
- get the class of each instance
- get all the superclass of the class of each instance
- for every superclass, count all the instances

> This query will run at the scale of all Wikidata, which contains millions of classes

We add the labels to see the results, not surprisingly, `entity` has the most instances, and the top classes are those at the top of the Wikidata ontology:

In [14]:
%%time
kgtk("""
    query -i all
        --match '
            (instance)-[:P31]->(class),
            (class)-[:P279star]->(superclass)'
        --return 'superclass as class, count(distinct instance) as count'
        --order-by 'cast(count, int) desc'
    / add-labels
""")

CPU times: user 50.8 ms, sys: 21.7 ms, total: 72.5 ms
Wall time: 10.4 s


Unnamed: 0,class,count,class;label
0,Q35120,49231,'entity'@en
1,Q99527517,30567,'collection entity'@en
2,Q28813620,28116,'set'@en
3,Q16887380,28102,'group'@en
4,Q58415929,27411,'spatio-temporal entity'@en
...,...,...,...
7849,Q100166391,1,'salt production facility'@en
7850,Q1001059,1,'writ'@en
7851,Q1000660,1,'algebra over a field'@en
7852,Q100052008,1,'anthropomorphic Pantherinae'@en


Store the results in a file using a new property `P31count_transitive`

In [15]:
%%time
kgtk("""
    query -i all 
        --match '
            (instance)-[:P31]->(class),
            (class)-[:P279star]->(superclass)'
        --return 'superclass as node1, "P31count_transitive" as label, count(distinct instance) as node2'
        --order-by 'cast(node2, int) desc'
    / add-id --id-style wikidata
    -o $OUT/metadata.p31.count.transitive.tsv
""")

CPU times: user 6.49 ms, sys: 12.9 ms, total: 19.3 ms
Wall time: 8.88 s


Find the number of instances of `Q5: human`, `artist: Q483501` and `film director: Q2526255`. There are many instances of human, but only one of artist and zero of film director.

In [16]:
kgtk("""
    filter -i $OUT/metadata.p31.count.transitive.tsv -p "Q5, Q483501, Q2526255 ;;" / add-labels
""")

Unnamed: 0,node1,label,node2,id,node1;label
0,Q5,P31count_transitive,10985,Q5-P31count_transitive-189d78,'human'@en
1,Q483501,P31count_transitive,1,Q483501-P31count_transitive-6b86b2,'artist'@en


The reason there are no instances of `artist: Q483501` or `film director: Q2526255`  is that Wikidata uses the property `occupation: P106` to relate people to their occupations, so the connection between human and artist of director is not `instance of: P31`. It would be nice if the browser page for `artist: Q483501` or `film director: Q2526255` would show the number of people with this occupation. DBpedia uses a different model where humans are instances of artist or film director.


## Define `P31x`, a generalization of `instance of: P31`

In our KG we are going to define a new property called `instance of (generalized): P31x` that behaves like DBpedia, so that we can ask for instances of `artist: Q483501`.
We do this by generalizing `occupation: P106` abd `position held: 39` to also behave as `P31` statements.

Approach:
- Combine `x P31 y`, `x P106 y` and `x P39 y` statements using a new `P31x` predicate

Use the `filter` to take a peek at the data and see whether our plan makes sense.

In [17]:
kgtk("""
    filter -i $item -p "; P39, P106 ;"
    / head
    / add-labels
""")

Unnamed: 0,node1,label,node2,id,node2;wikidatatype,node1;label,label;label,node2;label
0,Q1000048,P106,Q1622272,Q1000048-P106-Q1622272-3a1be6b5-0,wikibase-item,'Franz Zimmermann'@en,'occupation'@en,'university teacher'@en
1,Q1000048,P106,Q16267607,Q1000048-P106-Q16267607-e13e45d1-0,wikibase-item,'Franz Zimmermann'@en,'occupation'@en,'classical philologist'@en
2,Q1001,P106,Q808967,Q1001-P106-Q808967-57fe7a7e-0,wikibase-item,'Mahatma Gandhi'@en,'occupation'@en,'barrister'@en
3,Q100252,P106,Q1622272,Q100252-P106-Q1622272-e0e6768c-0,wikibase-item,'Johann Nepomuk von Ringseis'@en,'occupation'@en,'university teacher'@en
4,Q100749,P106,Q1622272,Q100749-P106-Q1622272-1960268e-0,wikibase-item,'Clemens Fuest'@en,'occupation'@en,'university teacher'@en
5,Q100749,P106,Q188094,Q100749-P106-Q188094-8ae31483-0,wikibase-item,'Clemens Fuest'@en,'occupation'@en,'economist'@en
6,Q100948,P106,Q15839134,Q100948-P106-Q15839134-1fd212b2-0,wikibase-item,'Rachel Carson'@en,'occupation'@en,'ecologist'@en
7,Q1010297,P106,Q1622272,Q1010297-P106-Q1622272-e72eab9d-0,wikibase-item,'Burchard Brentjes'@en,'occupation'@en,'university teacher'@en
8,Q1010297,P39,Q11827483,Q1010297-P39-Q11827483-f6439c09-0,wikibase-item,'Burchard Brentjes'@en,'position held'@en,'Оrdinary professor'@en
9,Q101268,P106,Q1622272,Q101268-P106-Q1622272-b7682325-0,wikibase-item,'August Thiersch'@en,'occupation'@en,'university teacher'@en


Select all the `P31`, `P39` and `P106` statements and rewrite them as `P31x` statements, and also make sure that we do this only for humans:

In [18]:
kgtk("""
    query -i all
        --match '
            (n1)-[:P31]->(:Q5),
            (n1)-[r {label: property}]->(n2)'
        --where 'property in ["P106", "P39", "P31"]'
        --return 'distinct n1 as node1, "P31x" as label, n2 as node2'
        --limit 10
    / add-labels
""")

Unnamed: 0,node1,label,node2,node1;label,node2;label
0,Q1000048,P31x,Q1622272,'Franz Zimmermann'@en,'university teacher'@en
1,Q1000048,P31x,Q16267607,'Franz Zimmermann'@en,'classical philologist'@en
2,Q1000048,P31x,Q5,'Franz Zimmermann'@en,'human'@en
3,Q1001,P31x,Q808967,'Mahatma Gandhi'@en,'barrister'@en
4,Q1001,P31x,Q5,'Mahatma Gandhi'@en,'human'@en
5,Q100148353,P31x,Q5,'Jacqueline Goddet'@en,'human'@en
6,Q100153947,P31x,Q5,'Cathaline Parker Widdoes'@en,'human'@en
7,Q100153956,P31x,Q5,'Ned Chase'@en,'human'@en
8,Q100252,P31x,Q1622272,'Johann Nepomuk von Ringseis'@en,'university teacher'@en
9,Q100252,P31x,Q5,'Johann Nepomuk von Ringseis'@en,'human'@en


The test query looks correct, so write the complete dataset into a graph in file `derived.P31x.tsv`:

In [19]:
%%time
kgtk("""
    query -i item
        --match '
            (n1)-[:P31]->(:Q5),
            (n1)-[r {label: property}]->(n2)'
        --where 'property in ["P106", "P39", "P31"]'
        --return 'distinct n1 as node1, "P31x" as label, n2 as node2'
    / add-id --id-style wikidata
    -o $OUT/derived.P31x.tsv
""")

CPU times: user 3.1 ms, sys: 9.95 ms, total: 13 ms
Wall time: 1.3 s


Load the `p31x` graph defining our generalized `instance of` property:

In [20]:
kgtk("""
    query -i $OUT/derived.P31x.tsv --as p31x --limit 2
""")

Unnamed: 0,node1,label,node2,id
0,Q1000048,P31x,Q1622272,Q1000048-P31x-Q1622272
1,Q1000048,P31x,Q16267607,Q1000048-P31x-Q16267607


Use the new `P31x` graph to substitute `P31x` for `P31` in our query that computes the class counts:

In [21]:
%%time
kgtk("""
    query -i all -i p31x
        --match '
            p31x: (instance)-[:P31x]->(class),
            all: (class)-[:P279star]->(superclass)'
        --return 'superclass as node1, "P31xcount_transitive" as label, count(distinct instance) as node2'
        --order-by 'cast(node2, int) desc'
    / add-id --id-style wikidata
    -o $OUT/metadata.p31x.count.transitive.tsv
""")

CPU times: user 3.43 ms, sys: 10.3 ms, total: 13.8 ms
Wall time: 2.3 s


Redo our query to get the number of instances of `Q5: human`, `artist: Q483501` and `film director: Q2526255`.
Now we get more reasonable counts for artist and film directors:

In [22]:
kgtk("""
    filter -i $OUT/metadata.p31x.count.transitive.tsv -p "Q5, Q483501, Q2526255 ;;" / add-labels
""")

Unnamed: 0,node1,label,node2,id,node1;label
0,Q5,P31xcount_transitive,10918,Q5-P31xcount_transitive-2bf374,'human'@en
1,Q483501,P31xcount_transitive,2526,Q483501-P31xcount_transitive-565330,'artist'@en
2,Q2526255,P31xcount_transitive,664,Q2526255-P31xcount_transitive-09eac9,'film director'@en


Find out the classes that appear in the new file that didn't appear in the old file. To do this we use the `ifnotexists` command that can be used to subtract the statements of one grpah from the statements from another graph.
> Some classes may appear in both graphs and have their counts updated (e.g., artists appeared with a count of 1 before):

In [23]:
kgtk("""
    ifnotexists -i $OUT/metadata.p31x.count.transitive.tsv
        --filter-on $OUT/metadata.p31.count.transitive.tsv
        --input-keys node1
        --filter-keys node1
    / add-labels
""")

Unnamed: 0,node1,label,node2,id,node1;label
0,Q713200,P31xcount_transitive,1890,Q713200-P31xcount_transitive-532530,'performing artist'@en
1,Q33999,P31xcount_transitive,1889,Q33999-P31xcount_transitive-285f8e,'actor'@en
2,Q15980804,P31xcount_transitive,1117,Q15980804-P31xcount_transitive-e1d9ce,'media professional'@en
3,Q131524,P31xcount_transitive,877,Q131524-P31xcount_transitive-30e26c,'entrepreneur'@en
4,Q13235160,P31xcount_transitive,877,Q13235160-P31xcount_transitive-30e26c,'producer'@en
...,...,...,...,...,...
232,Q66495020,P31xcount_transitive,1,Q66495020-P31xcount_transitive-6b86b2,'estate owner'@en
233,Q7141,P31xcount_transitive,1,Q7141-P31xcount_transitive-6b86b2,'cell biology'@en
234,Q856887,P31xcount_transitive,1,Q856887-P31xcount_transitive-6b86b2,'security guard'@en
235,Q957729,P31xcount_transitive,1,Q957729-P31xcount_transitive-6b86b2,'photojournalist'@en


### Summary
In this section we:
- Computed  `P31x` representing our generalized instance of property. Results in `derived.P31x.tsv`.
- Computed `P31xcount_transitive` as a revision of `P31count_transitive` to also include counts via occupation and position held links. Results in `metadata.p31x.count.transitive.tsv`.

## Compute the number of times each property appears in a class

## Compute the distribution of units for quantity properties

## Compute the distribution of years of birthdays

## Compute the number of awards each genre of movie has received

## Compute the gender of award winners, by award type

`film award
(Q4220917)`

In [29]:
kgtk("""
    query -i all
        --match '
            (award)-[P31]->(award_type)-[:P279star]->(:Q4220917)'
        --return 'distinct award_type as award_type'
    / add-labels
""")

Unnamed: 0,award_type,award_type;label
0,Q1011547,'Golden Globe Award'@en
1,Q106301,'Academy Award for Best Supporting Actress'@en
2,Q110145,'MTV Movie Awards'@en
3,Q1111310,'Directors Guild of America Award'@en
4,Q1131772,'Saturn Award for Best Science Fiction Film'@en
...,...,...
90,Q96474700,'award for best screenplay'@en
91,Q96474701,'award for best adapted screenplay'@en
92,Q96474704,'award for best makeup and hairdressing'@en
93,Q96474707,'honorary award'@en
