# Getty Subgraph Tutorial
In this notebook, we build a subgraph based on a set of Getty ULAN explicit files and use it in the hunger-for-knowledge procedure for place of birth in Wikidata and Getty.

In [1]:
import os
import json
import pandas as pd

from kgtk.functions import kgtk, kypher

## Step 0: Set up environment paths

In [2]:
# We will define environment variables to hold the full paths to the files as we will use them in the shell commands
kgtk_environment_variables = []

# Folder where database files store
data_path = "/nas/home/bohuizha/KG/hunger-for-knowledge/data/"
os.environ['DATABASE'] = data_path
kgtk_environment_variables.append('DATABASE')

# Wikidata (all is much less than claims)
os.environ['WIKIDATA'] = data_path + "claims.tsv"
kgtk_environment_variables.append('WIKIDATA')

# Label file of Wikidata
os.environ['KGTK_LABEL_FILE'] = data_path + "labels.en.tsv"
kgtk_environment_variables.append('KGTK_LABEL_FILE')

# P31
os.environ['P31'] = data_path + "P31.tsv"
kgtk_environment_variables.append('P31')

# P279star
os.environ['P279STAR'] = data_path + "P279star.tsv"
kgtk_environment_variables.append('P279STAR')

# Folder of ULAN
ulan_path = data_path + "gvp/ULAN/"
os.environ['ULAN'] = ulan_path
kgtk_environment_variables.append('ULAN')

# Full file provided by Getty ULAN, contains all explicit files
# os.environ['ULAN_FULL'] = ulan_path + "full.tsv"
# kgtk_environment_variables.append('ULAN_FULL')

# Explicit file Subjects
os.environ['ULAN_SUBJECTS'] = ulan_path + "Subjects.tsv"
kgtk_environment_variables.append('ULAN_SUBJECTS')

# Explicit file Terms
os.environ['ULAN_TERMS'] = ulan_path + "Terms.tsv"
kgtk_environment_variables.append('ULAN_TERMS')

# Explicit file AgentMap
os.environ['AGENTMAP'] = ulan_path + "AgentMap.tsv"
kgtk_environment_variables.append('AGENTMAP')

# Explicit file Biographies
os.environ['BIOGRAPHIES'] = ulan_path + "Biographies.tsv"
kgtk_environment_variables.append('BIOGRAPHIES')

ulan_wikialign_path = ulan_path + "wiki.align.tsv"
os.environ['ULAN_ALIGN'] = ulan_wikialign_path
kgtk_environment_variables.append('ULAN_ALIGN')

os.environ['ULAN_FULL_ALIGN'] = ulan_path + "full.align.tsv"
kgtk_environment_variables.append('ULAN_FULL_ALIGN')

os.environ['ULAN_FULL_ALIGN_ID'] = ulan_path + "full.align.id.tsv"
kgtk_environment_variables.append('ULAN_FULL_ALIGN_ID')

# Folder of TGN
tgn_path = data_path + "gvp/TGN/"
os.environ['TGN'] = tgn_path
kgtk_environment_variables.append('TGN')

os.environ['TGN_FULL'] = tgn_path + "full.tsv"
kgtk_environment_variables.append('TGN_FULL')

os.environ['TGN_ALIGN'] = tgn_path + "wiki.align.tsv"
kgtk_environment_variables.append('TGN_ALIGN')

os.environ['TGN_FULL_ALIGN'] = tgn_path + "full.align.tsv"
kgtk_environment_variables.append('TGN_FULL_ALIGN')

# Output
subgraph_path = data_path + "gvp/subgraph/"
if not os.path.exists(subgraph_path):
    os.mkdir(subgraph_path)
os.environ['SUBGRAPH'] = subgraph_path
kgtk_environment_variables.append('SUBGRAPH')

output_names = {
    "wiki_pob": "wiki.pob.tsv",
    "wiki_unknown": "wiki.unknown.tsv",
    "ulan_subgraph": "subgraph.ulan.tsv",
    "ulan_subgraph_id": "subgraph.ulan.id.tsv",
    "wiki_align_sample": "wiki.align.sample.tsv",
    "subjects_sample_1": "subjects.sample.1.tsv",
    "subjects_sample_2": "subjects.sample.2.tsv",
    "agentmap_sample_1": "agentmap.sample.1.tsv",
    "agentmap_sample_2": "agentmap.sample.2.tsv",
    "mapbio_sample": "mapbio.sample.tsv",
    "bio_sample": "bio.sample.tsv",
    "pobmap_sample": "pobmap.sample.tsv",
    "pairs": "pairs.tsv",
    "paths": "paths.tsv",
    "paths_label": "paths.label.tsv",
    "getty_pob": "getty.pob.tsv",
    "getty_mapped_pob": "getty.mapped.tsv",
    "correct": "correct.tsv",
    "unknown": "unknown.tsv"
}

for key, value in output_names.items():
    variable = key.upper()
    os.environ[variable] = os.path.join(subgraph_path, value)
    kgtk_environment_variables.append(variable)

for variable in kgtk_environment_variables:
    print("{}: \"{}\"".format(variable, os.environ[variable]))

DATABASE: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/"
ULAN: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/gvp/ULAN/"
ULAN_SUBJECTS: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/gvp/ULAN/Subjects.tsv"
ULAN_TERMS: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/gvp/ULAN/Terms.tsv"
AGENTMAP: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/gvp/ULAN/AgentMap.tsv"
BIOGRAPHIES: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/gvp/ULAN/Biographies.tsv"
ULAN_ALIGN: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/gvp/ULAN/wiki.align.tsv"
ULAN_FULL_ALIGN: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/gvp/ULAN/full.align.tsv"
ULAN_FULL_ALIGN_ID: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/gvp/ULAN/full.align.id.tsv"
TGN: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/gvp/TGN/"
TGN_FULL: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/gvp/TGN/full.tsv"
TGN_ALIGN: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/gvp/TGN/wiki.align.tsv"
TGN_FULL_ALIGN: "/nas/home/bohuizha/

## Step 1: Sample ULAN IDs
Sample ULAN IDs both in Subjects (ULAN) and Wikidata Alignment:

In [3]:
%%time
kgtk("""
    query -i $ULAN_ALIGN $ULAN_SUBJECTS 
        --match 'w: (ulanid)-[]->(), s: (ulanid)-[p]->(v)'
        --return 'distinct ulanid, p.label, v'
        --limit 2500
        -o $SUBJECTS_SAMPLE
    """)

CPU times: user 7.18 ms, sys: 21.1 ms, total: 28.3 ms
Wall time: 3.12 s


Count how many distinct ULAN IDs:

In [4]:
kgtk("""
    query -i $SUBJECTS_SAMPLE
        --match '(ulanid)-[]->()'
        --return 'count(distinct ulanid) as count'
    """)

Unnamed: 0,count
0,278


## Step 2: Collect Information in Separate Files

### AgentMap

In [5]:
%%time
kgtk("""
    query -i $SUBJECTS_SAMPLE $AGENTMAP 
        --match 's: (ulanid)-[]->(), \
                 a: (ulanid)-[p]->(v)'
        --return 'distinct ulanid, p.label, v'
        -o $AGENTMAP_SAMPLE
    """)

CPU times: user 4.93 ms, sys: 11.2 ms, total: 16.1 ms
Wall time: 2.24 s


### Biographies

In [124]:
%%time
kgtk("""
    query -i $AGENTMAP_SAMPLE $BIOGRAPHIES 
        --match 'agentmap: ()-[]->(ulanagent), \
                 Biographies: (ulanagent)-[p]->(v)'
        --return 'distinct ulanagent as node1, p.label as label, v as node2'
        -o $MAPBIO_SAMPLE
    """)

CPU times: user 4.93 ms, sys: 11.1 ms, total: 16 ms
Wall time: 2.38 s


In [125]:
%%time
kgtk("""
    query -i $MAPBIO_SAMPLE $BIOGRAPHIES 
        --match 'mapbio: ()-[]->(ulanbio), \
                 Biographies: (ulanbio)-[p]->(v)'
        --return 'distinct ulanbio as node1, p.label as label, v as node2'
        -o $BIO_SAMPLE
    """)

CPU times: user 3.28 ms, sys: 13.5 ms, total: 16.8 ms
Wall time: 2.88 s


Check actual birthplaces have in this subgraph:

In [126]:
%%time
kgtk("""
    query -i $BIO_SAMPLE
        --match '(ulanbio)-[p]->(v)'
        --where 'p.label = "schema:birthPlace"'
        --return 'distinct ulanbio as node1, p.label as label, v as node2'
    """)

CPU times: user 53.2 ms, sys: 20.4 ms, total: 73.6 ms
Wall time: 2.57 s


Unnamed: 0,node1,label,node2
0,ulan_bio:4000000001,schema:birthPlace,tgn:7010794-place
1,ulan_bio:4000000003,schema:birthPlace,tgn:7010794-place
2,ulan_bio:4000000004,schema:birthPlace,tgn:7008546-place
3,ulan_bio:4000000009,schema:birthPlace,tgn:7007269-place
4,ulan_bio:4000000018,schema:birthPlace,tgn:7008775-place
...,...,...,...
3421,ulan_bio:4000711145,schema:birthPlace,tgn:1000062-place
3422,ulan_bio:4000711209,schema:birthPlace,tgn:7012392-place
3423,ulan_bio:4000722129,schema:birthPlace,tgn:7013628-place
3424,ulan_bio:4000724344,schema:birthPlace,tgn:7003980-place


### TGN-Wikidata Alignment

In [127]:
%%time
kgtk("""
    query -i $BIO_SAMPLE $TGN_ALIGN 
        --match 'b: ()-[]->(tgnplace), \
                 w: (tgnplace)-[p]->(v)'
        --return 'distinct tgnplace as node1, p.label as label, v as node2'
        -o $POBMAP_SAMPLE
    """)

CPU times: user 4.84 ms, sys: 8.93 ms, total: 13.8 ms
Wall time: 2.26 s


## Step 3:  Concate into Subgraph and Add-id

Concatenate:

In [129]:
%%time
kgtk("""
    cat -i $SUBJECTS_SAMPLE $AGENTMAP_SAMPLE $MAPBIO_SAMPLE $BIO_SAMPLE $POBMAP_SAMPLE
        -o $ULAN_SUBGRAPH
    """)

CPU times: user 5.32 ms, sys: 8.54 ms, total: 13.9 ms
Wall time: 2.35 s


Add ids:

In [130]:
%%time
kgtk("""
    add-id -i $ULAN_SUBGRAPH --id-style node1-label-node2 --id-separator ":" -o $ULAN_SUBGRAPH_ID
    """)

CPU times: user 2.39 ms, sys: 11.6 ms, total: 14 ms
Wall time: 2.66 s


In [159]:
!wc -l $ULAN_SUBGRAPH_ID

12228876 /nas/home/bohuizha/KG/hunger-for-knowledge/data/gvp/subgraph/subgraph.ulan.id.tsv


## Step 4: Build Pairs (from Wikidata Results)

In [134]:
%%time
kgtk("""
    query -i $ULAN_ALIGN $WIKIDATA
        --match 'w: (ulanid)-[]->(qnode), 
                 a: (qnode)-[:P19]->(pob)' 
        --return 'distinct ulanid as source, pob as target'
        -o $PAIRS
    """)

CPU times: user 8.16 ms, sys: 11.2 ms, total: 19.4 ms
Wall time: 13.6 s


## Step 5: `kgtk paths` Query

In [161]:
%%time
kgtk("""
    paths --max_hops 4
        --path-file $PAIRS
        --path-mode NONE 
        --path-source source
        --path-target target
        -i $ULAN_SUBGRAPH_ID
        --statistics-only
        -o $PATHS
    """)

CPU times: user 27.2 s, sys: 13.5 s, total: 40.7 s
Wall time: 12h 51min 56s


## Step 6: Select Top 1 Property Chain

In [162]:
paths = pd.read_csv(os.environ['PATHS'], sep='\t')
paths

Unnamed: 0,node1,label,node2,id
0,p0,0,ulan:500023949:foaf:focus:ulan:500023949-agent,p0-0-0
1,p0,1,ulan:500023949-agent:gvp:biographyPreferred:ul...,p0-1-1
2,p0,2,ulan_bio:4000061264:schema:birthPlace:tgn:7008...,p0-2-2
3,p0,3,tgn:7008038-place:skos:exactMatch:Q90,p0-3-3
4,p1,0,ulan:500023949:foaf:focus:ulan:500023949-agent,p1-0-4
...,...,...,...,...
114715,p28678,3,tgn:7004446-place:skos:exactMatch:Q365,p28678-3-114715
114716,p28679,0,ulan:500026801:foaf:focus:ulan:500026801-agent,p28679-0-114716
114717,p28679,1,ulan:500026801-agent:gvp:biographyNonPreferred...,p28679-1-114717
114718,p28679,2,ulan_bio:4000704540:schema:birthPlace:tgn:7004...,p28679-2-114718


In [163]:
paths.node2 = paths.node2.apply(lambda x: ':'.join(x.split(':')[2:4]))
paths

Unnamed: 0,node1,label,node2,id
0,p0,0,foaf:focus,p0-0-0
1,p0,1,gvp:biographyPreferred,p0-1-1
2,p0,2,schema:birthPlace,p0-2-2
3,p0,3,skos:exactMatch,p0-3-3
4,p1,0,foaf:focus,p1-0-4
...,...,...,...,...
114715,p28678,3,skos:exactMatch,p28678-3-114715
114716,p28679,0,foaf:focus,p28679-0-114716
114717,p28679,1,gvp:biographyNonPreferred,p28679-1-114717
114718,p28679,2,schema:birthPlace,p28679-2-114718


In [164]:
paths.to_csv(os.environ['PATHS_LABEL'], sep='\t', index=False)

In [165]:
!head $PATHS_LABEL | column -ts $'\t'

node1  label  node2                      id
p0     0      foaf:focus                 p0-0-0
p0     1      gvp:biographyPreferred     p0-1-1
p0     2      schema:birthPlace          p0-2-2
p0     3      skos:exactMatch            p0-3-3
p1     0      foaf:focus                 p1-0-4
p1     1      gvp:biographyNonPreferred  p1-1-5
p1     2      schema:deathPlace          p1-2-6
p1     3      skos:exactMatch            p1-3-7
p2     0      foaf:focus                 p2-0-8


In [166]:
paths_concat = paths.groupby(paths['node1']).agg({'node2': lambda x: ' '.join(list(x))})
paths_concat.value_counts()

node2                                                                 
foaf:focus gvp:biographyPreferred schema:birthPlace skos:exactMatch       13978
foaf:focus gvp:biographyNonPreferred schema:birthPlace skos:exactMatch     7932
foaf:focus gvp:biographyPreferred schema:deathPlace skos:exactMatch        4737
foaf:focus gvp:biographyNonPreferred schema:deathPlace skos:exactMatch     2033
dtype: int64

## Step 7: Query Getty Using the Property Chain

Path: `(ulanid)-[foaf:focus]->()-[gvp:biographyPreferred]->()-[schema:birthPlace]->()-[skos:exactMatch]->(Qnode of PoB)`

In [170]:
%%time
kgtk("""
    query -i $ULAN_SUBGRAPH_ID
        --match '(ulanid)-[p0]->()-[p1]->()-[p2]->()-[p3]->(pob)'
        --where 'p0.label = "foaf:focus" AND
                 p1.label = "gvp:biographyPreferred" AND 
                 p2.label = "schema:birthPlace" AND
                 p3.label = "skos:exactMatch"'
        --return 'distinct ulanid as node1, "birthplace" as label, pob as node2'
        -o $GETTY_POB
    """)

CPU times: user 7.6 ms, sys: 20.4 ms, total: 28 ms
Wall time: 10.1 s


## Step 8: Record and Validate Results

Map back to Wikidata

In [None]:
%%time
kgtk("""
    query -i $GETTY_POB $ULAN_ALIGN
        --match 'g: (ulanid)-[p]->(pob), 
                 w: (ulanid)-[]->(qnode)'
        --return 'distinct qnode as node1, p.label as label, pob as node2'
        -o $GETTY_MAPPED_POB
    """)

In [None]:
kgtk("""
    query -i $GETTY_MAPPED_POB
        --match '(qnode)-[]->(pob)'
        --return 'count(distinct qnode)'
    """)

Validate with Wikidata Constraints

In [None]:
%%time
kgtk("""
    query -i $GETTY_MAPPED_POB $P31 $P279STAR \
        --match 'g: (node1)-[nodeProp]->(node2), P31: (node2)-[]->(nodex), P279star: (nodex)-[]->(par)' \
        --where 'par in ["Q618123", "Q3895768", "Q24334893", "Q2221906", "Q102496", "Q27096213"]' \
        --return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`' \
        -o $CORRECT
    """)

In [None]:
%%time
kgtk("""
    ifnotexists -i $GETTY_MAPPED_POB \
        --filter-on $CORRECT \
        --input-keys node1 node2 \
        --filter-keys node1 node2
    """)