# Add Derived Graphs To The Tutorial Graph



In [1]:
import io
import os
import subprocess
import sys

import numpy as np
import pandas as pd
from IPython.display import display, HTML

import papermill as pm

sys.path.insert(0,'../..')
from configure_kgtk_notebooks import ConfigureKGTK

from kgtk.functions import kgtk, kypher

In [2]:
# Parameters

kgtk_path = "/Users/pedroszekely/Documents/GitHub/kgtk"

# Folder on local machine where to create the output and temporary folders
input_path = "/Users/pedroszekely/Downloads/kypher/projects/build-tutorial"
output_path = "/Users/pedroszekely/Downloads/kypher/projects"
project_name = "tutorial-derived-graphs"
tutorial_files_path = "/Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold"

In [3]:
files = [
    "all"
]
ck = ConfigureKGTK(kgtk_path=kgtk_path)
ck.configure_kgtk(input_graph_path=input_path,
                  output_path=output_path,
                  project_name=project_name)

User home: /Users/pedroszekely
Current dir: /Users/pedroszekely/Documents/GitHub/kgtk/tutorial/build-kg
KGTK dir: /Users/pedroszekely/Documents/GitHub/kgtk
Use-cases dir: /Users/pedroszekely/Documents/GitHub/kgtk/use-cases


In [4]:
ck.print_env_variables(files)

kgtk: kgtk
USE_CASES_DIR: /Users/pedroszekely/Documents/GitHub/kgtk/use-cases
EXAMPLES_DIR: /Users/pedroszekely/Documents/GitHub/kgtk/examples
TEMP: /Users/pedroszekely/Downloads/kypher/projects/tutorial-derived-graphs/temp.tutorial-derived-graphs
OUT: /Users/pedroszekely/Downloads/kypher/projects/tutorial-derived-graphs
STORE: /Users/pedroszekely/Downloads/kypher/projects/tutorial-derived-graphs/temp.tutorial-derived-graphs/wikidata.sqlite3.db
GRAPH: /Users/pedroszekely/Downloads/kypher/projects/build-tutorial
kypher: kgtk query --graph-cache /Users/pedroszekely/Downloads/kypher/projects/tutorial-derived-graphs/temp.tutorial-derived-graphs/wikidata.sqlite3.db
all: /Users/pedroszekely/Downloads/kypher/projects/build-tutorial/all.tsv.gz


Turn on debugging for kypher

In [5]:
os.environ['tutorial_files_path'] = tutorial_files_path
os.environ['kgtk_path'] = kgtk_path
os.environ['KGTK_GRAPH_CACHE'] = os.environ['STORE']
os.environ['KGTK_LABEL_FILE'] = os.environ['OUT'] + "/parts/labels.en.tsv.gz"
os.environ['KGTK_OPTION_DEBUG'] = "true"

Load all my files into the kypher cache so that all graph aliases are defined

In [6]:
ck.load_files_into_cache(file_list=files)

kgtk query --graph-cache /Users/pedroszekely/Downloads/kypher/projects/tutorial-derived-graphs/temp.tutorial-derived-graphs/wikidata.sqlite3.db -i "/Users/pedroszekely/Downloads/kypher/projects/build-tutorial/all.tsv.gz" --as all  --limit 3
[2021-10-10 11:51:26 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_27 AS graph_27_c1
     LIMIT ?
  PARAS: [3]
---------------------------------------------
node1	label	node2	id
P10	P31	Q18610173	P10-P31-Q18610173-85ef4d24-0
P1000	P31	Q18608871	P1000-P31-Q18608871-093affb5-0
P1001	P1647	P276	P1001-P1647-P276-e4e44f83-0


In [7]:
%cd {os.environ['OUT']}

/Users/pedroszekely/Downloads/kypher/projects/tutorial-derived-graphs


## Run partition notebook

We need the parts to run the Useful Files notebook

In [8]:
pm.execute_notebook(
    os.environ["EXAMPLES_DIR"] + "/partition-wikidata.ipynb",
    os.environ["TEMP"] + "/partition-wikidata.out.ipynb",
    parameters=dict(
        wikidata_input_path = input_path + "/all.tsv.gz",
        wikidata_parts_path = os.environ["OUT"] + "/parts",
        temp_folder_path = os.environ["OUT"] + "/parts/temp",
        sort_extras = "--buffer-size 30% --temporary-directory $OUT/parts/temp",
        verbose = False,
        gzip_command = 'gzip'
    )
)
;

Executing:   0%|          | 0/49 [00:00<?, ?cell/s]

''

Show the files after partition

In [9]:
!ls $OUT/parts

aliases.en.tsv.gz                   metadata.property.datatypes.tsv.gz
aliases.tsv.gz                      metadata.types.tsv.gz
all.tsv.gz                          qualifiers.commonsMedia.tsv.gz
claims.commonsMedia.tsv.gz          qualifiers.external-id.tsv.gz
claims.external-id.tsv.gz           qualifiers.geo-shape.tsv.gz
claims.geo-shape.tsv.gz             qualifiers.globe-coordinate.tsv.gz
claims.globe-coordinate.tsv.gz      qualifiers.math.tsv.gz
claims.math.tsv.gz                  qualifiers.monolingualtext.tsv.gz
claims.monolingualtext.tsv.gz       qualifiers.musical-notation.tsv.gz
claims.musical-notation.tsv.gz      qualifiers.quantity.tsv.gz
claims.other.tsv.gz                 qualifiers.string.tsv.gz
claims.quantity.tsv.gz              qualifiers.tabular-data.tsv.gz
claims.string.tsv.gz                qualifiers.time.tsv.gz
claims.tabular-data.tsv.gz          qualifiers.tsv.gz
claims.time.tsv.gz                  qualifiers.url.tsv.gz
claims.tsv.gz                       quali

Deploy the parts to `$tutorial_files_path`

## Run useful files notebook

In [None]:
pm.execute_notebook(
    os.environ["USE_CASES_DIR"] + "/Wikidata Useful Files.ipynb",
    os.environ["TEMP"] + "/Wikidata Useful Files Out.ipynb",
    parameters=dict(
        output_path = os.environ["OUT"],
        output_folder = "useful_files",
        temp_folder = "temp.useful_files",
        wiki_root_folder = os.environ["OUT"] + "/parts/",
        cache_path = os.environ["TEMP"],
        languages = 'en',
        compute_pagerank = True,
        compute_degrees = True,
        compute_hits = False, 
        delete_database = False,
        debug = "false"
    )
)
;

Executing:   0%|          | 0/157 [00:00<?, ?cell/s]

In [None]:
!ls -l $OUT/useful_files

## Enhance pagerank files to include ordinal

Approach:
- Load the `directed_pagerank` from the metadata file into a dataframe (using kypher because somehow cat is broken, sigh
- Sort the file by pagerank descending
- Add a new column with header `P1545` (ordinal) and store the ranks in this column
- Store the result in a temporary file.

In [None]:
%%time
directed_pagerank = kgtk("""
    query -i $OUT/useful_files/metadata.pagerank.directed.tsv.gz 
    --match '(n1)-[l:Pdirected_pagerank]->(pagerank)'
""")

directed_pagerank_sorted = directed_pagerank.sort_values("node2", ascending=False)
directed_pagerank_sorted.insert(0, 'P1545', range(1, 1 + len(directed_pagerank_sorted)))
directed_pagerank_sorted.to_csv(f"{os.environ['TEMP']}/directed-pagerank.ordinal.tsv", index=False, sep='\t')
directed_pagerank_sorted

The temporary file looks good, next steps:
- `normalize` to put the qualifiers as extra edges so the file has only `node1/label/node2/id`
- `add-ids` as we want all edges to have ids

In [None]:
kgtk("""
    normalize -i "$TEMP"/directed-pagerank.ordinal.tsv
    / add-id --id-style wikidata 
    -o "$OUT"/useful_files/metadata.pagerank.directed.ordinal.tsv.gz
""")

Look at the result to confirm that we are generating the data we want.

In [None]:
kgtk("""
    head -i "$OUT"/useful_files/metadata.pagerank.directed.ordinal.tsv.gz / add-labels
""")

Repeat the same steps for `undirected_pagerank`

In [None]:
%%time
undirected_pagerank = kgtk("""
    query -i $OUT/useful_files/metadata.pagerank.undirected.tsv.gz 
    --match '(n1)-[l:Pundirected_pagerank]->(pagerank)'
""")

undirected_pagerank = undirected_pagerank.sort_values("node2", ascending=False)
undirected_pagerank.insert(0, 'P1545', range(1, 1 + len(undirected_pagerank)))
undirected_pagerank.to_csv(f"{os.environ['TEMP']}/undirected-pagerank.ordinal.tsv", index=False, sep='\t')
undirected_pagerank

In [None]:
kgtk("""
    normalize -i "$TEMP"/undirected-pagerank.ordinal.tsv
    / add-id --id-style wikidata 
    -o "$OUT"/useful_files/metadata.pagerank.undirected.ordinal.tsv.gz
""")

In [None]:
kgtk("""
    head -i "$OUT"/useful_files/metadata.pagerank.undirected.ordinal.tsv.gz / add-labels
""")

## Deploy the tutorial files to `$tutorial_files_path`

Define the files we want to have in the tutorial

In [None]:
tutorial_files_parts = [
    "labels.en.tsv.gz",
    "aliases.en.tsv.gz",
    "descriptions.en.tsv.gz",
    "claims.external-id.tsv.gz",
    "claims.monolingualtext.tsv.gz",
    "claims.quantity.tsv.gz",
    "claims.string.tsv.gz",
    "claims.time.tsv.gz",
    "claims.wikibase-item.tsv.gz",
    "claims.wikibase-property.tsv.gz",
    "qualifiers.tsv.gz"
]

tutorial_files_useful = [
    "derived.P279.tsv.gz",
    "derived.P279star.tsv.gz",
    "derived.P31.tsv.gz",
    "metadata.in_degree.tsv.gz",
    "metadata.out_degree.tsv.gz"
]

Deploy the files from the partition and useful notebooks. 

In [None]:
for file in tutorial_files_parts:
    path = "$OUT/parts/" + file
    !cp -p {path} $tutorial_files_path

for file in tutorial_files_useful:
    path = "$OUT/useful_files/" + file
    !cp -p {path} $tutorial_files_path

Overwrite the original pagerank files with the ones that include ordinal

In [None]:
!cp -p $OUT/useful_files/metadata.pagerank.directed.ordinal.tsv.gz $tutorial_files_path/metadata.pagerank.directed.tsv.gz
!cp -p $OUT/useful_files/metadata.pagerank.undirected.ordinal.tsv.gz $tutorial_files_path/metadata.pagerank.undirected.tsv.gz 

Important to deply the custom KGTK properties file. Copy using KGTK to conveniently compress the file.

In [None]:
kgtk("""
    cat 
        -i "$kgtk_path"/kgtk-properties/kgtk.properties.tsv 
        -i "$OUT"/parts/metadata.property.datatypes.tsv.gz
        -o "$tutorial_files_path"/metadata.property.datatypes.tsv.gz
""")

In [None]:
!ls -l "$tutorial_files_path"

Create an `all.tsv.gz` file

In [None]:
%%time
all_file_path = os.environ['tutorial_files_path'] + "/all.tsv.gz"
if os.path.exists(all_file_path):
    !rm {all_file_path}
!kgtk cat -i "$tutorial_files_path"/*.tsv.gz -o {all_file_path}

Peek at the file

In [None]:
kgtk("""
    head -i "$tutorial_files_path"/all.tsv.gz
""")

Run the KGTK validator on the new knowledge graph

In [None]:
%%time
!kgtk validate -i "$tutorial_files_path"/all.tsv.gz \
    --allow-wikidata-lq-strings True \
    --ignore-minimum-year True \
    --ignore-maximum-year True

### Somehow `kgtk cat` is broken, as it returns nothing. 
The command works if I invoke it before running the config cells at the top, but stops working after I run the config cells, very strange.

In [None]:
!zcat < $OUT/useful_files/metadata.pagerank.directed.tsv.gz | head

In [None]:
kgtk("""
    cat -i  $OUT/useful_files/metadata.pagerank.directed.tsv.gz 
""")

In [None]:
!kgtk cat -i /Users/pedroszekely/Downloads/kypher/projects/tutorial-derived-graphs/useful_files/metadata.pagerank.directed.tsv.gz  | head