# Class Visualization



### Preamble: set up the environment and files used in the tutorial

In [None]:
import io
import os
import subprocess
import sys

import numpy as np
import pandas as pd
from IPython.display import display, HTML

from graph_tool.all import *

from kgtk.configure_kgtk_notebooks import ConfigureKGTK
from kgtk.functions import kgtk, kypher

In [None]:
# Parameters

kgtk_path = "/Users/amandeep/GitHub/kgtk"

# Folder on local machine where to create the output and temporary folders
input_path = "/Volumes/saggu-ssd/wikidata-dwd-v3"
input_path = "/data/amandeep/wikidata-20211027-dwd-v3"
output_path = "/Volumes/saggu-ssd/wikidata-dwd-v3"
output_path = "/data/amandeep/wikidata-20211027-dwd-v3"
project_name = "class-visualization"

graph_cache_path = None

files = "p279,p279star,label"
files_for_cache = None
debug = False

In [None]:
files = files.split(',')

if files_for_cache is None:
    files_for_cache =  files
else:
    files_for_cache = files_for_cache.split(",")

Our Wikidata distribution partitions the knowledge in Wikidata into smaller files that make it possible for you to pick and choose which files you want to use. Our tutorial KG is a subset of Wikidata, and is partitioned in the same way as the full Wikidata. The following is a partial list of all the files:

In [None]:
ck = ConfigureKGTK(files, kgtk_path=kgtk_path)
ck.configure_kgtk(input_graph_path=input_path,
                  output_path=output_path,
                  project_name=project_name,
                  graph_cache_path=graph_cache_path,
                  debug=True
                 )

The KGTK setup command defines environment variables for all the files so that you can reuse the Jupyter notebook when you install it on your local machine.

In [None]:
ck.print_env_variables()

In [None]:
ck.load_files_into_cache(files=files_for_cache)

In [None]:
!kgtk --debug query -i p279 --idx mode:monograph --limit 5

In [None]:
!kgtk --debug query -i p279star --idx mode:monograph --limit 5

In [None]:
!kgtk --debug query -i label --idx mode:monograph --limit 5

## Get a list of all the classes


First get a list of all the `node1` in p279

In [None]:
kgtk("""
    query -i p279
        --match '(class)-[]->()'
        --return 'distinct class as id'
    -o $TEMP/p279.node1.tsv.gz
""")

In [None]:
if debug:
    !zcat < $TEMP/p279.node1.tsv.gz | wc -l

Now get a list of all the node2 in p279

In [None]:
kgtk("""
    query -i p279
        --match '()-[]->(class)'
        --return 'distinct class as id'
    -o $TEMP/p279.node2.tsv.gz
""")

In [None]:
if debug:
    !zcat < $TEMP/p279.node2.tsv.gz | wc -l

In [None]:
kgtk("""
    ifnotexists --mode NONE 
        -i $TEMP/p279.node2.tsv.gz
        --filter-on $TEMP/p279.node1.tsv.gz
        --input-keys id
        --filter-keys id
    -o $TEMP/p279.classes-that-are-not-subclasses.tsv.gz
""")

In [None]:
if debug:
    !zcat < $TEMP/p279.classes-that-are-not-subclasses.tsv.gz | wc -l

In [None]:
if debug:
    kgtk("head -i $TEMP/p279.classes-that-are-not-subclasses.tsv.gz -n 25 / add-labels")

Concatenate the files to get a list of all the classes

In [None]:
kgtk("""
    cat --mode NONE -i $TEMP/p279.node1.tsv.gz -i $TEMP/p279.classes-that-are-not-subclasses.tsv.gz
    / sort --mode NONE --column id
    -o $OUT/classes.tsv.gz
""")

In [None]:
if debug:
    !zcat < $OUT/classes.tsv.gz | wc -l

## Measure the degree of classes

In [None]:
kgtk("""
    graph-statistics -i "$p279" -o $OUT/statistics.p279.tsv.gz 
    --compute-pagerank False 
    --compute-hits False 
    --page-rank-property Pdirected_pagerank 
    --vertex-in-degree-property Pindegree
    --vertex-out-degree-property Poutdegree
    --output-degrees True 
    --output-pagerank False 
    --output-hits False \
    --output-statistics-only 
    --undirected False 
    --log-file $TEMP/statistics.summary.txt
""")

In [None]:
kgtk("sort -i $OUT/statistics.p279.tsv.gz --columns node2 --numeric --reverse -o $TEMP/p279.indegree.tsv.gz")

In [None]:
if debug:
    kgtk("head -i $TEMP/p279.indegree.tsv.gz -n 25 / add-labels")

In [None]:
kgtk("""
    query -i $OUT/statistics.p279.tsv.gz 
        --match '(n1)-[eid]->(degree)' 
        --where 'cast(degree, int) > 500' 
        --order-by 'cast(degree, int) desc'
""")

### Create list of high and low `P279` degree classes 

In [None]:
kgtk("""
    query -i $OUT/statistics.p279.tsv.gz 
        --match '(n1)-[:Pindegree]->(degree)' 
        --where 'cast(degree, int) < 500' 
        --return 'n1 as node1, "few_subclasses" as node_type'
        --order-by 'cast(degree, int) desc'
    -o $OUT/class-browsing.low-degree-nodes.tsv
""")

The `class-browsing.low-degree-nodes.tsv` is simply a list of nodes:

In [None]:
if debug:
    kgtk("head -n 5 -i $OUT/class-browsing.low-degree-nodes.tsv / add-labels")

In [None]:
kgtk("""
    query -i $OUT/statistics.p279.tsv.gz 
        --match '(n1)-[:Pindegree]->(degree)' 
        --where 'cast(degree, int) > 499'
        --return 'n1 as node1, "many_subclasses" as node_type'
        --order-by 'cast(degree, int) desc'
    -o $OUT/class-browsing.high-degree-nodes.tsv
""")

In [None]:
if debug:
    kgtk("head -n 5 -i $OUT/class-browsing.high-degree-nodes.tsv / add-labels")

In [None]:
kgtk("""
    cat --use-graph-cache-envar False --mode NONE -i $OUT/class-browsing.low-degree-nodes.tsv -i $OUT/class-browsing.high-degree-nodes.tsv
    -o $OUT/class-browsing.all-nodes.tsv
""")

In [None]:
if debug:
    kgtk("head -i $OUT/class-browsing.all-nodes.tsv -n 4")

In [None]:
!kgtk --debug query -i $OUT/class-browsing.all-nodes.tsv --as browsernodes --idx index:node1,node_type --limit 3

## Create a P279star file that we will use for visualization.



### First create a complete p279star file containing all classes

First create a complete P279star file that contains all classes as our starting point. We do this because in the browser, users can click on any class.

In [None]:
kgtk("""
    reachable-nodes
        --rootfile $OUT/classes.tsv.gz
        --selflink 
        --breadth-first True
        --show-distance True
        --label P279star
        -i "$p279"
        -o $TEMP/derived.p279star.complete.tsv.gz
""")

In [None]:
if debug:
    kgtk("head -i $TEMP/derived.p279star.complete.tsv.gz -n 10")

The complete p279star file has only a few more edges than the default one. We should replace the original one with the complete one in any case.

In [None]:
if debug:
    !zcat < "$p279star" | wc -l

In [None]:
if debug:
    !zcat < $TEMP/derived.p279star.complete.tsv.gz | wc -l

Add ids and index for use in queries. The new file has a distance column, which we index too so that we can do index queries quickly.

In [None]:
kgtk("""
    add-id --id-style wikidata -i $TEMP/derived.p279star.complete.tsv.gz
    -o $OUT/derived.p279star.complete.tsv.gz
""")

In [None]:
!kgtk --debug query -i $OUT/derived.p279star.complete.tsv.gz --as p279stard --idx index:node2,node1,distance --limit 3

### Count the number of subclasses 
We eventually want to build the subclass graph for each class, but some may be too large

In [None]:
kgtk("""
    query -i p279stard
        --match '
            (subclass)-[]->(class)'
        --return 'class as node1, "Pcount_subclasses" as label, count(distinct subclass) as node2, class as graph'
        --where 'subclass != class'
        --order-by 'cast(node2, int) desc'
    -o $TEMP/subclass.count.tsv.gz
""")

Get an overview of the file. The top classes have an enormous number of subclasses, which will cause trouble for visualization.
Also, only 126K classes with subclasses, so there are a lot of leaf classes in Wikidata.

In the steps below we exclude the high degree classes, but that won't fix the problem as the top classes have too many subclasses anyway. Sigh. The browser will freeze and the user will be annoyed.

In [None]:
df = kgtk("""
    cat -i $TEMP/subclass.count.tsv.gz / add-labels
""")
df

### Create a subset of p279 that excludes high in-degree classes in node2

File `class-browsing.low-degree-nodes.tsv` has the class with a low number of subclasses, which we call the low degree nodes. Our low degree P279 file will have all P279 edges that arrive at a low degree class, i.e., where the superclass is a low degree class.

In [None]:
kgtk("""
    query -i p279 -i $OUT/class-browsing.low-degree-nodes.tsv
        --match '
            p279: (class)-[eid]->(superclass),
            low: (superclass)'
        --return 'class as node1, eid.label as label, superclass as node2, eid as id'
    -o $OUT/p279.lowdegree.tsv.gz
""")

In [None]:
if debug:
    !zcat < "$p279" | wc -l

The low degree P279 file has many fewer edges, which is expected as the high degree classes account for a lot of edges.

In [None]:
if debug:
    !zcat < $OUT/p279.lowdegree.tsv.gz | wc -l

### Recompute P279star with the low degree classes
The output will be `derived.p279star.low-degree.complete.tsv.gz`

We start at all classes, and find all superclasses for them, excluding the high degree classes.

In [None]:
kgtk("""
    reachable-nodes
        --rootfile $OUT/classes.tsv.gz
        --selflink 
        --breadth-first True
        --show-distance True
        --label P279star
        -i $OUT/p279.lowdegree.tsv.gz
        -o $TEMP/derived.p279star.low-degree.complete.tsv.gz
""")

Add ids

In [None]:
kgtk("""
    add-id --id-style wikidata -i $TEMP/derived.p279star.low-degree.complete.tsv.gz
    -o $OUT/derived.p279star.low-degree.complete.tsv.gz
""")

Index using node1, node2 and distance. I wonder if we should also index the id column?

In [None]:
!kgtk --debug query -i $OUT/derived.p279star.low-degree.complete.tsv.gz --as p279starlow --idx index:node2,node1,distance --limit 3

### Statistics to show in the graph

> We are not computing the statistics file in this notebook as it is computed in the `p1963` project. 
> We need the file here, so Pedro copied it from the `p1963` project and put it in the `$TEMP` folder

File is `statistics.Pinstance_count.tsv.gz`


In [None]:
if debug:
    kgtk("head -i $GRAPH/statistics.Pinstance_count.tsv.gz")

In [None]:
!kgtk --debug query -i $GRAPH/statistics.Pinstance_count.tsv.gz --idx mode:monograph --limit 5

## Compute the edge file that contains the graph we want to visualize for each class

The edge file contains `subclass / P279 / class` edges, but we add two columns to support the visualization:

- `graph:` is the id of a class we want to visualize. This columns allows us to quickly fetch all the edges to build the visualization of a class.
- `edge_type`: in the visualization we want to distinguish `subclass` and `superclass` edges so the viewer can easily distinguish subclasses and superclasses.

### Compute the subclass edges

For every class (the graph) we want to find all the P279 edges for subclasses of the given class. We use `class-browsing.low-degree-nodes.tsv` so that we don't include high degree classes that will blow up the browser.

In [None]:
kgtk(f"""
    query -i p279starlow -i p279 -i $OUT/class-browsing.low-degree-nodes.tsv
        --match '
            p279starlow: (subclass1)-[]->(class),
            p279starlow: (subclass2)-[]->(class),
            low: (subclass1),
            low: (subclass2),
            p279: (subclass1)-[]->(subclass2)'
        --return 'distinct subclass1 as node1, "P279" as label, subclass2 as node2, class as graph, "subclass" as edge_type'
    -o $TEMP/all.graph.low.sub.tsv.gz
""")

In [None]:
if debug:
    !zcat < $TEMP/all.graph.low.sub.tsv.gz | wc -l

We have a lot of edges because we make copies for every graph, i.e., the same edge appears in many graphs. This is annoying, but it allows us to fetch the graphs very quickly, in less than 2 seconds.

In [None]:
if debug:
    kgtk("head -n 5 -i $TEMP/all.graph.low.sub.tsv.gz")

### Compute the superclass edges

The superclass edges are also P279 edges, but they sit above the given class. We don't need to filter to low degree classes because we are going up the P279 hierarchy.

In [None]:
kgtk(f"""
    query -i p279stard -i p279
        --match '
            p279stard: (class)-[]->(superclass1),
            p279stard: (class)-[]->(superclass2),
            p279: (superclass1)-[]->(superclass2)'
        --return 'distinct superclass1 as node1, "P279" as label, superclass2 as node2, class as graph, "superclass" as edge_type'
    -o $TEMP/all.graph.low.super.tsv.gz
""")

In [None]:
if debug:
    !zcat < $TEMP/all.graph.low.super.tsv.gz | wc -l

In [None]:
if debug:
    kgtk("head -n 5 -i $TEMP/all.graph.low.super.tsv.gz")

### Concatenate the subclass and superclass files, and store in `$TEMP/graph.low.tsv.gz`

We keep the file in `$TEMP` because for the final file we want to add he high degree nodes so that the user sees that they exist (we will not add the subclasses). Once we have the complete file, we will put it in `$OUT`.

In [None]:
kgtk(f"""
    cat --use-graph-cache-envar False -i $TEMP/all.graph.low.sub.tsv.gz -i $TEMP/all.graph.low.super.tsv.gz
    -o $TEMP/graph.low.tsv.gz
""")

Index the file to allow fast queries on all columns

In [None]:
!kgtk --debug query -i $TEMP/graph.low.tsv.gz --as graphbrowser --idx index:node1,node2,graph,edge_type --limit 3

## Compute the node file for visualization

The node file for visualization needs the labels for the nodes, and the `graph` to pull it out quickly. We add:

- `instance_count`: the number of direct instances of the class, as it is interesting for the user to see this information.

### Extract the nodes from the edge file

The reason to use the edge file is that we need the `graph` id. We do it in two steps, first extract `node1` and then extract `node2`

In [None]:
kgtk("""
    query -i label -i $TEMP/statistics.Pinstance_count.tsv.gz -i graphbrowser -i browsernodes
        --match '
            graphbrowser: (c)-[{graph: graph}]->(),
            browsernodes: (c)-[{node_type: nt}]->()'
        --opt 'label: (c)-[]->(class_label)'
        --opt 'Pinstance_count: (c)-[:Pinstance_count]->(instance_count)'
        --return 'distinct c as node1, graph as graph, coalesce(instance_count,0) as instance_count, nt as node_type, class_label as label'
    -o $TEMP/graph.low.node1.tsv.gz
""")


This is what our node file looks like:

In [None]:
if debug:
    kgtk("head -n 5 -i $TEMP/graph.low.node1.tsv.gz")

In [None]:
kgtk("""
    query -i label -i $TEMP/statistics.Pinstance_count.tsv.gz -i graphbrowser -i browsernodes
        --match '
            graphbrowser: ()-[{graph: graph}]->(c),
            browsernodes: (c)-[{node_type: nt}]->()'
        --opt 'label: (c)-[]->(class_label)'
        --opt 'Pinstance_count: (c)-[:Pinstance_count]->(instance_count)'
        --return 'distinct c as node1, graph as graph, coalesce(instance_count,0) as instance_count, nt as node_type, class_label as label'
    -o $TEMP/graph.low.node2.tsv.gz
""")

### Concatenate the two node files, deduplicate and index

To-do: try presorting the files to see if compact will run faster, as it is, this command takes over 2.5 hours

In [None]:
kgtk("""
    cat --use-graph-cache-envar False --mode NONE -i $TEMP/graph.low.node1.tsv.gz -i $TEMP/graph.low.node2.tsv.gz
    / compact --mode NONE  --columns node1 graph
    -o $TEMP/graph.low.node.tsv.gz
""")

We only need to index on `graph` as we will not do node queries on it:

## Special handling of high degree nodes

In [None]:
if debug:
    kgtk("head -n 5 -i $OUT/class-browsing.high-degree-nodes.tsv")

### Make a graph file with the `P279` edges where the subclass is a high degree class

Do this only to add edges that connect to the subclasses of our target node, so `class` has to be in `$TEMP/all.graph.low.sub.tsv.gz`

In [None]:
kgtk("""
    query --debug -i $OUT/class-browsing.high-degree-nodes.tsv -i p279 -i $TEMP/all.graph.low.sub.tsv.gz
        --match '
            low: (class)-[{graph: graph}]->(),
            high: (subclass),
            p279: (subclass)-[]->(class)'
        --where 'subclass != class'
        --return 'distinct subclass as node1, "P279" as label, class as node2, graph as graph, "subclass" as edge_type'
    -o $TEMP/graph.high1.tsv.gz
""")

In [None]:
kgtk("""
    query --debug -i $OUT/class-browsing.high-degree-nodes.tsv -i p279 -i $TEMP/all.graph.low.sub.tsv.gz
        --match '
            low: ()-[{graph: graph}]->(class),
            high: (subclass),
            p279: (subclass)-[]->(class)'
        --where 'subclass != class'
        --return 'distinct subclass as node1, "P279" as label, class as node2, graph as graph, "subclass" as edge_type'
    -o $TEMP/graph.high2.tsv.gz
""")

In [None]:
kgtk(f"""
    cat --use-graph-cache-envar False -i $TEMP/graph.high1.tsv.gz -i  $TEMP/graph.high2.tsv.gz
    -o $TEMP/graph.high.tsv.gz
""")

In [None]:
if debug:
    kgtk("head -n 5 -i $TEMP/graph.high.tsv.gz")

### Make a node file with the high degree nodes

We use the edge file because we need to put the `graph` in the node file too.

In [None]:
kgtk("""
    query -i label -i $GRAPH/statistics.Pinstance_count.tsv.gz -i $TEMP/graph.high.tsv.gz
        --match 'high: (c)-[{graph: graph}]->()'
        --opt 'label: (c)-[]->(class_label)'
        --opt 'Pinstance_count: (c)-[:Pinstance_count]->(instance_count)'
        --return 'distinct c as node1, graph as graph, coalesce(instance_count,0) as instance_count, "many_subclasses" as node_type, class_label as label'
    -o $TEMP/graph.high.node.tsv.gz
""")

In [None]:
if debug:
    kgtk("head -n 5 -i $TEMP/graph.high.node.tsv.gz")

Just to make sure, count the number of sublcasses of one of our supposedly high degree nodes, innocent looking with one instance, but indeed many subclasses.

In [None]:
if debug:
    kgtk("query -i p279 --match '(subclass)-[]->(:Q10267817)' --return 'count(distinct subclass)'")

In [None]:
if debug:
    kgtk("query -i p279 --match '(subclass)-[]->(:Q30185)' --return 'count(distinct subclass)'")

### Augment the low degree edge and node files with the high degree info

Concatenating without deduplication is sufficient as the files cannot have duplicate edges or nodes.

In [None]:
kgtk("""
    cat --use-graph-cache-envar False -i $TEMP/graph.high.tsv.gz -i $TEMP/graph.low.tsv.gz
    -o $OUT/class-visualization.edge.tsv.gz
""")

In [None]:
if debug:
    kgtk("head -n 5 -i $OUT/class-visualization.edge.tsv.gz")

Index the file for query using the `graph` column:

In [None]:
if debug:
    !kgtk query -i $OUT/class-visualization.edge.tsv.gz --as classvizedge --idx index:graph --limit 3

Concatenate the node files:

In [None]:
kgtk("""
    cat --use-graph-cache-envar False --mode NONE -i $TEMP/graph.high.node.tsv.gz -i $TEMP/graph.low.node.tsv.gz
    -o $TEMP/class-visualization.node.tsv.gz
""")

Add a tooltip with meaningful information

In [None]:
kgtk("""
    query -i $TEMP/class-visualization.node.tsv.gz
        --match '(node)-[{graph: g, instance_count: ic, node_type: nt, label: l}]->()'
        --return 'distinct
            node as node1, g as graph, ic as instance_count, nt as node_type, l as label,
            printf("%s (%s)<BR/>instance count: %s<BR/>node type: %s", kgtk_lqstring_text(l), node, cast(ic, int), nt) as tooltip'
    -o $OUT/class-visualization.node.tsv.gz
""")

In [None]:
if debug:
    kgtk("head -n 5 -i $OUT/class-visualization.node.tsv.gz")

In [None]:
if debug:
    kgtk("""
        query -i $OUT/class-visualization.edge.tsv.gz
        --match 'edge:()-[label{graph:g}]->()'
        --return 'g as node1, "count" as label, COUNT(g) as node2'
        -o $OUT/class-visualization.edge.count.tsv.gz
    """)

In [None]:
if debug:
    kgtk("""
        query -i $OUT/class-visualization.edge.tsv.gz
        --match '()-[label{graph:g, edge_type:et}]->()'
        --where 'et = "subclass"'
        --return 'g as node1, "count" as label, COUNT(g) as node2'
        -o $OUT/class-visualization.edge.sub.count.tsv.gz
    """)

In [None]:
if debug:
    kgtk("""
        head -i $OUT/class-visualization.edge.superclass.tsv.gz
    """)

In [None]:
if debug:
    kgtk("""
        query -i $OUT/class-visualization.edge.tsv.gz
        --match '()-[label{graph:g, edge_type:et}]->()'
        --where 'et = "superclass"'
        --return 'g as node1, "count" as label, COUNT(g) as node2'
        -o $OUT/class-visualization.edge.super.count.tsv.gz
    """)

In [None]:
if debug:
    kgtk("""
        query -i $OUT/class-visualization.edge.tsv.gz
        --match 'edge:()-[label{edge_type:t}]->()'
        --where 't = "subclass"'
        -o $OUT/class-visualization.edge.subclass.tsv.gz
    """)

In [None]:
if debug:
    kgtk("""
        query -i $OUT/class-visualization.edge.tsv.gz
        --match 'edge:()-[label{edge_type:t}]->()'
        --where 't = "superclass"'
        -o $OUT/class-visualization.edge.superclass.tsv.gz
    """)

Index the file for query using the `graph` column:

In [None]:
if debug:
    !kgtk query -i $OUT/class-visualization.node.tsv.gz --as classviznode --idx index:graph --limit 3

Temporary: we need this file for my current version of visualize because it needs labels in the edge file, the new version can have the labels in the node file

Test creation of the node file:

In [None]:
if debug:
    root = "Q11424"
    # root="Q391342"
    root="Q1420"
    # root="Q1107"
    # root="Q889821"
    # root="Q1549591"
    # root="Q188724"
    # root="Q946808"
    kgtk(f"""
        query -i classviznode
            --match '(class)-[{{graph: "{root}", instance_count: instance_count, label: label}}]->()'
    """)

## Test creation of visualizations

In [None]:
if debug:
    roots = [
        "Q11424",
        "Q391342",
        "Q1420",
        "Q1107",
        "Q889821",
        "Q1549591",
        "Q188724",
        "Q946808",
        "Q33999",
        "Q483501",
        "Q2221906",
        "Q144",
        "Q516021",
        "Q10494269"
    ]

    for root in roots:
        kgtk(f"""
            query -i classvizedgetest
                --match '(class)-[{{label: property, graph: "{root}", edge_type: edge_type}}]->(superclass)'
            -o $TEMP/browser/{root}.graph.low.tsv
        """)

        kgtk(f"""
            query -i classviznode
                --match '(class)-[{{graph: "{root}", instance_count: instance_count, label: label}}]->()'
            -o $TEMP/browser/{root}.node.graph.low.tsv
        """)

        # kgtk(f"""
        #     visualize-force-graph -i $TEMP/browser/{root}.graph.low.tsv
        #         --direction arrow
        #         -o $TEMP/browser/{root}.graph.low.html
        # """)

## Tests for individual files

In [None]:
if debug:
    kgtk("""
        query -i $TEMP/graph.low.node.tsv.gz
            --match '(node)-[{graph: "Q1420"}]->()'
            --order-by 'node'
    """)

In [None]:
if debug:
    kgtk("""
        query -i $TEMP/graph.high.node.tsv.gz
            --match '(node)-[{graph: "Q1420"}]->()'
            --order-by 'node'
    """)

In [None]:
if debug:
    kgtk("""
        query -i $TEMP/class-visualization.node.tsv.gz
            --match '(node)-[{graph: "Q1420"}]->()'
            --order-by 'node'
    """)

In [None]:
if debug:
    kgtk("""
        query -i classviznode
            --match '(node)-[{graph: "Q1420"}]->()'
            --order-by 'node'
    """)

In [None]:
if debug:
    kgtk("""
        query -i graphbrowser
            --match '(node)-[{graph: "Q1420"}]->()'
            --order-by 'node'
    """)

In [None]:
if debug:
    kgtk("""
        query -i $TEMP/graph.high.tsv.gz
            --match '(node)-[{graph: "Q1420"}]->()'
            --order-by 'node'
    """)

In [None]:
if debug:
    kgtk("""
        query -i $TEMP/graph.low.tsv.gz
            --match '(node)-[{graph: "Q1420"}]->()'
            --order-by 'node'
    """)

In [None]:
if debug:
    kgtk("""
        query -i $TEMP/all.graph.low.sub.tsv.gz
            --match '(node)-[{graph: "Q1420"}]->()'
            --order-by 'node'
    """)

In [None]:
if debug:
    kgtk("""
        query -i $TEMP/all.graph.low.super.tsv.gz
            --match '(node)-[{graph: "Q1420"}]->()'
            --order-by 'node'
    """)

In [None]:
if debug:
    kgtk("""
        query -i $TEMP/graph.low.node.tsv.gz
            --match '(node)-[{graph: "Q1420"}]->()'
            --order-by 'node'
    """)

### In progress: Trim the subclasses based on the levels

The idea is to also trim the graph based on the number of levels, this may be difficult as I think some small graphs may have lots of levels, and some graphs may become large with just a small number of levels.

This is our starting point:

In [None]:
if debug:
    kgtk("head -i $OUT/derived.p279star.complete.tsv.gz -n 5")

Let's look at the distribution of distances

In [None]:
if debug:
    kgtk("""
        query -i p279starcomplete
            --match '(class)-[eid {distance: d}]->(superclass)'
            --return 'distinct d as distance, count(eid) as count'
            --order-by 'cast(count, int) desc'
    """)

Filter the `p279starcomplete` file to keep only the subclasses with distance < K=10

In [None]:
if debug:
    kgtk("""
        query -i p279stard
            --match '(subclass)-[eid {distance: d}]->(class)'
            --return 'class as node1, "Pcount_subclasses" as label, count(distinct subclass) as node2'
            --where 'subclass != class and d < 9'
            --order-by 'cast(node2, int) desc'
        -o $TEMP/subclass.count.d10.tsv.gz
    """)

`kgtk add-labels` drives me crazy, as it takes sooooo long.

In [None]:
if debug:
    !zcat < $TEMP/subclass.count.d10.tsv.gz | head -20 | kgtk add-labels / table