In [1]:
import pandas as pd
import os
import sys
from kgtk.configure_kgtk_notebooks import ConfigureKGTK
from kgtk.functions import kgtk, kypher
from kgtk.io.kgtkreader import KgtkReader
from kgtk.io.kgtkwriter import KgtkWriter
from pathlib import Path

In [2]:
# Parameters

input_path = "/data/amandeep/wikidata-20211027-dwd-v3"
output_path = "/data/amandeep/wikidata-20211027-dwd-v3"
kgtk_path = "/Users/amandeep/Github/kgtk"

graph_cache_path = None


project_name = "browser-claims-file"
files = 'claims'
debug=True

In [3]:
files = files.split(',')

In [4]:
ck = ConfigureKGTK(files, kgtk_path=kgtk_path)
ck.configure_kgtk(input_graph_path=input_path,
                  output_path=output_path,
                  project_name=project_name,
                 graph_cache_path=graph_cache_path,
                 debug=debug)

User home: /nas/home/amandeep
Current dir: /data/amandeep/github/kgtk-browser
KGTK dir: /Users/amandeep/Github/kgtk
Use-cases dir: /Users/amandeep/Github/kgtk/use-cases


In [5]:
ck.print_env_variables()

GRAPH: /data/amandeep/wikidata-20211027-dwd-v3
KGTK_LABEL_FILE: /data/amandeep/wikidata-20211027-dwd-v3/labels.en.tsv.gz
kypher: kgtk --debug query --graph-cache /data/amandeep/wikidata-20211027-dwd-v3/browser-claims-file/temp.browser-claims-file/wikidata.sqlite3.db
STORE: /data/amandeep/wikidata-20211027-dwd-v3/browser-claims-file/temp.browser-claims-file/wikidata.sqlite3.db
KGTK_GRAPH_CACHE: /data/amandeep/wikidata-20211027-dwd-v3/browser-claims-file/temp.browser-claims-file/wikidata.sqlite3.db
TEMP: /data/amandeep/wikidata-20211027-dwd-v3/browser-claims-file/temp.browser-claims-file
OUT: /data/amandeep/wikidata-20211027-dwd-v3/browser-claims-file
USE_CASES_DIR: /Users/amandeep/Github/kgtk/use-cases
EXAMPLES_DIR: /Users/amandeep/Github/kgtk/examples
kgtk: kgtk --debug
KGTK_OPTION_DEBUG: false
claims: /data/amandeep/wikidata-20211027-dwd-v3/claims.tsv.gz


In [6]:
def separate_edges_and_qualifiers(input_file, output_edge_file, output_qualifier_file):
    kr: KgtkReader = KgtkReader.open(Path(input_file),
                                         error_file=sys.stderr)
    ids = set()
    for row in kr:
        ids.add(row[kr.id_column_idx])
    kr.close()
    
    kr: KgtkReader = KgtkReader.open(Path(input_file),
                                         error_file=sys.stderr)
    
    
    kw_edge: KgtkWriter = KgtkWriter.open(file_path=Path(output_edge_file),
                                          error_file=sys.stderr,
                                          column_names=kr.column_names,
                                          mode=KgtkWriter.Mode.EDGE,
                                          no_header=False)
    kw_qualifier: KgtkWriter = KgtkWriter.open(file_path=Path(output_qualifier_file),
                                          error_file=sys.stderr,
                                          column_names=kr.column_names,
                                          mode=KgtkWriter.Mode.EDGE,
                                          no_header=False)
    for row in kr:
        id = row[kr.id_column_idx]
        node1 = row[kr.node1_column_idx]
        if node1 in ids:
            kw_qualifier.write(row)
        else:
            kw_edge.write(row)
    
    kr.close()
    kw_edge.close()
    kw_qualifier.close()    

In [7]:
!curl https://raw.githubusercontent.com/usc-isi-i2/kgtk/dev/kgtk-properties/kgtk.properties.tsv -o $TEMP/kgtk.properties.tsv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 11367  100 11367    0     0  26973      0 --:--:-- --:--:-- --:--:-- 26936


In [8]:
kgtk("""filter -i $TEMP/kgtk.properties.tsv
        -p ';label;'
        -o $TEMP/kgtk.properties.labels.tsv
""")

In [9]:
kgtk("""filter -i $TEMP/kgtk.properties.tsv
        -p ';alias;'
        -o $TEMP/kgtk.properties.aliases.tsv
""")

In [10]:
kgtk("""filter -i $TEMP/kgtk.properties.tsv
        -p ';description;'
        -o $TEMP/kgtk.properties.descriptions.tsv
""")

In [11]:
kgtk("""filter -i $TEMP/kgtk.properties.tsv
        -p ';datatype;'
        -o $TEMP/kgtk.properties.datatypes.tsv
""")

In [12]:
kgtk("""filter -i $TEMP/kgtk.properties.tsv
        -p ';label,alias,description,datatype;'
        --invert
        -o $TEMP/kgtk.properties.claims.tsv
""")

In [13]:
kgtk("""cat 
    -i $GRAPH/statistics.Pinstance_count.star.tsv.gz
    -i $GRAPH/statistics.Pinstance_count.tsv.gz
    -i $GRAPH/statistics.Psubclass_count.star.tsv.gz
    -i $GRAPH/derived.class.P1963computed.count.tsv.gz
    -i $GRAPH/derived.P1963computed.subclass.count.star.tsv.gz
    -i $GRAPH/derived.Pproperty_domain.tsv.gz
    -i $TEMP/kgtk.properties.claims.tsv
    -o $TEMP/derived.claims.augmented.tsv.gz
""")

In [14]:
separate_edges_and_qualifiers(f"{os.environ['TEMP']}/derived.claims.augmented.tsv.gz",
                              f"{os.environ['TEMP']}/derived.claims.edges.tsv.gz",
                              f"{os.environ['TEMP']}/derived.claims.qualifiers.tsv.gz")

In [15]:
kgtk(f"""cat 
        -i $GRAPH/labels.en.tsv.gz
        -i $TEMP/kgtk.properties.labels.tsv
        / sort
         --extra '--parallel 24 --buffer-size 30% --temporary-directory {os.environ['TEMP']}'
        -o $OUT/labels.en.tsv.gz
""")

In [16]:
kgtk(f"""cat 
        -i $GRAPH/aliases.en.tsv.gz
        -i $TEMP/kgtk.properties.aliases.tsv
        / sort
         --extra '--parallel 24 --buffer-size 30% --temporary-directory {os.environ['TEMP']}'
        -o $OUT/aliases.en.tsv.gz
""")

In [17]:
kgtk(f"""cat 
        -i $GRAPH/descriptions.en.tsv.gz
        -i $TEMP/kgtk.properties.descriptions.tsv
        / sort
         --extra '--parallel 24 --buffer-size 30% --temporary-directory {os.environ['TEMP']}'
        -o $OUT/descriptions.en.tsv.gz
""")

In [18]:
kgtk(f"""cat 
        -i $GRAPH/metadata.property.datatypes.tsv.gz
        -i $TEMP/kgtk.properties.datatypes.tsv
        / sort
         --extra '--parallel 24 --buffer-size 30% --temporary-directory {os.environ['TEMP']}'
        -o $OUT/metadata.property.datatypes.tsv.gz
""")

In [19]:
kgtk(f"""cat
        -i $TEMP/derived.claims.qualifiers.tsv.gz
        -i $GRAPH/qualifiers.tsv.gz
        / sort 
         --extra '--parallel 24 --buffer-size 30% --temporary-directory {os.environ['TEMP']}'
        -o $OUT/qualifiers.tsv.gz
""")

In [20]:
kgtk(f"""cat
        -i $TEMP/derived.claims.edges.tsv.gz
        -i $GRAPH/claims.tsv.gz
        -i $GRAPH/downloaded.wikipedia.short_abstracts.tsv.gz
        / sort 
         --extra '--parallel 24 --buffer-size 30% --temporary-directory {os.environ['TEMP']}'
        -o $OUT/claims.tsv.gz
""")