# Import Wikidata

This notebook assumes the file `latest-all.json.bz2` is already [downloaded](https://dumps.wikimedia.org/wikidatawiki/entities/) and stored in the `input_path` in the cell marked as #Parameters.

You can download the `gz` version as well, please update the variable `wikidata_json_file` with correct file name.

In [3]:
import os

from kgtk.configure_kgtk_notebooks import ConfigureKGTK
from kgtk.functions import kgtk, kypher

In [4]:
# Parameters

# Folder on local machine where to create the output and temporary folders
input_path = "/data/amandeep/wikidata-20220505"
output_path = "/data/amandeep/wikidata-20220505"
project_name = "import-wikidata"

kgtk_path = "/data/amandeep/Github/kgtk"
wikidata_json_file = "latest-all.json.bz2"
# sort_command = 'gsort'
sort_command = 'sort'

In [5]:
files = []

ck = ConfigureKGTK(files, kgtk_path=kgtk_path)
ck.configure_kgtk(input_graph_path=input_path,
                  output_path=output_path,
                  project_name=project_name)

User home: /nas/home/amandeep
Current dir: /data/amandeep/Github/kgtk/use-cases
KGTK dir: /data/amandeep/Github/kgtk
Use-cases dir: /data/amandeep/Github/kgtk/use-cases


In [6]:
ck.print_env_variables()

OUT: /data/amandeep/wikidata-20220505/import-wikidata
KGTK_GRAPH_CACHE: /data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db
kypher: kgtk query --graph-cache /data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db
EXAMPLES_DIR: /data/amandeep/Github/kgtk/examples
KGTK_OPTION_DEBUG: false
USE_CASES_DIR: /data/amandeep/Github/kgtk/use-cases
STORE: /data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db
kgtk: kgtk
GRAPH: /data/amandeep/wikidata-20220505
TEMP: /data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata
KGTK_LABEL_FILE: /data/amandeep/wikidata-20220505/labels.en.tsv.gz


## Define some ENV Variables, users can simply run this step, no changes required

In [7]:
os.environ['WIKIDATA_ALL_JSON'] = f"{os.environ['GRAPH']}/{wikidata_json_file}"

# Work file extensions
os.environ['UNSORTED_KGTK'] = "unsorted.tsv.gz"
os.environ['SORTED_KGTK'] = "tsv.gz"

# Use mgzip in some cases?
os.environ['USE_MGZIP'] = "TRUE"

# Select on of the following gzip implementations:
# GZIP_CMD=bzip
os.environ['GZIP_CMD'] = "pigz"


# Some common flags:
#KGTK_FLAGS="--debug --timing --progress --progress-tty `tty`"
os.environ['KGTK_FLAGS'] = "--debug --timing"
os.environ['VERBOSE'] = "--verbose"
os.environ['SORT_EXTRAS'] = f"--parallel 6 --buffer-size 50% -T {os.environ['TEMP']}"

# The Wikidata datatypes:
WIKIDATATYPES = [ 
                "commonsMedia",
                "external-id",
                "geo-shape",
                "globe-coordinate",
                "math",
                "monolingualtext",
                "musical-notation",
                "quantity",
                "string",
                "tabular-data",
                "time",
                "url",
                "wikibase-form",
                "wikibase-item",
                "wikibase-lexeme",
                "wikibase-property",
                "wikibase-sense",
                "other"
                ]

# The wikidata import split files to be sorted:
WIKIDATA_IMPORT_SPLIT_FILES = [ "claims",
	"claims.badvalue",
	"claims.novalue",
	"claims.somevalue",
	"qualifiers",
	"qualifiers.badvalue",
	"qualifiers.badvalueClaims",
	"qualifiers.novalue",
	"qualifiers.novalueClaims",
	"qualifiers.somevalue",
	"qualifiers.somevalueClaims",
	"aliases",
	"aliases.en",
	"descriptions",
	"descriptions.en",
	"labels",
	"labels.en",
	"sitelinks",
	"sitelinks.en",
	"sitelinks.en.qualifiers",
	"sitelinks.qualifiers",
	"metadata.node",
	"metadata.property.datatypes",
	"metadata.types"]


os.environ['SORT_COMMAND'] = sort_command

## Run the `import-wikidata` command

**NOTE**:
This command is set to import only english labels/aliases/descriptions, controlled by parameters `--all-languages False` and `--lang en`.

If you wish to import all languages, simple set `--all-languages True`.

In [9]:
!kgtk ${KGTK_FLAGS} \
     import-wikidata \
     -i ${WIKIDATA_ALL_JSON} \
     --node-file ${TEMP}/metadata.node.${UNSORTED_KGTK} \
     --minimal-edge-file ${TEMP}/claims.raw.${UNSORTED_KGTK} \
     --minimal-qual-file ${TEMP}/qualifiers.raw.${UNSORTED_KGTK} \
     --invalid-edge-file ${TEMP}/claims.badvalue.${UNSORTED_KGTK} \
     --invalid-qual-file ${TEMP}/qualifiers.badvalue.${UNSORTED_KGTK} \
     --node-file-id-only \
     --explode-values False \
     --all-languages True \
     --lang en \
     --alias-edges True \
     --split-alias-file ${TEMP}/aliases.${UNSORTED_KGTK} \
     --split-en-alias-file ${TEMP}/aliases.en.${UNSORTED_KGTK} \
     --description-edges True \
     --split-description-file ${TEMP}/descriptions.${UNSORTED_KGTK} \
     --split-en-description-file ${TEMP}/descriptions.en.${UNSORTED_KGTK} \
     --label-edges True \
     --split-label-file ${TEMP}/labels.${UNSORTED_KGTK} \
     --split-en-label-file ${TEMP}/labels.en.${UNSORTED_KGTK} \
     --datatype-edges True \
     --split-datatype-file ${TEMP}/metadata.property.datatypes.${UNSORTED_KGTK} \
     --entry-type-edges True \
     --split-type-file ${TEMP}/metadata.types.${UNSORTED_KGTK} \
     --sitelink-edges True \
     --sitelink-verbose-edges True \
     --split-sitelink-file ${TEMP}/sitelinks.raw.${UNSORTED_KGTK} \
     --split-en-sitelink-file ${TEMP}/sitelinks.en.raw.${UNSORTED_KGTK} \
     --value-hash-width 6 \
     --claim-id-hash-width 8 \
     --use-kgtkwriter True \
     --use-mgzip-for-input False \
     --use-mgzip-for-output False \
     --use-shm True \
     --procs 12 \
     --mapper-batch-size 5 \
     --max-size-per-mapper-queue 3 \
     --single-mapper-queue True \
     --collect-results True \
     --collect-seperately True\
     --collector-batch-size 5 \
     --collector-queue-per-proc-size 3 \
     --progress-interval 500000 \
     --clean \
     --allow-end-of-day False \
     --repair-month-or-day-zero \
     --minimum-valid-year 1 \
     --maximum-valid-year 9999 \
     --validate-fromisoformat \
     --repair-lax-coordinates \
     --allow-language-suffixes \
     --allow-wikidata-lq-strings \
    | tee ${TEMP}/import-split-wikidata.log


kgtk import-wikidata version: 2021-11-17T01:38:17.437678+00:00#9z/aARcXhiV2hPdyVXjAREcpZwh2MawWFp6numz8GZBCtAg2WypLYAFpHjP43k97Zj8VHVaoel0oEit9KHXH0w==
Starting main process (pid 118098).
Processing.
Processing wikidata file /data/amandeep/wikidata-20220505/latest-all.json.bz2
Decompressing (bz2)
Creating the collector queue.
The collector node queue has been created (maxsize=36).
Creating the node_collector.
Creating the node collector process.
Starting the node collector process.
Started the node collector process.
The node collector is starting (pid 118140).
The collector edge queue has been created (maxsize=36).
Creating the edge_collector.
Creating the edge collector process.
Starting the edge collector process.
Started the edge collector process.
The edge collector is starting (pid 118141).
The collector qual queue has been created (maxsize=36).
Creating the qual_collector.
Creating the qual collector process.
Starting the qual collector process.
Started the qual collector proces

## Split `somevalue` and `novalue` from `claims.raw.unsorted.tsv.gz`

In [10]:
!kgtk ${KGTK_FLAGS} \
     filter ${VERBOSE} --use-mgzip ${USE_MGZIP} \
     --input-file ${TEMP}/claims.raw.${UNSORTED_KGTK} \
     --first-match-only \
     --pattern ";; novalue"  -o ${TEMP}/claims.novalue.${UNSORTED_KGTK} \
     --pattern ";; somevalue"  -o ${TEMP}/claims.somevalue.${UNSORTED_KGTK} \
     --reject-file ${TEMP}/claims.${UNSORTED_KGTK} \
    | tee ${TEMP}/split-claims-missing-values.log

kgtk filter version: 2021-09-24T02:35:27.840163+00:00#gysblgql6Q7482L14Zozt/ne8Owd497FJa7MVp92+UbmixJKElkfg/GY5UmGBsog86NPtmYy+dXWa6PRMIyuIw==
Opening the input file: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.raw.unsorted.tsv.gz
input format: kgtk
Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'
Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.raw.unsorted.tsv.gz' not found in the cache.
KgtkReader: OK to use the fast read path.
KgtkReader: File_path.suffix: .gz
KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.raw.unsorted.tsv.gz
header: id	node1	label	node2	rank	node2;wikidatatype
node1 column found, this is a KGTK edge file
KgtkReader: is_edge_file=True is_node_file=False
KgtkReader: Special columns: node1=1 label=2 node2=3 id

 ## Split `somevalue` and `novalue` from `qualifiers.raw.tsv.gz`

In [11]:
!kgtk ${KGTK_FLAGS} \
     filter ${VERBOSE} --use-mgzip ${USE_MGZIP} \
     --input-file ${TEMP}/qualifiers.raw.${UNSORTED_KGTK} \
     --first-match-only \
     --pattern ";; novalue"  -o ${TEMP}/qualifiers.novalue.${UNSORTED_KGTK} \
     --pattern ";; somevalue"  -o ${TEMP}/qualifiers.somevalue.${UNSORTED_KGTK} \
     --reject-file - \
     / ifexists ${VERBOSE} \
     --input-keys node1 \
     --filter-file ${TEMP}/claims.novalue.${UNSORTED_KGTK} \
     --filter-keys id \
     --output-file ${TEMP}/qualifiers.novalueClaims.${UNSORTED_KGTK} \
     --reject-file - \
     / ifexists ${VERBOSE} \
     --input-keys node1 \
     --filter-file ${TEMP}/claims.somevalue.${UNSORTED_KGTK} \
     --filter-keys id \
     --output-file ${TEMP}/qualifiers.somevalueClaims.${UNSORTED_KGTK} \
     --reject-file - \
     / ifexists ${VERBOSE} \
     --input-keys node1 \
     --filter-file ${TEMP}/claims.badvalue.${UNSORTED_KGTK} \
     --filter-keys id \
     --output-file ${TEMP}/qualifiers.badvalueClaims.${UNSORTED_KGTK} \
     --reject-file ${TEMP}/qualifiers.${UNSORTED_KGTK} \
    | tee ${TEMP}/split-qualifiers-missing-values.log

KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==
Opening the input file: -
input format: kgtk
Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'
kgtk filter version: 2021-09-24T02:35:27.840163+00:00#gysblgql6Q7482L14Zozt/ne8Owd497FJa7MVp92+UbmixJKElkfg/GY5UmGBsog86NPtmYy+dXWa6PRMIyuIw==
Opening the input file: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.raw.unsorted.tsv.gz
input format: kgtk
Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'
KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==
Opening the input file: -
input format: kgtk
Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'
Gr

## Split `sitelinks.raw.unsorted.tsv.gz`

In [12]:
!kgtk ${KGTK_FLAGS} \
     filter ${VERBOSE} --use-mgzip=${USE_MGZIP} \
     --input-file ${TEMP}/sitelinks.raw.${UNSORTED_KGTK} \
     --pattern "; sitelink-badge,sitelink-language,sitelink-site,sitelink-title ;" \
     --output-file ${TEMP}/sitelinks.qualifiers.${UNSORTED_KGTK} \
     --reject-file ${TEMP}/sitelinks.${UNSORTED_KGTK} \
    | tee ${TEMP}/split-sitelink-qualifiers.log

kgtk filter version: 2021-09-24T02:35:27.840163+00:00#gysblgql6Q7482L14Zozt/ne8Owd497FJa7MVp92+UbmixJKElkfg/GY5UmGBsog86NPtmYy+dXWa6PRMIyuIw==
Opening the input file: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.raw.unsorted.tsv.gz
input format: kgtk
Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'
Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.raw.unsorted.tsv.gz' not found in the cache.
KgtkReader: OK to use the fast read path.
KgtkReader: File_path.suffix: .gz
KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.raw.unsorted.tsv.gz
header: id	node1	label	node2	lang
node1 column found, this is a KGTK edge file
KgtkReader: is_edge_file=True is_node_file=False
KgtkReader: Special columns: node1=1 label=2 node2=3 id=0
KgtkRea

## Split `sitelinks.en.raw.unsorted.tsv.gz`

In [13]:
!kgtk ${KGTK_FLAGS} \
     filter ${VERBOSE} --use-mgzip=${USE_MGZIP} \
     --input-file ${TEMP}/sitelinks.en.raw.${UNSORTED_KGTK} \
     --pattern "; sitelink-badge,sitelink-language,sitelink-site,sitelink-title ;" \
     --output-file ${TEMP}/sitelinks.en.qualifiers.${UNSORTED_KGTK} \
     --reject-file ${TEMP}/sitelinks.en.${UNSORTED_KGTK} \
    | tee ${TEMP}/split-sitelink-en-qualifiers.log

kgtk filter version: 2021-09-24T02:35:27.840163+00:00#gysblgql6Q7482L14Zozt/ne8Owd497FJa7MVp92+UbmixJKElkfg/GY5UmGBsog86NPtmYy+dXWa6PRMIyuIw==
Opening the input file: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.raw.unsorted.tsv.gz
input format: kgtk
Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'
Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.raw.unsorted.tsv.gz' not found in the cache.
KgtkReader: OK to use the fast read path.
KgtkReader: File_path.suffix: .gz
KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.raw.unsorted.tsv.gz
header: id	node1	label	node2
node1 column found, this is a KGTK edge file
KgtkReader: is_edge_file=True is_node_file=False
KgtkReader: Special columns: node1=1 label=2 node2=3 id=0
Kgt

## Sort the files from `TEMP` to `OUT` folder

In [14]:
for TARGET in WIKIDATA_IMPORT_SPLIT_FILES:
    print(f"Sort the {TARGET} file.")
    input_file = f"{os.environ['TEMP']}/{TARGET}.{os.environ['UNSORTED_KGTK']}"
    output_file = f"{os.environ['OUT']}/{TARGET}.{os.environ['SORTED_KGTK']}"
    logfile = f"{os.environ['TEMP']}/{TARGET}-sorted.log"
    sort_command = f"""kgtk {os.environ['KGTK_FLAGS']} \
    sort {os.environ['VERBOSE']} \
    --input-file  {input_file} \
    --output-file {output_file} \
    --gzip-command {os.environ['GZIP_CMD']} \
    --sort-command {os.environ['SORT_COMMAND']} \
    --extra '{os.environ['SORT_EXTRAS']}' | tee {logfile}"""
    !$sort_command


Sort the claims file.
Using the sort command 'sort'
header pipe: read_fd=4 write_fd=5
sort options pipe: read_fd=6 write_fd=7
gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz'
sort command: { IFS= read -r header ;  { printf "%s\n" "$header" >&5 ; } ;  printf "%s\n" "$header" ;  IFS= read -u 6 -r options ;  LC_ALL=C sort -t '	' $options ; }  | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz'
gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz'
full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz' | { IFS= read -r header ;  { printf "%s\n" "$header" >&5 ; } ;  printf "%s\n" "$header" ;  IFS= read -u 6 -r options ;  LC_ALL=C sort -t '	' $options ; }  | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz'
Monitoring the cat command (pid=175774).
Running the sort script (pid=175778).
Reading the KGTK input 

## Build the `all.tsv.gz file`

In [16]:
!kgtk ${KGTK_FLAGS} \
     cat ${VERBOSE} --use-mgzip=${USE_MGZIP} \
     --input-file ${TEMP}/claims.${UNSORTED_KGTK} \
     --input-file ${TEMP}/qualifiers.${UNSORTED_KGTK} \
     --input-file ${TEMP}/aliases.${UNSORTED_KGTK} \
     --input-file ${TEMP}/descriptions.${UNSORTED_KGTK} \
     --input-file ${TEMP}/labels.${UNSORTED_KGTK} \
     --input-file ${TEMP}/sitelinks.${UNSORTED_KGTK} \
     --input-file ${TEMP}/sitelinks.qualifiers.${UNSORTED_KGTK} \
     --input-file ${TEMP}/metadata.types.${UNSORTED_KGTK} \
     --input-file ${TEMP}/metadata.property.datatypes.${UNSORTED_KGTK} \
   / sort ${VERBOSE} \
     --gzip-command ${GZIP_CMD} \
     --extra "${SORT_EXTRAS}" \
     --output-file ${OUT}/all.${SORTED_KGTK} \
| tee ${TEMP}/build-all-edges.log

Using the sort command 'sort'
header pipe: read_fd=4 write_fd=5
sort options pipe: read_fd=6 write_fd=7
gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/all.tsv.gz'
sort command: { IFS= read -r header ;  { printf "%s\n" "$header" >&5 ; } ;  printf "%s\n" "$header" ;  IFS= read -u 6 -r options ;  LC_ALL=C sort -t '	' $options ; }  | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/all.tsv.gz'
Running the sort script (pid=158825).
Reading the KGTK input file header line with KgtkReader
input format: kgtk
Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'
Starting kgtkcat pid=158741
Opening the 9 input files.
Opening file 1: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz
input format: kgtk
Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'
Graph cache '/data/amandeep/wikidata-20220505/import-wik

## Check for unclaimed qualifiers

In [15]:
!kgtk ${KGTK_FLAGS} \
     ifnotexists $VERBOSE --use-mgzip=$USE_MGZIP --presorted \
     --input-file ${OUT}/qualifiers.${SORTED_KGTK} \
     --input-keys node1 \
     --filter-file ${OUT}/claims.${SORTED_KGTK} \
     --filter-keys id \
     --output-file ${OUT}/qualifiers.unclaimed.${SORTED_KGTK} \
| tee ${TEMP}/qualifiers.unclaimed.log

KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==
Opening the input file: /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.tsv.gz
input format: kgtk
Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'
Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.
KgtkReader: OK to use the fast read path.
KgtkReader: File_path.suffix: .gz
KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.tsv.gz
header: id	node1	label	node2	node2;wikidatatype
node1 column found, this is a KGTK edge file
KgtkReader: is_edge_file=True is_node_file=False
KgtkReader: Special columns: node1=1 label=2 node2=3 id=0
KgtkReader: Reading a kgtk

## Split edges by datatype

In [17]:
!kgtk ${KGTK_FLAGS} \
     filter ${VERBOSE} \
     --input-file ${OUT}/claims.${SORTED_KGTK} \
     --obj "node2;wikidatatype" \
     --first-match-only \
     --pattern ";;commonsMedia" \
     --output-file ${OUT}/claims.commonsMedia.${SORTED_KGTK} \
     --pattern ";;external-id" \
     --output-file ${OUT}/claims.external-id.${SORTED_KGTK} \
     --pattern ";;geo-shape" \
     --output-file ${OUT}/claims.geo-shape.${SORTED_KGTK} \
     --pattern ";;globe-coordinate" \
     --output-file ${OUT}/claims.globe-coordinate.${SORTED_KGTK} \
     --pattern ";;math" \
     --output-file ${OUT}/claims.math.${SORTED_KGTK} \
     --pattern ";;monolingualtext" \
     --output-file ${OUT}/claims.monolingualtext.${SORTED_KGTK} \
     --pattern ";;musical-notation" \
     --output-file ${OUT}/claims.musical-notation.${SORTED_KGTK} \
     --pattern ";;quantity" \
     --output-file ${OUT}/claims.quantity.${SORTED_KGTK} \
     --pattern ";;string" \
     --output-file ${OUT}/claims.string.${SORTED_KGTK} \
     --pattern ";;tabular-data" \
     --output-file ${OUT}/claims.tabular-data.${SORTED_KGTK} \
     --pattern ";;time" \
     --output-file ${OUT}/claims.time.${SORTED_KGTK} \
     --pattern ";;url" \
     --output-file ${OUT}/claims.url.${SORTED_KGTK} \
     --pattern ";;wikibase-form" \
     --output-file ${OUT}/claims.wikibase-form.${SORTED_KGTK} \
     --pattern ";;wikibase-item" \
     --output-file ${OUT}/claims.wikibase-item.${SORTED_KGTK} \
     --pattern ";;wikibase-lexeme" \
     --output-file ${OUT}/claims.wikibase-lexeme.${SORTED_KGTK} \
     --pattern ";;wikibase-property" \
     --output-file ${OUT}/claims.wikibase-property.${SORTED_KGTK} \
     --pattern ";;wikibase-sense" \
     --output-file ${OUT}/claims.wikibase-sense.${SORTED_KGTK} \
     --reject-file ${OUT}/claims.other.${SORTED_KGTK} \
     --use-mgzip ${USE_MGZIP} \
    | tee ${TEMP}/edge-datatype-split.log

kgtk filter version: 2021-09-24T02:35:27.840163+00:00#gysblgql6Q7482L14Zozt/ne8Owd497FJa7MVp92+UbmixJKElkfg/GY5UmGBsog86NPtmYy+dXWa6PRMIyuIw==
Opening the input file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz
input format: kgtk
Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'
Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz' not found in the cache.
KgtkReader: OK to use the fast read path.
KgtkReader: File_path.suffix: .gz
KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz
header: id	node1	label	node2	rank	node2;wikidatatype
node1 column found, this is a KGTK edge file
KgtkReader: is_edge_file=True is_node_file=False
KgtkReader: Special columns: node1=1 label=2 node2=3 id=0
KgtkReader: Reading a kgtk file usin

## Extract qualifiers for edge datatype splits

In [10]:
for TARGET in WIKIDATATYPES:
    print(f"Extract any qualifiers for the properties in claims.{TARGET}")
    os.environ['TARGET'] = TARGET
    !kgtk ${KGTK_FLAGS} \
	 ifexists ${VERBOSE} \
	 --input-file ${OUT}/qualifiers.${SORTED_KGTK} \
	 --filter-on ${OUT}/claims.${TARGET}.${SORTED_KGTK} \
	 --output-file ${OUT}/qualifiers.${TARGET}.${SORTED_KGTK} \
	 --input-keys node1 \
	 --filter-keys id \
	 --presorted \
	 --use-mgzip ${USE_MGZIP} \
	| tee ${TEMP}/qualifiers.${TARGET}.log

Extract any qualifiers for the properties in claims.commonsMedia
KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==
Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz
input format: kgtk
Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'
Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.
KgtkReader: OK to use the fast read path.
KgtkReader: File_path.suffix: .gz
KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz
header: id	node1	label	node2	node2;wikidatatype
node1 column found, this is a KGT

## Extract claims with a property in the node1 column

In [11]:
!kgtk $KGTK_FLAGS filter $VERBOSE --use-mgzip=$USE_MGZIP --regex\
     --input-file $OUT/claims.$SORTED_KGTK \
     -p '^P ;;' -o $OUT/claims.properties.$SORTED_KGTK \
     | tee ${TEMP}/claims.properties.log

kgtk filter version: 2021-09-24T02:35:27.840163+00:00#gysblgql6Q7482L14Zozt/ne8Owd497FJa7MVp92+UbmixJKElkfg/GY5UmGBsog86NPtmYy+dXWa6PRMIyuIw==
Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.tsv.gz
input format: kgtk
Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'
Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.tsv.gz' not found in the cache.
KgtkReader: OK to use the fast read path.
KgtkReader: File_path.suffix: .gz
KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.tsv.gz
header: id	node1	label	node2	rank	node2;wikidatatype
node1 column found, this is a KGTK edge file
KgtkReader: is_edge_file=True is_node_file=False
KgtkReader: Sp

## Extract qualifiers for claims with a property in node1 column

In [12]:
!kgtk $KGTK_FLAGS filter $VERBOSE --use-mgzip=$USE_MGZIP --regex \
     --input-file $OUT/qualifiers.$SORTED_KGTK \
     -p '^P ;;' -o $OUT/qualifiers.properties.$SORTED_KGTK \
     | tee ${TEMP}/qualifiers.properties.log

kgtk filter version: 2021-09-24T02:35:27.840163+00:00#gysblgql6Q7482L14Zozt/ne8Owd497FJa7MVp92+UbmixJKElkfg/GY5UmGBsog86NPtmYy+dXWa6PRMIyuIw==
Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz
input format: kgtk
Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'
Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.
KgtkReader: OK to use the fast read path.
KgtkReader: File_path.suffix: .gz
KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz
header: id	node1	label	node2	node2;wikidatatype
node1 column found, this is a KGTK edge file
KgtkReader: is_edge_file=True is_node_file=False
KgtkRea

## Files in the output data Folder

In [13]:
!ls -lrth $OUT/data

total 115G
-rw-r--r-- 1 amandeep isdstaff  28G Apr 15 11:15 claims.tsv.gz
-rw-r--r-- 1 amandeep isdstaff 994K Apr 15 11:16 claims.badvalue.tsv.gz
-rw-r--r-- 1 amandeep isdstaff 1.7M Apr 15 11:16 claims.novalue.tsv.gz
-rw-r--r-- 1 amandeep isdstaff 5.4M Apr 15 11:16 claims.somevalue.tsv.gz
-rw-r--r-- 1 amandeep isdstaff 5.4G Apr 15 11:20 qualifiers.tsv.gz
-rw-r--r-- 1 amandeep isdstaff 592K Apr 15 11:20 qualifiers.badvalue.tsv.gz
-rw-r--r-- 1 amandeep isdstaff 242K Apr 15 11:20 qualifiers.badvalueClaims.tsv.gz
-rw-r--r-- 1 amandeep isdstaff 862K Apr 15 11:20 qualifiers.novalue.tsv.gz
-rw-r--r-- 1 amandeep isdstaff 573K Apr 15 11:20 qualifiers.novalueClaims.tsv.gz
-rw-r--r-- 1 amandeep isdstaff 2.1M Apr 15 11:20 qualifiers.somevalue.tsv.gz
-rw-r--r-- 1 amandeep isdstaff 6.7M Apr 15 11:20 qualifiers.somevalueClaims.tsv.gz
-rw-r--r-- 1 amandeep isdstaff 184M Apr 15 11:20 aliases.tsv.gz
-rw-r--r-- 1 amandeep isdstaff 181M Apr 15 11:20 aliases.en.tsv.gz
-rw-r--r-- 1 amandeep isdstaff 694M Ap