# Import Wikidata

This notebook assumes the file `latest-all.json.bz2` is already [downloaded](https://dumps.wikimedia.org/wikidatawiki/entities/) and stored in the `input_path` in the cell marked as #Parameters.

You can download the `gz` version as well, please update the variable `wikidata_json_file` with correct file name.

In [1]:
import os

from kgtk.configure_kgtk_notebooks import ConfigureKGTK
from kgtk.functions import kgtk, kypher

In [2]:
# Parameters

# Folder on local machine where to create the output and temporary folders
input_path = "/Volumes/saggu-ssd/wikidata-2021-10-27"
output_path = "/Volumes/saggu-ssd/wikidata-2021-10-27-out"
project_name = "import-wikidata"

kgtk_path = "/Users/amandeep/Github/kgtk"
wikidata_json_file = "latest-all.json.bz2"

In [3]:
files = []

ck = ConfigureKGTK(files, kgtk_path=kgtk_path)
ck.configure_kgtk(input_graph_path=input_path,
                  output_path=output_path,
                  project_name=project_name)

User home: /Users/amandeep
Current dir: /Users/amandeep/Github/kgtk/use-cases
KGTK dir: /Users/amandeep/Github/kgtk
Use-cases dir: /Users/amandeep/Github/kgtk/use-cases


In [4]:
ck.print_env_variables()

kgtk: kgtk
GRAPH: /Volumes/saggu-ssd/wikidata-2021-10-27
KGTK_GRAPH_CACHE: /Volumes/saggu-ssd/wikidata-2021-10-27-out/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db
KGTK_OPTION_DEBUG: false
OUT: /Volumes/saggu-ssd/wikidata-2021-10-27-out/import-wikidata
kypher: kgtk query --graph-cache /Volumes/saggu-ssd/wikidata-2021-10-27-out/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db
KGTK_LABEL_FILE: /Volumes/saggu-ssd/wikidata-2021-10-27/labels.en.tsv.gz
EXAMPLES_DIR: /Users/amandeep/Github/kgtk/examples
USE_CASES_DIR: /Users/amandeep/Github/kgtk/use-cases
STORE: /Volumes/saggu-ssd/wikidata-2021-10-27-out/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db
TEMP: /Volumes/saggu-ssd/wikidata-2021-10-27-out/import-wikidata/temp.import-wikidata


## Define some ENV Variables, users can simply run this step, no changes required

In [5]:
os.environ['PATTERNDIR'] = f"{kgtk_path}/wikidata/patterns"

os.environ['DATADIR'] = f"{os.environ['OUT']}/data"

# Temporary files (unsorted) will be stored in in:
os.environ['TEMPDIR'] = f"{os.environ['OUT']}/temp"

# The working log files will be stored in:
os.environ['LOGDIR'] = f"{os.environ['OUT']}/logs"

# The count validation files will be stored in:
os.environ['COUNTDIR'] = f"{os.environ['OUT']}/counts"

# Completed data products will be stored in:
os.environ['PRODUCTDIR'] = f"{os.environ['OUT']}/product"


os.environ['WIKIDATA_ALL_JSON'] = f"{os.environ['GRAPH']}/{wikidata_json_file}"

# Work file extensions
os.environ['UNSORTED_KGTK'] = "unsorted.tsv.gz"
os.environ['SORTED_KGTK'] = "tsv.gz"

# Use mgzip in some cases?
os.environ['USE_MGZIP'] = "TRUE"

# Select on of the following gzip implementations:
# GZIP_CMD=bzip
os.environ['GZIP_CMD'] = "pigz"


# Some common flags:
#KGTK_FLAGS="--debug --timing --progress --progress-tty `tty`"
os.environ['KGTK_FLAGS'] = "--debug --timing"
os.environ['VERBOSE'] = "--verbose"
os.environ['SORT_EXTRAS'] = f"--parallel 6 --buffer-size 50% -T {os.environ['TEMPDIR']}"

# The Wikidata datatypes:
WIKIDATATYPES = [ 
                "commonsMedia",
                "external-id",
                "geo-shape",
                "globe-coordinate",
                "math",
                "monolingualtext",
                "musical-notation",
                "quantity",
                "string",
                "tabular-data",
                "time",
                "url",
                "wikibase-form",
                "wikibase-item",
                "wikibase-lexeme",
                "wikibase-property",
                "wikibase-sense",
                "other"
                ]

# The wikidata import split files to be sorted:
WIKIDATA_IMPORT_SPLIT_FILES = [ "claims",
	"claims.badvalue",
	"claims.novalue",
	"claims.somevalue",
	"qualifiers",
	"qualifiers.badvalue",
	"qualifiers.badvalueClaims",
	"qualifiers.novalue",
	"qualifiers.novalueClaims",
	"qualifiers.somevalue",
	"qualifiers.somevalueClaims",
	"aliases",
	"aliases.en",
	"descriptions",
	"descriptions.en",
	"labels",
	"labels.en",
	"sitelinks",
	"sitelinks.en",
	"sitelinks.en.qualifiers",
	"sitelinks.qualifiers",
	"metadata.node",
	"metadata.property.datatypes",
	"metadata.types"]


os.environ['SORT_COMMAND'] = "gsort"

In [6]:
!mkdir -p ${DATADIR}
!mkdir -p ${TEMPDIR}
!mkdir -p ${LOGDIR}
!mkdir -p ${COUNTDIR}

## Run the `import-wikidata` command

**NOTE**:
This command is set to import only english labels/aliases/descriptions, controlled by parameters `--all-languages False` and `--lang en`.

If you wish to import all languages, simple set `--all-languages True`.

In [None]:
!kgtk ${KGTK_FLAGS} \
     import-wikidata \
     -i ${WIKIDATA_ALL_JSON} \
     --node-file ${TEMPDIR}/metadata.node.${UNSORTED_KGTK} \
     --minimal-edge-file ${TEMPDIR}/claims.raw.${UNSORTED_KGTK} \
     --minimal-qual-file ${TEMPDIR}/qualifiers.raw.${UNSORTED_KGTK} \
     --invalid-edge-file ${TEMPDIR}/claims.badvalue.${UNSORTED_KGTK} \
     --invalid-qual-file ${TEMPDIR}/qualifiers.badvalue.${UNSORTED_KGTK} \
     --node-file-id-only \
     --explode-values False \
     --all-languages False \
     --lang en \
     --alias-edges True \
     --split-alias-file ${TEMPDIR}/aliases.${UNSORTED_KGTK} \
     --split-en-alias-file ${TEMPDIR}/aliases.en.${UNSORTED_KGTK} \
     --description-edges True \
     --split-description-file ${TEMPDIR}/descriptions.${UNSORTED_KGTK} \
     --split-en-description-file ${TEMPDIR}/descriptions.en.${UNSORTED_KGTK} \
     --label-edges True \
     --split-label-file ${TEMPDIR}/labels.${UNSORTED_KGTK} \
     --split-en-label-file ${TEMPDIR}/labels.en.${UNSORTED_KGTK} \
     --datatype-edges True \
     --split-datatype-file ${TEMPDIR}/metadata.property.datatypes.${UNSORTED_KGTK} \
     --entry-type-edges True \
     --split-type-file ${TEMPDIR}/metadata.types.${UNSORTED_KGTK} \
     --sitelink-edges True \
     --sitelink-verbose-edges True \
     --split-sitelink-file ${TEMPDIR}/sitelinks.raw.${UNSORTED_KGTK} \
     --split-en-sitelink-file ${TEMPDIR}/sitelinks.en.raw.${UNSORTED_KGTK} \
     --value-hash-width 6 \
     --claim-id-hash-width 8 \
     --use-kgtkwriter True \
     --use-mgzip-for-input False \
     --use-mgzip-for-output False \
     --use-shm True \
     --procs 6 \
     --mapper-batch-size 5 \
     --max-size-per-mapper-queue 3 \
     --single-mapper-queue True \
     --collect-results True \
     --collect-seperately True\
     --collector-batch-size 5 \
     --collector-queue-per-proc-size 3 \
     --progress-interval 500000 \
     --clean \
     --allow-end-of-day False \
     --repair-month-or-day-zero \
     --minimum-valid-year 1 \
     --maximum-valid-year 9999 \
     --validate-fromisoformat \
     --repair-lax-coordinates \
     --allow-language-suffixes \
     --allow-wikidata-lq-strings \
    | tee ${LOGDIR}/import-split-wikidata.log


## Split `somevalue` and `novalue` from `claims.raw.unsorted.tsv.gz`

In [None]:
!kgtk ${KGTK_FLAGS} \
     filter ${VERBOSE} --use-mgzip ${USE_MGZIP} \
     --input-file ${TEMPDIR}/claims.raw.${UNSORTED_KGTK} \
     --first-match-only \
     --pattern ";; novalue"  -o ${TEMPDIR}/claims.novalue.${UNSORTED_KGTK} \
     --pattern ";; somevalue"  -o ${TEMPDIR}/claims.somevalue.${UNSORTED_KGTK} \
     --reject-file ${TEMPDIR}/claims.${UNSORTED_KGTK} \
    | tee ${LOGDIR}/split-claims-missing-values.log

 ## Split `somevalue` and `novalue` from `qualifiers.raw.tsv.gz`

In [None]:
!kgtk ${KGTK_FLAGS} \
     filter ${VERBOSE} --use-mgzip ${USE_MGZIP} \
     --input-file ${TEMPDIR}/qualifiers.raw.${UNSORTED_KGTK} \
     --first-match-only \
     --pattern ";; novalue"  -o ${TEMPDIR}/qualifiers.novalue.${UNSORTED_KGTK} \
     --pattern ";; somevalue"  -o ${TEMPDIR}/qualifiers.somevalue.${UNSORTED_KGTK} \
     --reject-file - \
     / ifexists ${VERBOSE} \
     --input-keys node1 \
     --filter-file ${TEMPDIR}/claims.novalue.${UNSORTED_KGTK} \
     --filter-keys id \
     --output-file ${TEMPDIR}/qualifiers.novalueClaims.${UNSORTED_KGTK} \
     --reject-file - \
     / ifexists ${VERBOSE} \
     --input-keys node1 \
     --filter-file ${TEMPDIR}/claims.somevalue.${UNSORTED_KGTK} \
     --filter-keys id \
     --output-file ${TEMPDIR}/qualifiers.somevalueClaims.${UNSORTED_KGTK} \
     --reject-file - \
     / ifexists ${VERBOSE} \
     --input-keys node1 \
     --filter-file ${TEMPDIR}/claims.badvalue.${UNSORTED_KGTK} \
     --filter-keys id \
     --output-file ${TEMPDIR}/qualifiers.badvalueClaims.${UNSORTED_KGTK} \
     --reject-file ${TEMPDIR}/qualifiers.${UNSORTED_KGTK} \
    | tee ${LOGDIR}/split-qualifiers-missing-values.log

## Split `sitelinks.raw.unsorted.tsv.gz`

In [None]:
!kgtk ${KGTK_FLAGS} \
     filter ${VERBOSE} --use-mgzip=${USE_MGZIP} \
     --input-file ${TEMPDIR}/sitelinks.raw.${UNSORTED_KGTK} \
     --pattern "; sitelink-badge,sitelink-language,sitelink-site,sitelink-title ;" \
     --output-file ${TEMPDIR}/sitelinks.qualifiers.${UNSORTED_KGTK} \
     --reject-file ${TEMPDIR}/sitelinks.${UNSORTED_KGTK} \
    | tee ${LOGDIR}/split-sitelink-qualifiers.log

## Split `sitelinks.en.raw.unsorted.tsv.gz`

In [None]:
!kgtk ${KGTK_FLAGS} \
     filter ${VERBOSE} --use-mgzip=${USE_MGZIP} \
     --input-file ${TEMPDIR}/sitelinks.en.raw.${UNSORTED_KGTK} \
     --pattern "; sitelink-badge,sitelink-language,sitelink-site,sitelink-title ;" \
     --output-file ${TEMPDIR}/sitelinks.en.qualifiers.${UNSORTED_KGTK} \
     --reject-file ${TEMPDIR}/sitelinks.en.${UNSORTED_KGTK} \
    | tee ${LOGDIR}/split-sitelink-en-qualifiers.log

## Sort the files from `TEMPDIR` to `DATADIR` folder

In [None]:
for TARGET in WIKIDATA_IMPORT_SPLIT_FILES:
    print(f"Sort the {TARGET} file.")
    input_file = f"{os.environ['TEMPDIR']}/{TARGET}.{os.environ['UNSORTED_KGTK']}"
    output_file = f"{os.environ['DATADIR']}/{TARGET}.{os.environ['SORTED_KGTK']}"
    logfile = f"{os.environ['LOGDIR']}/{TARGET}-sorted.log"
    sort_command = f"""kgtk {os.environ['KGTK_FLAGS']} \
    sort {os.environ['VERBOSE']} \
    --input-file  {input_file} \
    --output-file {output_file} \
    --gzip-command {os.environ['GZIP_CMD']} \
    --sort-command {os.environ['SORT_COMMAND']} \
    --extra '{os.environ['SORT_EXTRAS']}' | tee {logfile}"""
    !$sort_command


## Build the `all.tsv.gz file`

In [None]:
!kgtk ${KGTK_FLAGS} \
     cat ${VERBOSE} --use-mgzip=${USE_MGZIP} \
     --input-file ${TEMPDIR}/claims.${UNSORTED_KGTK} \
     --input-file ${TEMPDIR}/qualifiers.${UNSORTED_KGTK} \
     --input-file ${TEMPDIR}/aliases.${UNSORTED_KGTK} \
     --input-file ${TEMPDIR}/descriptions.${UNSORTED_KGTK} \
     --input-file ${TEMPDIR}/labels.${UNSORTED_KGTK} \
     --input-file ${TEMPDIR}/sitelinks.${UNSORTED_KGTK} \
     --input-file ${TEMPDIR}/sitelinks.qualifiers.${UNSORTED_KGTK} \
     --input-file ${TEMPDIR}/metadata.types.${UNSORTED_KGTK} \
     --input-file ${TEMPDIR}/metadata.property.datatypes.${UNSORTED_KGTK} \
   / sort ${VERBOSE} \
     --gzip-command ${GZIP_CMD} \
     --extra "${SORT_EXTRAS}" \
     --output-file ${DATADIR}/all.${SORTED_KGTK} \
| tee ${LOGDIR}/build-all-edges.log

## Check for unclaimed qualifiers

In [None]:
!kgtk ${KGTK_FLAGS} \
     ifnotexists $VERBOSE --use-mgzip=$USE_MGZIP --presorted \
     --input-file ${DATADIR}/qualifiers.${SORTED_KGTK} \
     --input-keys node1 \
     --filter-file ${DATADIR}/claims.${SORTED_KGTK} \
     --filter-keys id \
     --output-file ${DATADIR}/qualifiers.unclaimed.${SORTED_KGTK} \
| tee ${LOGDIR}/qualifiers.unclaimed.log

## Split edges by datatype

In [None]:
!kgtk ${KGTK_FLAGS} \
     filter ${VERBOSE} \
     --input-file ${DATADIR}/claims.${SORTED_KGTK} \
     --obj "node2;wikidatatype" \
     --first-match-only \
     --pattern ";;commonsMedia" \
     --output-file ${DATADIR}/claims.commonsMedia.${SORTED_KGTK} \
     --pattern ";;external-id" \
     --output-file ${DATADIR}/claims.external-id.${SORTED_KGTK} \
     --pattern ";;geo-shape" \
     --output-file ${DATADIR}/claims.geo-shape.${SORTED_KGTK} \
     --pattern ";;globe-coordinate" \
     --output-file ${DATADIR}/claims.globe-coordinate.${SORTED_KGTK} \
     --pattern ";;math" \
     --output-file ${DATADIR}/claims.math.${SORTED_KGTK} \
     --pattern ";;monolingualtext" \
     --output-file ${DATADIR}/claims.monolingualtext.${SORTED_KGTK} \
     --pattern ";;musical-notation" \
     --output-file ${DATADIR}/claims.musical-notation.${SORTED_KGTK} \
     --pattern ";;quantity" \
     --output-file ${DATADIR}/claims.quantity.${SORTED_KGTK} \
     --pattern ";;string" \
     --output-file ${DATADIR}/claims.string.${SORTED_KGTK} \
     --pattern ";;tabular-data" \
     --output-file ${DATADIR}/claims.tabular-data.${SORTED_KGTK} \
     --pattern ";;time" \
     --output-file ${DATADIR}/claims.time.${SORTED_KGTK} \
     --pattern ";;url" \
     --output-file ${DATADIR}/claims.url.${SORTED_KGTK} \
     --pattern ";;wikibase-form" \
     --output-file ${DATADIR}/claims.wikibase-form.${SORTED_KGTK} \
     --pattern ";;wikibase-item" \
     --output-file ${DATADIR}/claims.wikibase-item.${SORTED_KGTK} \
     --pattern ";;wikibase-lexeme" \
     --output-file ${DATADIR}/claims.wikibase-lexeme.${SORTED_KGTK} \
     --pattern ";;wikibase-property" \
     --output-file ${DATADIR}/claims.wikibase-property.${SORTED_KGTK} \
     --pattern ";;wikibase-sense" \
     --output-file ${DATADIR}/claims.wikibase-sense.${SORTED_KGTK} \
     --reject-file ${DATADIR}/claims.other.${SORTED_KGTK} \
     --use-mgzip ${USE_MGZIP} \
    | tee ${LOGDIR}/edge-datatype-split.log

## Extract qualifiers for edge datatype splits

In [None]:
for TARGET in WIKIDATATYPES:
    print(f"Extract any qualifiers for the properties in claims.{TARGET}")
    os.environ['TARGET'] = TARGET
    !kgtk ${KGTK_FLAGS} \
	 ifexists ${VERBOSE} \
	 --input-file ${DATADIR}/qualifiers.${SORTED_KGTK} \
	 --filter-on ${DATADIR}/claims.${TARGET}.${SORTED_KGTK} \
	 --output-file ${DATADIR}/qualifiers.${TARGET}.${SORTED_KGTK} \
	 --input-keys node1 \
	 --filter-keys id \
	 --presorted \
	 --use-mgzip ${USE_MGZIP} \
	| tee ${LOGDIR}/qualifiers.${TARGET}.log

## Extract claims with a property in the node1 column

In [None]:
!kgtk $KGTK_FLAGS filter $VERBOSE --use-mgzip=$USE_MGZIP --regex\
     --input-file $DATADIR/claims.$SORTED_KGTK \
     -p '^P ;;' -o $DATADIR/claims.properties.$SORTED_KGTK \
     | tee ${LOGDIR}/claims.properties.log

## Extract qualifiers for claims with a property in node1 column

In [None]:
!kgtk $KGTK_FLAGS filter $VERBOSE --use-mgzip=$USE_MGZIP --regex \
     --input-file $DATADIR/qualifiers.$SORTED_KGTK \
     -p '^P ;;' -o $DATADIR/qualifiers.properties.$SORTED_KGTK \
     | tee ${LOGDIR}/qualifiers.properties.log

## Files in the output data Folder

In [55]:
!ls -lrth $OUT/data

total 213464
-rw-r--r--  1 amandeep  staff   9.2M Nov  2 15:38 claims.tsv.gz
-rw-r--r--  1 amandeep  staff   4.7K Nov  2 15:38 claims.badvalue.tsv.gz
-rw-r--r--  1 amandeep  staff   3.2K Nov  2 15:38 claims.novalue.tsv.gz
-rw-r--r--  1 amandeep  staff   1.4K Nov  2 15:38 claims.somevalue.tsv.gz
-rw-r--r--  1 amandeep  staff   2.5M Nov  2 15:38 qualifiers.tsv.gz
-rw-r--r--  1 amandeep  staff   2.0K Nov  2 15:38 qualifiers.badvalue.tsv.gz
-rw-r--r--  1 amandeep  staff   2.3K Nov  2 15:38 qualifiers.badvalueClaims.tsv.gz
-rw-r--r--  1 amandeep  staff   1.9K Nov  2 15:38 qualifiers.novalue.tsv.gz
-rw-r--r--  1 amandeep  staff   1.4K Nov  2 15:38 qualifiers.novalueClaims.tsv.gz
-rw-r--r--  1 amandeep  staff   4.9K Nov  2 15:38 qualifiers.somevalue.tsv.gz
-rw-r--r--  1 amandeep  staff   1.7K Nov  2 15:38 qualifiers.somevalueClaims.tsv.gz
-rw-r--r--  1 amandeep  staff   2.4M Nov  2 15:38 aliases.tsv.gz
-rw-r--r--  1 amandeep  staff   190K Nov  2 15:38 aliases.en.tsv.gz
-rw-r--r--  1 amandeep 