# Creating a subset of Wikidata

This notebook illustrates how to partition a Wikidata KGTK edges file.

Parameters are set up in the first cell so that we can run this notebook in batch mode. Example invocation command:

```
papermill partition-wikidata.ipynb partition-wikidata.out.ipynb \
-p wikidata_input_path /data4/rogers/elicit/cache/datasets/wikidata-20200803/data/everything.tsv.gz \
-p wikidata_parts_path /data4/rogers/elicit/cache/datasets/wikidata-20200803/parts \
```

### Parameters for invoking the notebook

- `wikidata_input_path`: A folder containing the Wikidata KGTK edges to partition. 
- `wikidata_parts_path`: A folder containing the part files of Wikidata, including files such as `part.wikibase-item.tsv.gz`
- `temp_folder_path`:    A folder that may be used for temporary files. 
- `gzip_command`:        The compression command for sorting. (default: pigz). 
- `sort_extras`:         Extra parameters for the sort program. 
- `unsorted_extension`:  The file extension for unsorted files. (default: .unsorted.tsv.gz) 
- `sorted_extension`:    The file extension for sorted files. (default: .tsv.gz)  
- `use_mgzip`:           When True, use the mgzip program where appropriate for faster compression. (default: True) 
- `verbose`:             When True, see additional feedback messages. (default: True) 


In [1]:
# Parameters
wikidata_input_path = '/data4/rogers/elicit/cache/datasets/wikidata-20200803/data/everything.tsv.gz'
wikidata_parts_path = '/data4/rogers/elicit/cache/datasets/wikidata-20200803/parts'
temp_folder_path =    wikidata_parts_path + '/temp'
gzip_command =        'pigz'
sort_extras =         '-T ' + wikidata_parts_path
unsorted_extension =  'unsorted.tsv.gz'
sorted_extension =    'tsv.gz'
use_mgzip =           'True'
verbose =             'True'


### Import the Python modules we will use in this script.

In [3]:
import os

### Set up environment variables and folders that we need
Define environment variables to pass the script parameters to the KGTK commands.

In [5]:
# file containing wikidata edges.
os.environ['WIKIDATA_INPUT'] =     wikidata_input_path
# folder to receive wikidata broken down into smaller files.
os.environ['WIKIDATA_PARTS'] =     wikidata_parts_path
# temporary folder
os.environ['TEMP'] =               temp_folder_path
# kgtk command to run
# os.environ['kgtk'] =             "kgtk"
os.environ['kgtk'] =               "time kgtk --debug --timing"
# gzip command to run
os.environ['gzip'] =               gzip_command
# extra parameters for sort
os.environ['SORT_EXTRAS'] =        sort_extras
# The unsorted file extension.
os.environ['UNSORTED_EXTENSION'] = unsorted_extension
# The sorted file extension.
os.environ['SORTED_EXTENSION'] =   sorted_extension
# The use_mgzip flag.
os.environ['USE_MGZIP'] =          use_mgzip
# The verbose flag.
os.environ['VERBOSE'] =            verbose


In [6]:
!mkdir $WIKIDATA_PARTS
!mkdir $TEMP

In [7]:
!rm $WIKIDATA_PARTS/*.tsv $WIKIDATA_PARTS/*.tsv.gz
!rm $TEMP/*.tsv $TEMP/*.tsv.gz

In [8]:
!$kgtk filter --verbose=$VERBOSE --use-mgzip=$USE_MGZIP --first-match-only --regex \
 --input-file $WIKIDATA_INPUT \
 -p '; ^datatype$ ;'      -o $TEMP/property.datatype.$UNSORTED_EXTENSION \
 -p '; ^alias$ ;'         -o $TEMP/part.alias.$UNSORTED_EXTENSION \
 -p '; ^description$ ;'   -o $TEMP/part.description.$UNSORTED_EXTENSION \
 -p '; ^label$ ;'         -o $TEMP/part.label.$UNSORTED_EXTENSION \
 -p '; ^(addl_wikipedia_sitelink|sitelink-badge|sitelink-language|sitelink-site|sitelink-title|wikipedia_sitelink)$ ;' \
                          -o $TEMP/part.wikipedia_sitelink.$UNSORTED_EXTENSION \
 -p '; ^type$ ;'          -o $TEMP/types.$UNSORTED_EXTENSION \
 -p '^.*-.*-.*-.*-.*$ ;;' -o $TEMP/part.qual.$UNSORTED_EXTENSION \
 --reject-file $TEMP/part.claims.$UNSORTED_EXTENSION

In [9]:
!$kgtk sort2 --verbose=$VERBOSE --gzip-command=$gzip \
 --input-file  $TEMP/property.datatype.$UNSORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/property.datatype.$SORTED_EXTENSION \
 --columns     id node1 label node2 \
 --extra       "$SORT_EXTRAS" \


In [9]:
!$kgtk sort2 --verbose=$VERBOSE --gzip-command=$gzip \
 --input-file  $TEMP/part.alias.$UNSORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/part.alias.$SORTED_EXTENSION \
 --columns     id node1 label node2 \
 --extra       "$SORT_EXTRAS" \


In [9]:
!$kgtk filter --verbose=$VERBOSE --use-mgzip=$USE_MGZIP --regex \
 --input-file $WIKIDATA_PARTS/part.alias.$SORTED_EXTENSION \
 -p ';; @en$' -o $WIKIDATA_PARTS/part.alias.en.$SORTED_EXTENSION \


In [9]:
!$kgtk sort2 --verbose=$VERBOSE --gzip-command=$gzip \
 --input-file  $TEMP/part.description.$UNSORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/part.description.$SORTED_EXTENSION \
 --columns     id node1 label node2 \
 --extra       "$SORT_EXTRAS" \


In [9]:
!$kgtk filter --verbose=$VERBOSE --use-mgzip=$USE_MGZIP --regex \
 --input-file $WIKIDATA_PARTS/part.description.$SORTED_EXTENSION \
 -p ';; @en$' -o $WIKIDATA_PARTS/part.description.en.$SORTED_EXTENSION \


In [9]:
!$kgtk sort2 --verbose=$VERBOSE --gzip-command=$gzip \
 --input-file  $TEMP/part.label.$UNSORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/part.label.$SORTED_EXTENSION \
 --columns     id node1 label node2 \
 --extra       "$SORT_EXTRAS" \


In [9]:
!$kgtk filter --verbose=$VERBOSE --use-mgzip=$USE_MGZIP --regex \
 --input-file $WIKIDATA_PARTS/part.label.$SORTED_EXTENSION \
 -p ';; @en$' -o $WIKIDATA_PARTS/part.label.en.$SORTED_EXTENSION \


In [9]:
!$kgtk sort2 --verbose=$VERBOSE --gzip-command=$gzip \
 --input-file  $TEMP/part.wikipedia_sitelink.$UNSORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/part.wikipedia_sitelink.$SORTED_EXTENSION \
 --columns     id node1 label node2 \
 --extra       "$SORT_EXTRAS" \


In [9]:
!$kgtk filter --verbose=$VERBOSE --use-mgzip=$USE_MGZIP --obj=lang \
 --input-file $WIKIDATA_PARTS/part.wikipedia_sitelink.$SORTED_EXTENSION \
 -p ';; en' -o $WIKIDATA_PARTS/part.wikipedia_sitelink.en.$SORTED_EXTENSION \


In [9]:
!$kgtk sort2 --verbose=$VERBOSE --gzip-command=$gzip \
 --input-file  $TEMP/types.$UNSORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/types.$SORTED_EXTENSION \
 --columns     id node1 label node2 \
 --extra       "$SORT_EXTRAS" \


In [9]:
!$kgtk sort2 --verbose=$VERBOSE --gzip-command=$gzip \
 --input-file  $TEMP/part.qual.$UNSORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/part.qual.$SORTED_EXTENSION \
 --columns     id node1 label node2 \
 --extra       "$SORT_EXTRAS" \


In [9]:
!$kgtk sort2 --verbose=$VERBOSE --gzip-command=$gzip \
 --input-file  $TEMP/part.claims.$UNSORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/part.claims.$SORTED_EXTENSION \
 --columns     id node1 label node2 \
 --extra       "$SORT_EXTRAS" \


In [9]:
!$kgtk unique --verbose=$VERBOSE --format=node-only --use-mgzip=$USE_MGZIP \
 --input-file  $WIKIDATA_PARTS/part.claims.$SORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/part.claims.entities.$SORTED_EXTENSION \
 --column      node1 \


In [9]:
!$kgtk unique --verbose=$VERBOSE --use-mgzip=$USE_MGZIP \
 --input-file  $WIKIDATA_PARTS/part.claims.$SORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/part.claims.datatypes.$SORTED_EXTENSION \
 --column      'node2;wikidatatype' \


In [9]:
!$kgtk filter --verbose=$VERBOSE --use-mgzip=$USE_MGZIP --first-match-only \
 --input-file $WIKIDATA_PARTS/part.claims.$SORTED_EXTENSION \
 --obj 'node2;wikidatatype' \
 -p ';; commonsMedia'      -o $WIKIDATA_PARTS/part.commonsMedia.$SORTED_EXTENSION \
 -p ';; external-id'       -o $WIKIDATA_PARTS/part.external-id.$SORTED_EXTENSION \
 -p ';; geo-shape'         -o $WIKIDATA_PARTS/part.geo-shape.$SORTED_EXTENSION \
 -p ';; globe-coordinate'  -o $WIKIDATA_PARTS/part.globe-coordinate.$SORTED_EXTENSION \
 -p ';; math'              -o $WIKIDATA_PARTS/part.math.$SORTED_EXTENSION \
 -p ';; monolingualtext'   -o $WIKIDATA_PARTS/part.monolingualtext.$SORTED_EXTENSION \
 -p ';; musical-notation'  -o $WIKIDATA_PARTS/part.musical-notation.$SORTED_EXTENSION \
 -p ';; quantity'          -o $WIKIDATA_PARTS/part.quantity.$SORTED_EXTENSION \
 -p ';; string'            -o $WIKIDATA_PARTS/part.string.$SORTED_EXTENSION \
 -p ';; tabular-data'      -o $WIKIDATA_PARTS/part.tabular-data.$SORTED_EXTENSION \
 -p ';; time'              -o $WIKIDATA_PARTS/part.time.$SORTED_EXTENSION \
 -p ';; url'               -o $WIKIDATA_PARTS/part.url.$SORTED_EXTENSION \
 -p ';; wikibase-form'     -o $WIKIDATA_PARTS/part.wikibase-form.$SORTED_EXTENSION \
 -p ';; wikibase-item'     -o $WIKIDATA_PARTS/part.wikibase-item.$SORTED_EXTENSION \
 -p ';; wikibase-lexeme'   -o $WIKIDATA_PARTS/part.wikibase-lexeme.$SORTED_EXTENSION \
 -p ';; wikibase-property' -o $WIKIDATA_PARTS/part.wikibase-property.$SORTED_EXTENSION \
 -p ';; wikibase-sense'    -o $WIKIDATA_PARTS/part.wikibase-sense.$SORTED_EXTENSION \
              --reject-file $WIKIDATA_PARTS/part.other.$SORTED_EXTENSION

In [9]:
!$kgtk ifexists --verbose=$VERBOSE --use-mgzip=$USE_MGZIP --presorted \
 --input-file  $WIKIDATA_PARTS/part.qual.$SORTED_EXTENSION \
 --filter-on   $WIKIDATA_PARTS/part.commonsMedia.$SORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/part.commonsMedia.qual.$SORTED_EXTENSION \
 --input-keys  node1 \
 --filter-keys id \


In [9]:
!$kgtk ifexists --verbose=$VERBOSE --use-mgzip=$USE_MGZIP --presorted \
 --input-file  $WIKIDATA_PARTS/part.qual.$SORTED_EXTENSION \
 --filter-on   $WIKIDATA_PARTS/part.external-id.$SORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/part.external-id.qual.$SORTED_EXTENSION \
 --input-keys  node1 \
 --filter-keys id \


In [9]:
!$kgtk ifexists --verbose=$VERBOSE --use-mgzip=$USE_MGZIP --presorted \
 --input-file  $WIKIDATA_PARTS/part.qual.$SORTED_EXTENSION \
 --filter-on   $WIKIDATA_PARTS/part.geo-shape.$SORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/part.geo-shape.qual.$SORTED_EXTENSION \
 --input-keys  node1 \
 --filter-keys id \


In [9]:
!$kgtk ifexists --verbose=$VERBOSE --use-mgzip=$USE_MGZIP --presorted \
 --input-file  $WIKIDATA_PARTS/part.qual.$SORTED_EXTENSION \
 --filter-on   $WIKIDATA_PARTS/part.globe-coordinate.$SORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/part.globe-coordinate.qual.$SORTED_EXTENSION \
 --input-keys  node1 \
 --filter-keys id \


In [9]:
!$kgtk ifexists --verbose=$VERBOSE --use-mgzip=$USE_MGZIP --presorted \
 --input-file  $WIKIDATA_PARTS/part.qual.$SORTED_EXTENSION \
 --filter-on   $WIKIDATA_PARTS/part.math.$SORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/part.math.qual.$SORTED_EXTENSION \
 --input-keys  node1 \
 --filter-keys id \


In [9]:
!$kgtk ifexists --verbose=$VERBOSE --use-mgzip=$USE_MGZIP --presorted \
 --input-file  $WIKIDATA_PARTS/part.qual.$SORTED_EXTENSION \
 --filter-on   $WIKIDATA_PARTS/part.monolingualtext.$SORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/part.monolingualtext.qual.$SORTED_EXTENSION \
 --input-keys  node1 \
 --filter-keys id \


In [9]:
!$kgtk ifexists --verbose=$VERBOSE --use-mgzip=$USE_MGZIP --presorted \
 --input-file  $WIKIDATA_PARTS/part.qual.$SORTED_EXTENSION \
 --filter-on   $WIKIDATA_PARTS/part.musical-notation.$SORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/part.musical-notation.qual.$SORTED_EXTENSION \
 --input-keys  node1 \
 --filter-keys id \


In [9]:
!$kgtk ifexists --verbose=$VERBOSE --use-mgzip=$USE_MGZIP --presorted \
 --input-file  $WIKIDATA_PARTS/part.qual.$SORTED_EXTENSION \
 --filter-on   $WIKIDATA_PARTS/part.quantity.$SORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/part.quantity.qual.$SORTED_EXTENSION \
 --input-keys  node1 \
 --filter-keys id \


In [9]:
!$kgtk ifexists --verbose=$VERBOSE --use-mgzip=$USE_MGZIP --presorted \
 --input-file  $WIKIDATA_PARTS/part.qual.$SORTED_EXTENSION \
 --filter-on   $WIKIDATA_PARTS/part.string.$SORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/part.string.qual.$SORTED_EXTENSION \
 --input-keys  node1 \
 --filter-keys id \


In [9]:
!$kgtk ifexists --verbose=$VERBOSE --use-mgzip=$USE_MGZIP --presorted \
 --input-file  $WIKIDATA_PARTS/part.qual.$SORTED_EXTENSION \
 --filter-on   $WIKIDATA_PARTS/part.tabular-data.$SORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/part.tabular-data.qual.$SORTED_EXTENSION \
 --input-keys  node1 \
 --filter-keys id \


In [9]:
!$kgtk ifexists --verbose=$VERBOSE --use-mgzip=$USE_MGZIP --presorted \
 --input-file  $WIKIDATA_PARTS/part.qual.$SORTED_EXTENSION \
 --filter-on   $WIKIDATA_PARTS/part.time.$SORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/part.time.qual.$SORTED_EXTENSION \
 --input-keys  node1 \
 --filter-keys id \


In [9]:
!$kgtk ifexists --verbose=$VERBOSE --use-mgzip=$USE_MGZIP --presorted \
 --input-file  $WIKIDATA_PARTS/part.qual.$SORTED_EXTENSION \
 --filter-on   $WIKIDATA_PARTS/part.url.$SORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/part.url.qual.$SORTED_EXTENSION \
 --input-keys  node1 \
 --filter-keys id \


In [9]:
!$kgtk ifexists --verbose=$VERBOSE --use-mgzip=$USE_MGZIP --presorted \
 --input-file  $WIKIDATA_PARTS/part.qual.$SORTED_EXTENSION \
 --filter-on   $WIKIDATA_PARTS/part.wikibase-form.$SORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/part.wikibase-form.qual.$SORTED_EXTENSION \
 --input-keys  node1 \
 --filter-keys id \


In [9]:
!$kgtk ifexists --verbose=$VERBOSE --use-mgzip=$USE_MGZIP \
 --input-file  $WIKIDATA_PARTS/part.qual.$SORTED_EXTENSION \
 --filter-on   $WIKIDATA_PARTS/part.wikibase-item.$SORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/part.wikibase-item.qual.$SORTED_EXTENSION \
 --input-keys  node1 \
 --filter-keys id \
 --presorted \


In [9]:
!$kgtk ifexists --verbose=$VERBOSE --use-mgzip=$USE_MGZIP --presorted \
 --input-file  $WIKIDATA_PARTS/part.qual.$SORTED_EXTENSION \
 --filter-on   $WIKIDATA_PARTS/part.wikibase-lexeme.$SORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/part.wikibase-lexeme.qual.$SORTED_EXTENSION \
 --input-keys  node1 \
 --filter-keys id \


In [9]:
!$kgtk ifexists --verbose=$VERBOSE --use-mgzip=$USE_MGZIP --presorted \
 --input-file  $WIKIDATA_PARTS/part.qual.$SORTED_EXTENSION \
 --filter-on   $WIKIDATA_PARTS/part.wikibase-property.$SORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/part.wikibase-property.qual.$SORTED_EXTENSION \
 --input-keys  node1 \
 --filter-keys id \


In [9]:
!$kgtk ifexists --verbose=$VERBOSE --use-mgzip=$USE_MGZIP --presorted \
 --input-file  $WIKIDATA_PARTS/part.qual.$SORTED_EXTENSION \
 --filter-on   $WIKIDATA_PARTS/part.wikibase-sense.$SORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/part.wikibase-sense.qual.$SORTED_EXTENSION \
 --input-keys  node1 \
 --filter-keys id \


In [9]:
!$kgtk filter --verbose=$VERBOSE --use-mgzip=$USE_MGZIP --regex \
 --input-file $WIKIDATA_PARTS/part.claims.$SORTED_EXTENSION \
 -p '^P ;;' -o $WIKIDATA_PARTS/part.property.$SORTED_EXTENSION \


In [9]:
!$kgtk filter --verbose=$VERBOSE --use-mgzip=$USE_MGZIP --regex \
 --input-file $WIKIDATA_PARTS/part.qual.$SORTED_EXTENSION \
 -p '^P ;;' -o $WIKIDATA_PARTS/part.property.qual.$SORTED_EXTENSION \


In [9]:
!$kgtk unique --verbose=$VERBOSE --use-mgzip=$USE_MGZIP \
 --input-file  $WIKIDATA_PARTS/part.property.$SORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/part.property.datatypes.$SORTED_EXTENSION \
 --column      'node2;wikidatatype' \


In [9]:
!$kgtk unique --verbose=$VERBOSE \
 --use-mgzip $USE_MGZIP \
 --input-file  $WIKIDATA_PARTS/part.property.$SORTED_EXTENSION \
 --output-file $WIKIDATA_PARTS/part.property.counts.$SORTED_EXTENSION \
 --column      label \
 --label       total-count \


In [9]:
!$kgtk lift --verbose=$VERBOSE --use-mgzip=$USE_MGZIP \
 --input-file       $WIKIDATA_PARTS/part.property.counts.$SORTED_EXTENSION \
 --input-file       $WIKIDATA_PARTS/part.label.en.$SORTED_EXTENSION \
 --output-file      $WIKIDATA_PARTS/part.property.counts-with-labels.$SORTED_EXTENSION \
 --columns-to-lift  node1 \
 --prefilter-labels \
