# Subset preprocessing
**GOAL:** Must call the `partition` and `useful_files` notebooks to preprocess the wikidata input, before calling the `subsets` notebook

In [1]:
import os
import papermill as pm

In [7]:
# The location of input files
wiki_root_folder = "/nas/home/mbmann/kgtk/datasets/wikidataos-v4-mm-2/"
# wiki_root_folder = "/nas/home/mbmann/KGTK-public-graphs2/wikidata-20201130/data/"

# Location of output folder
output_folder = "/nas/home/mbmann/useful_files_output"

# Location of output and temp directories
temp = output_folder + "/output.temp"
out = output_folder + "/output"

In [None]:
!mkdir {temp}
!mkdir {out}

Filepath variables

In [4]:
#Required input files
claims_file = "claims.tsv.gz"
label_file = "labels.en.tsv.gz"
alias_file = "aliases.en.tsv.gz"
description_file = "descriptions.en.tsv.gz"
item_file = "claims.wikibase-item.tsv.gz"
qual_file = "qualifiers.tsv.gz"
property_datatypes_file = "metadata.property.datatypes.tsv.gz"
metadata_file = "metadata.types.tsv.gz" 

#Create the file paths
claims = wiki_root_folder + claims_file
labels = wiki_root_folder + label_file
aliases = wiki_root_folder + alias_file
descriptions = wiki_root_folder + description_file
items = wiki_root_folder + item_file
quals = wiki_root_folder + qual_file
datatypes = wiki_root_folder + property_datatypes_file
metadata = wiki_root_folder + metadata_file

# Useful files Jupyter notebook
useful_files_notebook = "Wikidata Useful Files.ipynb"
notebooks_folder = "/nas/home/mbmann/kgtk_subset/kgtk/examples/"

Create an all.tsv.gz

In [None]:
!{kgtk} cat \
-i {aliases} \
-i {descriptions} \
-i {quals} \
-i {claims} \
-i {labels} \
-i {datatypes} \
-i {metadata} \
-o {out}/all.tsv.gz

Create os.environ variables for the input filepaths to be used by papermill

In [5]:
kgtk_scripts_path = "/nas/home/mbmann/kgtk_subset/kgtk"
os.environ["EXAMPLES_DIR"] = kgtk_scripts_path + "/examples"
os.environ["USECASE_DIR"] = kgtk_scripts_path + "/use-cases"
os.environ["TEMP"] = temp
os.environ["OUT"] = out
os.environ["DATATYPES"] = datatypes
os.environ["METADATA"] = metadata

Call the `partition-wikidata` notebook

In [None]:
pm.execute_notebook(
    os.environ["EXAMPLES_DIR"] + "/partition-wikidata.ipynb",
    os.environ["TEMP"] + "/partition-wikidata.out.ipynb",
    parameters=dict(
        wikidata_input_path = os.environ["OUT"] + "/all.tsv.gz",
        wikidata_parts_path = os.environ["OUT"] + "/parts",
        temp_folder_path = os.environ["OUT"] + "/parts/temp",
        sort_extras = "--buffer-size 30% --temporary-directory $OUT/parts/temp",
        verbose = False
    )
)
;

Call the `useful files` notebook

In [6]:
pm.execute_notebook(
    os.environ["USECASE_DIR"] + "/Wikidata Useful Files.ipynb",
    os.environ["TEMP"] + "/Wikidata Useful Files Out.ipynb",
    parameters=dict(
        output_path = os.environ["OUT"],
        output_folder = "useful_files",
        temp_folder = "temp.useful_files",
        wiki_root_folder = os.environ["OUT"] + "/parts/",
        languages = 'en',
        compute_pagerank = True,
        delete_database = False
    )
)
;

Executing:   0%|          | 0/103 [00:00<?, ?cell/s]

''