In [1]:
%%time
import papermill as pm

from kgtk.configure_kgtk_notebooks import ConfigureKGTK
from kgtk.functions import kgtk, kypher
import os

CPU times: user 829 ms, sys: 181 ms, total: 1.01 s
Wall time: 784 ms


**NOTE: downloaded.wikipedia.short_abstracts.tsv.gz**

This file is available to be downloaded from `https://drive.google.com/drive/folders/1UkvFFLWbfjJtSw767IKYPfZiFsqUFu5n`

The location on `ckg07` is `/data/amandeep/downloaded.wikipedia.short_abstracts.tsv.gz`

This file is required for building the cache file for KGTK Browser.

In [2]:
input_path = "/data/amandeep"
output_path = "/data/amandeep"
project_name = "create-wikidata-dwd"

kgtk_path= "/data/amandeep/Github/kgtk"
kgtk_browser_path = "/data/amandeep/Github/kgtk-browser"
kernel_name = "kgtk-env-ckg07"
wikipedia_short_abstracts_path = '/data/amandeep/downloaded.wikipedia.short_abstracts.tsv.gz'

In [None]:
%%time
ck = ConfigureKGTK([], kgtk_path=kgtk_path)
ck.configure_kgtk(input_graph_path=input_path,
                  output_path=output_path,
                  project_name=project_name,
                 graph_cache_path=None)

In [4]:
ck.print_env_variables()

GRAPH: /data/amandeep
KGTK_OPTION_DEBUG: false
USE_CASES_DIR: /data/amandeep/Github/kgtk/use-cases
EXAMPLES_DIR: /data/amandeep/Github/kgtk/examples
KGTK_LABEL_FILE: /data/amandeep/labels.en.tsv.gz


In [None]:
!git clone https://github.com/usc-isi-i2/kgtk-browser $OUT/kgtk-browser

## Run the Import Wikidata Notebook 

In [4]:
# Parameters for Import Wikidata
json_file_path = "/data/amandeep/wikidata-20220519"
import_wikidata_path = "/data/amandeep/wikidata-20220519"
wikidata_project_name = "import-wikidata"
wikidata_json_file = "latest-all.json.bz2"
sort_command = 'sort'

In [None]:
%%time
pm.execute_notebook(
    "import-wikidata.ipynb",
    os.environ["TEMP"] + "/import-wikidata.out.ipynb",
    kernel_name=kernel_name,
    parameters=dict(
        input_path = json_file_path,
        output_path = import_wikidata_path,
        project_name = wikidata_project_name,
        wikidata_json_file = wikidata_json_file,
        kgtk_path = kgtk_path,
        sort_command = sort_command
    )
)

## Run the Useful Files Notebook to compute `isa` and `p279star` files only

In [5]:
# Parameters for First run on Useful Files
first_useful_files_input_path = f"{import_wikidata_path}/{wikidata_project_name}"
first_useful_files_output_path = import_wikidata_path
first_useful_files_project_name = "useful-files"
first_useful_files = 'claims,label_all,alias_all,description_all'
first_useful_files_for_cache = 'claims'

In [None]:
%%time
pm.execute_notebook(
    "Wikidata-Useful-Files.ipynb",
    os.environ["TEMP"] + "/Wikidata-Useful-Files.out.ipynb",
    kernel_name=kernel_name,
    parameters=dict(
        input_path = first_useful_files_input_path,
        output_path = first_useful_files_output_path,
        project_name = first_useful_files_project_name,
        kgtk_path = kgtk_path,
        files = first_useful_files,
        files_for_cache=first_useful_files_for_cache,
        compute_pagerank=False,
        compute_degrees=False,
        debug=False,
        compute_isa_star=False,
        compute_p31p279_star=False
    )
)

In [None]:
!cp $import_wikidata_path/$first_useful_files_project_name/derived.isa.tsv.gz $import_wikidata_path/$wikidata_project_name
!cp $import_wikidata_path/$first_useful_files_project_name/derived.P279star.tsv.gz $import_wikidata_path/$wikidata_project_name

## Run Wikidata Subsets Notebook

The following notebook will run the following notebooks ,

1. `./partition-wikidata.ipynb`
The output will be at the path (example accoding to the parameters specified in the below cell) ,
`/data/amandeep/wikidata-20220519-dwd-v5/parts`

2. `./Wikidata-Useful-Files.ipynb`

The output will be at the path,
`/data/amandeep/wikidata-20220519-dwd-v5/useful-files`

We will move the output files from the above 2 notebooks to the path `/data/amandeep/wikidata-20220519-dwd-v5` at the end of execution of the `Wikidata-Subsets.ipynb` notebook.

In [6]:
subset_input_path = f"{import_wikidata_path}/{wikidata_project_name}"
subset_output_path = "/data/amandeep"


subset_project_name = "wikidata-20220519-dwd-v5"

subset_files = 'isa,p279star'

# Classes to remove
remove_classes = "Q7318358,Q13442814"

languages = "en,ru,es,zh-cn,de,it,nl,pl,fr,pt,sv"

In [None]:
%%time
pm.execute_notebook(
    "Wikidata-Subsets.ipynb",
    os.environ["TEMP"] + "/Wikidata-Subsets.out.ipynb",
    kernel_name=kernel_name,
    parameters=dict(
        input_path = subset_input_path,
        output_path = subset_output_path,
        project_name = subset_project_name,
        kgtk_path = kgtk_path,
        files = subset_files,
        remove_classes = remove_classes,
        languages = languages,
        kernel_name = kernel_name
    )
)

In [10]:
!mv $subset_output_path/$subset_project_name/parts/*tsv.gz $subset_output_path/$subset_project_name
!mv $subset_output_path/$subset_project_name/useful-files/*tsv.gz $subset_output_path/$subset_project_name

## Create JSON file for KGTK-Search

The following notebook will create following file ,

`/data/amandeep/wikidata-20220519-dwd-v5/kgtk-search/wikidata.dwd.all.kgtk.search.sorted.jl`

We will then split the json lines file into 1M line partitions and load it into the ES index

In [7]:
search_input_path = f"{subset_output_path}/{subset_project_name}"
search_output_path = f"{subset_output_path}/{subset_project_name}"

search_project_name = "kgtk-search"

compute_embeddings = False
generate_triples = False
generate_kgtk_search = True
datatype_property = "datatype"

es_url="http://ckg07:9200"
es_index="wikidata-dwd-kgtk-search-04"

In [None]:
%%time
pm.execute_notebook(
    "Embeddings-Elasticsearch-Triples.ipynb",
    os.environ["TEMP"] + "/Embeddings-Elasticsearch-Triples.out.ipynb",
    kernel_name=kernel_name,
    parameters=dict(
        input_path = search_input_path,
        output_path = search_output_path,
        project_name = search_project_name,
        kgtk_path = kgtk_path,
        compute_embeddings = compute_embeddings,
        generate_triples = generate_triples,
        generate_kgtk_search = generate_kgtk_search,
        datatype_property = datatype_property,
        languages = languages
    )
)

### Split the output json lines file to 1M lines partitions

In [13]:
!mkdir -p $search_output_path/$search_project_name/es_split/

In [14]:
!split $search_output_path/$search_project_name/wikidata.dwd.all.kgtk.search.sorted.jl \
    -l 1000000 \
    $search_output_path/$search_project_name/es_split/

## Run Properties-for-this-type-notebook

This notebook is in the [kgtk-notebooks](https://github.com/usc-isi-i2/kgtk-notebooks) repo.

In [8]:
p_input_path=f"{subset_output_path}/{subset_project_name}"
p_output_path = f"{subset_output_path}/{subset_project_name}"

# we will re use graph cache from the useful-files notebook
# at this point it already has the `claims` file loaded into cache.
# we will only load the required files into the cache, save time
p_graph_cache_path = f"{subset_output_path}/{subset_project_name}/useful-files/temp.useful-files/wikidata.sqlite3.db"
files_for_cache="item,datatypes,p279,p279star"

p_project_name = "p1963"
debug = False

In [None]:
%%time
pm.execute_notebook(
    "properties-for-this-type.ipynb",
    os.environ["TEMP"] + "/properties-for-this-type.out.ipynb",
    kernel_name=kernel_name,
    parameters=dict(
        input_path = p_input_path,
        output_path = p_output_path,
        project_name = p_project_name,
        graph_cache_path = p_graph_cache_path,
        debug = debug,
        files_for_cache=files_for_cache   
    )
)

**move the files out into the root folder**

In [17]:
!mv $p_output_path/$p_project_name/*tsv.gz $p_output_path

## Run class-visualization notebook

In [9]:
c_input_path = f"{subset_output_path}/{subset_project_name}"
c_output_path = f"{subset_output_path}/{subset_project_name}"
c_project_name = "class-visualization"

# re use the graph cache, at this point the cache has the following files loaded
# claims,item,datatypes,p279,p279star
# we only need to load label

c_graph_cache_path = p_graph_cache_path
files_for_cache = "label"
debug = False

In [None]:
%%time
pm.execute_notebook(
    "class-visualization.ipynb",
    os.environ["TEMP"] + "/class-visualization.out.ipynb",
    kernel_name=kernel_name,
    parameters=dict(
        input_path = c_input_path,
        output_path = c_output_path,
        project_name = c_project_name,
        graph_cache_path = c_graph_cache_path,
        debug = debug,
        files_for_cache=files_for_cache   
    )
)

In [10]:
!mv $c_output_path/$c_project_name/class-visualization.node.tsv.gz $c_output_path
!mv $c_output_path/$c_project_name/class-visualization.edge.tsv.gz $c_output_path

## Run Create-claims-augmented-for-browser notebook

In [11]:
a_input_path = f"{subset_output_path}/{subset_project_name}"
a_output_path = f"{subset_output_path}/{subset_project_name}"
a_project_name = "browser-claims-file"

In [13]:
!cp $wikipedia_short_abstracts_path $subset_output_path/$subset_project_name

In [None]:
%%time
pm.execute_notebook(
    "Create-claims-augmented-for-browser.ipynb",
    os.environ["TEMP"] + "/Create-claims-augmented-for-browser.out.ipynb",
    kernel_name=kernel_name,
    parameters=dict(
        input_path = a_input_path,
        output_path = a_output_path,
        project_name = a_project_name
    )
)

## Run KGTK-Query-Text-Search-Setup Notebook

In [12]:
q_input_path = f"{a_output_path}/{a_project_name}"
q_output_path = f"{a_output_path}/{a_project_name}"

q_project_name = "kgtk-browser-files"

In [15]:
!cp $subset_output_path/$subset_project_name/class-visualization.edge.tsv.gz $q_input_path
!cp $subset_output_path/$subset_project_name/class-visualization.node.tsv.gz $q_input_path
!cp $subset_output_path/$subset_project_name/metadata.pagerank.undirected.tsv.gz $q_input_path

In [None]:
%%time
pm.execute_notebook(
    f"{os.environ['OUT']}/kgtk-browser/KGTK-Query-Text-Search-Setup.ipynb",
    os.environ["TEMP"] + "/KGTK-Query-Text-Search-Setup.ipynb",
    kernel_name=kernel_name,
    parameters=dict(
        input_path = q_input_path,
        output_path = q_output_path,
        project_name = q_project_name
    )
)

## LOAD ES Index

**Make sure [table-linker](https://github.com/usc-isi-i2/table-linker) is installed**

In [21]:
!curl -H "Content-Type: application/json" \
-XPUT $es_url/$es_index -d @$search_output_path/$search_project_name/wikidata_dwd_v3_mapping.json

{"acknowledged":true,"shards_acknowledged":true,"index":"wikidata-dwd-kgtk-search-04"}

In [13]:
os.environ['search_output_path']=search_output_path
os.environ['search_project_name']=search_project_name
os.environ['es_url']=es_url
os.environ['es_index']=es_index

In [None]:
%%bash
for f in $search_output_path/$search_project_name/es_split/* ;
do
 echo $f 
 tl load-elasticsearch-index --es-url $es_url --es-index $es_index --es-version 7 --kgtk-jl-path $f > $TEMP/load_es.log
 sleep 60
done