In [1]:
%%time
import papermill as pm

from kgtk.configure_kgtk_notebooks import ConfigureKGTK
from kgtk.functions import kgtk, kypher
import os

CPU times: user 1.78 s, sys: 1.95 s, total: 3.73 s
Wall time: 10.8 s


**NOTE: downloaded.wikipedia.short_abstracts.tsv.gz**

This file is available to be downloaded from `https://drive.google.com/drive/folders/1UkvFFLWbfjJtSw767IKYPfZiFsqUFu5n`

The location on `ckg07` is `/data/amandeep/downloaded.wikipedia.short_abstracts.tsv.gz`

This file is required for building the cache file for KGTK Browser.

In [2]:
input_path = "/data/amandeep"
output_path = "/data/amandeep"
project_name = "create-wikidata-dwd-20220623"

kgtk_path= "/data/amandeep/Github/kgtk"
kgtk_browser_path = "/data/amandeep/Github/kgtk-browser"
kernel_name = "kgtk-env-ckg07"
wikipedia_short_abstracts_path = '/data/amandeep/downloaded.wikipedia.short_abstracts.tsv.gz'

In [3]:
%%time
ck = ConfigureKGTK([], kgtk_path=kgtk_path)
ck.configure_kgtk(input_graph_path=input_path,
                  output_path=output_path,
                  project_name=project_name,
                 graph_cache_path=None)

User home: /nas/home/amandeep
Current dir: /data/amandeep/Github/kgtk-notebooks/use-cases/create_wikidata
KGTK dir: /data/amandeep/Github/kgtk
Use-cases dir: /data/amandeep/Github/kgtk/use-cases
CPU times: user 1.22 ms, sys: 152 µs, total: 1.37 ms
Wall time: 1.02 ms


In [4]:
ck.print_env_variables()

kgtk: kgtk
USE_CASES_DIR: /data/amandeep/Github/kgtk/use-cases
kypher: kgtk query --graph-cache /data/amandeep/create-wikidata-dwd-20220623/temp.create-wikidata-dwd-20220623/wikidata.sqlite3.db
KGTK_GRAPH_CACHE: /data/amandeep/create-wikidata-dwd-20220623/temp.create-wikidata-dwd-20220623/wikidata.sqlite3.db
STORE: /data/amandeep/create-wikidata-dwd-20220623/temp.create-wikidata-dwd-20220623/wikidata.sqlite3.db
EXAMPLES_DIR: /data/amandeep/Github/kgtk/examples
OUT: /data/amandeep/create-wikidata-dwd-20220623
TEMP: /data/amandeep/create-wikidata-dwd-20220623/temp.create-wikidata-dwd-20220623
GRAPH: /data/amandeep
KGTK_LABEL_FILE: /data/amandeep/labels.en.tsv.gz
KGTK_OPTION_DEBUG: false


## Run the Import Wikidata Notebook 

In [5]:
# Parameters for Import Wikidata
json_file_path = "/data/amandeep/wikidata-20220623"
import_wikidata_path = "/data/amandeep/wikidata-20220623"
wikidata_project_name = "import-wikidata"
wikidata_json_file = "latest-all.json.bz2"
sort_command = 'sort'

In [None]:
%%time
pm.execute_notebook(
    "import-wikidata.ipynb",
    os.environ["TEMP"] + "/import-wikidata.out.ipynb",
    kernel_name=kernel_name,
    parameters=dict(
        input_path = json_file_path,
        output_path = import_wikidata_path,
        project_name = wikidata_project_name,
        wikidata_json_file = wikidata_json_file,
        kgtk_path = kgtk_path,
        sort_command = sort_command
    )
)

## Run the Useful Files Notebook to compute `isa` and `p279star` files only

In [6]:
# Parameters for First run on Useful Files
first_useful_files_input_path = f"{import_wikidata_path}/{wikidata_project_name}"
first_useful_files_output_path = import_wikidata_path
first_useful_files_project_name = "useful-files"
first_useful_files = 'claims,label_all,alias_all,description_all'
first_useful_files_for_cache = 'claims'

In [None]:
%%time
pm.execute_notebook(
    "Wikidata-Useful-Files.ipynb",
    os.environ["TEMP"] + "/Wikidata-Useful-Files.out.ipynb",
    kernel_name=kernel_name,
    parameters=dict(
        input_path = first_useful_files_input_path,
        output_path = first_useful_files_output_path,
        project_name = first_useful_files_project_name,
        kgtk_path = kgtk_path,
        files = first_useful_files,
        files_for_cache=first_useful_files_for_cache,
        compute_pagerank=False,
        compute_degrees=False,
        debug=False,
        compute_isa_star=False,
        compute_p31p279_star=False
    )
)

In [9]:
!cp $import_wikidata_path/$first_useful_files_project_name/derived.isa.tsv.gz $import_wikidata_path/$wikidata_project_name
!cp $import_wikidata_path/$first_useful_files_project_name/derived.P279star.tsv.gz $import_wikidata_path/$wikidata_project_name

## Run Wikidata Subsets Notebook

The following notebook will run the following notebooks ,

1. `./partition-wikidata.ipynb`
The output will be at the path (example accoding to the parameters specified in the below cell) ,
`/data/amandeep/wikidata-20220519-dwd-v5/parts`

2. `./Wikidata-Useful-Files.ipynb`

The output will be at the path,
`/data/amandeep/wikidata-20220519-dwd-v5/useful-files`

We will move the output files from the above 2 notebooks to the path `/data/amandeep/wikidata-20220519-dwd-v5` at the end of execution of the `Wikidata-Subsets.ipynb` notebook.

In [7]:
subset_input_path = f"{import_wikidata_path}/{wikidata_project_name}"
subset_output_path = "/data/amandeep"


subset_project_name = "wikidata-20220623-dwd-v6"

subset_files = 'isa,p279star'

# Classes to remove
remove_classes = "Q7318358,Q13442814"

languages = "en,ru,es,zh-cn,de,it,nl,pl,fr,pt,sv"

In [None]:
%%time
pm.execute_notebook(
    "Wikidata-Subsets.ipynb",
    os.environ["TEMP"] + "/Wikidata-Subsets.out.ipynb",
    kernel_name=kernel_name,
    parameters=dict(
        input_path = subset_input_path,
        output_path = subset_output_path,
        project_name = subset_project_name,
        kgtk_path = kgtk_path,
        files = subset_files,
        remove_classes = remove_classes,
        languages = languages,
        kernel_name = kernel_name
    )
)

In [12]:
!mv $subset_output_path/$subset_project_name/parts/*tsv.gz $subset_output_path/$subset_project_name
!mv $subset_output_path/$subset_project_name/useful-files/*tsv.gz $subset_output_path/$subset_project_name

## Create JSON file for KGTK-Search

The following notebook will create following file ,

`/data/amandeep/wikidata-20220519-dwd-v5/kgtk-search/wikidata.dwd.all.kgtk.search.sorted.jl`

We will then split the json lines file into 1M line partitions and load it into the ES index

In [8]:
search_input_path = f"{subset_output_path}/{subset_project_name}"
search_output_path = f"{subset_output_path}/{subset_project_name}"

search_project_name = "kgtk-search"

compute_embeddings = False
generate_triples = False
generate_kgtk_search = True
datatype_property = "datatype"

es_url="http://ckg07:9200"
es_index="wikidata-dwd-kgtk-search-06"
old_es_index="wikidata-dwd-kgtk-search-05"

In [None]:
%%time
pm.execute_notebook(
    "Embeddings-Elasticsearch-Triples.ipynb",
    os.environ["TEMP"] + "/Embeddings-Elasticsearch-Triples.out.ipynb",
    kernel_name=kernel_name,
    parameters=dict(
        input_path = search_input_path,
        output_path = search_output_path,
        project_name = search_project_name,
        kgtk_path = kgtk_path,
        compute_embeddings = compute_embeddings,
        generate_triples = generate_triples,
        generate_kgtk_search = generate_kgtk_search,
        datatype_property = datatype_property,
        languages = languages
    )
)

### Split the output json lines file to 1M lines partitions

In [15]:
!mkdir -p $search_output_path/$search_project_name/es_split/

In [16]:
!split $search_output_path/$search_project_name/wikidata.dwd.all.kgtk.search.sorted.jl \
    -l 1000000 \
    $search_output_path/$search_project_name/es_split/

## Run Properties-for-this-type-notebook

This notebook is in the [kgtk-notebooks](https://github.com/usc-isi-i2/kgtk-notebooks) repo.

In [9]:
p_input_path=f"{subset_output_path}/{subset_project_name}"
p_output_path = f"{subset_output_path}/{subset_project_name}"

# we will re use graph cache from the useful-files notebook
# at this point it already has the `claims` file loaded into cache.
# we will only load the required files into the cache, save time
p_graph_cache_path = f"{subset_output_path}/{subset_project_name}/useful-files/temp.useful-files/wikidata.sqlite3.db"
files_for_cache="item,datatypes,p279,p279star"

p_project_name = "p1963"
debug = False

In [None]:
%%time
pm.execute_notebook(
    "properties-for-this-type.ipynb",
    os.environ["TEMP"] + "/properties-for-this-type.out.ipynb",
    kernel_name=kernel_name,
    parameters=dict(
        input_path = p_input_path,
        output_path = p_output_path,
        project_name = p_project_name,
        graph_cache_path = p_graph_cache_path,
        debug = debug,
        files_for_cache=files_for_cache   
    )
)

**move the files out into the root folder**

In [20]:
!mv $p_output_path/$p_project_name/statistics.Pinstance_count.star.tsv.gz $p_output_path
!mv $p_output_path/$p_project_name/statistics.Pinstance_count.tsv.gz $p_output_path
!mv $p_output_path/$p_project_name/statistics.Psubclass_count.star.tsv.gz $p_output_path
!mv $p_output_path/$p_project_name/derived.class.P1963computed.count.tsv.gz $p_output_path
!mv $p_output_path/$p_project_name/derived.P1963computed.subclass.count.star.tsv.gz $p_output_path
!mv $p_output_path/$p_project_name/derived.Pproperty_domain.tsv.gz $p_output_path
!mv $p_output_path/$p_project_name/derived.P1963computed.count.star.tsv.gz $p_output_path

mv: cannot stat ‘/data/amandeep/wikidata-20220623-dwd-v6/p1963/statistics.Pinstance_count.star.tsv.gz’: No such file or directory
mv: cannot stat ‘/data/amandeep/wikidata-20220623-dwd-v6/p1963/statistics.Pinstance_count.tsv.gz’: No such file or directory
mv: cannot stat ‘/data/amandeep/wikidata-20220623-dwd-v6/p1963/statistics.Psubclass_count.star.tsv.gz’: No such file or directory
mv: cannot stat ‘/data/amandeep/wikidata-20220623-dwd-v6/p1963/derived.class.P1963computed.count.tsv.gz’: No such file or directory
mv: cannot stat ‘/data/amandeep/wikidata-20220623-dwd-v6/p1963/derived.Pproperty_domain.tsv.gz’: No such file or directory
mv: cannot stat ‘/data/amandeep/wikidata-20220623-dwd-v6/p1963/derived.P1963computed.count.star.tsv.gz’: No such file or directory


## Run class-visualization notebook

In [10]:
c_input_path = f"{subset_output_path}/{subset_project_name}"
c_output_path = f"{subset_output_path}/{subset_project_name}"
c_project_name = "class-visualization"

# re use the graph cache, at this point the cache has the following files loaded
# claims,item,datatypes,p279,p279star
# we only need to load label

c_graph_cache_path = p_graph_cache_path
files_for_cache = "label"
debug = False

In [None]:
%%time
pm.execute_notebook(
    "class-visualization.ipynb",
    os.environ["TEMP"] + "/class-visualization.out.ipynb",
    kernel_name=kernel_name,
    parameters=dict(
        input_path = c_input_path,
        output_path = c_output_path,
        project_name = c_project_name,
        graph_cache_path = c_graph_cache_path,
        debug = debug,
        files_for_cache=files_for_cache   
    )
)

In [23]:
!mv $c_output_path/$c_project_name/class-visualization.node.tsv.gz $c_output_path
!mv $c_output_path/$c_project_name/class-visualization.edge.tsv.gz $c_output_path

## Run Create-claims-augmented-for-browser notebook

In [11]:
a_input_path = f"{subset_output_path}/{subset_project_name}"
a_output_path = f"{subset_output_path}/{subset_project_name}"
a_project_name = "browser-claims-file"

In [25]:
!cp $wikipedia_short_abstracts_path $subset_output_path/$subset_project_name

In [None]:
%%time
pm.execute_notebook(
    "Create-claims-augmented-for-browser.ipynb",
    os.environ["TEMP"] + "/Create-claims-augmented-for-browser.out.ipynb",
    kernel_name=kernel_name,
    parameters=dict(
        input_path = a_input_path,
        output_path = a_output_path,
        project_name = a_project_name
    )
)

## Run KGTK-Query-Text-Search-Setup Notebook

In [12]:
q_input_path = f"{a_output_path}/{a_project_name}"
q_output_path = f"{a_output_path}/{a_project_name}"

q_project_name = "kgtk-browser-files"

In [28]:
!cp $subset_output_path/$subset_project_name/class-visualization.edge.tsv.gz $q_input_path
!cp $subset_output_path/$subset_project_name/class-visualization.node.tsv.gz $q_input_path
!cp $subset_output_path/$subset_project_name/metadata.pagerank.undirected.tsv.gz $q_input_path

In [None]:
%%time
pm.execute_notebook(
    "KGTK-Query-Text-Search-Setup.ipynb",
    os.environ["TEMP"] + "/KGTK-Query-Text-Search-Setup.ipynb",
    kernel_name=kernel_name,
    parameters=dict(
        input_path = q_input_path,
        output_path = q_output_path,
        project_name = q_project_name
    )
)

## LOAD ES Index

**Make sure [table-linker](https://github.com/usc-isi-i2/table-linker) is installed**

In [14]:
!curl -H "Content-Type: application/json" \
-XPUT $es_url/$es_index -d @$search_output_path/$search_project_name/wikidata_dwd_v3_mapping.json

{"acknowledged":true,"shards_acknowledged":true,"index":"wikidata-dwd-kgtk-search-06"}

In [15]:
os.environ['search_output_path']=search_output_path
os.environ['search_project_name']=search_project_name
os.environ['es_url']=es_url
os.environ['es_index']=es_index
os.environ['old_es_index']=old_es_index

In [16]:
%%bash
for f in $search_output_path/$search_project_name/es_split/* ;
do
 echo $f 
 tl load-elasticsearch-index --es-url $es_url --es-index $es_index --es-version 7 --kgtk-jl-path $f > $TEMP/load_es.log
 sleep 60
done

/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/aa


load-elasticsearch-index Time: 203.23125886917114s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/ab


load-elasticsearch-index Time: 212.19348120689392s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/ac


load-elasticsearch-index Time: 232.39738059043884s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/ad


load-elasticsearch-index Time: 223.13047671318054s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/ae


load-elasticsearch-index Time: 207.98568725585938s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/af


load-elasticsearch-index Time: 209.52932262420654s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/ag


load-elasticsearch-index Time: 217.15551614761353s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/ah


load-elasticsearch-index Time: 205.30692982673645s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/ai


load-elasticsearch-index Time: 280.6385817527771s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/aj


load-elasticsearch-index Time: 245.82577466964722s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/ak


load-elasticsearch-index Time: 273.07805943489075s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/al


load-elasticsearch-index Time: 288.3985483646393s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/am


load-elasticsearch-index Time: 290.35138273239136s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/an


load-elasticsearch-index Time: 247.05698943138123s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/ao


load-elasticsearch-index Time: 226.00112390518188s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/ap


load-elasticsearch-index Time: 238.58124351501465s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/aq


load-elasticsearch-index Time: 250.91750574111938s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/ar


load-elasticsearch-index Time: 285.9588119983673s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/as


load-elasticsearch-index Time: 233.44917464256287s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/at


load-elasticsearch-index Time: 232.89905071258545s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/au


load-elasticsearch-index Time: 217.68211150169373s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/av


load-elasticsearch-index Time: 228.22214603424072s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/aw


load-elasticsearch-index Time: 223.08515882492065s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/ax


load-elasticsearch-index Time: 218.168771982193s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/ay


load-elasticsearch-index Time: 211.16736125946045s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/az


load-elasticsearch-index Time: 214.17090511322021s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/ba


load-elasticsearch-index Time: 235.91955542564392s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/bb


load-elasticsearch-index Time: 315.8065674304962s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/bc


load-elasticsearch-index Time: 258.2485029697418s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/bd


load-elasticsearch-index Time: 231.61155581474304s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/be


load-elasticsearch-index Time: 234.4430947303772s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/bf


load-elasticsearch-index Time: 239.02663493156433s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/bg


load-elasticsearch-index Time: 222.19299221038818s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/bh


load-elasticsearch-index Time: 226.2885115146637s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/bi


load-elasticsearch-index Time: 211.87452960014343s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/bj


load-elasticsearch-index Time: 213.76228952407837s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/bk


load-elasticsearch-index Time: 252.84073138237s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/bl


load-elasticsearch-index Time: 241.09837555885315s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/bm


load-elasticsearch-index Time: 230.67660188674927s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/bn


load-elasticsearch-index Time: 226.34756469726562s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/bo


load-elasticsearch-index Time: 198.8803005218506s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/bp


load-elasticsearch-index Time: 191.73241519927979s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/bq


load-elasticsearch-index Time: 203.71557426452637s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/br


load-elasticsearch-index Time: 216.29501128196716s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/bs


load-elasticsearch-index Time: 210.23845601081848s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/bt


load-elasticsearch-index Time: 214.7675621509552s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/bu


load-elasticsearch-index Time: 233.4191062450409s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/bv


load-elasticsearch-index Time: 203.50434851646423s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/bw


load-elasticsearch-index Time: 202.7918300628662s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/bx


load-elasticsearch-index Time: 210.43698024749756s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/by


load-elasticsearch-index Time: 200.0101158618927s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/bz


load-elasticsearch-index Time: 193.1346881389618s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/ca


load-elasticsearch-index Time: 198.98675346374512s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/cb


load-elasticsearch-index Time: 208.5802345275879s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/cc


load-elasticsearch-index Time: 209.33298254013062s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/cd


load-elasticsearch-index Time: 219.51308798789978s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/ce


load-elasticsearch-index Time: 231.2949652671814s


/data/amandeep/wikidata-20220623-dwd-v6/kgtk-search/es_split/cf


load-elasticsearch-index Time: 43.602638483047485s


In [23]:
%%bash
curl -X POST "$es_url/_aliases?pretty" -H 'Content-Type: application/json' -d"
{
  \"actions\": [
    {
      \"remove\": {
        \"index\": \"$old_es_index\",
        \"alias\": \"wikidata-dwd-kgtk-search\"
      }
    },
    {
      \"add\": {
        \"index\": \"$es_index\",
        \"alias\": \"wikidata-dwd-kgtk-search\"
      }
    }
  ]
}
"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   310  100    28  100   282     78    795 --:--:-- --:--:-- --:--:--   796


{
  "acknowledged" : true
}
