# Build Wikidata Infobox

In [1]:
import os
import re
import csv
import json
import numpy as np
import pandas as pd

from kgtk.functions import kgtk, kypher

## Datasets
1. Sitelinks: <div/>
`id                          node1    label                 node2                                ` <div/>
`Q45-wikipedia_sitelink-1    Q45      wikipedia_sitelink    http://en.wikipedia.org/wiki/Portugal`
2. DBpedia infobox: <div/>
`<http://dbpedia.org/resource/!!!>    <http://dbpedia.org/property/alias>    "Chk Chk Chk"@en `.

In [2]:
# Parameters

# Folder where database files store
data_path = "/nas/home/bohuizha/KG/hunger-for-knowledge/data/"

# The names of files in the KGTK Wikidata distirbution that we will use in this notebook.
data_file_names = {
    "wiki_info": "wikidata_infobox.tsv",
    "wiki_site": "wikidata-20211027.sitelinks.en.tsv",
    "sitelinks": "infobox/sitelinks.tsv",
    "infobox_ttl": "infobox-properties_en_2021_12_01.ttl",
    "infobox_tsv": "infobox-properties_en_2021_12_01.tsv",
    "qnodes": "infobox/qnodes.tsv",
    "not_dbnode": "infobox/wikidata_infobox_not_dbnode.tsv",
    "structured_literals": "infobox/structured_literals.tsv",
    "namespaces": "namespaces.tsv",
    "wiki_info_mapped": "infobox/wikidata_infobox_mapped.tsv",
    "head_mapped": "infobox/wikidata_infobox_head_mapped.tsv"
}

# We will define environment variables to hold the full paths to the files as we will use them in the shell commands
kgtk_environment_variables = []

os.environ['DATABASE'] = data_path
kgtk_environment_variables.append('DATABASE')

for key, value in data_file_names.items():
    variable = key.upper()
    os.environ[variable] = data_path + value
    kgtk_environment_variables.append(variable)
    
for variable in kgtk_environment_variables:
    print("{}: \"{}\"".format(variable, os.environ[variable]))

DATABASE: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/"
WIKI_INFO: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/wikidata_infobox.tsv"
WIKI_SITE: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/wikidata-20211027.sitelinks.en.tsv"
SITELINKS: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/infobox/sitelinks.tsv"
INFOBOX_TTL: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/infobox-properties_en_2021_12_01.ttl"
INFOBOX_TSV: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/infobox-properties_en_2021_12_01.tsv"
QNODES: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/infobox/qnodes.tsv"
NOT_DBNODE: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/infobox/wikidata_infobox_not_dbnode.tsv"
STRUCTURED_LITERALS: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/infobox/structured_literals.tsv"
NAMESPACES: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/namespaces.tsv"
WIKI_INFO_MAPPED: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/infobox/wikidata_infobox_mapped.tsv"
HEAD_MAPPED: "/

## Procedure

### Step 1 Build Wikidata nodes to DBpedia nodes link
Convert node2 from ` http://en.wikipedia.org/wiki/Portugal` into `dbpedia-resource:Portugal` use namespace;

In [5]:
df = pd.read_csv(os.environ["WIKI_SITE"], sep='\t')

df['id'] = df['id'].apply(lambda x: re.sub('wiki', 'db', x))
df['node2'] = df['node2'].apply(lambda x: re.sub('http://en.wikipedia.org/wiki/', 'dbpedia-resource:', x))
# 
df.loc[len(df.index)] = ['prefix', 'dbpedia-resource', 'prefix_expansion', '"http://dbpedia.org/resource/"', 'en', np.nan, np.nan]

df.to_csv(os.environ["SITELINKS"], sep='\t', index=False,
          quoting=csv.QUOTE_NONE, escapechar='', quotechar='')

### Step 2 Build namespaces

In [6]:
namespaces = [
    [
        'dbpedia-resource',
        'prefix_expansion',
        '"http://dbpedia.org/resource/"'
    ],
    [
        'property',
        'prefix_expansion',
        '"http://dbpedia.org/property/"'
    ],
    [
        'dbpedia-datatype',
        'prefix_expansion',
        '"http://dbpedia.org/datatype/"'
    ],
    [
        'rdf',
        'prefix_expansion',
        '"http://www.w3.org/1999/02/22-rdf-syntax-ns#"'
    ],
    [
        'xml-schema-type',
        'prefix_expansion',
        '"http://www.w3.org/2001/XMLSchema#"'
    ]
]
with open(os.environ["NAMESPACES"], 'w+', newline='') as f:
    fieldnames = ['node1', 'label', 'node2']
    # fieldnames = ['subject', 'to', 'object']
    writer = csv.writer(f, delimiter='\t', quoting=csv.QUOTE_NONE, escapechar='', quotechar='')
    writer.writerow(fieldnames)
    for ns in namespaces:
        writer.writerow(ns)

### Step 3 Import DBpedia infobox
Use `kgtk` `import-ntriples` to convert ttl to tsv;

In [7]:
%%time
kgtk("""
    import-ntriples -i $INFOBOX_TTL 
        -o $INFOBOX_TSV 
        --namespace-file $NAMESPACES 
        --namespace-id-use-uuid True 
        --build-new-namespaces False 
        --output-only-used-namespaces True 
        --structured-value-label dbpedia:structured_value 
        --structured-uri-label dbpedia:structured_uri 
        --newnode-prefix node 
        --newnode-use-uuid True
    """)

CPU times: user 1.12 s, sys: 780 ms, total: 1.9 s
Wall time: 43min 51s


### Step 4 Map node2 to Wikidata nodes
For records like `<Wikidata node> <property> <DBpedia node>` map value node (node2) into `Wikidata node`, like `<Wikidata node> <property> <Wikidata node>`;

In [8]:
%%time
kgtk("""
    query -i $INFOBOX_TSV $SITELINKS
        --match 'i: (w)-[p]->(v), s: (q)-[]->(w)' 
        --return 'q as node1, p.label as label, v as node2'
        -o $HEAD_MAPPED 
    """)

CPU times: user 451 ms, sys: 375 ms, total: 826 ms
Wall time: 14min 9s


In [9]:
%%time
kgtk("""
    query -i $HEAD_MAPPED $SITELINKS 
        --match 'wi: (w)-[p]->(v), s: (q)-[]->(v)' 
        --return 'w as node1, p.label as label, q as node2' 
        -o $QNODES
    """) 

CPU times: user 407 ms, sys: 382 ms, total: 790 ms
Wall time: 10min 56s


In [10]:
%%time
kgtk("""
    filter -i $HEAD_MAPPED 
        --regex --match-type match 
        -p ';;^(?!dbpedia-resource:).*' 
        -o $NOT_DBNODE
    """)

CPU times: user 233 ms, sys: 294 ms, total: 527 ms
Wall time: 6min 43s


In [11]:
%%time
kgtk("""
    join --left-join --right-join 
        --left-file $NOT_DBNODE
        --right-file $QNODES 
        -o $WIKI_INFO_MAPPED
    """)

CPU times: user 131 ms, sys: 361 ms, total: 493 ms
Wall time: 5min 47s


### Step 5 Add structured literals
Concatenate structure literals in original DBpedia infobox with generated Wikidata infobox;

In [12]:
%%time
kgtk("""
    filter -i $INFOBOX_TSV 
        --regex --match-type match 
        -p "node;;" 
        -o $STRUCTURED_LITERALS
    """)

CPU times: user 81.8 ms, sys: 278 ms, total: 360 ms
Wall time: 5min 38s


In [13]:
%%time
kgtk("""
    cat -i $STRUCTURED_LITERALS $WIKI_INFO_MAPPED 
        -o $WIKI_INFO
    """)

CPU times: user 37.4 ms, sys: 199 ms, total: 236 ms
Wall time: 1min 34s


In [None]:
# kgtk("""
#     query -i $WIKIINFO 
#         --match '(q)-[]->()' 
#         --where 'q = ""'
#     """)