# Step 1 User Query

In [1]:
import os
import re
import time
import json
import subprocess
import pandas as pd

from tqdm import tqdm
from collections import defaultdict

## Define alias and variables

In [2]:
# Parameters

# Folder where database files store
data_path = "/nas/home/bohuizha/KG/hunger-for-knowledge/data/"

# The names of files in the KGTK Wikidata distirbution that we will use in this notebook.
data_file_names = {
    "claims": "claims.tsv",
    "wiki_info": "wikidata_infobox.tsv",
    "p31": "P31.tsv",
    "p279star": "P279star.tsv",
    "labels": "labels.en.tsv",
    "constraints": "value_type_constraint.json"
}

# We will define environment variables to hold the full paths to the files as we will use them in the shell commands
kgtk_environment_variables = []

os.environ['DATABASE'] = data_path
kgtk_environment_variables.append('DATABASE')

for key, value in data_file_names.items():
    variable = key.upper()
    os.environ[variable] = data_path + value
    kgtk_environment_variables.append(variable)
    
for variable in kgtk_environment_variables:
    print("{}: \"{}\"".format(variable, os.environ[variable]))

DATABASE: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/"
CLAIMS: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/claims.tsv"
WIKI_INFO: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/wikidata_infobox.tsv"
P31: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/P31.tsv"
P279STAR: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/P279star.tsv"
LABELS: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/labels.en.tsv"
CONSTRAINTS: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/value_type_constraint.json"


**Method 1: query from Wikidata**
- property constraint (P2302)
- value-type constraint (Q21510865)

In [None]:
# property_list = !kgtk query -i $CLAIMS --match 'c: (q)-[p:P2302]->(v:Q21510865)' --return 'q' --limit 100
# property_list = property_list[1:5]

**Method 2: preprocess Kartik's work**

In [3]:
with open(os.environ['CONSTRAINTS']) as f:
    property_dict = json.load(f)

In [4]:
# start_time = round(time.time())
property_runtime = {k: defaultdict(float) for k, _ in property_dict.items()}

In [5]:
property_lines_count = {k: defaultdict(int) for k, _ in property_dict.items()}
property_qnodes_count = {k: defaultdict(int) for k, _ in property_dict.items()}

In [6]:
# Parameters

# Folder on local machine where to create the output and temporary folders
output_path = "/nas/home/bohuizha/KG/hunger-for-knowledge/batch_output/"
if not os.path.exists(output_path):
    os.makedirs(output_path)
output_list = ['results', 'infobox_results', 'new_results', 'samples', 
               'unknown', 'entity', 'class', 'query', 'agree',
               'direct_infer', 'indirect_infer', 'infer', 
               'structured_literals', 'nodes', 'qnodes', 
               'correct_temp_1', 'correct_temp_2', 'incorrect_temp', 
               'correct', 'incorrect']
for folder_name in output_list:
    folder_path = os.path.join(output_path, folder_name)
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
    variable = folder_name.upper()
    os.environ[variable] = folder_path
    kgtk_environment_variables.append(variable)

os.environ['OUTPUT'] = output_path
kgtk_environment_variables.append('OUTPUT')

os.environ['PROPERTY_MAPPING'] = "/nas/home/bohuizha/KG/hunger-for-knowledge/batch_output/property_mapping_full.json"
kgtk_environment_variables.append('PROPERTY_MAPPING')

os.environ['RUNTIME'] = "/nas/home/bohuizha/KG/hunger-for-knowledge/batch_output/runtime.json"
kgtk_environment_variables.append('RUNTIME')

os.environ['LINE_STATISTICS'] = "/nas/home/bohuizha/KG/hunger-for-knowledge/batch_output/line.statistics.json"
kgtk_environment_variables.append('LINE_STATISTICS')

os.environ['QNODE_STATISTICS'] = "/nas/home/bohuizha/KG/hunger-for-knowledge/batch_output/qnode.statistics.json"
kgtk_environment_variables.append('QNODE_STATISTICS')

for prop in property_dict.keys():
    output_file_names = {
        "%s_results" % prop: "results/results.%s.tsv" % prop,
        "%s_infobox_results" % prop: "infobox_results/infobox.results.%s.tsv" % prop,
        "%s_new_results" % prop: "new_results/new.results.%s.tsv" % prop,
        "%s_samples" % prop: "samples/samples.%s.tsv" % prop,
        "%s_agree" % prop: "agree/agree.%s.tsv" % prop,
        "%s_query_file" % prop: "query/query.file.%s.tsv" % prop,
        "%s_direct_infer" % prop: "direct_infer/direct.infer.%s.tsv" % prop,
        "%s_indirect_infer" % prop: "indirect_infer/indirect.infer.%s.tsv" % prop,
        "%s_infers" % prop: "infers/infers.%s.tsv" % prop,
        "%s_structured_literals" % prop: "structured_literals/structured.literals.%s.tsv" % prop,
        "%s_nodes" % prop: "nodes/nodes.%s.tsv" % prop,
        "%s_qnodes" % prop: "qnodes/qnodes.%s.tsv" % prop,
        "%s_correct_temp_1" % prop: "correct_temp_1/correct_temp_1.%s.tsv" % prop,
        "%s_correct_temp_2" % prop: "correct_temp_2/correct_temp_2.%s.tsv" % prop,
        "%s_incorrect_temp" % prop: "incorrect_temp/incorrect_temp.%s.tsv" % prop,
        "%s_correct" % prop: "correct/correct.%s.tsv" % prop,
        "%s_incorrect" % prop: "incorrect/incorrect.%s.tsv" % prop
    }
    # "%s_unknown" % prop: "unknown/unknown.%s.tsv" % prop,
    # "%s_class" % prop: "class/class.%s.tsv" % prop,
    # "%s_entity" % prop: "entity/entity.%s.tsv" % prop,
    # print(output_file_names)
    for key, value in output_file_names.items():
        variable = key.upper()
        os.environ[variable] = os.path.join(output_path, value)
        kgtk_environment_variables.append(variable)

# kgtk_environment_variables.sort()
# for variable in kgtk_environment_variables:
#     print("{}: \"{}\"".format(variable, os.environ[variable]))

# Step 2: Wikidata Results

Generate a property file for each property using `kgtk filter` (filter for property). **Note:** Already run on background and since it's relatively time consuming, it's better to not run again.

In [None]:
# for prop, val in property_dict.items():
#     command = "kgtk filter -i $CLAIMS -p \" ; %s ; \" -o $%s_RESULTS" % (prop, prop) 
#     # print(command)
#     code = os.system(command)
#     print(code, prop) 

In [7]:
# code from Kartik
propFileDict = defaultdict()
with open(os.environ['CLAIMS'], 'r') as f: 
    headerLine = next(f)#.decode("utf-8")
    for line in tqdm(f):
        line = line#.decode("utf-8")
        lineP = line.rstrip().split("\t")
        prop = lineP[2]
        if prop in property_dict: 
            if prop not in propFileDict:
                propFileDict[prop] = open(os.environ["%s_RESULTS" % prop], "w")
                propFileDict[prop].write(headerLine)
            propFileDict[prop].write(line)
    for file1 in propFileDict.values():
        file1.close()

491297975it [20:32, 398776.68it/s] 


## Count known results in Wikidata database:

In [9]:
for prop in property_dict.keys():
    if not os.path.exists(os.environ['%s_RESULTS' % prop]):
        property_lines_count[prop]['Known'] = 0
        property_qnodes_count[prop]['Known'] = 0
        continue
    if os.path.getsize(os.environ['%s_RESULTS' % prop]) == 0:
        property_lines_count[prop]['Known'] = 0
        property_qnodes_count[prop]['Known'] = 0
        continue
    # count in lines
    lines = subprocess.check_output("wc -l < $%s_RESULTS" % prop, shell=True)
    lines = lines.decode("utf-8").strip()
    lines = int(lines) - 1
    property_lines_count[prop]['Known'] = lines
    # print("%s -> %d" % (prop, lines))
    # count in qnodes
    command = "kgtk query -i $%s_RESULTS --match '(qnode)-[]->()' --return 'count(distinct qnode)'" % prop
    nodes = subprocess.check_output(command, shell=True)
    nodes = nodes.decode("utf-8").strip().split('\n')[1]
    nodes = int(nodes)
    property_qnodes_count[prop]['Known'] = nodes
    print("%s -> %d" % (prop, nodes))

P1000 -> 174
P1001 -> 715765
P1002 -> 800
P1018 -> 154
P102 -> 392278
P1026 -> 7327
P1028 -> 4183
P1029 -> 795
P103 -> 106113
P1035 -> 506
P1037 -> 9215
P1038 -> 23053
P1039 -> 35
P1040 -> 45481
P1041 -> 65
P1049 -> 1643
P105 -> 2892666
P1050 -> 176676
P1056 -> 22639
P1057 -> 592250
P106 -> 6339032
P1060 -> 186
P1064 -> 22745
P1066 -> 40075
P1068 -> 187
P1071 -> 49684
P1072 -> 1092
P1073 -> 658
P1075 -> 577
P1078 -> 74
P1079 -> 170
P108 -> 1001482
P1080 -> 25336
P110 -> 8499
P111 -> 2626
P112 -> 50162
P113 -> 2627
P1136 -> 88
P114 -> 275
P1142 -> 13170
P1145 -> 5775
P115 -> 30528
P1151 -> 1626
P1165 -> 1294
P1170 -> 3
P1171 -> 1
P118 -> 160085
P119 -> 180870
P1192 -> 12181
P1194 -> 1
P1199 -> 78
P1200 -> 2659
P1201 -> 400
P1202 -> 40
P121 -> 4714
P1210 -> 6
P1211 -> 25
P122 -> 1510
P1221 -> 2
P1227 -> 11
P123 -> 232055
P126 -> 302852
P127 -> 399839
P128 -> 12547
P1283 -> 1379
P129 -> 2771
P1290 -> 547
P1299 -> 5960
P1303 -> 170552
P1304 -> 160
P1308 -> 5833
P131 -> 9789120
P1312 -> 635

P541 -> 31640
P542 -> 868
P5425 -> 2
P543 -> 55
P5444 -> 60
P545 -> 55
P5460 -> 148
P5475 -> 62
P548 -> 3
P550 -> 48
P551 -> 162223
P552 -> 1999
P5522 -> 31977
P5523 -> 44
P553 -> 1111
P5537 -> 53
P556 -> 5284
P5572 -> 487
P559 -> 25027
P560 -> 24
P562 -> 379
P5642 -> 116
P5658 -> 245
P57 -> 267822
P5707 -> 68
P5753 -> 136
P5769 -> 1060
P58 -> 132118
P5800 -> 3451
P5804 -> 88
P5805 -> 60
P5816 -> 8029
P5817 -> 3919
P5873 -> 919
P589 -> 273
P59 -> 7377061
P5911 -> 1
P5940 -> 123
P5970 -> 10543
P598 -> 5928
P5995 -> 231
P6 -> 23243
P6022 -> 1
P607 -> 153435
P608 -> 11736
P6087 -> 6730
P609 -> 19140
P61 -> 72751
P610 -> 5104
P6104 -> 10290
P611 -> 27481
P6112 -> 2884
P6116 -> 5
P6118 -> 9
P612 -> 748
P6149 -> 2
P6153 -> 107745
P6166 -> 102
P618 -> 1126
P6185 -> 231
P6186 -> 254
P6191 -> 3
P6193 -> 60
P6237 -> 15
P624 -> 191
P6241 -> 5034
P6243 -> 1
P6275 -> 77591
P629 -> 71498
P6338 -> 1195
P634 -> 332
P636 -> 902
P6364 -> 1356
P6365 -> 3303
P6379 -> 72651
P641 -> 1735488
P6426 -> 148
P64

## Find unknow results in Wikidata database:

Find the most frequent class and substitute the entities of the class as the whole entity set we're going to query. **Note:** Currently we don't apply this step.

In [None]:
# for prop, val in property_dict.items():
#     command = "kgtk query -i $%s_RESULTS $P31 \
#         --match 'r: (entity)-[]->(), P31: (entity)-[]->(class)' \
#         --return 'distinct entity as node1, \"P31\" as label, class as node2' \
#         -o $%s_CLASS" % (prop, prop)
#     print(command)
#     start = time.time()
#     code = os.system(command)
#     runtime = time.time() - start
#     print(code, runtime)
#     property_runtime[prop].append(runtime)

In [None]:
# entity_class_map = dict()
# for prop, _ in property_dict.items():
#     command = "kgtk query -i $%s_CLASS \
#         --match 'c: ()-[]->(class)' \
#         --return 'class, count(class) as N' \
#         --order-by 'N desc' \
#         --limit 1" % prop
#     start = time.time()
#     output = subprocess.check_output(command, shell=True)
#     runtime = time.time() - start
#     property_runtime[prop].append(runtime)
#     output = output.decode("utf-8").strip().split()
#     print("%s" % prop, output, runtime)
#     if len(output) == 4:
#         entity_class_map[prop] = output[2]

In [None]:
# class_entity_map = dict()
# for key, value in entity_class_map.items():
#     class_entity_map[value] = []
# for key, value in entity_class_map.items():
#     class_entity_map[value].append(key)
# class_entity_map

- Find all entities

In [None]:
# for cls, prop in class_entity_map.items():
#     command = "kgtk filter -i $P31 -p \";; %s\" -o $%s_ENTITY" % (cls, prop[0])
#     print(command)
#     # If ${val} already in folder, no need to query
#     # if os.path.exists(os.environ['%s_ENTITY' % prop[0]]):
#     #     print(0)
#     #     runtime = 0
#     #     propperty_runtime[prop].append(runtime)
#     # else:
#     start = time.time()
#     code = os.system(command)
#     runtime = time.time() - start
#     property_runtime[prop[0]].append(runtime)
#     print(code, runtime)
#     if len(prop) > 1:
#         for p in prop[1:]:
#             os.system('cp $%s_ENTITY $%s_ENTITY' % (prop[0], p))
#             property_runtime[p].append(runtime)

In [None]:
# directly filter
# !kgtk query -i $%s_CLASS $P31 \
#         --match 'c: (n)-[]->(class), P31: (entity)-[]->(class)' \
#         --where 'n != entity' \
#         --return 'distinct entity as node1, \"P31\" as label, class as node2' \
#         -o $%s_ENTITY" % (prop, prop)

- Eliminate entities who have known property / properties values

In [None]:
# for prop, val in property_dict.items():
#     command = "kgtk ifnotexists -i $%s_ENTITY \
#         --filter-on $%s_RESULTS \
#         --input-keys node1 \
#         --filter-keys node1 \
#         -o $%s_QUERY_FILE" % (prop, prop, prop)
#     print(command)
#     # if os.path.exists(os.environ['%s_QUERY_FILE' % prop[0]]):
#     #     print(0)
#     # else:
#     #     code = os.system(command)
#     #     print(code)
#     start = time.time()
#     code = os.system(command)
#     runtime = time.time() - start
#     property_runtime[prop].append(runtime)
#     print(code, runtime)

## Count unknown results

- rows

In [None]:
# for prop, val in property_dict.items():
#     output = subprocess.check_output("wc -l < $%s_QUERY_FILE" % prop, shell=True)
#     output = output.decode("utf-8").strip()
#     output = int(output) - 1
#     print("%s: %s" % (prop, output))

- entities

In [None]:
# for prop, val in property_dict.items():
#     command = "kgtk query -i $%s_QUERY_FILE \
#         --match '(p)-[]->()' \
#         --return 'count(distinct p) as N'" % prop
#     output = subprocess.check_output(command, shell=True)
#     output = output.decode("utf-8").strip().split()[-1]
#     print("%s: %s" % (prop, output))

# Step 3 Selection of Additional KG(s)

Currently we use Wikidata Infobox generated from DBpedia.

# Step 4 Schema Alignment

## Entity resolution

Use query results from Wikidata database to infer properties in Wikidata infobox and return the most frequent property.

**Direct Infer:** Query for qnode;

In [10]:
for prop, val in property_dict.items():
    command = "kgtk query -i $%s_RESULTS --match '()-[]->()' --limit 200000 -o $%s_SAMPLES" % (prop, prop)
    code = os.system(command)

In [11]:
for prop, val in property_dict.items():
    command = "kgtk query -i $%s_SAMPLES -i $WIKI_INFO \
        --match 's: (entity)-[]->(v), w: (entity)-[p]->(v)' \
        --return 'entity, p.label, v as node2' \
        -o $%s_DIRECT_INFER" % (prop, prop)
    # print(command)
    start = time.time()
    code = os.system(command)
    runtime = time.time() - start
    property_runtime[prop]['Entity Resolution'] = runtime
    print(prop, code, runtime)

P1000 0 10.041428804397583
P1001 0 322.13322043418884
P1002 0 10.839092493057251
P1016 256 8.117747068405151
P1018 0 9.149420976638794
P102 0 63.906991481781006
P1026 0 11.410224437713623
P1028 0 9.474342107772827
P1029 0 9.05444884300232
P103 0 53.48261094093323
P1035 0 12.982839584350586
P1037 0 10.177430391311646
P1038 0 10.079153060913086
P1039 0 9.55798864364624
P1040 0 10.276310205459595
P1041 0 9.397211790084839
P1049 0 10.302280187606812
P105 0 111.18825650215149
P1050 0 14.877598524093628
P1056 0 10.252161979675293
P1057 0 10.561971664428711
P106 0 19.398312091827393
P1060 0 9.291229009628296
P1064 0 11.386128425598145
P1066 0 9.769603252410889
P1068 0 9.105098962783813
P1071 0 49.23881959915161
P1072 0 11.4793062210083
P1073 0 8.15464997291565
P1075 0 8.910903692245483
P1078 0 8.856094360351562
P1079 0 8.238034963607788
P108 0 16.184264183044434
P1080 0 10.862496852874756
P110 0 9.186925888061523
P111 0 7.453509092330933
P112 0 10.776747465133667
P113 0 9.54660177230835
P1136

P2701 0 9.946199417114258
P2702 0 9.741891384124756
P2715 0 10.3737473487854
P272 0 14.722517728805542
P275 0 11.390317916870117
P276 0 23.425618171691895
P277 0 9.830408334732056
P2813 0 8.760682344436646
P2817 0 10.477635383605957
P282 0 27.537697792053223
P2825 0 10.3822181224823
P2827 0 9.753999948501587
P2828 0 9.88401174545288
P2838 0 10.03147292137146
P2839 0 10.032243728637695
P2841 0 9.65330696105957
P2842 256 9.357136487960815
P2851 0 11.034689903259277
P2852 0 9.718152284622192
P2853 0 9.415270805358887
P286 0 9.263758420944214
P287 0 9.745319128036499
P2872 0 8.615444421768188
P2875 0 8.34372878074646
P2876 0 10.621078968048096
P2881 0 9.420355319976807
P2882 0 9.869870901107788
P289 0 8.63316559791565
P291 0 169.47089529037476
P2922 0 12.42552375793457
P2935 0 9.309314250946045
P2936 0 16.013731479644775
P2937 0 8.481384515762329
P2962 0 9.639266729354858
P2974 0 9.32970666885376
P2975 0 11.318980932235718
P2976 0 9.225704908370972
P2978 0 9.547550439834595
P2989 0 8.96659

P548 0 16.523427963256836
P550 0 12.870888710021973
P551 0 84.96336817741394
P552 0 15.737269401550293
P5522 0 11.599603652954102
P5523 0 11.595025777816772
P553 0 11.874096155166626
P5537 0 10.296377897262573
P556 0 13.418043851852417
P5560 256 10.586738348007202
P5572 0 10.52715015411377
P559 0 9.978651523590088
P560 0 9.354942560195923
P562 0 10.198023319244385
P5642 0 13.336499452590942
P5658 0 11.912300825119019
P57 0 13.05836296081543
P5707 0 8.06743836402893
P5753 0 9.569319248199463
P5769 0 8.851454019546509
P58 0 12.268966436386108
P5800 0 10.185478448867798
P5804 0 10.609423398971558
P5805 0 9.484817266464233
P5816 0 10.196296691894531
P5817 0 9.01341962814331
P5824 256 12.117036581039429
P5873 0 10.475674867630005
P5886 256 9.050994157791138
P589 0 9.471725463867188
P59 0 16.211346864700317
P5911 0 8.90455675125122
P5940 0 12.288085699081421
P5970 0 10.891029357910156
P598 0 10.622703790664673
P5995 0 9.495558977127075
P6 0 14.242581367492676
P6022 0 10.70628547668457
P607 0

P941 0 12.185081005096436
P942 0 8.904331922531128
P945 0 146.66596007347107
P967 0 10.673198699951172
P97 0 10.2600679397583
P972 0 9.721731901168823
P974 0 9.680016994476318
P98 0 9.797102689743042
P991 0 10.590015649795532


## Property mapping

In [12]:
# {Wikidata property: DBpedia property}, e.g. {'P452': 'property:industry'}
property_mapping = dict()
for prop, val in property_dict.items():
    if not os.path.exists(os.environ['%s_DIRECT_INFER' % prop]):
        continue
    if os.path.getsize(os.environ['%s_DIRECT_INFER' % prop]) == 0:
        continue
    command = "kgtk query -i $%s_DIRECT_INFER \
        --match '(q)-[p]->(v)' \
        --return 'p.label, count(v) as N' \
        --order-by 'N desc' \
        --limit 1" % prop
    start = time.time()
    # print(command)
    output = subprocess.check_output(command, shell=True)
    runtime = time.time() - start
    property_runtime[prop]['Property Mapping'] = runtime
    output = output.decode("utf-8").strip().split()
    print(output, runtime)
    if len(output) == 4:
        property_mapping[prop] = output[2]
        print("%s -> %s" % (prop, output[2]))

['label', 'N', 'property:title', '72'] 13.821938037872314
P1000 -> property:title
['label', 'N', 'property:jurisdiction', '557'] 9.913676261901855
P1001 -> property:jurisdiction
['label', 'N', 'property:configuration', '57'] 8.738563299179077
P1002 -> property:configuration
['label', 'N', 'property:agency', '69'] 9.810288429260254
P1018 -> property:agency
['label', 'N', 'property:party', '35148'] 9.685633182525635
P102 -> property:party
['label', 'N'] 9.628532409667969
['label', 'N', 'property:founder', '4'] 11.657528162002563
P1028 -> property:founder
['label', 'N', 'property:crewMembers', '1496'] 8.454116821289062
P1029 -> property:crewMembers
['label', 'N', 'property:language', '288'] 9.70067286491394
P103 -> property:language
['label', 'N', 'property:honorificSuffix', '142'] 8.882179021835327
P1035 -> property:honorificSuffix
['label', 'N', 'property:director', '198'] 9.348042964935303
P1037 -> property:director
['label', 'N', 'property:relatives', '749'] 9.292741060256958
P1038 ->

['label', 'N', 'property:series', '3268'] 10.579928874969482
P1441 -> property:series
['label', 'N', 'property:destination', '562'] 10.100733518600464
P1444 -> property:destination
['label', 'N', 'property:films', '3'] 9.958155632019043
P1445 -> property:films
['label', 'N', 'property:type', '4484'] 10.151318788528442
P1454 -> property:type
['label', 'N', 'property:works', '7'] 9.194974899291992
P1455 -> property:works
['label', 'N', 'property:exImageCap', '16'] 9.309904098510742
P1456 -> property:exImageCap
['label', 'N', 'property:organization', '22'] 9.14981460571289
P1462 -> property:organization
['label', 'N'] 11.060544490814209
['label', 'N'] 10.295295476913452
['label', 'N', 'property:architecture', '7378'] 11.202978134155273
P149 -> property:architecture
['label', 'N', 'property:p', '391'] 9.280627727508545
P150 -> property:p
['label', 'N', 'property:birthPlace', '14298'] 9.666303157806396
P1532 -> property:birthPlace
['label', 'N', 'property:seealso', '8'] 8.2710542678833
P153

['label', 'N', 'property:inflow', '1644'] 10.699360370635986
P200 -> property:inflow
['label', 'N', 'property:outflow', '1980'] 10.923537254333496
P201 -> property:outflow
['label', 'N', 'property:foodType', '56'] 13.46058440208435
P2012 -> property:foodType
['label', 'N', 'property:basinCountries', '2198'] 9.792478799819946
P205 -> property:basinCountries
['label', 'N', 'property:depositor', '68'] 10.010003089904785
P2058 -> property:depositor
['label', 'N', 'property:location', '2393'] 10.447643756866455
P206 -> property:location
['label', 'N', 'property:pictureFormat', '2'] 9.224528789520264
P2061 -> property:pictureFormat
['label', 'N', 'property:type', '36'] 10.587560653686523
P2079 -> property:type
['label', 'N', 'property:leaderTitle', '27'] 11.337594032287598
P208 -> property:leaderTitle
['label', 'N', 'property:judiciary', '50'] 9.01697039604187
P209 -> property:judiciary
['label', 'N', 'property:weight', '2191'] 9.834899425506592
P2094 -> property:weight
['label', 'N', 'prope

['label', 'N'] 9.811660289764404
['label', 'N', 'property:connectivity', '13'] 9.919489622116089
P2935 -> property:connectivity
['label', 'N', 'property:language', '526'] 10.467952251434326
P2936 -> property:language
['label', 'N', 'property:legislatureTerm', '1'] 9.856929063796997
P2937 -> property:legislatureTerm
['label', 'N', 'property:title', '1091'] 11.194416761398315
P2962 -> property:title
['label', 'N', 'property:region', '2'] 10.464884281158447
P2974 -> property:region
['label', 'N'] 10.096591472625732
['label', 'N'] 9.96108603477478
['label', 'N', 'property:uicclass', '300'] 9.706227779388428
P2978 -> property:uicclass
['label', 'N'] 10.519083499908447
['label', 'N'] 11.33467721939087
['label', 'N', 'property:location', '1283'] 10.190248489379883
P30 -> property:location
['label', 'N', 'property:caption', '1'] 9.508178949356079
P3005 -> property:caption
['label', 'N', 'property:lawsapplied', '11'] 9.305595397949219
P3014 -> property:lawsapplied
['label', 'N', 'property:crew1

['label', 'N'] 11.70696234703064
['label', 'N', 'property:basedOn', '1'] 9.880049705505371
P4099 -> property:basedOn
['label', 'N', 'property:rank', '16112'] 9.965999126434326
P410 -> property:rank
['label', 'N', 'property:party', '4'] 9.995343923568726
P4100 -> property:party
['label', 'N', 'property:almaMater', '4'] 9.666962623596191
P4101 -> property:almaMater
['label', 'N', 'property:honorificPrefix', '339'] 10.714900493621826
P411 -> property:honorificPrefix
['label', 'N', 'property:occupation', '108'] 11.551725387573242
P412 -> property:occupation
['label', 'N', 'property:position', '100653'] 10.449514865875244
P413 -> property:position
['label', 'N', 'property:exchanges', '15'] 9.916820287704468
P414 -> property:exchanges
['label', 'N'] 9.488610744476318
['label', 'N'] 9.394213914871216
['label', 'N', 'property:format', '2711'] 10.5820894241333
P415 -> property:format
['label', 'N', 'property:patron', '450'] 11.831217050552368
P417 -> property:patron
['label', 'N', 'property:mot

['label', 'N'] 10.572020769119263
['label', 'N'] 10.419256448745728
['label', 'N', 'property:eng1Name', '402'] 9.573585510253906
P516 -> property:eng1Name
['label', 'N', 'property:interaction', '48'] 11.100606441497803
P517 -> property:interaction
['label', 'N'] 9.497198820114136
['label', 'N', 'property:weapons', '27'] 9.982557535171509
P520 -> property:weapons
['label', 'N', 'property:writer', '25'] 10.459373474121094
P5202 -> property:writer
['label', 'N'] 11.062926292419434
['label', 'N'] 10.004748582839966
['label', 'N', 'property:regions', '2'] 10.96304702758789
P521 -> property:regions
['label', 'N', 'property:orbitRegime', '772'] 9.42908763885498
P522 -> property:orbitRegime
['label', 'N', 'property:age', '8'] 9.406508445739746
P523 -> property:age
['label', 'N'] 10.597260475158691
['label', 'N', 'property:age', '9'] 10.880054712295532
P524 -> property:age
['label', 'N'] 10.525991678237915
['label', 'N', 'property:league', '18'] 11.265305042266846
P5249 -> property:league
['lab

['label', 'N', 'property:writer', '5612'] 7.184404611587524
P676 -> property:writer
['label', 'N'] 7.27603006362915
['label', 'N'] 7.0465087890625
['label', 'N', 'property:employer', '18'] 7.1677186489105225
P6872 -> property:employer
['label', 'N'] 7.255326747894287
['label', 'N', 'property:subdivisionName', '1'] 7.3217175006866455
P6885 -> property:subdivisionName
['label', 'N', 'property:language', '778'] 7.259411334991455
P6886 -> property:language
['label', 'N', 'property:architecture', '16'] 7.368269920349121
P6889 -> property:architecture
['label', 'N', 'property:almaMater', '213'] 7.335354804992676
P69 -> property:almaMater
['label', 'N'] 7.196554899215698
['label', 'N', 'property:eraName', '4'] 7.183977842330933
P6902 -> property:eraName
['label', 'N'] 7.2014124393463135
['label', 'N', 'property:animator', '123'] 7.464440584182739
P6942 -> property:animator
['label', 'N'] 6.923638105392456
['label', 'N'] 7.635486125946045
['label', 'N'] 7.0911705493927
['label', 'N'] 7.1718540

['label', 'N', 'property:architect', '12989'] 7.203981876373291
P84 -> property:architect
['label', 'N', 'property:country', '739'] 6.820436954498291
P840 -> property:country
['label', 'N', 'property:products', '1'] 7.295710325241089
P8402 -> property:products
['label', 'N'] 7.193270444869995
['label', 'N'] 7.474035739898682
['label', 'N', 'property:workplaces', '1'] 7.65239143371582
P8413 -> property:workplaces
['label', 'N', 'property:listing', '188'] 7.333124399185181
P8450 -> property:listing
['label', 'N', 'property:publictransit', '1'] 7.407061815261841
P8453 -> property:publictransit
['label', 'N'] 7.291511297225952
['label', 'N'] 7.443562030792236
['label', 'N', 'property:nationalAnthem', '51'] 7.494327545166016
P85 -> property:nationalAnthem
['label', 'N'] 7.323984861373901
['label', 'N'] 7.393872022628784
['label', 'N'] 7.1161909103393555
['label', 'N'] 7.2422003746032715
['label', 'N'] 7.337549448013306
['label', 'N'] 7.341680288314819
['label', 'N', 'property:sponsor', '534

In [13]:
# save property_mapping
with open(os.environ['PROPERTY_MAPPING'], 'w+') as f:
    json.dump(property_mapping, f, indent=4)

In [14]:
# create reverse property mapping: {DBpedia property: Wikidata property}
# assert len(set(property_mapping.keys())) == len(set(property_mapping.values())), 'Not one-to-one mapping!'
print('Number of Wikidata properties: %d' % len(set(property_mapping.keys()))) 
print('Number of DBpedia properties: %d' % len(set(property_mapping.values())))
property_mapping_inverted = defaultdict(list)
for key, value in property_mapping.items():
    property_mapping_inverted[value].append(key) 

Number of Wikidata properties: 582
Number of DBpedia properties: 411


# Step 5 Results from other KG(s)

For those entities don't have property value, query in Wikidata infobox:

In [16]:
DBpediaResultsFileDict = defaultdict()
with open(os.environ['WIKI_INFO'], 'r') as f: 
    headerLine = next(f)
    for line in tqdm(f):
        lineP = line.rstrip().split("\t")
        dbpedia_prop = lineP[1]
        if dbpedia_prop in property_mapping_inverted:
            for wiki_prop in property_mapping_inverted[dbpedia_prop]:
                if wiki_prop not in DBpediaResultsFileDict:
                    DBpediaResultsFileDict[wiki_prop] = open(os.environ["%s_INFOBOX_RESULTS" % wiki_prop], "w")
                    DBpediaResultsFileDict[wiki_prop].write(headerLine)
                DBpediaResultsFileDict[wiki_prop].write(line)
    for file1 in DBpediaResultsFileDict.values():
        file1.close()

121503677it [02:53, 700511.14it/s]


## Filter new results found from additional KG

In [17]:
for prop, val in property_mapping.items():
    if not os.path.exists(os.environ['%s_INFOBOX_RESULTS' % prop]):
        continue
    if os.path.getsize(os.environ['%s_INFOBOX_RESULTS' % prop]) == 0:
        continue
    command = "kgtk ifnotexists -i $%s_INFOBOX_RESULTS \
        --filter-on $%s_RESULTS \
        --input-keys node1 node2 \
        --filter-keys node1 node2 \
        -o $%s_NEW_RESULTS" % (prop, prop, prop) 
    # print(command)
    start = time.time()
    code = os.system(command)
    runtime = time.time() - start
    property_runtime[prop]['Knowledge Retrieval'] = runtime
    print(prop, code, runtime)

P1000 0 14.239500761032104
P1001 0 7.0099499225616455
P1002 0 5.170429229736328
P1018 0 5.499737501144409
P102 0 8.02161955833435
P1028 0 5.845066070556641
P1029 0 5.515587329864502
P103 0 7.0580878257751465
P1035 0 6.119622230529785
P1037 0 6.890001535415649
P1038 0 5.538966655731201
P1040 0 6.140905141830444
P1041 0 5.61583399772644
P1049 0 5.523047685623169
P1050 0 5.715090751647949
P1056 0 5.429327011108398
P106 0 25.056373119354248
P1064 0 5.966238021850586
P1066 0 5.7146430015563965
P1068 0 5.5681235790252686
P1071 0 5.87145209312439
P1072 0 5.485913991928101
P1073 0 5.565650224685669
P1075 0 5.369428873062134
P1078 0 5.385726451873779
P1079 0 5.456471681594849
P108 0 8.877389669418335
P1080 0 5.985927581787109
P110 0 5.889227390289307
P111 0 5.4259514808654785
P112 0 5.8835718631744385
P113 0 5.2775561809539795
P1136 0 5.542478799819946
P114 0 5.667109966278076
P1142 0 5.768745183944702
P115 0 7.546982526779175
P1165 0 5.34912896156311
P1170 0 5.023123025894165
P118 0 6.29208946

P408 0 5.460708379745483
P4099 0 5.468592166900635
P410 0 6.22787070274353
P4100 0 7.136110305786133
P4101 0 7.041820287704468
P411 0 6.9082701206207275
P412 0 8.363350629806519
P413 0 7.466332674026489
P414 0 5.654929876327515
P415 0 4.973104000091553
P417 0 5.533447265625
P418 0 5.634929895401001
P421 0 10.780673742294312
P4220 0 5.98402214050293
P425 0 5.3832032680511475
P427 0 6.703229188919067
P4290 0 5.8421008586883545
P4322 0 5.538647890090942
P4345 0 5.327486276626587
P4353 0 5.449854135513306
P437 0 5.80517053604126
P4379 0 5.619333267211914
P4428 0 5.514729738235474
P449 0 5.916743278503418
P450 0 5.489628314971924
P451 0 5.55169677734375
P452 0 5.961020231246948
P4552 0 6.280843734741211
P457 0 5.607668161392212
P4584 0 6.592576503753662
P4586 0 5.514540433883667
P4608 0 6.40437650680542
P4614 0 5.70348858833313
P462 0 5.940125226974487
P463 0 7.2791056632995605
P4647 0 5.392583131790161
P466 0 5.5777671337127686
P4661 0 5.828569412231445
P467 0 5.436765193939209
P468 0 13.2

- Count rows of new findings:

In [None]:
# for prop, val in property_mapping.items():
#     output = subprocess.check_output("wc -l < $%s_NEW_RESULTS" % prop, shell=True)
#     output = output.decode("utf-8").strip()
#     output = int(output) - 1
#     print("%s: %s" % (prop, output))

- Count unique entities of new findings:

In [None]:
# for prop, val in property_mapping.items():
#     command = "kgtk query -i $%s_NEW_RESULTS \
#         --match 'n: (p)-[]->()' \
#         --return 'count(distinct p) as N'" % prop
#     output = subprocess.check_output(command, shell=True)
#     output = output.decode("utf-8").strip().split()[1]
#     # print("%s: %s" % (prop, output))

## Filter out entities (rows) we still don't know

In [None]:
# for prop, val in property_mapping.items():
#     command = "kgtk ifnotexists -i $%s_QUERY_FILE \
#         --filter-on $%s_NEW_RESULTS \
#         --input-keys node1 \
#         --filter-keys node1 \
#         -o $%s_UNKNOWN" % (prop, prop, prop)
#     print(command)
#     start = time.time()
#     code = os.system(command)
#     runtime = time.time() - start
#     property_runtime[prop].append(runtime)
#     print(code, runtime)

- Count rows still unknow

In [None]:
# for prop, val in property_mapping.items():
#     output = subprocess.check_output("wc -l < $%s_UNKNOWN" % prop, shell=True)
#     output = output.decode("utf-8").strip()
#     output = int(output) - 1
#     print("%s: %s" % (prop, output))

- Count entities still unknow

In [None]:
# for prop, val in property_mapping.items():
#     command = "kgtk query -i $%s_UNKNOWN \
#         --match 'n: (p)-[]->()' \
#         --return 'count(distinct p) as N'" % prop
#     output = subprocess.check_output(command, shell=True)
#     output = output.decode("utf-8").strip().split()[1]
#     print("%s: %s" % (prop, output))

# Step 6 Datatype Filtering

## 1. Filter Structured literals:

In [18]:
for prop, val in property_mapping.items():
    if not os.path.exists(os.environ['%s_NEW_RESULTS' % prop]):
        continue
    if os.path.getsize(os.environ['%s_NEW_RESULTS' % prop]) == 0:
        continue
    command = "kgtk query -i $%s_NEW_RESULTS -i $WIKI_INFO \
        --match 'n: (q)-[p]->(s), w: (s)-[sv]->(v)' \
        --where 'NOT kgtk_lqstring(s) AND NOT kgtk_number(s) AND sv.label = \"dbpedia:structured_value\"' \
        --return 'q, p.label, s' \
        -o $%s_STRUCTURED_LITERALS" % (prop, prop)
    # print(command)
    start = time.time()
    code = os.system(command)
    runtime = time.time() - start
    property_runtime[prop]['Datatype Filtering'] = runtime
    print(prop, code, runtime)

P1000 0 22.86501431465149
P1001 0 10.287123918533325
P1002 0 7.437270164489746
P1018 0 7.923629522323608
P102 0 10.173832178115845
P1028 0 8.670959234237671
P1029 0 7.318754196166992
P103 0 9.475188732147217
P1035 0 7.771365642547607
P1037 0 9.46187710762024
P1038 0 8.433539390563965
P1040 0 8.202587366104126
P1041 0 7.4213292598724365
P1049 0 7.645489692687988
P1050 0 7.406570911407471
P1056 0 7.291668891906738
P106 0 9.548047065734863
P1064 0 7.766049861907959
P1066 0 8.06187915802002
P1068 0 8.31909704208374
P1071 0 8.306559324264526
P1072 0 7.576040983200073
P1073 0 7.536308526992798
P1075 0 7.653122901916504
P1078 0 7.547594785690308
P1079 0 8.050579071044922
P108 0 9.232857465744019
P1080 0 8.785923719406128
P110 0 7.551856279373169
P111 0 7.780805349349976
P112 0 8.26680874824524
P113 0 7.473475456237793
P1136 0 7.7300169467926025
P114 0 7.872782468795776
P1142 0 7.666178464889526
P115 0 16.036295413970947
P1165 0 7.499665260314941
P1170 0 7.558686017990112
P118 0 8.709635257720

P4099 0 7.742267847061157
P410 0 7.752700090408325
P4100 0 11.154851198196411
P4101 0 10.462905168533325
P411 0 7.8429834842681885
P412 0 9.523077726364136
P413 0 8.117112636566162
P414 0 7.472893238067627
P415 0 7.540284872055054
P417 0 8.42399001121521
P418 0 8.821882009506226
P421 0 9.30151629447937
P4220 0 7.8311309814453125
P425 0 8.160682439804077
P427 0 8.866817951202393
P4290 0 7.486780166625977
P4322 0 7.78193211555481
P4345 0 7.41043758392334
P4353 0 7.633167028427124
P437 0 8.221201658248901
P4379 0 7.52195143699646
P4428 0 8.047229766845703
P449 0 8.111490726470947
P450 0 7.5754218101501465
P451 0 7.628708839416504
P452 0 7.72196102142334
P4552 0 7.65238618850708
P457 0 7.489722490310669
P4584 0 8.472846508026123
P4586 0 7.691105842590332
P4608 0 8.700510025024414
P4614 0 7.478754043579102
P462 0 7.649639129638672
P463 0 9.51804494857788
P4647 0 9.080449342727661
P466 0 8.026665449142456
P4661 0 7.319331169128418
P467 0 7.90833854675293
P468 0 18.506786823272705
P4688 0 7.7

## 2. Filter Qnodes

In [19]:
for prop, val in property_mapping.items():
    if not os.path.exists(os.environ['%s_NEW_RESULTS' % prop]):
        continue
    if os.path.getsize(os.environ['%s_NEW_RESULTS' % prop]) == 0:
        continue
    command_1 = "kgtk query -i $%s_NEW_RESULTS \
        --match 'n: (q)-[p]->(v)' \
        --where 'NOT kgtk_lqstring(v) AND NOT kgtk_number(v)' \
        --return 'distinct q, p.label, v' \
        -o $%s_NODES" % (prop, prop)
    # print(command_1) 
    command_2 = "kgtk ifnotexists -i $%s_NODES \
        --filter-on $%s_STRUCTURED_LITERALS \
        -o $%s_QNODES" % (prop, prop, prop)
    # print(command_2) 
    start = time.time()
    code_1 = os.system(command_1)
    code_2 = os.system(command_2)
    runtime = time.time() - start
    property_runtime[prop]['Datatype Filtering'] += runtime
    print(prop, code_1, code_2, runtime) 

P1000 0 0 16.693738222122192
P1001 0 0 11.366452932357788
P1002 0 0 11.146271705627441
P1018 0 0 11.381734848022461
P102 0 0 11.869765043258667
P1028 0 0 11.142835140228271
P1029 0 0 11.228919982910156
P103 0 0 11.427511930465698
P1035 0 0 11.049703359603882
P1037 0 0 12.058863401412964
P1038 0 0 10.65898084640503
P1040 0 0 10.795746803283691
P1041 0 0 10.26636815071106
P1049 0 0 10.489820718765259
P1050 0 0 10.573704481124878
P1056 0 0 10.650339126586914
P106 0 0 11.280210971832275
P1064 0 0 10.640970706939697
P1066 0 0 11.700672149658203
P1068 0 0 11.572436571121216
P1071 0 0 10.938103914260864
P1072 0 0 10.52578353881836
P1073 0 0 10.3564293384552
P1075 0 0 10.647947311401367
P1078 0 0 10.332383394241333
P1079 0 0 10.47180724143982
P108 0 0 10.801570415496826
P1080 0 0 10.588059186935425
P110 0 0 10.57086443901062
P111 0 0 10.740450620651245
P112 0 0 10.936720132827759
P113 0 0 10.885836124420166
P1136 0 0 10.375614881515503
P114 0 0 11.027217388153076
P1142 0 0 10.962996006011963
P

P3679 0 0 11.360345363616943
P37 0 0 10.694716215133667
P371 0 0 10.748703956604004
P3716 0 0 12.591802597045898
P3719 0 0 10.296342849731445
P375 0 0 10.363839149475098
P376 0 0 10.822227001190186
P3776 0 0 10.224819421768188
P38 0 0 10.23629903793335
P3828 0 0 10.430608034133911
P3842 0 0 10.803788661956787
P3876 0 0 10.182240962982178
P39 0 0 15.699948787689209
P3912 0 0 10.796047687530518
P3919 0 0 11.023973941802979
P3931 0 0 12.19492220878601
P3966 0 0 11.588162183761597
P3967 0 0 10.566311597824097
P397 0 0 10.192039251327515
P3975 0 0 10.126015663146973
P398 0 0 10.357684850692749
P3985 0 0 10.283171653747559
P3989 0 0 11.720369577407837
P399 0 0 10.667452096939087
P40 0 0 10.445375680923462
P400 0 0 10.412548542022705
P403 0 0 10.546557903289795
P4032 0 0 11.903507709503174
P405 0 0 11.23061728477478
P406 0 0 17.694618463516235
P407 0 0 11.485129833221436
P408 0 0 10.494848728179932
P4099 0 0 10.935238361358643
P410 0 0 11.101004838943481
P4100 0 0 12.72679328918457
P4101 0 0 

P98 0 0 10.847271919250488
P991 0 0 11.896927833557129


In [20]:
for prop in property_mapping.keys():
    if not os.path.exists(os.environ['%s_QNODES' % prop]):
        property_lines_count[prop]['Found Total (Qnodes)'] = 0
        property_qnodes_count[prop]['Found Total (Qnodes)'] = 0
        continue
    if os.path.getsize(os.environ['%s_QNODES' % prop]) == 0:
        property_lines_count[prop]['Found Total (Qnodes)'] = 0
        property_qnodes_count[prop]['Found Total (Qnodes)'] = 0
        continue
    # count in lines
    lines = subprocess.check_output("wc -l < $%s_QNODES" % prop, shell=True)
    lines = lines.decode("utf-8").strip()
    lines = int(lines) - 1
    property_lines_count[prop]['Found Total (Qnodes)'] = lines
    print("%s -> %d" % (prop, lines))
    # count in qnodes
    command = "kgtk query -i $%s_QNODES --match '(qnode)-[]->()' --return 'count(distinct qnode)'" % prop
    nodes = subprocess.check_output(command, shell=True)
    nodes = nodes.decode("utf-8").strip().split('\n')[1]
    nodes = int(nodes)
    property_qnodes_count[prop]['Found Total (Qnodes)'] = nodes

P1000 -> 405325
P1001 -> 5539
P1002 -> 894
P1018 -> 840
P102 -> 71336
P1028 -> 19398
P1029 -> 157
P103 -> 78835
P1035 -> 22074
P1037 -> 133425
P1038 -> 25895
P1040 -> 12212
P1041 -> 218
P1049 -> 5857
P1050 -> 3123
P1056 -> 20949
P106 -> 58601
P1064 -> 97
P1066 -> 10109
P1068 -> 259
P1071 -> 26924
P1072 -> 708
P1073 -> 490
P1075 -> 185
P1078 -> 360
P1079 -> 1362
P108 -> 16744
P1080 -> 22500
P110 -> 1140
P111 -> 44
P112 -> 9494
P113 -> 1397
P1136 -> 19
P114 -> 2107
P1142 -> 21211
P115 -> 486253
P1165 -> 319
P1170 -> 113
P118 -> 65947
P119 -> 6488
P1192 -> 18500
P1194 -> 113
P1201 -> 2485
P1202 -> 2120
P121 -> 1566
P1211 -> 648
P122 -> 17093
P123 -> 31693
P126 -> 2571
P127 -> 33865
P1283 -> 366
P129 -> 8
P1290 -> 176778
P1299 -> 9629
P1303 -> 14493
P1308 -> 11461
P131 -> 757858
P1313 -> 45322
P1318 -> 13
P1321 -> 933388
P1322 -> 10
P1327 -> 31
P1336 -> 1128993
P1343 -> 405254
P1344 -> 678136
P1346 -> 11730
P135 -> 7263
P136 -> 380620
P137 -> 21289
P1383 -> 19507
P1398 -> 223
P1399 -> 1120

# Step 7 Quality Checking

Agree:

In [21]:
for prop, val in property_mapping.items():
    start = time.time()
    command = "kgtk query -i $%s_RESULTS $%s_INFOBOX_RESULTS \
        --match 'r: (qnode)-[p]->(value), i: (qnode)-[]->(value)' \
        --return 'qnode as `node1`, p.label as `label`, value as `node2`' \
        -o $%s_AGREE" % (prop, prop, prop)
    code = os.system(command)
    runtime = time.time() - start
    print(prop, code, runtime)

P1000 0 16.20970845222473
P1001 0 13.254762649536133
P1002 0 7.8236188888549805
P1018 0 8.06665849685669
P102 0 10.895379066467285
P1028 0 8.129587888717651
P1029 0 7.930304288864136
P103 0 9.48814606666565
P1035 0 8.682162284851074
P1037 0 8.459880590438843
P1038 0 8.644829511642456
P1040 0 9.135504007339478
P1041 0 8.004392862319946
P1049 0 8.050572156906128
P1050 0 8.476483345031738
P1056 0 9.12816834449768
P106 0 23.50189185142517
P1064 0 8.136825323104858
P1066 0 8.320223569869995
P1068 0 8.010871410369873
P1071 0 8.478798627853394
P1072 0 8.353845596313477
P1073 0 8.448908805847168
P1075 0 8.270445823669434
P1078 0 8.178609848022461
P1079 0 8.489248275756836
P108 0 10.499580144882202
P1080 0 8.078675746917725
P110 0 8.290950536727905
P111 0 8.473113059997559
P112 0 8.244663715362549
P113 0 8.113182783126831
P1136 0 8.058680534362793
P114 0 8.114022970199585
P1142 0 8.270549297332764
P115 0 9.689201354980469
P1165 0 8.17296576499939
P1170 0 8.480016469955444
P118 0 8.6741776466369

P4099 0 6.179319381713867
P410 0 6.391169786453247
P4100 0 7.306926250457764
P4101 0 8.721378803253174
P411 0 5.9598493576049805
P412 0 8.501741409301758
P413 0 8.102670669555664
P414 0 5.8886497020721436
P415 0 6.150975227355957
P417 0 5.904102563858032
P418 0 6.101287841796875
P421 0 9.431498765945435
P4220 0 8.403909683227539
P425 0 6.097572565078735
P427 0 7.63640284538269
P4290 0 6.172435283660889
P4322 0 6.026441335678101
P4345 0 5.787244081497192
P4353 0 5.718942880630493
P437 0 8.744945526123047
P4379 0 6.040111064910889
P4428 0 6.101772785186768
P449 0 6.284129619598389
P450 0 5.921475172042847
P451 0 6.215871572494507
P452 0 6.901746034622192
P4552 0 6.1364476680755615
P457 0 6.270791530609131
P4584 0 6.7470786571502686
P4586 0 6.019038915634155
P4608 0 6.6393163204193115
P4614 0 6.157989263534546
P462 0 6.0942394733428955
P463 0 9.803139925003052
P4647 0 6.396888732910156
P466 0 6.232813596725464
P4661 0 5.94290018081665
P467 0 5.995209693908691
P468 0 14.987680435180664
P46

In [22]:
for prop in property_mapping.keys():
    if not os.path.exists(os.environ['%s_AGREE' % prop]):
        property_lines_count[prop]['Agree'] = 0
        property_qnodes_count[prop]['Agree'] = 0
        continue
    if os.path.getsize(os.environ['%s_AGREE' % prop]) == 0:
        property_lines_count[prop]['Agree'] = 0
        property_qnodes_count[prop]['Agree'] = 0
        continue
    # count in lines
    lines = subprocess.check_output("wc -l < $%s_AGREE" % prop, shell=True)
    lines = lines.decode("utf-8").strip()
    lines = int(lines) - 1
    property_lines_count[prop]['Agree'] = lines
    print("%s -> %d" % (prop, lines))
    # count in qnodes
    command = "kgtk query -i $%s_AGREE --match '(qnode)-[]->()' --return 'count(distinct qnode)'" % prop
    nodes = subprocess.check_output(command, shell=True)
    nodes = nodes.decode("utf-8").strip().split('\n')[1]
    nodes = int(nodes)
    property_qnodes_count[prop]['Agree'] = nodes

P1000 -> 72
P1001 -> 1112
P1002 -> 57
P1018 -> 69
P102 -> 98619
P1028 -> 4
P1029 -> 1496
P103 -> 288
P1035 -> 142
P1037 -> 198
P1038 -> 749
P1040 -> 15115
P1041 -> 20
P1049 -> 30
P1050 -> 56
P1056 -> 4456
P106 -> 46922
P1064 -> 46
P1066 -> 489
P1068 -> 47
P1071 -> 2397
P1072 -> 10
P1073 -> 9
P1075 -> 47
P1078 -> 3
P1079 -> 122
P108 -> 17020
P1080 -> 114
P110 -> 1435
P111 -> 107
P112 -> 9913
P113 -> 1491
P1136 -> 6
P114 -> 197
P1142 -> 6606
P115 -> 7633
P1165 -> 39
P1170 -> 1
P118 -> 10263
P119 -> 5697
P1192 -> 88
P1194 -> 1
P1201 -> 6
P1202 -> 6
P121 -> 114
P1211 -> 6
P122 -> 497
P123 -> 39042
P126 -> 3401
P127 -> 22714
P1283 -> 153
P129 -> 3
P1290 -> 1
P1299 -> 20
P1303 -> 7394
P1308 -> 996
P131 -> 372011
P1313 -> 404
P1318 -> 12
P1321 -> 40
P1322 -> 28
P1327 -> 1107
P1336 -> 10
P1343 -> 124
P1344 -> 777
P1346 -> 10855
P135 -> 3739
P136 -> 223646
P137 -> 24955
P1383 -> 55
P1398 -> 5
P1399 -> 481
P140 -> 6607
P1408 -> 14744
P1411 -> 491
P1412 -> 2151
P1414 -> 49
P1416 -> 1683
P1419 -> 

Semantic Checking:

In [23]:
for prop, val in property_mapping.items():
    start = time.time()
    command_1 = "kgtk query -i $%s_QNODES $P31 $P279STAR \
        --match 'q: (node1)-[nodeProp]->(node2), P31: (node2)-[]->(nodex), P279star: (nodex)-[]->(par)' \
        --where 'par in %s' \
        --return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`' \
        -o $%s_CORRECT_TEMP_1" % (prop, property_dict[prop], prop)
    # print(command_1)
    code_1 = os.system(command_1)
    # print(code_1)
    command_2 = "kgtk ifnotexists -i $%s_QNODES \
        --filter-on $%s_CORRECT_TEMP_1 \
        --input-keys node1 node2 \
        --filter-keys node1 node2 \
        -o $%s_INCORRECT_TEMP" % (prop, prop, prop)
    # print(command_2)
    code_2 = os.system(command_2)
    # print(code_2)
    command_3 = "kgtk query -i $%s_INCORRECT_TEMP $P279STAR \
        --match 'i: (node1)-[nodeProp]->(node2), P279star: (node2)-[]->(par)' \
        --where 'par in %s' \
        --return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`' \
        -o $%s_CORRECT_TEMP_2" % (prop, property_dict[prop], prop)
    # print(command_3)
    code_3 = os.system(command_3)
    # print(code_3)
    command_4 = "kgtk ifnotexists -i $%s_INCORRECT_TEMP \
        --filter-on $%s_CORRECT_TEMP_2 \
        --input-keys node1 node2 \
        --filter-keys node1 node2 \
        -o $%s_INCORRECT" % (prop, prop, prop)
    # print(command_4)
    code_4 = os.system(command_4)
    # print(code_4)
    command_5 = "kgtk cat -i $%s_CORRECT_TEMP_1 $%s_CORRECT_TEMP_2 \
        -o $%s_CORRECT" % (prop, prop, prop)
    # print(command_5)
    code_5 = os.system(command_5)
    # print(code_5)
    runtime = time.time() - start
    property_runtime[prop]['Semantic Validation'] = runtime
    print(prop, code_1, code_2, code_3, code_4, code_5, runtime)

P1000 0 0 0 0 0 392.80291533470154
P1001 0 0 0 0 0 30.79641842842102
P1002 0 0 0 0 0 31.28206992149353
P1018 0 0 0 0 0 30.975862741470337
P102 0 0 0 0 0 32.6503963470459
P1028 0 0 0 0 0 31.028228998184204
P1029 0 0 0 0 0 31.37562918663025
P103 0 0 0 0 0 31.935678005218506
P1035 0 0 0 0 0 31.67425513267517
P1037 0 0 0 0 0 33.4055438041687
P1038 0 0 0 0 0 31.322067737579346
P1040 0 0 0 0 0 30.70585799217224
P1041 0 0 0 0 0 31.14078187942505
P1049 0 0 0 0 0 30.841254949569702
P1050 0 0 0 0 0 31.505194425582886
P1056 0 0 0 0 0 31.46564245223999
P106 0 0 0 0 0 32.67790222167969
P1064 0 0 0 0 0 30.535138607025146
P1066 0 0 0 0 0 31.034425497055054
P1068 0 0 0 0 0 31.01188039779663
P1071 0 0 0 0 0 31.241424083709717
P1072 0 0 0 0 0 31.03555727005005
P1073 0 0 0 0 0 29.731892585754395
P1075 0 0 0 0 0 30.737238883972168
P1078 0 0 0 0 0 30.627776622772217
P1079 0 0 0 0 0 30.493274450302124
P108 0 0 0 0 0 31.296816110610962
P1080 0 0 0 0 0 31.133455514907837
P110 0 0 0 0 0 30.660776615142822
P111

P291 0 0 0 0 0 52.336700439453125
P2935 0 0 0 0 0 47.867945432662964
P2936 0 0 0 0 0 43.03757166862488
P2937 0 0 0 0 0 32.96983885765076
P2962 0 0 0 0 0 39.205132722854614
P2974 0 0 0 0 0 30.205812692642212
P2978 0 0 0 0 0 29.278028964996338
P30 0 0 0 0 0 49.793874740600586
P3005 0 0 0 0 0 29.989274740219116
P3014 0 0 0 0 0 30.3783221244812
P3015 0 0 0 0 0 29.520907640457153
P3018 0 0 0 0 0 50.541430711746216
P3019 0 0 0 0 0 30.350632429122925
P3022 0 0 0 0 0 30.273959636688232
P3033 0 0 0 0 0 29.026657342910767
P306 0 0 0 0 0 30.906975507736206
P3075 0 0 0 0 0 30.338740825653076
P3080 0 0 0 0 0 34.61870765686035
P3085 0 0 0 0 0 28.49676537513733
P3092 0 0 0 0 0 33.97599649429321
P3137 0 0 0 0 0 30.685088634490967
P3173 0 0 0 0 0 30.76253628730774
P3174 0 0 0 0 0 29.139991521835327
P3179 0 0 0 0 0 77.90223217010498
P3189 0 0 0 0 0 38.87232255935669
P3190 0 0 0 0 0 41.267027378082275
P3261 0 0 0 0 0 38.93437838554382
P3262 0 0 0 0 0 41.78819417953491
P3300 0 0 0 0 0 42.64490246772766
P3

P7153 0 0 0 0 0 43.53784489631653
P7169 0 0 0 0 0 44.74743938446045
P720 0 0 0 0 0 51.535483598709106
P725 0 0 0 0 0 65.92175817489624
P726 0 0 0 0 0 53.11798548698425
P7309 0 0 0 0 0 48.95102858543396
P734 0 0 0 0 0 50.96526026725769
P736 0 0 0 0 0 52.26296901702881
P7376 0 0 0 0 0 48.076465129852295
P739 0 0 0 0 0 51.55290246009827
P744 0 0 0 0 0 32.52221131324768
P747 0 0 0 0 0 31.930588722229004
P7479 0 0 0 0 0 29.944363355636597
P748 0 0 0 0 0 31.223350048065186
P749 0 0 0 0 0 31.694908618927002
P750 0 0 0 0 0 35.43548893928528
P751 0 0 0 0 0 30.307313442230225
P7514 0 0 0 0 0 31.30108332633972
P767 0 0 0 0 0 35.901617765426636
P768 0 0 0 0 0 31.175861358642578
P770 0 0 0 0 0 31.141974449157715
P7719 0 0 0 0 0 30.6483473777771
P7727 0 0 0 0 0 30.835085153579712
P7779 0 0 0 0 0 31.099987745285034
P7781 0 0 0 0 0 31.75837755203247
P7782 0 0 0 0 0 31.250612497329712
P78 0 0 0 0 0 33.96938753128052
P780 0 0 0 0 0 31.76274347305298
P7888 0 0 0 0 0 32.923532247543335
P790 0 0 0 0 0 33.3

In [24]:
for prop in property_mapping.keys():
    if not os.path.exists(os.environ['%s_CORRECT' % prop]):
        property_lines_count[prop]['Correct'] = 0
        property_qnodes_count[prop]['Correct'] = 0
        continue
    if os.path.getsize(os.environ['%s_CORRECT' % prop]) == 0:
        property_lines_count[prop]['Correct'] = 0
        property_qnodes_count[prop]['Correct'] = 0
        continue
    # count in lines
    lines = subprocess.checka_output("wc -l < $%s_CORRECT" % prop, shell=True)
    lines = lines.decode("utf-8").strip()
    lines = int(lines) - 1
    property_lines_count[prop]['Correct'] = lines
    print("%s -> %d" % (prop, lines))
    # count in qnodes
    command = "kgtk query -i $%s_CORRECT --match '(qnode)-[]->()' --return 'count(distinct qnode)'" % prop
    nodes = subprocess.check_output(command, shell=True)
    nodes = nodes.decode("utf-8").strip().split('\n')[1]
    nodes = int(nodes)
    property_qnodes_count[prop]['Correct'] = nodes

P1000 -> 68
P1001 -> 4778
P1002 -> 755
P1018 -> 745
P102 -> 66353
P1028 -> 18659
P1029 -> 146
P103 -> 75949
P1035 -> 10999
P1037 -> 131548
P1038 -> 24102
P1040 -> 11646
P1041 -> 124
P1049 -> 5308
P1050 -> 1618
P1056 -> 15695
P106 -> 29971
P1064 -> 72
P1066 -> 10032
P1068 -> 176
P1071 -> 19133
P1072 -> 15
P1073 -> 35
P1075 -> 147
P1078 -> 189
P1079 -> 612
P108 -> 16352
P1080 -> 444
P110 -> 1083
P111 -> 26
P112 -> 8972
P113 -> 864
P1136 -> 19
P114 -> 55
P1142 -> 18203
P115 -> 350386
P1165 -> 156
P1170 -> 52
P118 -> 49919
P119 -> 6274
P1192 -> 2252
P1194 -> 52
P1201 -> 279
P1202 -> 25
P121 -> 1550
P1211 -> 98
P122 -> 4040
P123 -> 30368
P126 -> 2181
P127 -> 29935
P1283 -> 1
P129 -> 6
P1290 -> 158516
P1299 -> 33
P1303 -> 12743
P1308 -> 10770
P131 -> 712951
P1313 -> 28196
P1318 -> 13
P1321 -> 2362
P1322 -> 6
P1327 -> 31
P1336 -> 377267
P1343 -> 197051
P1344 -> 585545
P1346 -> 11305
P135 -> 4441
P136 -> 345001
P137 -> 19214
P1383 -> 16397
P1398 -> 139
P1399 -> 1048
P140 -> 3952
P1408 -> 11999

In [25]:
for prop in property_mapping.keys():
    if not os.path.exists(os.environ['%s_INCORRECT' % prop]):
        property_lines_count[prop]['Incorrect'] = 0
        property_qnodes_count[prop]['Incorrect'] = 0
        continue
    if os.path.getsize(os.environ['%s_INCORRECT' % prop]) == 0:
        property_lines_count[prop]['Incorrect'] = 0
        property_qnodes_count[prop]['Incorrect'] = 0
        continue
    # count in lines
    lines = subprocess.check_output("wc -l < $%s_INCORRECT" % prop, shell=True)
    lines = lines.decode("utf-8").strip()
    lines = int(lines) - 1
    property_lines_count[prop]['Incorrect'] = lines
    print("%s -> %d" % (prop, lines))
    # count in qnodes
    command = "kgtk query -i $%s_INCORRECT --match '(qnode)-[]->()' --return 'count(distinct qnode)'" % prop
    nodes = subprocess.check_output(command, shell=True)
    nodes = nodes.decode("utf-8").strip().split('\n')[1]
    nodes = int(nodes)
    property_qnodes_count[prop]['Incorrect'] = nodes

P1000 -> 405257
P1001 -> 761
P1002 -> 139
P1018 -> 95
P102 -> 4983
P1028 -> 739
P1029 -> 11
P103 -> 2886
P1035 -> 11075
P1037 -> 1877
P1038 -> 1793
P1040 -> 566
P1041 -> 94
P1049 -> 549
P1050 -> 1505
P1056 -> 5254
P106 -> 28630
P1064 -> 25
P1066 -> 77
P1068 -> 83
P1071 -> 7791
P1072 -> 693
P1073 -> 455
P1075 -> 38
P1078 -> 171
P1079 -> 750
P108 -> 392
P1080 -> 22056
P110 -> 57
P111 -> 18
P112 -> 522
P113 -> 533
P1136 -> 0
P114 -> 2052
P1142 -> 3008
P115 -> 135867
P1165 -> 163
P1170 -> 61
P118 -> 16028
P119 -> 214
P1192 -> 16248
P1194 -> 61
P1201 -> 2206
P1202 -> 2095
P121 -> 16
P1211 -> 550
P122 -> 13053
P123 -> 1325
P126 -> 390
P127 -> 3930
P1283 -> 365
P129 -> 2
P1290 -> 18262
P1299 -> 9596
P1303 -> 1750
P1308 -> 691
P131 -> 44907
P1313 -> 17126
P1318 -> 0
P1321 -> 931026
P1322 -> 4
P1327 -> 0
P1336 -> 751726
P1343 -> 208203
P1344 -> 92591
P1346 -> 425
P135 -> 2822
P136 -> 35619
P137 -> 2075
P1383 -> 3110
P1398 -> 84
P1399 -> 72
P140 -> 323
P1408 -> 1157
P1411 -> 405
P1412 -> 2886
P1

In [26]:
for prop, val in property_mapping.items():
    
    output_1 = subprocess.check_output("wc -l < $%s_CORRECT" % prop, shell=True)
    output_1 = output_1.decode("utf-8").strip()
    correct_lines = int(output_1) - 1
    # property_results_count[prop]['Correct'] = correct_lines
    
    output_2 = subprocess.check_output("wc -l < $%s_INCORRECT" % prop, shell=True)
    output_2 = output_2.decode("utf-8").strip()
    incorrect_lines = int(output_2) - 1
    # property_results_count[prop]['Incorrect'] = incorrect_lines
    
    output_3 = subprocess.check_output("wc -l < $%s_QNODES" % prop, shell=True)
    output_3 = output_3.decode("utf-8").strip()
    qnode_lines = int(output_3) - 1
    # property_results_count[prop].append(qnode_lines) 
    
    print("%s: correct %d; incorrect %d; total %d" % (prop, correct_lines, incorrect_lines, qnode_lines))
    
    assert correct_lines + incorrect_lines == qnode_lines, "The sum is not correct!"

P1000: correct 68; incorrect 405257; total 405325
P1001: correct 4778; incorrect 761; total 5539
P1002: correct 755; incorrect 139; total 894
P1018: correct 745; incorrect 95; total 840
P102: correct 66353; incorrect 4983; total 71336
P1028: correct 18659; incorrect 739; total 19398
P1029: correct 146; incorrect 11; total 157
P103: correct 75949; incorrect 2886; total 78835
P1035: correct 10999; incorrect 11075; total 22074
P1037: correct 131548; incorrect 1877; total 133425
P1038: correct 24102; incorrect 1793; total 25895
P1040: correct 11646; incorrect 566; total 12212
P1041: correct 124; incorrect 94; total 218
P1049: correct 5308; incorrect 549; total 5857
P1050: correct 1618; incorrect 1505; total 3123
P1056: correct 15695; incorrect 5254; total 20949
P106: correct 29971; incorrect 28630; total 58601
P1064: correct 72; incorrect 25; total 97
P1066: correct 10032; incorrect 77; total 10109
P1068: correct 176; incorrect 83; total 259
P1071: correct 19133; incorrect 7791; total 2692

P2098: correct 164; incorrect 5370; total 5534
P210: correct 2016; incorrect 47; total 2063
P2152: correct 0; incorrect 0; total 0
P2159: correct 3; incorrect 10304; total 10307
P2175: correct 60; incorrect 819; total 879
P2176: correct 321; incorrect 84; total 405
P2184: correct 9; incorrect 96; total 105
P22: correct 3251; incorrect 845; total 4096
P2239: correct 10; incorrect 600; total 610
P2286: correct 262; incorrect 3; total 265
P2288: correct 28; incorrect 0; total 28
P2289: correct 160; incorrect 11; total 171
P2291: correct 0; incorrect 33577; total 33577
P2318: correct 866; incorrect 212; total 1078
P2319: correct 3; incorrect 1450; total 1453
P2321: correct 4310; incorrect 721; total 5031
P2341: correct 40558; incorrect 5497; total 46055
P2354: correct 0; incorrect 405344; total 405344
P2360: correct 408; incorrect 9; total 417
P2361: correct 114; incorrect 1008; total 1122
P237: correct 125; incorrect 52; total 177
P2378: correct 1769; incorrect 78; total 1847
P2388: corre

P4614: correct 434; incorrect 2763; total 3197
P462: correct 1376; incorrect 225; total 1601
P463: correct 46704; incorrect 45657; total 92361
P4647: correct 42461; incorrect 744; total 43205
P466: correct 5249; incorrect 836; total 6085
P4661: correct 28; incorrect 68; total 96
P467: correct 1430; incorrect 361; total 1791
P468: correct 0; incorrect 405377; total 405377
P4688: correct 1058; incorrect 18049; total 19107
P469: correct 949; incorrect 19728; total 20677
P47: correct 11919; incorrect 698; total 12617
P4743: correct 434; incorrect 48; total 482
P4788: correct 78; incorrect 5; total 83
P479: correct 3; incorrect 22602; total 22605
P4791: correct 6406; incorrect 152; total 6558
P4792: correct 10; incorrect 123; total 133
P483: correct 4604; incorrect 44237; total 48841
P485: correct 178529; incorrect 1319; total 179848
P488: correct 2048; incorrect 601; total 2649
P4884: correct 2052; incorrect 247; total 2299
P489: correct 13; incorrect 833; total 846
P4908: correct 105; inc

P8047: correct 2946; incorrect 220; total 3166
P807: correct 17303; incorrect 1553; total 18856
P81: correct 4385; incorrect 2944; total 7329
P8111: correct 9; incorrect 9945; total 9954
P812: correct 18989; incorrect 86381; total 105370
P8127: correct 774; incorrect 244; total 1018
P8131: correct 99; incorrect 10816; total 10915
P8138: correct 53848; incorrect 1075121; total 1128969
P816: correct 10; incorrect 164808; total 164818
P817: correct 0; incorrect 39; total 39
P823: correct 770; incorrect 564; total 1334
P826: correct 412; incorrect 9; total 421
P831: correct 1364; incorrect 947; total 2311
P832: correct 11; incorrect 843; total 854
P8324: correct 0; incorrect 51226; total 51226
P833: correct 99; incorrect 7953; total 8052
P8345: correct 1530; incorrect 20693; total 22223
P837: correct 0; incorrect 201; total 201
P84: correct 7438; incorrect 603; total 8041
P840: correct 142497; incorrect 414; total 142911
P8402: correct 24; incorrect 25357; total 25381
P8413: correct 32986;

# Step (8). Output statistics

## Runtime

In [27]:
# df = pd.DataFrame.from_dict(runtime_copy, orient='index', 
#                             columns=['Entity Resolution', 'Property Mapping', 'Query Wikidata Infobox', 'Filter new results', 'Datatype Filtering', 'Quality Checking'])
# df.to_csv('/nas/home/bohuizha/KG/hunger-for-knowledge/batch_output/runtime.csv', sep=',')
with open(os.environ['RUNTIME'], 'w', encoding='utf-8') as f:
    json.dump(property_runtime, f, ensure_ascii=False, indent=4)

## Results count

In [28]:
with open(os.environ['LINE_STATISTICS'], 'w', encoding='utf-8') as f:
    json.dump(property_lines_count, f, ensure_ascii=False, indent=4)

In [29]:
with open(os.environ['QNODE_STATISTICS'], 'w', encoding='utf-8') as f:
    json.dump(property_qnodes_count, f, ensure_ascii=False, indent=4)