# Step 1 User Query

In [1]:
import os
import re
import time
import json
import subprocess
import pandas as pd

from collections import defaultdict

## Define alias and variables

In [2]:
# Parameters

# Folder where database files store
data_path = "/nas/home/bohuizha/KG/hunger-for-knowledge/data/"

# The names of files in the KGTK Wikidata distirbution that we will use in this notebook.
data_file_names = {
    "claims": "claims.tsv",
    "wiki_info": "wikidata_infobox.tsv",
    "p31": "P31.tsv",
    "p279star": "P279star.tsv",
    "labels": "labels.en.tsv",
    "constraints": "value_type_constraint.json"
}

# We will define environment variables to hold the full paths to the files as we will use them in the shell commands
kgtk_environment_variables = []

os.environ['DATABASE'] = data_path
kgtk_environment_variables.append('DATABASE')

for key, value in data_file_names.items():
    variable = key.upper()
    os.environ[variable] = data_path + value
    kgtk_environment_variables.append(variable)
    
for variable in kgtk_environment_variables:
    print("{}: \"{}\"".format(variable, os.environ[variable]))

DATABASE: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/"
CLAIMS: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/claims.tsv"
WIKI_INFO: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/wikidata_infobox.tsv"
P31: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/P31.tsv"
P279STAR: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/P279star.tsv"
LABELS: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/labels.en.tsv"
PROPERTIES: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/valueTypeConstraintValidator.sh"
CONSTRAINTS: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/value_type_constraint.json"


**Method 1: query from Wikidata**
- property constraint (P2302)
- value-type constraint (Q21510865)

In [3]:
# property_list = !kgtk query -i $CLAIMS --match 'c: (q)-[p:P2302]->(v:Q21510865)' --return 'q' --limit 100
# property_list = property_list[1:5]

**Method 2: preprocess Kartik's work**

In [None]:
with open(os.environ['CONSTRAINTS']) as f:
    property_dict = json.load(f)

In [5]:
# start_time = round(time.time())
property_runtime = {k: defaultdict(float) for k, _ in property_dict.items()}

In [6]:
property_lines_count = {k: defaultdict(int) for k, _ in property_dict.items()}
property_qnodes_count = {k: defaultdict(int) for k, _ in property_dict.items()}

In [16]:
# Parameters

# Folder on local machine where to create the output and temporary folders
output_path = "/nas/home/bohuizha/KG/hunger-for-knowledge/batch_output/"
if not os.path.exists(output_path):
    os.makedirs(output_path)
output_list = ['results', 'infobox_results', 'new_results', 'samples', 
               'unknown', 'entity', 'class', 'query', 'agree',
               'direct_infer', 'indirect_infer', 'infer', 
               'structured_literals', 'nodes', 'qnodes', 
               'correct_temp_1', 'correct_temp_2', 'incorrect_temp', 
               'correct', 'incorrect']
for folder_name in output_list:
    folder_path = os.path.join(output_path, folder_name)
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
    variable = folder_name.upper()
    os.environ[variable] = folder_path
    kgtk_environment_variables.append(variable)

os.environ['OUTPUT'] = output_path
kgtk_environment_variables.append('OUTPUT')

os.environ['PROPERTY_MAPPING'] = "/nas/home/bohuizha/KG/hunger-for-knowledge/batch_output/property_mapping_full.json"
kgtk_environment_variables.append('PROPERTY_MAPPING')

os.environ['RUNTIME'] = "/nas/home/bohuizha/KG/hunger-for-knowledge/batch_output/runtime.json"
kgtk_environment_variables.append('RUNTIME')

os.environ['LINE_STATISTICS'] = "/nas/home/bohuizha/KG/hunger-for-knowledge/batch_output/line.statistics.json"
kgtk_environment_variables.append('STATISTICS')

os.environ['QNODE_STATISTICS'] = "/nas/home/bohuizha/KG/hunger-for-knowledge/batch_output/qnode.statistics.json"
kgtk_environment_variables.append('QNODE_STATISTICS')

for prop in property_dict.keys():
    output_file_names = {
        "%s_results" % prop: "results/results.%s.tsv" % prop,
        "%s_infobox_results" % prop: "infobox_results/infobox.results.%s.tsv" % prop,
        "%s_new_results" % prop: "new_results/new.results.%s.tsv" % prop,
        "%s_samples" % prop: "samples/samples.%s.tsv" % prop,
        "%s_unknown" % prop: "unknown/unknown.%s.tsv" % prop,
        "%s_class" % prop: "class/class.%s.tsv" % prop,
        "%s_entity" % prop: "entity/entity.%s.tsv" % prop,
        "%s_agree" % prop: "agree/agree.%s.tsv" % prop,
        "%s_query_file" % prop: "query/query.file.%s.tsv" % prop,
        "%s_direct_infer" % prop: "direct_infer/direct.infer.%s.tsv" % prop,
        "%s_indirect_infer" % prop: "indirect_infer/indirect.infer.%s.tsv" % prop,
        "%s_infers" % prop: "infers/infers.%s.tsv" % prop,
        "%s_structured_literals" % prop: "structured_literals/structured.literals.%s.tsv" % prop,
        "%s_nodes" % prop: "nodes/nodes.%s.tsv" % prop,
        "%s_qnodes" % prop: "qnodes/qnodes.%s.tsv" % prop,
        "%s_correct_temp_1" % prop: "correct_temp_1/correct_temp_1.type-constraints.instanceOf.%s.tsv" % prop,
        "%s_correct_temp_2" % prop: "correct_temp_2/correct_temp_2.type-constraints.instanceOf.%s.tsv" % prop,
        "%s_incorrect_temp" % prop: "incorrect_temp/incorrect_temp.type-constraints.instanceOf.%s.tsv" % prop,
        "%s_correct" % prop: "correct/correct.type-constraints.instanceOf.%s.tsv" % prop,
        "%s_incorrect" % prop: "incorrect/incorrect.type-constraints.instanceOf.%s.tsv" % prop
    }
    # print(output_file_names)
    for key, value in output_file_names.items():
        variable = key.upper()
        os.environ[variable] = os.path.join(output_path, value)
        kgtk_environment_variables.append(variable)

# kgtk_environment_variables.sort()
# for variable in kgtk_environment_variables:
#     print("{}: \"{}\"".format(variable, os.environ[variable]))

# Step 2: Wikidata Results

Generate a property file for each property using `kgtk filter` (filter for property). **Note:** Already run on background and since it's relatively time consuming, it's better to not run again.

In [9]:
# for prop, val in property_dict.items()[100:]:
#     command = "kgtk filter -i $CLAIMS -p \" ; %s ; \" -o $%s_RESULTS" % (prop, prop) 
#     # print(command)
#     code = os.system(command)
#     print(code, prop) 

In [10]:
# # code from Kartik
# with open(os.environ['CLAIMS'], 'r') as f: 
#     headerLine = next(f).decode("utf-8")
#     tstCount = 0
#     for line in tqdm(f):
#         line = line.decode("utf-8")
#         lineP = line.rstrip().split("\t")
#         if "external-id" in lineP[-1]:
#             continue
#         prop = lineP[2]
#         if prop not in propFileDict:
#             propFileDict[prop] = open("../../propertiesSplit_Final/claims."+str(prop)+".tsv","w")
#             propFileDict[prop].write(headerLine)
#         propFileDict[prop].write(line)
#         tstCount += 1
#         if tstCount == 10:
#             break
#     for file1 in propFileDict.values():
#         file1.close()

## Count known results in Wikidata database:

In [12]:
for prop in property_dict.keys():
    # count in lines
    lines = subprocess.check_output("wc -l < $%s_RESULTS" % prop, shell=True)
    lines = lines.decode("utf-8").strip()
    lines = int(lines) - 1
    property_lines_count[prop]['Known'] = lines
    # print("%s -> %d" % (prop, lines))
    # count in qnodes
    command = "kgtk query -i $%s_RESULTS --match '(qnode)-[]->()' --return 'count(distinct qnode)'" % prop
    nodes = subprocess.check_output(command, shell=True)
    nodes = nodes.decode("utf-8").strip().split('\n')[1]
    nodes = int(nodes)
    property_qnodes_count[prop]['Known'] = nodes
    print("%s -> %d" % (prop, nodes))

P1000 -> 174
P1001 -> 715765
P1002 -> 800
P1018 -> 154
P102 -> 392278
P1026 -> 7327
P1028 -> 4183
P1029 -> 795
P103 -> 106113
P1035 -> 506
P1037 -> 9215
P1038 -> 23053
P1039 -> 35
P1040 -> 45480
P1041 -> 65
P1049 -> 1643
P105 -> 2892666
P1050 -> 176482
P1057 -> 123
P106 -> 6339031
P1060 -> 186
P1064 -> 22745
P1066 -> 40075
P1068 -> 187
P1071 -> 49679
P1072 -> 1092
P1073 -> 658
P1075 -> 577
P1078 -> 74
P1079 -> 170
P1080 -> 25336
P110 -> 8499
P111 -> 2625
P112 -> 50161
P113 -> 2627
P1136 -> 88
P114 -> 275
P1142 -> 13170
P1145 -> 5775
P115 -> 30528
P1151 -> 1626
P1165 -> 1293
P1170 -> 3
P1171 -> 1
P118 -> 160085
P119 -> 180870
P1192 -> 12181
P1194 -> 1
P1199 -> 73
P1200 -> 2659
P1201 -> 400
P1202 -> 40
P121 -> 4709
P1210 -> 6
P1211 -> 25
P122 -> 1510
P1221 -> 2
P1227 -> 11
P123 -> 232053
P126 -> 302852
P127 -> 399788
P128 -> 9942
P1283 -> 1379
P129 -> 73
P1290 -> 547
P1299 -> 5953
P1303 -> 170552
P1304 -> 160
P1308 -> 5833
P131 -> 9788187
P1312 -> 635
P1313 -> 60741
P1318 -> 86
P1321 -> 

P5970 -> 10542
P5995 -> 231
P6 -> 23243
P6022 -> 1
P608 -> 11736
P6087 -> 6730
P609 -> 19140
P610 -> 5104
P6104 -> 10290
P611 -> 27481
P6112 -> 2884
P6116 -> 5
P6118 -> 9
P612 -> 748
P6149 -> 2
P6153 -> 107745
P6166 -> 102
P618 -> 1100
P6185 -> 2
P6186 -> 254
P6191 -> 3
P6193 -> 60
P6237 -> 15
P624 -> 191
P6241 -> 5034
P6243 -> 1
P6275 -> 77591
P629 -> 71496
P6338 -> 1195
P634 -> 332
P636 -> 740
P6364 -> 1355
P6365 -> 3303
P6379 -> 72650
P641 -> 1735485
P6426 -> 148
P6437 -> 33
P6440 -> 19
P647 -> 11593
P65 -> 53623
P6533 -> 5
P6534 -> 5
P654 -> 13
P655 -> 11305
P6569 -> 846
P658 -> 8959
P66 -> 1291
P660 -> 0
P6606 -> 195
P6609 -> 20
P664 -> 65064
P6684 -> 1
P669 -> 120796
P6718 -> 71
P6758 -> 341
P676 -> 26693
P6803 -> 25
P6819 -> 225
P6872 -> 1246
P6884 -> 2
P6885 -> 14654
P6886 -> 81545
P6889 -> 147
P69 -> 1432340
P690 -> 1868
P6902 -> 116
P694 -> 1047
P6942 -> 586
P6948 -> 7
P6977 -> 1302
P6978 -> 384
P7010 -> 4
P702 -> 0
P703 -> 271141
P7047 -> 1881
P706 -> 60981
P707 -> 57
P7078 

## Find unknow results in Wikidata database:

Find the most frequent class and substitute the entities of the class as the whole entity set we're going to query. **Note:** Currently we don't apply this step.

In [None]:
# for prop, val in property_dict.items():
#     command = "kgtk query -i $%s_RESULTS $P31 \
#         --match 'r: (entity)-[]->(), P31: (entity)-[]->(class)' \
#         --return 'distinct entity as node1, \"P31\" as label, class as node2' \
#         -o $%s_CLASS" % (prop, prop)
#     print(command)
#     start = time.time()
#     code = os.system(command)
#     runtime = time.time() - start
#     print(code, runtime)
#     property_runtime[prop].append(runtime)

In [None]:
# entity_class_map = dict()
# for prop, _ in property_dict.items():
#     command = "kgtk query -i $%s_CLASS \
#         --match 'c: ()-[]->(class)' \
#         --return 'class, count(class) as N' \
#         --order-by 'N desc' \
#         --limit 1" % prop
#     start = time.time()
#     output = subprocess.check_output(command, shell=True)
#     runtime = time.time() - start
#     property_runtime[prop].append(runtime)
#     output = output.decode("utf-8").strip().split()
#     print("%s" % prop, output, runtime)
#     if len(output) == 4:
#         entity_class_map[prop] = output[2]

In [None]:
# class_entity_map = dict()
# for key, value in entity_class_map.items():
#     class_entity_map[value] = []
# for key, value in entity_class_map.items():
#     class_entity_map[value].append(key)
# class_entity_map

- Find all entities

In [None]:
# for cls, prop in class_entity_map.items():
#     command = "kgtk filter -i $P31 -p \";; %s\" -o $%s_ENTITY" % (cls, prop[0])
#     print(command)
#     # If ${val} already in folder, no need to query
#     # if os.path.exists(os.environ['%s_ENTITY' % prop[0]]):
#     #     print(0)
#     #     runtime = 0
#     #     propperty_runtime[prop].append(runtime)
#     # else:
#     start = time.time()
#     code = os.system(command)
#     runtime = time.time() - start
#     property_runtime[prop[0]].append(runtime)
#     print(code, runtime)
#     if len(prop) > 1:
#         for p in prop[1:]:
#             os.system('cp $%s_ENTITY $%s_ENTITY' % (prop[0], p))
#             property_runtime[p].append(runtime)

In [None]:
# directly filter
# !kgtk query -i $%s_CLASS $P31 \
#         --match 'c: (n)-[]->(class), P31: (entity)-[]->(class)' \
#         --where 'n != entity' \
#         --return 'distinct entity as node1, \"P31\" as label, class as node2' \
#         -o $%s_ENTITY" % (prop, prop)

- Eliminate entities who have known property / properties values

In [None]:
# for prop, val in property_dict.items():
#     command = "kgtk ifnotexists -i $%s_ENTITY \
#         --filter-on $%s_RESULTS \
#         --input-keys node1 \
#         --filter-keys node1 \
#         -o $%s_QUERY_FILE" % (prop, prop, prop)
#     print(command)
#     # if os.path.exists(os.environ['%s_QUERY_FILE' % prop[0]]):
#     #     print(0)
#     # else:
#     #     code = os.system(command)
#     #     print(code)
#     start = time.time()
#     code = os.system(command)
#     runtime = time.time() - start
#     property_runtime[prop].append(runtime)
#     print(code, runtime)

## Count unknown results

- rows

In [None]:
# for prop, val in property_dict.items():
#     output = subprocess.check_output("wc -l < $%s_QUERY_FILE" % prop, shell=True)
#     output = output.decode("utf-8").strip()
#     output = int(output) - 1
#     print("%s: %s" % (prop, output))

- entities

In [None]:
# for prop, val in property_dict.items():
#     command = "kgtk query -i $%s_QUERY_FILE \
#         --match '(p)-[]->()' \
#         --return 'count(distinct p) as N'" % prop
#     output = subprocess.check_output(command, shell=True)
#     output = output.decode("utf-8").strip().split()[-1]
#     print("%s: %s" % (prop, output))

# Step 3 Selection of Additional KG(s)

Currently we use Wikidata Infobox generated from DBpedia.

# Step 4 Schema Alignment

## Entity resolution

Use query results from Wikidata database to infer properties in Wikidata infobox and return the most frequent property.

**Direct Infer:** Query for qnode;

In [13]:
for prop, val in property_dict.items():
    command = "kgtk query -i $%s_RESULTS --match '()-[]->()' --limit 100000 -o $%s_SAMPLES" % (prop, prop)
    code = os.system(command)

In [14]:
for prop, val in property_dict.items():
    command = "kgtk query -i $%s_SAMPLES -i $WIKI_INFO \
        --match 's: (entity)-[]->(v), w: (entity)-[p]->(v)' \
        --return 'entity, p.label, v as node2' \
        -o $%s_DIRECT_INFER" % (prop, prop)
    # print(command)
    start = time.time()
    code = os.system(command)
    runtime = time.time() - start
    property_runtime[prop]['Entity Resolution'] = runtime
    print(prop, code, runtime)

P1000 0 5.713055849075317
P1001 0 396.0106248855591
P1002 0 6.0309717655181885
P1018 0 5.898914098739624
P102 0 140.07747769355774
P1026 0 6.308619737625122
P1028 0 6.416730642318726
P1029 0 5.849759817123413
P103 0 251.4438920021057
P1035 0 6.09217643737793
P1037 0 6.384928464889526
P1038 0 7.352807283401489
P1039 0 5.534467458724976
P1040 0 7.085474729537964
P1041 0 5.051496267318726
P1049 0 6.670876979827881
P105 0 225.35661697387695
P1050 0 7.6225762367248535
P1057 0 5.447472810745239
P106 0 23.04977250099182
P1060 0 5.343763828277588
P1064 0 6.558681488037109
P1066 0 6.639030694961548
P1068 0 5.867770195007324
P1071 0 273.28448033332825
P1072 0 6.569829225540161
P1073 0 5.6456403732299805
P1075 0 5.419899940490723
P1078 0 5.879947900772095
P1079 0 5.8536787033081055
P1080 0 5.938830137252808
P110 0 6.146820068359375
P111 0 5.863587856292725
P112 0 11.773656368255615
P113 0 5.796530723571777
P1136 0 6.068910837173462
P114 0 5.616992235183716
P1142 0 11.534939289093018
P1145 0 5.558

P2882 0 6.748704195022583
P289 0 6.099083185195923
P2922 0 5.774558782577515
P2935 0 5.771280527114868
P2936 0 51.24226450920105
P2937 0 6.022082090377808
P2962 0 6.669083118438721
P2974 0 5.636828899383545
P2975 0 5.504502773284912
P2976 0 5.587708950042725
P2978 0 6.138958215713501
P2989 0 5.698786735534668
P2992 0 5.658933401107788
P30 0 43.57773494720459
P3005 0 6.919565200805664
P3014 0 5.704907655715942
P3015 0 5.727987766265869
P3018 0 5.869591951370239
P3019 0 5.795964241027832
P3022 0 5.947368860244751
P3025 0 5.512752294540405
P3026 0 5.895929574966431
P3027 0 6.2226011753082275
P3028 0 5.895323753356934
P3033 0 5.609557151794434
P3037 0 5.732429265975952
P306 0 60.75541114807129
P3075 0 7.231246709823608
P3080 0 5.720891952514648
P3085 0 5.968597412109375
P3092 0 5.618879079818726
P3096 0 5.924523830413818
P3103 0 5.463284969329834
P3137 0 5.881840705871582
P3150 0 5.773504972457886
P3156 0 5.3900017738342285
P3161 0 5.7927186489105225
P3173 0 6.147919416427612
P3174 0 6.271

P618 0 6.000166893005371
P6185 0 6.007590293884277
P6186 0 5.698055982589722
P6191 0 5.752790212631226
P6193 0 17.85539960861206
P6237 0 5.900480270385742
P624 0 6.054769039154053
P6241 0 6.213551998138428
P6243 0 6.13910436630249
P6275 0 6.514391899108887
P629 0 6.5283660888671875
P6338 0 5.825833082199097
P634 0 6.1293652057647705
P636 0 5.469071865081787
P6364 0 6.6104042530059814
P6365 0 5.9963908195495605
P6379 0 9.16944932937622
P641 0 145.15733289718628
P6426 0 6.6123878955841064
P6437 0 5.472076654434204
P6440 0 5.603970050811768
P647 0 11.20615029335022
P65 0 10.87767744064331
P6533 0 5.550556182861328
P6534 0 5.836568593978882
P654 0 5.542920827865601
P655 0 6.223320960998535
P6569 0 6.1361610889434814
P658 0 5.7568519115448
P66 0 9.756242990493774
P660 0 5.963173866271973
P6606 0 5.955073356628418
P6609 0 5.173492431640625
P664 0 10.665799379348755
P6684 0 6.04532527923584
P669 0 6.373992681503296
P6718 0 6.136835336685181
P6758 0 6.609835147857666
P676 0 8.207503318786621
P

## Property mapping

In [None]:
property_mapping = dict()
for prop, val in property_dict.items():
    command = """kgtk query -i $%s_DIRECT_INFER 
        --match '(q)-[p]->(v)' 
        --return 'p.label, count(v) as N' 
        --order-by 'N desc' 
        --limit 1""" % prop
    start = time.time()
    print(command)
    output = subprocess.check_output(command, shell=True)
    runtime = time.time() - start
    property_runtime[prop]['Property Mapping'] = runtime
    output = output.decode("utf-8").strip().split()
    print(output, runtime)
    if len(output) == 4:
        property_mapping[prop] = output[2]
        print("%s -> %s" % (prop, output[2]))

In [None]:
# save property_mapping
with open(os.environ['PROPERTY_MAPPING'], 'w+') as f:
    json.dump(property_mapping, f, indent=4)

# Step 5 Results from other KG(s)

For those entities don't have property value, query in Wikidata infobox:

In [16]:
for prop, val in property_mapping.items():
    
    # original command if unknown entity set exists
    # command = "kgtk query -i $%s_QUERY_FILE -i $WIKI_INFO \
    #     --match 'q: (entity)-[]->(), w: (entity)-[property]->(value)' \
    #     --where 'property.label = \"%s\"' \
    #     --return 'distinct entity, property.label, value' \
    #     -o $%s_NEW_RESULTS" % (prop, val, prop)
    
    # command similar to Wikidata query
    command = "kgtk filter -i $WIKI_INFO -p \" ; %s ; \" -o $%s_INFOBOX_RESULTS" % (val, prop) 
    # print(command)
    start = time.time()
    code = os.system(command)
    runtime = time.time() - start
    property_runtime[prop]['Knowledge Retrieval'] = runtime
    print(prop, code, runtime)

P1000 0 160.27169823646545
P1001 0 162.09388184547424
P1002 0 155.2455952167511
P1018 0 153.19074320793152
P102 0 159.53155636787415
P1028 0 151.18282532691956
P1029 0 156.90697622299194
P103 0 153.6838800907135
P1035 0 165.16186356544495
P1037 0 151.53013062477112
P1038 0 152.48442721366882
P1040 0 166.8406298160553
P1041 0 153.51759099960327
P1049 0 154.25958681106567
P1050 0 157.53289937973022
P106 0 153.39906311035156
P1064 0 156.83636045455933
P1066 0 154.01483631134033
P1068 0 154.01302909851074
P1071 0 156.0089054107666
P1072 0 158.99135279655457
P1073 0 150.5013234615326
P1075 0 155.17958974838257
P1078 0 152.87814497947693
P1079 0 163.33334517478943
P1080 0 161.23699021339417
P110 0 161.80069088935852
P111 0 155.30673956871033
P112 0 154.55348229408264
P113 0 157.86383366584778
P1136 0 151.2306432723999
P114 0 162.99566912651062
P1142 0 153.91269659996033
P115 0 156.89712691307068
P1165 0 157.9520571231842
P1170 0 161.97993540763855
P118 0 149.894198179245
P119 0 155.419747352

P457 0 156.1289839744568
P4584 0 156.89897441864014
P4586 0 158.48867273330688
P4608 0 155.82427859306335
P4614 0 157.64231204986572
P462 0 158.398540019989
P4647 0 156.93584418296814
P466 0 154.1048183441162
P4661 0 151.52267861366272
P467 0 153.6100947856903
P4688 0 156.94634366035461
P469 0 171.90177941322327
P47 0 150.92109560966492
P4743 0 154.37834429740906
P4788 0 150.92609810829163
P479 0 154.9432818889618
P4791 0 158.2100625038147
P4792 0 158.8613896369934
P485 0 161.580326795578
P488 0 162.14677929878235
P4884 0 152.59357929229736
P489 0 155.23743057250977
P4908 0 157.828280210495
P4913 0 150.9337694644928
P495 0 152.32429218292236
P50 0 151.1416494846344
P500 0 152.03899383544922
P5004 0 154.10592579841614
P5009 0 184.87013220787048
P501 0 155.6138083934784
P5028 0 152.78263330459595
P504 0 158.93728947639465
P505 0 153.06872487068176
P5051 0 161.8462724685669
P5052 0 154.65448808670044
P5053 0 158.39170217514038
P5054 0 153.61968445777893
P5059 0 165.87007474899292
P509 0 1

## Filter new results found from additional KG

In [17]:
for prop, val in property_mapping.items():
    command = "kgtk ifnotexists -i $%s_INFOBOX_RESULTS \
        --filter-on $%s_RESULTS \
        --input-keys node1 node2 \
        --filter-keys node1 node2 \
        -o $%s_NEW_RESULTS" % (prop, prop, prop) 
    # print(command)
    start = time.time()
    code = os.system(command)
    runtime = time.time() - start
    property_runtime[prop]['Knowledge Retrieval'] += runtime
    print(prop, code, runtime)

P1000 0 12.49911117553711
P1001 0 18.52511239051819
P1002 0 5.808444023132324
P1018 0 5.485958814620972
P102 0 7.649545669555664
P1028 0 5.392573595046997
P1029 0 5.155886650085449
P103 0 7.581751585006714
P1035 0 5.757181167602539
P1037 0 5.856914043426514
P1038 0 6.163699150085449
P1040 0 6.340071439743042
P1041 0 5.695775508880615
P1049 0 6.077310562133789
P1050 0 5.831122875213623
P106 0 25.632703065872192
P1064 0 6.086587429046631
P1066 0 6.001864433288574
P1068 0 5.284717082977295
P1071 0 5.230486631393433
P1072 0 5.709000587463379
P1073 0 5.274458885192871
P1075 0 5.619153261184692
P1078 0 5.139584064483643
P1079 0 5.583566188812256
P1080 0 5.260922193527222
P110 0 5.431077480316162
P111 0 5.772254467010498
P112 0 6.097115993499756
P113 0 6.231533527374268
P1136 0 6.359338045120239
P114 0 5.561196804046631
P1142 0 6.042811393737793
P115 0 7.458459138870239
P1165 0 5.6483118534088135
P1170 0 5.099308013916016
P118 0 6.5024755001068115
P119 0 6.756683349609375
P1192 0 5.7781500816

P4661 0 5.723924398422241
P467 0 5.776277303695679
P4688 0 6.014869689941406
P469 0 6.406317234039307
P47 0 7.253904104232788
P4743 0 5.564932584762573
P4788 0 5.6683244705200195
P479 0 6.715718030929565
P4791 0 5.527331352233887
P4792 0 5.75880241394043
P485 0 6.90190315246582
P488 0 5.857424974441528
P4884 0 5.589831352233887
P489 0 5.719823122024536
P4908 0 5.841756343841553
P4913 0 6.095219135284424
P495 0 10.184398412704468
P50 0 9.526443481445312
P500 0 9.56888484954834
P5004 0 5.367166042327881
P5009 0 5.90079402923584
P501 0 5.500075578689575
P5028 0 5.635959625244141
P504 0 5.509397745132446
P505 0 5.901917934417725
P5051 0 5.295740842819214
P5052 0 6.101610898971558
P5053 0 6.187780141830444
P5054 0 5.565891265869141
P5059 0 5.759101152420044
P509 0 5.7514612674713135
P5095 0 5.856971502304077
P511 0 5.939015626907349
P512 0 6.453514337539673
P5125 0 11.309330463409424
P5132 0 5.730570316314697
P5138 0 6.26284384727478
P516 0 5.950165510177612
P517 0 5.210730075836182
P520 0 

- Count rows of new findings:

In [None]:
# for prop, val in property_mapping.items():
#     output = subprocess.check_output("wc -l < $%s_NEW_RESULTS" % prop, shell=True)
#     output = output.decode("utf-8").strip()
#     output = int(output) - 1
#     print("%s: %s" % (prop, output))

- Count unique entities of new findings:

In [None]:
# for prop, val in property_mapping.items():
#     command = "kgtk query -i $%s_NEW_RESULTS \
#         --match 'n: (p)-[]->()' \
#         --return 'count(distinct p) as N'" % prop
#     output = subprocess.check_output(command, shell=True)
#     output = output.decode("utf-8").strip().split()[1]
#     # print("%s: %s" % (prop, output))

## Filter out entities (rows) we still don't know

In [None]:
# for prop, val in property_mapping.items():
#     command = "kgtk ifnotexists -i $%s_QUERY_FILE \
#         --filter-on $%s_NEW_RESULTS \
#         --input-keys node1 \
#         --filter-keys node1 \
#         -o $%s_UNKNOWN" % (prop, prop, prop)
#     print(command)
#     start = time.time()
#     code = os.system(command)
#     runtime = time.time() - start
#     property_runtime[prop].append(runtime)
#     print(code, runtime)

- Count rows still unknow

In [None]:
# for prop, val in property_mapping.items():
#     output = subprocess.check_output("wc -l < $%s_UNKNOWN" % prop, shell=True)
#     output = output.decode("utf-8").strip()
#     output = int(output) - 1
#     print("%s: %s" % (prop, output))

- Count entities still unknow

In [None]:
# for prop, val in property_mapping.items():
#     command = "kgtk query -i $%s_UNKNOWN \
#         --match 'n: (p)-[]->()' \
#         --return 'count(distinct p) as N'" % prop
#     output = subprocess.check_output(command, shell=True)
#     output = output.decode("utf-8").strip().split()[1]
#     print("%s: %s" % (prop, output))

# Step 6 Datatype Filtering

## 1. Filter Structured literals:

In [18]:
for prop, val in property_mapping.items():
    command = "kgtk query -i $%s_NEW_RESULTS -i $WIKI_INFO \
        --match 'n: (q)-[p]->(s), w: (s)-[sv]->(v)' \
        --where 'NOT kgtk_lqstring(s) AND NOT kgtk_number(s) AND sv.label = \"dbpedia:structured_value\"' \
        --return 'q, p.label, s' \
        -o $%s_STRUCTURED_LITERALS" % (prop, prop)
    # print(command)
    start = time.time()
    code = os.system(command)
    runtime = time.time() - start
    property_runtime[prop]['Datatype Filtering'] = runtime
    print(prop, code, runtime)

P1000 0 17.083594799041748
P1001 0 40.15961480140686
P1002 0 6.4988439083099365
P1018 0 6.036976099014282
P102 0 7.322449207305908
P1028 0 8.120703935623169
P1029 0 6.4378650188446045
P103 0 7.83821964263916
P1035 0 6.352614164352417
P1037 0 6.986302137374878
P1038 0 7.848837375640869
P1040 0 7.182326793670654
P1041 0 6.356652021408081
P1049 0 6.201275587081909
P1050 0 5.995474815368652
P106 0 8.365049839019775
P1064 0 6.139243125915527
P1066 0 6.705004930496216
P1068 0 6.387531995773315
P1071 0 6.466960191726685
P1072 0 6.42833685874939
P1073 0 6.660416603088379
P1075 0 6.144593715667725
P1078 0 5.8477418422698975
P1079 0 6.1684088706970215
P1080 0 6.621407508850098
P110 0 6.012504577636719
P111 0 6.303067684173584
P112 0 6.152547836303711
P113 0 6.0387067794799805
P1136 0 6.333868503570557
P114 0 6.616801738739014
P1142 0 6.41654109954834
P115 0 13.277951955795288
P1165 0 6.1572041511535645
P1170 0 6.29489278793335
P118 0 6.960706949234009
P119 0 6.282413721084595
P1192 0 7.186585664

P4661 0 6.434786558151245
P467 0 6.2191832065582275
P4688 0 6.45853328704834
P469 0 7.358311653137207
P47 0 6.703794002532959
P4743 0 6.290258407592773
P4788 0 6.297308921813965
P479 0 6.34201717376709
P4791 0 6.3884289264678955
P4792 0 6.29217791557312
P485 0 8.378056049346924
P488 0 6.999622583389282
P4884 0 6.304764270782471
P489 0 6.209745168685913
P4908 0 7.027121543884277
P4913 0 6.653720140457153
P495 0 8.817869186401367
P50 0 6.789129972457886
P500 0 20.84809112548828
P5004 0 7.069577693939209
P5009 0 6.249457597732544
P501 0 6.5283684730529785
P5028 0 6.611844539642334
P504 0 6.199716567993164
P505 0 6.912050247192383
P5051 0 6.154475927352905
P5052 0 6.434097051620483
P5053 0 6.043999195098877
P5054 0 6.27667236328125
P5059 0 7.239223480224609
P509 0 6.062345743179321
P5095 0 6.304091453552246
P511 0 6.618006467819214
P512 0 7.240233898162842
P5125 0 14.808088541030884
P5132 0 6.2372870445251465
P5138 0 7.356372833251953
P516 0 6.923111438751221
P517 0 6.164275169372559
P520 

## 2. Filter Qnodes

In [19]:
for prop, val in property_mapping.items():
    command_1 = "kgtk query -i $%s_NEW_RESULTS \
        --match 'n: (q)-[p]->(v)' \
        --where 'NOT kgtk_lqstring(v) AND NOT kgtk_number(v)' \
        --return 'distinct q, p.label, v' \
        -o $%s_NODES" % (prop, prop)
    # print(command_1) 
    command_2 = "kgtk ifnotexists -i $%s_NODES \
        --filter-on $%s_STRUCTURED_LITERALS \
        -o $%s_QNODES" % (prop, prop, prop)
    # print(command_2) 
    start = time.time()
    code_1 = os.system(command_1)
    code_2 = os.system(command_2)
    runtime = time.time() - start
    property_runtime[prop]['Datatype Filtering'] += runtime
    print(prop, code_1, code_2, runtime) 

P1000 0 0 17.4754056930542
P1001 0 0 15.247225284576416
P1002 0 0 11.08524227142334
P1018 0 0 10.916612386703491
P102 0 0 11.683358430862427
P1028 0 0 10.42996096611023
P1029 0 0 10.306978702545166
P103 0 0 11.442277431488037
P1035 0 0 13.814148664474487
P1037 0 0 11.09455680847168
P1038 0 0 13.014771699905396
P1040 0 0 10.932515859603882
P1041 0 0 11.43230128288269
P1049 0 0 10.996284008026123
P1050 0 0 11.798854351043701
P106 0 0 11.954849481582642
P1064 0 0 10.55661916732788
P1066 0 0 10.85793423652649
P1068 0 0 10.231197834014893
P1071 0 0 10.368257999420166
P1072 0 0 10.660126447677612
P1073 0 0 10.429637908935547
P1075 0 0 10.28878903388977
P1078 0 0 10.412736177444458
P1079 0 0 10.110423803329468
P1080 0 0 10.335355520248413
P110 0 0 10.385573387145996
P111 0 0 10.861268520355225
P112 0 0 10.703025817871094
P113 0 0 10.519324779510498
P1136 0 0 10.350848913192749
P114 0 0 10.483699321746826
P1142 0 0 10.31415867805481
P115 0 0 13.844179630279541
P1165 0 0 10.170220375061035
P117

P410 0 0 11.082851648330688
P4100 0 0 11.892560243606567
P411 0 0 11.171097755432129
P412 0 0 12.244014024734497
P413 0 0 11.540083408355713
P414 0 0 11.339623212814331
P415 0 0 10.497602939605713
P417 0 0 10.849666118621826
P418 0 0 10.535200595855713
P421 0 0 13.458669900894165
P425 0 0 11.459404230117798
P427 0 0 10.98886775970459
P4290 0 0 11.03586721420288
P4322 0 0 10.447067499160767
P4345 0 0 10.24266505241394
P4353 0 0 10.521888256072998
P4379 0 0 10.983793020248413
P4428 0 0 11.461579084396362
P449 0 0 11.169199228286743
P450 0 0 10.820465326309204
P451 0 0 11.252639532089233
P452 0 0 11.791334390640259
P4552 0 0 11.387161493301392
P457 0 0 12.042363405227661
P4584 0 0 14.13325047492981
P4586 0 0 15.72756290435791
P4608 0 0 12.19303297996521
P4614 0 0 11.227904558181763
P462 0 0 11.432413816452026
P4647 0 0 11.29461145401001
P466 0 0 11.56182312965393
P4661 0 0 11.200274467468262
P467 0 0 10.976743221282959
P4688 0 0 10.850461721420288
P469 0 0 10.762062311172485
P47 0 0 11.04

In [20]:
for prop in property_mapping.keys():
    # count in lines
    lines = subprocess.check_output("wc -l < $%s_QNODES" % prop, shell=True)
    lines = lines.decode("utf-8").strip()
    lines = int(lines) - 1
    property_lines_count[prop]['Found Total (Qnodes)'] = lines
    print("%s -> %d" % (prop, lines))
    # count in qnodes
    command = "kgtk query -i $%s_QNODES --match '(qnode)-[]->()' --return 'count(distinct qnode)'" % prop
    nodes = subprocess.check_output(command, shell=True)
    nodes = nodes.decode("utf-8").strip().split('\n')[1]
    nodes = int(nodes)
    property_qnodes_count[prop]['Found Total (Qnodes)'] = nodes

P1000 -> 287123
P1001 -> 294142
P1002 -> 544
P1018 -> 753
P102 -> 37673
P1028 -> 15733
P1029 -> 82
P103 -> 68189
P1035 -> 17254
P1037 -> 4777
P1038 -> 18203
P1040 -> 7789
P1041 -> 178
P1049 -> 5123
P1050 -> 2656
P106 -> 49557
P1064 -> 95
P1066 -> 8018
P1068 -> 234
P1071 -> 21447
P1072 -> 619
P1073 -> 451
P1075 -> 114
P1078 -> 154
P1079 -> 1015
P1080 -> 17447
P110 -> 903
P111 -> 32
P112 -> 6727
P113 -> 1136
P1136 -> 21
P114 -> 1144
P1142 -> 16973
P115 -> 352829
P1165 -> 269
P1170 -> 78
P118 -> 50137
P119 -> 4929
P1192 -> 26928
P1194 -> 78
P1201 -> 2023
P1202 -> 1867
P121 -> 1368
P1211 -> 533
P122 -> 11074
P123 -> 23737
P126 -> 1979
P127 -> 24751
P1283 -> 264
P1290 -> 18780
P1299 -> 6176
P1303 -> 13781
P1308 -> 5176
P131 -> 636562
P1313 -> 38066
P1318 -> 15
P1321 -> 809309
P1322 -> 12
P1327 -> 19
P1336 -> 986457
P1343 -> 287003
P1344 -> 1946
P1346 -> 7727
P137 -> 14219
P1383 -> 17133
P1398 -> 174
P1399 -> 812
P1408 -> 86766
P1411 -> 2714
P1412 -> 66573
P1414 -> 361
P1419 -> 22212
P1427 -

# Step 7 Quality Checking

Agree:

In [21]:
for prop, val in property_mapping.items():
    start = time.time()
    command = "kgtk query -i $%s_RESULTS $%s_INFOBOX_RESULTS \
        --match 'r: (qnode)-[p]->(value), i: (qnode)-[]->(value)' \
        --return 'qnode as `node1`, p.label as `label`, value as `node2`' \
        -o $%s_AGREE" % (prop, prop, prop)
    code = os.system(command)
    runtime = time.time() - start
    print(prop, code, runtime)

P1000 0 13.245733499526978
P1001 0 19.286831617355347
P1002 0 6.13214898109436
P1018 0 6.40873122215271
P102 0 8.592373132705688
P1028 0 6.364440202713013
P1029 0 5.927130937576294
P103 0 7.391772508621216
P1035 0 6.685913324356079
P1037 0 6.574083566665649
P1038 0 6.808195114135742
P1040 0 7.120340347290039
P1041 0 6.6596269607543945
P1049 0 6.442839860916138
P1050 0 6.986105918884277
P106 0 46.8838574886322
P1064 0 6.844204425811768
P1066 0 6.810074090957642
P1068 0 6.0859363079071045
P1071 0 6.66681432723999
P1072 0 6.373327255249023
P1073 0 6.1090404987335205
P1075 0 6.335498094558716
P1078 0 5.8501808643341064
P1079 0 6.043724298477173
P1080 0 6.456425189971924
P110 0 6.9972429275512695
P111 0 6.231842756271362
P112 0 6.64064621925354
P113 0 6.503586053848267
P1136 0 6.178228855133057
P114 0 6.062530040740967
P1142 0 6.154005289077759
P115 0 6.983297824859619
P1165 0 5.907005310058594
P1170 0 5.669863700866699
P118 0 7.860069751739502
P119 0 7.219202756881714
P1192 0 6.05976104736

P467 0 6.145544528961182
P4688 0 6.402982473373413
P469 0 6.140318393707275
P47 0 7.283470869064331
P4743 0 6.480538845062256
P4788 0 6.565171718597412
P479 0 6.49233603477478
P4791 0 6.416461229324341
P4792 0 6.420281648635864
P485 0 6.650566339492798
P488 0 5.877025604248047
P4884 0 6.309738636016846
P489 0 6.209417819976807
P4908 0 6.838604927062988
P4913 0 6.649653673171997
P495 0 9.747446060180664
P50 0 8.546287298202515
P500 0 9.612594604492188
P5004 0 6.410115480422974
P5009 0 6.438868522644043
P501 0 6.3754448890686035
P5028 0 6.934117317199707
P504 0 6.4257166385650635
P505 0 6.261078834533691
P5051 0 6.472326040267944
P5052 0 6.7456114292144775
P5053 0 6.699941635131836
P5054 0 6.465460538864136
P5059 0 6.088850021362305
P509 0 6.621518135070801
P5095 0 6.403053045272827
P511 0 6.392791032791138
P512 0 6.891373872756958
P5125 0 12.162229537963867
P5132 0 6.459616184234619
P5138 0 7.202622652053833
P516 0 6.3544628620147705
P517 0 6.393319606781006
P520 0 6.534297227859497
P52

In [22]:
for prop in property_mapping.keys():
    # if not os.path.exists(os.environ["$%s_AGREE" % prop]):
    #     property_lines_count[prop]['Agree'] = 0
    #     property_qnodes_count[prop]['Agree'] = 0
    #     break
    # count in lines
    lines = subprocess.check_output("wc -l < $%s_AGREE" % prop, shell=True)
    lines = lines.decode("utf-8").strip()
    lines = int(lines) - 1
    property_lines_count[prop]['Agree'] = lines
    print("%s -> %d" % (prop, lines))
    # count in qnodes
    command = "kgtk query -i $%s_AGREE --match '(qnode)-[]->()' --return 'count(distinct qnode)'" % prop
    nodes = subprocess.check_output(command, shell=True)
    nodes = nodes.decode("utf-8").strip().split('\n')[1]
    nodes = int(nodes)
    property_qnodes_count[prop]['Agree'] = nodes

P1000 -> 2
P1001 -> 59
P1002 -> 26
P1018 -> 59
P102 -> 82983
P1028 -> 3
P1029 -> 1409
P103 -> 255
P1035 -> 144
P1037 -> 167
P1038 -> 579
P1040 -> 12853
P1041 -> 20
P1049 -> 20
P1050 -> 29
P106 -> 43815
P1064 -> 37
P1066 -> 453
P1068 -> 43
P1071 -> 2329
P1072 -> 5
P1073 -> 9
P1075 -> 37
P1078 -> 2
P1079 -> 108
P1080 -> 107
P110 -> 1354
P111 -> 100
P112 -> 9009
P113 -> 1344
P1136 -> 6
P114 -> 189
P1142 -> 6183
P115 -> 6201
P1165 -> 36
P1170 -> 1
P118 -> 8873
P119 -> 4974
P1192 -> 86
P1194 -> 1
P1201 -> 7
P1202 -> 5
P121 -> 104
P1211 -> 6
P122 -> 468
P123 -> 36170
P126 -> 3330
P127 -> 20202
P1283 -> 102
P1290 -> 1
P1299 -> 19
P1303 -> 7516
P1308 -> 901
P131 -> 350328
P1313 -> 315
P1318 -> 12
P1321 -> 30
P1322 -> 28
P1327 -> 1017
P1336 -> 10
P1343 -> 122
P1344 -> 551
P1346 -> 9219
P137 -> 22285
P1383 -> 55
P1398 -> 3
P1399 -> 403
P1408 -> 14112
P1411 -> 485
P1412 -> 1871
P1414 -> 48
P1419 -> 1
P1427 -> 1239
P1429 -> 4
P1431 -> 691
P1433 -> 287
P1434 -> 21
P1435 -> 34
P1441 -> 2898
P1444 ->

Semantic Checking:

In [23]:
for prop, val in property_mapping.items():
    start = time.time()
    command_1 = "kgtk query -i $%s_QNODES $P31 $P279STAR \
        --match 'q: (node1)-[nodeProp]->(node2), P31: (node2)-[]->(nodex), P279star: (nodex)-[]->(par)' \
        --where 'par in %s' \
        --return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`' \
        -o $%s_CORRECT_TEMP_1" % (prop, property_dict[prop], prop)
    # print(command_1)
    code_1 = os.system(command_1)
    # print(code_1)
    command_2 = "kgtk ifnotexists -i $%s_QNODES \
        --filter-on $%s_CORRECT_TEMP_1 \
        --input-keys node1 node2 \
        --filter-keys node1 node2 \
        -o $%s_INCORRECT_TEMP" % (prop, prop, prop)
    # print(command_2)
    code_2 = os.system(command_2)
    # print(code_2)
    command_3 = "kgtk query -i $%s_INCORRECT_TEMP $P279STAR \
        --match 'i: (node1)-[nodeProp]->(node2), P279star: (node2)-[]->(par)' \
        --where 'par in %s' \
        --return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`' \
        -o $%s_CORRECT_TEMP_2" % (prop, property_dict[prop], prop)
    # print(command_3)
    code_3 = os.system(command_3)
    # print(code_3)
    command_4 = "kgtk ifnotexists -i $%s_INCORRECT_TEMP \
        --filter-on $%s_CORRECT_TEMP_2 \
        --input-keys node1 node2 \
        --filter-keys node1 node2 \
        -o $%s_INCORRECT" % (prop, prop, prop)
    # print(command_4)
    code_4 = os.system(command_4)
    # print(code_4)
    command_5 = "kgtk cat -i $%s_CORRECT_TEMP_1 $%s_CORRECT_TEMP_2 \
        -o $%s_CORRECT" % (prop, prop, prop)
    # print(command_5)
    code_5 = os.system(command_5)
    # print(code_5)
    runtime = time.time() - start
    property_runtime[prop]['Semantic Validation'] = runtime
    print(prop, code_1, code_2, code_3, code_4, code_5, runtime)

P1000 0 0 0 0 0 56.833958864212036
P1001 0 0 0 0 0 49.48102951049805
P1002 0 0 0 0 0 28.69264268875122
P1018 0 0 0 0 0 28.302552461624146
P102 0 0 0 0 0 30.504727363586426
P1028 0 0 0 0 0 30.38375186920166
P1029 0 0 0 0 0 29.00627851486206
P103 0 0 0 0 0 47.099982500076294
P1035 0 0 0 0 0 29.491548538208008
P1037 0 0 0 0 0 29.474849462509155
P1038 0 0 0 0 0 30.8555850982666
P1040 0 0 0 0 0 28.766189098358154
P1041 0 0 0 0 0 29.326995849609375
P1049 0 0 0 0 0 29.4282865524292
P1050 0 0 0 0 0 28.820223569869995
P106 0 0 0 0 0 31.7416889667511
P1064 0 0 0 0 0 28.858437538146973
P1066 0 0 0 0 0 29.442007780075073
P1068 0 0 0 0 0 28.936874628067017
P1071 0 0 0 0 0 29.568490505218506
P1072 0 0 0 0 0 29.143216609954834
P1073 0 0 0 0 0 28.95482611656189
P1075 0 0 0 0 0 28.884921550750732
P1078 0 0 0 0 0 29.00248670578003
P1079 0 0 0 0 0 28.46103072166443
P1080 0 0 0 0 0 30.56196165084839
P110 0 0 0 0 0 34.503556966781616
P111 0 0 0 0 0 36.83008599281311
P112 0 0 0 0 0 28.90373921394348
P113 0 

P3262 0 0 0 0 0 29.03734517097473
P3300 0 0 0 0 0 28.389108180999756
P3320 0 0 0 0 0 28.68528175354004
P3342 0 0 0 0 0 29.09327268600464
P3349 0 0 0 0 0 28.39120841026306
P3373 0 0 0 0 0 29.278180599212646
P3438 0 0 0 0 0 50.792914152145386
P344 0 0 0 0 0 42.09172463417053
P3448 0 0 0 0 0 40.64869976043701
P3460 0 0 0 0 0 38.681941986083984
P3490 0 0 0 0 0 41.375550508499146
P3491 0 0 0 0 0 40.541178703308105
P35 0 0 0 0 0 39.65641188621521
P3501 0 0 0 0 0 37.3771026134491
P355 0 0 0 0 0 39.2958927154541
P358 0 0 0 0 0 39.768956899642944
P36 0 0 0 0 0 41.90206694602966
P3602 0 0 0 0 0 40.92354726791382
P364 0 0 0 0 0 37.93135118484497
P3679 0 0 0 0 0 39.954503297805786
P37 0 0 0 0 0 38.88538074493408
P371 0 0 0 0 0 40.12200212478638
P3716 0 0 0 0 0 43.468021631240845
P3719 0 0 0 0 0 39.040775775909424
P375 0 0 0 0 0 38.78631138801575
P376 0 0 0 0 0 40.112327098846436
P38 0 0 0 0 0 39.52184224128723
P3828 0 0 0 0 0 37.14048194885254
P3842 0 0 0 0 0 39.72037124633789
P39 0 0 0 0 0 47.978

P8345 0 0 0 0 0 37.11795234680176
P837 0 0 0 0 0 39.09418988227844
P84 0 0 0 0 0 38.12434148788452
P8450 0 0 0 0 0 39.76557374000549
P8453 0 0 0 0 0 37.733155727386475
P85 0 0 0 0 0 38.062098026275635
P859 0 0 0 0 0 39.346091985702515
P86 0 0 0 0 0 40.79762816429138
P8646 0 0 0 0 0 41.809457778930664
P8670 0 0 0 0 0 45.61902737617493
P87 0 0 0 0 0 40.40358376502991
P870 0 0 0 0 0 42.74241065979004
P8702 0 0 0 0 0 42.760618925094604
P872 0 0 0 0 0 40.105467796325684
P88 0 0 0 0 0 40.03835201263428
P880 0 0 0 0 0 40.97867774963379
P885 0 0 0 0 0 37.50162672996521
P91 0 0 0 0 0 40.664841413497925
P915 0 0 0 0 0 58.33741998672485
P92 0 0 0 0 0 38.663697242736816
P923 0 0 0 0 0 39.09472942352295
P930 0 0 0 0 0 38.905149698257446
P931 0 0 0 0 0 40.893287658691406
P937 0 0 0 0 0 51.29282593727112
P942 0 0 0 0 0 40.38119959831238
P945 0 0 0 0 0 40.10890054702759
P97 0 0 0 0 0 50.67022156715393
P974 0 0 0 0 0 41.423251152038574
P98 0 0 0 0 0 41.36071157455444
P991 0 0 0 0 0 42.79798746109009


In [24]:
for prop in property_mapping.keys():
    # count in lines
    lines = subprocess.check_output("wc -l < $%s_CORRECT" % prop, shell=True)
    lines = lines.decode("utf-8").strip()
    lines = int(lines) - 1
    property_lines_count[prop]['Correct'] = lines
    print("%s -> %d" % (prop, lines))
    # count in qnodes
    command = "kgtk query -i $%s_CORRECT --match '(qnode)-[]->()' --return 'count(distinct qnode)'" % prop
    nodes = subprocess.check_output(command, shell=True)
    nodes = nodes.decode("utf-8").strip().split('\n')[1]
    nodes = int(nodes)
    property_qnodes_count[prop]['Correct'] = nodes

P1000 -> 68
P1001 -> 1519
P1002 -> 439
P1018 -> 667
P102 -> 35310
P1028 -> 15277
P1029 -> 82
P103 -> 65552
P1035 -> 8912
P1037 -> 4764
P1038 -> 17704
P1040 -> 7478
P1041 -> 105
P1049 -> 4754
P1050 -> 1365
P106 -> 25482
P1064 -> 68
P1066 -> 7984
P1068 -> 181
P1071 -> 15424
P1072 -> 12
P1073 -> 34
P1075 -> 94
P1078 -> 3
P1079 -> 481
P1080 -> 394
P110 -> 869
P111 -> 21
P112 -> 6454
P113 -> 655
P1136 -> 21
P114 -> 45
P1142 -> 14485
P115 -> 242695
P1165 -> 130
P1170 -> 45
P118 -> 38968
P119 -> 4822
P1192 -> 3
P1194 -> 45
P1201 -> 228
P1202 -> 12
P121 -> 1353
P1211 -> 91
P122 -> 3617
P123 -> 22774
P126 -> 1671
P127 -> 22220
P1283 -> 0
P1290 -> 18166
P1299 -> 22
P1303 -> 12128
P1308 -> 5011
P131 -> 599002
P1313 -> 26133
P1318 -> 15
P1321 -> 2044
P1322 -> 7
P1327 -> 19
P1336 -> 319741
P1343 -> 211004
P1344 -> 1597
P1346 -> 7699
P137 -> 11564
P1383 -> 14664
P1398 -> 112
P1399 -> 750
P1408 -> 86143
P1411 -> 2393
P1412 -> 63936
P1414 -> 2
P1419 -> 1
P1427 -> 2021
P1429 -> 524
P1431 -> 6941
P1433 

In [25]:
for prop in property_mapping.keys():
    # count in lines
    lines = subprocess.check_output("wc -l < $%s_INCORRECT" % prop, shell=True)
    lines = lines.decode("utf-8").strip()
    lines = int(lines) - 1
    property_lines_count[prop]['Incorrect'] = lines
    print("%s -> %d" % (prop, lines))
    # count in qnodes
    command = "kgtk query -i $%s_INCORRECT --match '(qnode)-[]->()' --return 'count(distinct qnode)'" % prop
    nodes = subprocess.check_output(command, shell=True)
    nodes = nodes.decode("utf-8").strip().split('\n')[1]
    nodes = int(nodes)
    property_qnodes_count[prop]['Incorrect'] = nodes

P1000 -> 287055
P1001 -> 292623
P1002 -> 105
P1018 -> 86
P102 -> 2363
P1028 -> 456
P1029 -> 0
P103 -> 2637
P1035 -> 8342
P1037 -> 13
P1038 -> 499
P1040 -> 311
P1041 -> 73
P1049 -> 369
P1050 -> 1291
P106 -> 24075
P1064 -> 27
P1066 -> 34
P1068 -> 53
P1071 -> 6023
P1072 -> 607
P1073 -> 417
P1075 -> 20
P1078 -> 151
P1079 -> 534
P1080 -> 17053
P110 -> 34
P111 -> 11
P112 -> 273
P113 -> 481
P1136 -> 0
P114 -> 1099
P1142 -> 2488
P115 -> 110134
P1165 -> 139
P1170 -> 33
P118 -> 11169
P119 -> 107
P1192 -> 26925
P1194 -> 33
P1201 -> 1795
P1202 -> 1855
P121 -> 15
P1211 -> 442
P122 -> 7457
P123 -> 963
P126 -> 308
P127 -> 2531
P1283 -> 264
P1290 -> 614
P1299 -> 6154
P1303 -> 1653
P1308 -> 165
P131 -> 37560
P1313 -> 11933
P1318 -> 0
P1321 -> 807265
P1322 -> 5
P1327 -> 0
P1336 -> 666716
P1343 -> 75999
P1344 -> 349
P1346 -> 28
P137 -> 2655
P1383 -> 2469
P1398 -> 62
P1399 -> 62
P1408 -> 623
P1411 -> 321
P1412 -> 2637
P1414 -> 359
P1419 -> 22211
P1427 -> 134
P1429 -> 7380
P1431 -> 269
P1433 -> 20
P1434 ->

In [26]:
for prop, val in property_mapping.items():
    
    output_1 = subprocess.check_output("wc -l < $%s_CORRECT" % prop, shell=True)
    output_1 = output_1.decode("utf-8").strip()
    correct_lines = int(output_1) - 1
    # property_results_count[prop]['Correct'] = correct_lines
    
    output_2 = subprocess.check_output("wc -l < $%s_INCORRECT" % prop, shell=True)
    output_2 = output_2.decode("utf-8").strip()
    incorrect_lines = int(output_2) - 1
    # property_results_count[prop]['Incorrect'] = incorrect_lines
    
    output_3 = subprocess.check_output("wc -l < $%s_QNODES" % prop, shell=True)
    output_3 = output_3.decode("utf-8").strip()
    qnode_lines = int(output_3) - 1
    # property_results_count[prop].append(qnode_lines) 
    
    print("%s: correct %d; incorrect %d; total %d" % (prop, correct_lines, incorrect_lines, qnode_lines))
    
    assert correct_lines + incorrect_lines == qnode_lines, "The sum is not correct!"

P1000: correct 68; incorrect 287055; total 287123
P1001: correct 1519; incorrect 292623; total 294142
P1002: correct 439; incorrect 105; total 544
P1018: correct 667; incorrect 86; total 753
P102: correct 35310; incorrect 2363; total 37673
P1028: correct 15277; incorrect 456; total 15733
P1029: correct 82; incorrect 0; total 82
P103: correct 65552; incorrect 2637; total 68189
P1035: correct 8912; incorrect 8342; total 17254
P1037: correct 4764; incorrect 13; total 4777
P1038: correct 17704; incorrect 499; total 18203
P1040: correct 7478; incorrect 311; total 7789
P1041: correct 105; incorrect 73; total 178
P1049: correct 4754; incorrect 369; total 5123
P1050: correct 1365; incorrect 1291; total 2656
P106: correct 25482; incorrect 24075; total 49557
P1064: correct 68; incorrect 27; total 95
P1066: correct 7984; incorrect 34; total 8018
P1068: correct 181; incorrect 53; total 234
P1071: correct 15424; incorrect 6023; total 21447
P1072: correct 12; incorrect 607; total 619
P1073: correct 

P2408: correct 534868; incorrect 7030; total 541898
P241: correct 12105; incorrect 2958; total 15063
P2416: correct 4376; incorrect 4997; total 9373
P25: correct 677; incorrect 258; total 935
P2505: correct 77; incorrect 306; total 383
P2522: correct 9560; incorrect 276011; total 285571
P2541: correct 996; incorrect 25; total 1021
P2545: correct 0; incorrect 801; total 801
P2546: correct 438; incorrect 9831; total 10269
P2550: correct 128566; incorrect 158558; total 287124
P2560: correct 20; incorrect 38; total 58
P2563: correct 93; incorrect 108; total 201
P2564: correct 3193; incorrect 2090; total 5283
P2579: correct 16896; incorrect 10283; total 27179
P2596: correct 732; incorrect 284; total 1016
P263: correct 688; incorrect 30854; total 31542
P2634: correct 2810; incorrect 19; total 2829
P264: correct 63233; incorrect 19228; total 82461
P2643: correct 0; incorrect 184230; total 184230
P2647: correct 796; incorrect 59; total 855
P2652: correct 14; incorrect 2870; total 2884
P2673: c

P5138: correct 77192; incorrect 3482; total 80674
P516: correct 3810; incorrect 1889; total 5699
P517: correct 28; incorrect 24; total 52
P520: correct 468; incorrect 140; total 608
P5202: correct 226809; incorrect 715; total 227524
P521: correct 22; incorrect 1891; total 1913
P522: correct 1459; incorrect 0; total 1459
P523: correct 826; incorrect 5330; total 6156
P524: correct 826; incorrect 5329; total 6155
P5249: correct 23311; incorrect 35596; total 58907
P53: correct 1741; incorrect 1749; total 3490
P530: correct 271; incorrect 7173; total 7444
P532: correct 525; incorrect 336; total 861
P5326: correct 98; incorrect 2586; total 2684
P534: correct 1; incorrect 0; total 1
P5353: correct 83; incorrect 11941; total 12024
P538: correct 1; incorrect 18; total 19
P5389: correct 710566; incorrect 98691; total 809257
P54: correct 142451; incorrect 2816; total 145267
P541: correct 55435; incorrect 230574; total 286009
P542: correct 63; incorrect 49; total 112
P543: correct 44; incorrect 0;

# Step (8). Output statistics

## Runtime

In [None]:
# runtime_copy = property_runtime.copy()
# for k, v in runtime_copy.items():
#     if len(runtime_copy[k]) < 7:
#         new_list = [0] * 7
#         for i, t in enumerate(runtime_copy[k]):
#             new_list[i] = runtime_copy[k][i]
#         runtime_copy[k] = new_list
#     combined_list = runtime_copy[k][:4] + [runtime_copy[k][4] + runtime_copy[k][5]] + runtime_copy[k][6:]
#     runtime_copy[k] = combined_list
# runtime_copy

In [None]:
# property_runtime

In [27]:
# df = pd.DataFrame.from_dict(runtime_copy, orient='index', 
#                             columns=['Entity Resolution', 'Property Mapping', 'Query Wikidata Infobox', 'Filter new results', 'Datatype Filtering', 'Quality Checking'])
# df.to_csv('/nas/home/bohuizha/KG/hunger-for-knowledge/batch_output/runtime.csv', sep=',')
with open(os.environ['RUNTIME'], 'w', encoding='utf-8') as f:
    json.dump(property_runtime, f, ensure_ascii=False, indent=4)

## Results count

In [None]:
# count_copy = property_results_count.copy()
# for k, v in count_copy.items():
#     if len(count_copy[k]) < 4:
#         new_list = [0] * 4
#         for i, t in enumerate(count_copy[k]):
#             new_list[i] = count_copy[k][i]
#         count_copy[k] = new_list
# count_copy

In [28]:
with open(os.environ['LINE_STATISTICS'], 'w', encoding='utf-8') as f:
    json.dump(property_lines_count, f, ensure_ascii=False, indent=4)

In [29]:
with open(os.environ['QNODE_STATISTICS'], 'w', encoding='utf-8') as f:
    json.dump(property_qnodes_count, f, ensure_ascii=False, indent=4)

In [None]:
# df = pd.DataFrame.from_dict(count_copy, orient='index', 
#                             columns=['Wikidata results', 'Found correct', 'Found incorrect', 'Found total (Qnodes)'])
# df.to_csv('/nas/home/bohuizha/KG/hunger-for-knowledge/batch_output/results_count.csv', sep=',')


# Step (9). Extra Step: Disagreement statements

In [None]:
for prop in property_mapping.keys():
    command_1 = """
        kgtk ifexists -i $%s_INFOBOX_RESULTS \
            --filter-on $%s_RESULTS \
            --input-keys node1 \
            --filter-keys node1 \
            -o $%s_KNOWN
        """ % (prop, prop, prop)
    code_1 = os.system(command_1)
    command_2 = """
        kgtk ifnotexists -i $%s_KNOWN \
            --filter-on $%s_RESULTS \
            --input-keys node1 node2 \
            --filter-keys node1 node2 \
            -o $%s_DISAGREE
        """ % (prop, prop, prop)
    code_2 = os.system(command_2)
    print(prop, code_1, code_2)