# Step 1 User Query

In [1]:
import os
import re
import time
import json
import subprocess

## Define alias and variables

In [2]:
# Parameters

# Folder where database files store
data_path = "/nas/home/bohuizha/KG/hunger-for-knowledge/data/"

# The names of files in the KGTK Wikidata distirbution that we will use in this notebook.
data_file_names = {
    "claims": "claims.tsv",
    "wiki_info": "wikidata_infobox.tsv",
    "p31": "P31.tsv",
    "p279star": "P279star.tsv",
    "labels": "labels.en.tsv",
    "properties": "valueTypeConstraintValidator1.sh"
}

# We will define environment variables to hold the full paths to the files as we will use them in the shell commands
kgtk_environment_variables = []

os.environ['DATABASE'] = data_path
kgtk_environment_variables.append('DATABASE')

for key, value in data_file_names.items():
    variable = key.upper()
    os.environ[variable] = data_path + value
    kgtk_environment_variables.append(variable)
    
for variable in kgtk_environment_variables:
    print("{}: \"{}\"".format(variable, os.environ[variable]))

DATABASE: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/"
CLAIMS: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/claims.tsv"
WIKI_INFO: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/wikidata_infobox.tsv"
P31: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/P31.tsv"
P279STAR: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/P279star.tsv"
LABELS: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/labels.en.tsv"
PROPERTIES: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/valueTypeConstraintValidator1.sh"


**Method 1: query from Wikidata**
- property constraint (P2302)
- value-type constraint (Q21510865)

In [33]:
# property_list = !kgtk query -i $CLAIMS --match 'c: (q)-[p:P2302]->(v:Q21510865)' --return 'q' --limit 100
# property_list = property_list[1:5]

**Method 2: preprocess Kartik's work**

In [3]:
property_dict = dict()  # key: property, value: value-type constraint
count = 0
with open(os.environ['PROPERTIES']) as f:
    for line in f.readlines():
        property_matched = re.findall('claims.P[0-9]*.tsv', line)
        class_matched = re.findall('par in .{1,100}\] ', line)
        if property_matched and class_matched:
            proper = property_matched[0].strip('claims.').strip('.tsv')
            qnodes = class_matched[0].strip('par in ').strip()
            # print('%s --- %s--' % (proper, qnodes))
            property_dict[proper] = qnodes
            count += 1
            if count == 100:
                break

In [10]:
property_dict

{'P1000': '["Q1241356"]',
 'P1001': '["Q20926517", "Q2881272", "Q2882257", "Q3624078", "Q3895768", "Q3918", "Q56061", "Q82794"]',
 'P1002': '["Q2576663"]',
 'P1018': '["Q43229", "Q5"]',
 'P102': '["Q1393724", "Q2738074", "Q7210356", "Q7278"]',
 'P1026': '["Q1266946", "Q187685"]',
 'P1028': '["Q43229", "Q5"]',
 'P1029': '["Q5", "Q95074"]',
 'P103': '["Q17376908"]',
 'P1035': '["Q618779"]',
 'P1037': '["Q5", "Q95074"]',
 'P1038': '["Q1569167", "Q178885", "Q21070598", "Q5", "Q95074"]',
 'P1039': '["Q11666901", "Q12758374", "Q171318"]',
 'P1040': '["Q14073567", "Q5"]',
 'P1041': '["Q727009"]',
 'P1049': '["Q49447"]',
 'P105': '["Q427626"]',
 'P1050': '["Q12136", "Q1441305", "Q175854", "Q2057971", "Q796194", "Q808"]',
 'P1057': '["Q37748"]',
 'P106': '["Q12737077", "Q17305127", "Q192581", "Q2207288", "Q28640"]',
 'P1060': '["Q525512"]',
 'P1064': '["Q214519"]',
 'P1066': '["Q16334295", "Q5", "Q95074"]',
 'P1068': '["Q272683"]',
 'P1071': '["Q618123"]',
 'P1072': '["Q235557", "Q26085352"]',


In [5]:
# start_time = round(time.time())
property_runtime = {k: [] for k, _ in property_dict.items()}

In [6]:
property_results_count = {k: [] for k, _ in property_dict.items()}

In [4]:
# Parameters

# Folder on local machine where to create the output and temporary folders
output_path = "/nas/home/bohuizha/KG/hunger-for-knowledge/batch_output/"
if not os.path.exists(output_path):
    os.makedirs(output_path)
output_list = ['results', 'infobox_results', 'new_results', 'unknown', 'entity', 'class', 'query', 
               'direct_infer', 'indirect_infer', 'infer', 
               'structured_literals', 'nodes', 'qnodes', 
               'correct_temp_1', 'correct_temp_2', 'incorrect_temp', 
               'correct', 'incorrect']
for folder_name in output_list:
    folder_path = os.path.join(output_path, folder_name)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    variable = folder_name.upper()
    os.environ[variable] = folder_path
    kgtk_environment_variables.append(variable)

os.environ['OUTPUT'] = output_path
kgtk_environment_variables.append('OUTPUT')

os.environ['PROPERTY_MAPPING'] = "/nas/home/bohuizha/KG/hunger-for-knowledge/batch_output/property_mapping.json"
kgtk_environment_variables.append('PROPERTY_MAPPING')

for prop in property_dict.keys():
    output_file_names = {
        "%s_results" % prop: "results/results.%s.tsv" % prop,
        "%s_infobox_results" % prop: "infobox_results/infobox.results.%s.tsv" % prop,
        "%s_new_results" % prop: "new_results/new.results.%s.tsv" % prop,
        "%s_unknown" % prop: "unknown/unknown.%s.tsv" % prop,
        "%s_class" % prop: "class/class.%s.tsv" % prop,
        "%s_entity" % prop: "entity/entity.%s.tsv" % prop,
        "%s_query_file" % prop: "query/query.file.%s.tsv" % prop,
        "%s_direct_infer" % prop: "direct_infer/direct.infer.%s.tsv" % prop,
        "%s_indirect_infer" % prop: "indirect_infer/indirect.infer.%s.tsv" % prop,
        "%s_infers" % prop: "infers/infers.%s.tsv" % prop,
        "%s_structured_literals" % prop: "structured_literals/structured.literals.%s.tsv" % prop,
        "%s_nodes" % prop: "nodes/nodes.%s.tsv" % prop,
        "%s_qnodes" % prop: "qnodes/qnodes.%s.tsv" % prop,
        "%s_correct_temp_1" % prop: "correct_temp_1/correct_temp_1.type-constraints.instanceOf.%s.tsv" % prop,
        "%s_correct_temp_2" % prop: "correct_temp_2/correct_temp_2.type-constraints.instanceOf.%s.tsv" % prop,
        "%s_incorrect_temp" % prop: "incorrect_temp/incorrect_temp.type-constraints.instanceOf.%s.tsv" % prop,
        "%s_correct" % prop: "correct/correct.type-constraints.instanceOf.%s.tsv" % prop,
        "%s_incorrect" % prop: "incorrect/incorrect.type-constraints.instanceOf.%s.tsv" % prop
    }
    # print(output_file_names)
    for key, value in output_file_names.items():
        variable = key.upper()
        os.environ[variable] = os.path.join(output_path, value)
        kgtk_environment_variables.append(variable)

# kgtk_environment_variables.sort()
# for variable in kgtk_environment_variables:
#     print("{}: \"{}\"".format(variable, os.environ[variable]))

# Step 2: Wikidata Results

Generate a property file for each property using `kgtk` `filter` (filter for property). <div>
Already run on background and since it's relatively time consuming, it's better to not run again.

In [13]:
# for prop, val in property_dict.items():
#     command = "kgtk filter -i $CLAIMS -p \" ; %s ; \" -o $%s_RESULTS" % (prop, prop) 
#     print(command)
#     code = os.system(command)
#     print(code)

In [14]:
# # code from Kartik
# with open(os.environ['CLAIMS'],'r') as fin: 
#     headerLine = next(fin).decode("utf-8")
# #     tstCount = 0
#     for line in tqdm(fin):
#         line = line.decode("utf-8")
#         lineP = line.rstrip().split("\t")
#         if "external-id" in lineP[-1]:
#             continue
#         prop = lineP[2]
#         if prop not in propFileDict:
#             propFileDict[prop] = open("../../propertiesSplit_Final/claims."+str(prop)+".tsv","w")
#             propFileDict[prop].write(headerLine)
#         propFileDict[prop].write(line)
# #         tstCount += 1
# #         if tstCount == 10:
# #             break
#     for file1 in propFileDict.values():
#         file1.close()

## Count known results in Wikidata database:

In [69]:
# results_lines = dict()
for prop in property_dict.keys():
    output = subprocess.check_output("wc -l < $%s_RESULTS" % prop, shell=True)
    output = output.decode("utf-8").strip()
    output = int(output) - 1
    # results_lines[prop] = output
    property_results_count[prop].append(output)
    print("%s: %d" % (prop, output))

P1000: 290
P1001: 760787
P1002: 825
P1018: 189
P102: 432427
P1026: 7346
P1028: 4240
P1029: 2769
P103: 107210
P1035: 522
P1037: 13046
P1038: 33957
P1039: 45
P1040: 47592
P1041: 85
P1049: 1896
P105: 2892792
P1050: 236675
P1057: 124
P106: 8273095
P1060: 287
P1064: 23191
P1066: 56327
P1068: 406
P1071: 52640
P1072: 5543
P1073: 2322
P1075: 1496
P1078: 75
P1079: 170
P1080: 25590
P110: 9453
P111: 2866
P112: 59728
P113: 3037
P1136: 110
P114: 277
P1142: 20699
P1145: 5775
P115: 31464
P1151: 1626
P1165: 1297
P1170: 3
P1171: 3
P118: 178146
P119: 182489
P1192: 17715
P1194: 1
P1199: 74
P1200: 2660
P1201: 400
P1202: 106
P121: 11081
P1210: 6
P1211: 27
P122: 1812
P1221: 2
P1227: 19
P123: 241437
P126: 308338
P127: 424347
P128: 9953
P1283: 1387
P129: 98
P1290: 1108
P1299: 6926
P1303: 200830
P1304: 165
P1308: 14470
P131: 10426043
P1312: 1065
P1313: 60848
P1318: 108
P1321: 2415
P1322: 337
P1327: 8731
P1336: 1083
P1340: 7392
P1343: 621332
P1344: 625676
P1346: 217868
P1349: 6
P137: 486110
P1383: 31852
P1398: 

## Find unknow results in Wikidata database:

Find the most frequent class and substitute the entities of the class as the whole entity set we're going to query. <div>
Currently we don't apply this step.

In [16]:
# for prop, val in property_dict.items():
#     command = "kgtk query -i $%s_RESULTS $P31 \
#         --match 'r: (entity)-[]->(), P31: (entity)-[]->(class)' \
#         --return 'distinct entity as node1, \"P31\" as label, class as node2' \
#         -o $%s_CLASS" % (prop, prop)
#     print(command)
#     start = time.time()
#     code = os.system(command)
#     runtime = time.time() - start
#     print(code, runtime)
#     property_runtime[prop].append(runtime)

In [17]:
# entity_class_map = dict()
# for prop, _ in property_dict.items():
#     command = "kgtk query -i $%s_CLASS \
#         --match 'c: ()-[]->(class)' \
#         --return 'class, count(class) as N' \
#         --order-by 'N desc' \
#         --limit 1" % prop
#     start = time.time()
#     output = subprocess.check_output(command, shell=True)
#     runtime = time.time() - start
#     property_runtime[prop].append(runtime)
#     output = output.decode("utf-8").strip().split()
#     print("%s" % prop, output, runtime)
#     if len(output) == 4:
#         entity_class_map[prop] = output[2]

In [18]:
# class_entity_map = dict()
# for key, value in entity_class_map.items():
#     class_entity_map[value] = []
# for key, value in entity_class_map.items():
#     class_entity_map[value].append(key)
# class_entity_map

- Find all entities

In [19]:
# for cls, prop in class_entity_map.items():
#     command = "kgtk filter -i $P31 -p \";; %s\" -o $%s_ENTITY" % (cls, prop[0])
#     print(command)
#     # If ${val} already in folder, no need to query
#     # if os.path.exists(os.environ['%s_ENTITY' % prop[0]]):
#     #     print(0)
#     #     runtime = 0
#     #     propperty_runtime[prop].append(runtime)
#     # else:
#     start = time.time()
#     code = os.system(command)
#     runtime = time.time() - start
#     property_runtime[prop[0]].append(runtime)
#     print(code, runtime)
#     if len(prop) > 1:
#         for p in prop[1:]:
#             os.system('cp $%s_ENTITY $%s_ENTITY' % (prop[0], p))
#             property_runtime[p].append(runtime)

In [20]:
# directly filter
# !kgtk query -i $%s_CLASS $P31 \
#         --match 'c: (n)-[]->(class), P31: (entity)-[]->(class)' \
#         --where 'n != entity' \
#         --return 'distinct entity as node1, \"P31\" as label, class as node2' \
#         -o $%s_ENTITY" % (prop, prop)

- Eliminate entities who have known property / properties values

In [21]:
# for prop, val in property_dict.items():
#     command = "kgtk ifnotexists -i $%s_ENTITY \
#         --filter-on $%s_RESULTS \
#         --input-keys node1 \
#         --filter-keys node1 \
#         -o $%s_QUERY_FILE" % (prop, prop, prop)
#     print(command)
#     # if os.path.exists(os.environ['%s_QUERY_FILE' % prop[0]]):
#     #     print(0)
#     # else:
#     #     code = os.system(command)
#     #     print(code)
#     start = time.time()
#     code = os.system(command)
#     runtime = time.time() - start
#     property_runtime[prop].append(runtime)
#     print(code, runtime)

## Count unknown results

- rows

In [22]:
# for prop, val in property_dict.items():
#     output = subprocess.check_output("wc -l < $%s_QUERY_FILE" % prop, shell=True)
#     output = output.decode("utf-8").strip()
#     output = int(output) - 1
#     print("%s: %s" % (prop, output))

- entities

In [23]:
# for prop, val in property_dict.items():
#     command = "kgtk query -i $%s_QUERY_FILE \
#         --match '(p)-[]->()' \
#         --return 'count(distinct p) as N'" % prop
#     output = subprocess.check_output(command, shell=True)
#     output = output.decode("utf-8").strip().split()[-1]
#     print("%s: %s" % (prop, output))

# Step 3 Selection of Additional KG(s)

Currently we use Wikidata Infobox generated from DBpedia.

# Step 4 Schema Alignment

## Entity resolution

Use query results from Wikidata database to infer properties in Wikidata infobox and return the most frequent property.

**Direct Infer:** Query for qnode;

In [None]:
for prop, val in property_dict.items():
    command = "kgtk query -i $%s_RESULTS -i $WIKI_INFO \
        --match 'r: (entity)-[]->(v), w: (entity)-[p]->(v)' \
        --return 'entity, p.label, v as node2' \
        --limit 10000 \
        -o $%s_DIRECT_INFER" % (prop, prop)
    print(command)
    start = time.time()
    code = os.system(command)
    runtime = time.time() - start
    property_runtime[prop].append(runtime)
    print(code, runtime)

## Property mapping

In [27]:
property_mapping = dict()
for prop, val in property_dict.items():
    command = "kgtk query -i $%s_DIRECT_INFER \
        --match '(q)-[p]->(v)' \
        --return 'p.label, count(v) as N' \
        --order-by 'N desc' \
        --limit 1" % prop
    start = time.time()
    output = subprocess.check_output(command, shell=True)
    runtime = time.time() - start
    property_runtime[prop].append(runtime)
    output = output.decode("utf-8").strip().split()
    print(output, runtime)
    if len(output) == 4:
        property_mapping[prop] = output[2]
        print("%s: %s" % (prop, output[2]))

['label', 'N', 'property:title', '2'] 5.2808003425598145
P1000: property:title
['label', 'N', 'property:jurisdiction', '941'] 3.8210620880126953
P1001: property:jurisdiction
['label', 'N', 'property:configuration', '26'] 3.3586645126342773
P1002: property:configuration
['label', 'N', 'property:agency', '59'] 3.3858444690704346
P1018: property:agency
['label', 'N', 'property:party', '82983'] 3.7279090881347656
P102: property:party
['label', 'N'] 4.045220851898193
['label', 'N', 'property:founder', '3'] 4.011143684387207
P1028: property:founder
['label', 'N', 'property:crewMembers', '1409'] 3.9420437812805176
P1029: property:crewMembers
['label', 'N', 'property:language', '255'] 3.476477861404419
P103: property:language
['label', 'N', 'property:honorificSuffix', '144'] 3.9311208724975586
P1035: property:honorificSuffix
['label', 'N', 'property:officeholder', '167'] 3.777768850326538
P1037: property:officeholder
['label', 'N', 'property:relatives', '579'] 3.910413980484009
P1038: property

In [32]:
# save property_mapping
with open(os.environ['PROPERTY_MAPPING'], 'w+') as f:
    json.dump(property_mapping, f, indent=4)

# Step 5 Results from other KG(s)

For those entities don't have property value, query in Wikidata infobox:

In [37]:
for prop, val in property_mapping.items():
    
    # original command if unknown entity set exists
    # command = "kgtk query -i $%s_QUERY_FILE -i $WIKI_INFO \
    #     --match 'q: (entity)-[]->(), w: (entity)-[property]->(value)' \
    #     --where 'property.label = \"%s\"' \
    #     --return 'distinct entity, property.label, value' \
    #     -o $%s_NEW_RESULTS" % (prop, val, prop)
    
    # command similar to Wikidata query
    command = "kgtk filter -i $WIKI_INFO -p \" ; %s ; \" -o $%s_INFOBOX_RESULTS" % (val, prop) 
    print(command)
    start = time.time()
    code = os.system(command)
    runtime = time.time() - start
    property_runtime[prop].append(runtime)
    print(code, runtime)

kgtk filter -i $WIKI_INFO -p " ; property:title ; " -o $P1000_INFOBOX_RESULTS
0 226.55320620536804
kgtk filter -i $WIKI_INFO -p " ; property:jurisdiction ; " -o $P1001_INFOBOX_RESULTS
0 233.53115892410278
kgtk filter -i $WIKI_INFO -p " ; property:configuration ; " -o $P1002_INFOBOX_RESULTS
0 222.08623671531677
kgtk filter -i $WIKI_INFO -p " ; property:agency ; " -o $P1018_INFOBOX_RESULTS
0 223.88510823249817
kgtk filter -i $WIKI_INFO -p " ; property:party ; " -o $P102_INFOBOX_RESULTS
0 228.79758763313293
kgtk filter -i $WIKI_INFO -p " ; property:founder ; " -o $P1028_INFOBOX_RESULTS
0 221.43906807899475
kgtk filter -i $WIKI_INFO -p " ; property:crewMembers ; " -o $P1029_INFOBOX_RESULTS
0 226.9954059123993
kgtk filter -i $WIKI_INFO -p " ; property:language ; " -o $P103_INFOBOX_RESULTS
0 234.76417875289917
kgtk filter -i $WIKI_INFO -p " ; property:honorificSuffix ; " -o $P1035_INFOBOX_RESULTS
0 230.47625303268433
kgtk filter -i $WIKI_INFO -p " ; property:officeholder ; " -o $P1037_INFOBO

## Filter new results found from additional KG

In [47]:
for prop, val in property_mapping.items():
    command = "kgtk ifnotexists -i $%s_INFOBOX_RESULTS \
        --filter-on $%s_RESULTS \
        --input-keys node1 node2 \
        --filter-keys node1 node2 \
        -o $%s_NEW_RESULTS" % (prop, prop, prop) 
    print(command)
    start = time.time()
    code = os.system(command)
    runtime = time.time() - start
    property_runtime[prop].append(runtime)
    print(code, runtime)

kgtk ifnotexists -i $P1000_INFOBOX_RESULTS         --filter-on $P1000_RESULTS         --input-keys node1 node2         --filter-keys node1 node2         -o $P1000_NEW_RESULTS
0 11.906784534454346
kgtk ifnotexists -i $P1001_INFOBOX_RESULTS         --filter-on $P1001_RESULTS         --input-keys node1 node2         --filter-keys node1 node2         -o $P1001_NEW_RESULTS
0 4.712164878845215
kgtk ifnotexists -i $P1002_INFOBOX_RESULTS         --filter-on $P1002_RESULTS         --input-keys node1 node2         --filter-keys node1 node2         -o $P1002_NEW_RESULTS
0 1.9388160705566406
kgtk ifnotexists -i $P1018_INFOBOX_RESULTS         --filter-on $P1018_RESULTS         --input-keys node1 node2         --filter-keys node1 node2         -o $P1018_NEW_RESULTS
0 1.6232929229736328
kgtk ifnotexists -i $P102_INFOBOX_RESULTS         --filter-on $P102_RESULTS         --input-keys node1 node2         --filter-keys node1 node2         -o $P102_NEW_RESULTS
0 5.048824787139893
kgtk ifnotexists -i $P102

0 1.8437440395355225
kgtk ifnotexists -i $P121_INFOBOX_RESULTS         --filter-on $P121_RESULTS         --input-keys node1 node2         --filter-keys node1 node2         -o $P121_NEW_RESULTS
0 2.0639090538024902
kgtk ifnotexists -i $P1211_INFOBOX_RESULTS         --filter-on $P1211_RESULTS         --input-keys node1 node2         --filter-keys node1 node2         -o $P1211_NEW_RESULTS
0 1.8533682823181152
kgtk ifnotexists -i $P122_INFOBOX_RESULTS         --filter-on $P122_RESULTS         --input-keys node1 node2         --filter-keys node1 node2         -o $P122_NEW_RESULTS
0 2.203983783721924
kgtk ifnotexists -i $P123_INFOBOX_RESULTS         --filter-on $P123_RESULTS         --input-keys node1 node2         --filter-keys node1 node2         -o $P123_NEW_RESULTS
0 2.8735010623931885
kgtk ifnotexists -i $P126_INFOBOX_RESULTS         --filter-on $P126_RESULTS         --input-keys node1 node2         --filter-keys node1 node2         -o $P126_NEW_RESULTS
0 2.873234510421753
kgtk ifnotexi

- Count rows of new findings:

In [48]:
for prop, val in property_mapping.items():
    output = subprocess.check_output("wc -l < $%s_NEW_RESULTS" % prop, shell=True)
    output = output.decode("utf-8").strip()
    output = int(output) - 1
    print("%s: %s" % (prop, output))

P1000: 1765149
P1001: 9541
P1002: 1251
P1018: 3936
P102: 243582
P1028: 47846
P1029: 348
P103: 261465
P1035: 21134
P1037: 5934
P1038: 36284
P1040: 67007
P1041: 226
P1049: 5800
P105: 41049
P1050: 6329
P106: 377947
P1064: 1190
P1066: 10858
P1068: 341
P1071: 31398
P1072: 1163
P1073: 734
P1075: 3032
P1078: 1059
P1079: 1078
P1080: 47843
P110: 4506
P111: 135
P112: 38840
P113: 2349
P1136: 24
P114: 2775
P1142: 20662
P115: 401614
P1165: 585
P1170: 632
P118: 61795
P119: 15165
P1192: 46902
P1194: 632
P1201: 3065
P1202: 4468
P121: 1496
P1211: 1202
P122: 19215
P123: 52993
P126: 4645
P127: 74215
P1283: 3400
P1290: 36861
P1299: 10114
P1303: 50228
P1308: 6785
P131: 797761
P1313: 107095
P1318: 18
P1321: 1214384
P1322: 128
P1327: 996
P1336: 1147656
P1343: 1765029
P1344: 808855
P1346: 22062
P137: 39316
P1383: 20629
P1398: 415
P1399: 2234
P1408: 113601
P1411: 10479
P1412: 259849
P1414: 522
P1419: 55293
P1427: 2256
P1429: 16227
P1431: 43561
P1433: 3855
P1434: 47929
P1435: 3285


- Count unique entities of new findings:

In [49]:
for prop, val in property_mapping.items():
    command = "kgtk query -i $%s_NEW_RESULTS \
        --match 'n: (p)-[]->()' \
        --return 'count(distinct p) as N'" % prop
    output = subprocess.check_output(command, shell=True)
    output = output.decode("utf-8").strip().split()[1]
    print("%s: %s" % (prop, output))

P1000: 507288
P1001: 9322
P1002: 1136
P1018: 1685
P102: 64725
P1028: 40522
P1029: 243
P103: 236456
P1035: 16990
P1037: 524
P1038: 18938
P1040: 55920
P1041: 91
P1049: 3771
P105: 41019
P1050: 6242
P106: 285737
P1064: 1052
P1066: 9923
P1068: 209
P1071: 23710
P1072: 770
P1073: 604
P1075: 2986
P1078: 956
P1079: 1053
P1080: 40511
P110: 3482
P111: 106
P112: 32901
P113: 1569
P1136: 18
P114: 1368
P1142: 8208
P115: 61921
P1165: 528
P1170: 385
P118: 47219
P119: 13018
P1192: 31008
P1194: 385
P1201: 2480
P1202: 1515
P121: 1452
P1211: 907
P122: 17959
P123: 44045
P126: 4238
P127: 66359
P1283: 2034
P1290: 19183
P1299: 5264
P1303: 31439
P1308: 2084
P131: 375817
P1313: 73601
P1318: 14
P1321: 863730
P1322: 127
P1327: 747
P1336: 410799
P1343: 507187
P1344: 392705
P1346: 15350
P137: 34343
P1383: 19230
P1398: 312
P1399: 1618
P1408: 94550
P1411: 4289
P1412: 235122
P1414: 412
P1419: 32113
P1427: 1972
P1429: 9368
P1431: 16204
P1433: 2838
P1434: 40594
P1435: 3258


## Filter out entities (rows) we still don't know

In [50]:
# for prop, val in property_mapping.items():
#     command = "kgtk ifnotexists -i $%s_QUERY_FILE \
#         --filter-on $%s_NEW_RESULTS \
#         --input-keys node1 \
#         --filter-keys node1 \
#         -o $%s_UNKNOWN" % (prop, prop, prop)
#     print(command)
#     start = time.time()
#     code = os.system(command)
#     runtime = time.time() - start
#     property_runtime[prop].append(runtime)
#     print(code, runtime)

- Count rows still unknow

In [51]:
# for prop, val in property_mapping.items():
#     output = subprocess.check_output("wc -l < $%s_UNKNOWN" % prop, shell=True)
#     output = output.decode("utf-8").strip()
#     output = int(output) - 1
#     print("%s: %s" % (prop, output))

- Count entities still unknow

In [52]:
# for prop, val in property_mapping.items():
#     command = "kgtk query -i $%s_UNKNOWN \
#         --match 'n: (p)-[]->()' \
#         --return 'count(distinct p) as N'" % prop
#     output = subprocess.check_output(command, shell=True)
#     output = output.decode("utf-8").strip().split()[1]
#     print("%s: %s" % (prop, output))

# Step 6 Datatype Filtering

## 1. Filter Structured literals:

In [53]:
for prop, val in property_mapping.items():
    command = "kgtk query -i $%s_NEW_RESULTS -i $WIKI_INFO \
        --match 'n: (q)-[p]->(s), w: (s)-[sv]->(v)' \
        --where 'NOT kgtk_lqstring(s) AND NOT kgtk_number(s) AND sv.label = \"dbpedia:structured_value\"' \
        --return 'q, p.label, s' \
        -o $%s_STRUCTURED_LITERALS" % (prop, prop)
    print(command)
    start = time.time()
    code = os.system(command)
    runtime = time.time() - start
    property_runtime[prop].append(runtime)
    print(code, runtime)

kgtk query -i $P1000_NEW_RESULTS -i $WIKI_INFO         --match 'n: (q)-[p]->(s), w: (s)-[sv]->(v)'         --where 'NOT kgtk_lqstring(s) AND NOT kgtk_number(s) AND sv.label = "dbpedia:structured_value"'         --return 'q, p.label, s'         -o $P1000_STRUCTURED_LITERALS
0 36.119502782821655
kgtk query -i $P1001_NEW_RESULTS -i $WIKI_INFO         --match 'n: (q)-[p]->(s), w: (s)-[sv]->(v)'         --where 'NOT kgtk_lqstring(s) AND NOT kgtk_number(s) AND sv.label = "dbpedia:structured_value"'         --return 'q, p.label, s'         -o $P1001_STRUCTURED_LITERALS
0 1.9462220668792725
kgtk query -i $P1002_NEW_RESULTS -i $WIKI_INFO         --match 'n: (q)-[p]->(s), w: (s)-[sv]->(v)'         --where 'NOT kgtk_lqstring(s) AND NOT kgtk_number(s) AND sv.label = "dbpedia:structured_value"'         --return 'q, p.label, s'         -o $P1002_STRUCTURED_LITERALS
0 1.7612698078155518
kgtk query -i $P1018_NEW_RESULTS -i $WIKI_INFO         --match 'n: (q)-[p]->(s), w: (s)-[sv]->(v)'         --where 

0 2.154759645462036
kgtk query -i $P111_NEW_RESULTS -i $WIKI_INFO         --match 'n: (q)-[p]->(s), w: (s)-[sv]->(v)'         --where 'NOT kgtk_lqstring(s) AND NOT kgtk_number(s) AND sv.label = "dbpedia:structured_value"'         --return 'q, p.label, s'         -o $P111_STRUCTURED_LITERALS
0 1.863593578338623
kgtk query -i $P112_NEW_RESULTS -i $WIKI_INFO         --match 'n: (q)-[p]->(s), w: (s)-[sv]->(v)'         --where 'NOT kgtk_lqstring(s) AND NOT kgtk_number(s) AND sv.label = "dbpedia:structured_value"'         --return 'q, p.label, s'         -o $P112_STRUCTURED_LITERALS
0 2.0700724124908447
kgtk query -i $P113_NEW_RESULTS -i $WIKI_INFO         --match 'n: (q)-[p]->(s), w: (s)-[sv]->(v)'         --where 'NOT kgtk_lqstring(s) AND NOT kgtk_number(s) AND sv.label = "dbpedia:structured_value"'         --return 'q, p.label, s'         -o $P113_STRUCTURED_LITERALS
0 2.1837234497070312
kgtk query -i $P1136_NEW_RESULTS -i $WIKI_INFO         --match 'n: (q)-[p]->(s), w: (s)-[sv]->(v)'    

0 2.2918779850006104
kgtk query -i $P1318_NEW_RESULTS -i $WIKI_INFO         --match 'n: (q)-[p]->(s), w: (s)-[sv]->(v)'         --where 'NOT kgtk_lqstring(s) AND NOT kgtk_number(s) AND sv.label = "dbpedia:structured_value"'         --return 'q, p.label, s'         -o $P1318_STRUCTURED_LITERALS
0 1.9954495429992676
kgtk query -i $P1321_NEW_RESULTS -i $WIKI_INFO         --match 'n: (q)-[p]->(s), w: (s)-[sv]->(v)'         --where 'NOT kgtk_lqstring(s) AND NOT kgtk_number(s) AND sv.label = "dbpedia:structured_value"'         --return 'q, p.label, s'         -o $P1321_STRUCTURED_LITERALS
0 23.31630849838257
kgtk query -i $P1322_NEW_RESULTS -i $WIKI_INFO         --match 'n: (q)-[p]->(s), w: (s)-[sv]->(v)'         --where 'NOT kgtk_lqstring(s) AND NOT kgtk_number(s) AND sv.label = "dbpedia:structured_value"'         --return 'q, p.label, s'         -o $P1322_STRUCTURED_LITERALS
0 1.9915859699249268
kgtk query -i $P1327_NEW_RESULTS -i $WIKI_INFO         --match 'n: (q)-[p]->(s), w: (s)-[sv]->(

## 2. Filter Qnodes

In [54]:
for prop, val in property_mapping.items():
    command_1 = "kgtk query -i $%s_NEW_RESULTS \
        --match 'n: (q)-[p]->(v)' \
        --where 'NOT kgtk_lqstring(v) AND NOT kgtk_number(v)' \
        --return 'distinct q, p.label, v' \
        -o $%s_NODES" % (prop, prop)
    print(command_1)
    command_2 = "kgtk ifnotexists -i $%s_NODES \
        --filter-on $%s_STRUCTURED_LITERALS \
        -o $%s_QNODES" % (prop, prop, prop)
    print(command_2)
    start = time.time()
    code_1 = os.system(command_1)
    code_2 = os.system(command_2)
    runtime = time.time() - start
    property_runtime[prop].append(runtime)
    print(code_1, code_2, runtime)

kgtk query -i $P1000_NEW_RESULTS         --match 'n: (q)-[p]->(v)'         --where 'NOT kgtk_lqstring(v) AND NOT kgtk_number(v)'         --return 'distinct q, p.label, v'         -o $P1000_NODES
kgtk ifnotexists -i $P1000_NODES         --filter-on $P1000_STRUCTURED_LITERALS         -o $P1000_QNODES
0 0 8.748481035232544
kgtk query -i $P1001_NEW_RESULTS         --match 'n: (q)-[p]->(v)'         --where 'NOT kgtk_lqstring(v) AND NOT kgtk_number(v)'         --return 'distinct q, p.label, v'         -o $P1001_NODES
kgtk ifnotexists -i $P1001_NODES         --filter-on $P1001_STRUCTURED_LITERALS         -o $P1001_QNODES
0 0 3.8231935501098633
kgtk query -i $P1002_NEW_RESULTS         --match 'n: (q)-[p]->(v)'         --where 'NOT kgtk_lqstring(v) AND NOT kgtk_number(v)'         --return 'distinct q, p.label, v'         -o $P1002_NODES
kgtk ifnotexists -i $P1002_NODES         --filter-on $P1002_STRUCTURED_LITERALS         -o $P1002_QNODES
0 0 3.3124077320098877
kgtk query -i $P1018_NEW_RESULTS

0 0 3.8804330825805664
kgtk query -i $P1080_NEW_RESULTS         --match 'n: (q)-[p]->(v)'         --where 'NOT kgtk_lqstring(v) AND NOT kgtk_number(v)'         --return 'distinct q, p.label, v'         -o $P1080_NODES
kgtk ifnotexists -i $P1080_NODES         --filter-on $P1080_STRUCTURED_LITERALS         -o $P1080_QNODES
0 0 4.3350982666015625
kgtk query -i $P110_NEW_RESULTS         --match 'n: (q)-[p]->(v)'         --where 'NOT kgtk_lqstring(v) AND NOT kgtk_number(v)'         --return 'distinct q, p.label, v'         -o $P110_NODES
kgtk ifnotexists -i $P110_NODES         --filter-on $P110_STRUCTURED_LITERALS         -o $P110_QNODES
0 0 3.8451974391937256
kgtk query -i $P111_NEW_RESULTS         --match 'n: (q)-[p]->(v)'         --where 'NOT kgtk_lqstring(v) AND NOT kgtk_number(v)'         --return 'distinct q, p.label, v'         -o $P111_NODES
kgtk ifnotexists -i $P111_NODES         --filter-on $P111_STRUCTURED_LITERALS         -o $P111_QNODES
0 0 3.810898780822754
kgtk query -i $P112

0 0 3.91011643409729
kgtk query -i $P1303_NEW_RESULTS         --match 'n: (q)-[p]->(v)'         --where 'NOT kgtk_lqstring(v) AND NOT kgtk_number(v)'         --return 'distinct q, p.label, v'         -o $P1303_NODES
kgtk ifnotexists -i $P1303_NODES         --filter-on $P1303_STRUCTURED_LITERALS         -o $P1303_QNODES
0 0 4.060178756713867
kgtk query -i $P1308_NEW_RESULTS         --match 'n: (q)-[p]->(v)'         --where 'NOT kgtk_lqstring(v) AND NOT kgtk_number(v)'         --return 'distinct q, p.label, v'         -o $P1308_NODES
kgtk ifnotexists -i $P1308_NODES         --filter-on $P1308_STRUCTURED_LITERALS         -o $P1308_QNODES
0 0 3.6605992317199707
kgtk query -i $P131_NEW_RESULTS         --match 'n: (q)-[p]->(v)'         --where 'NOT kgtk_lqstring(v) AND NOT kgtk_number(v)'         --return 'distinct q, p.label, v'         -o $P131_NODES
kgtk ifnotexists -i $P131_NODES         --filter-on $P131_STRUCTURED_LITERALS         -o $P131_QNODES
0 0 10.952211380004883
kgtk query -i $P

0 0 4.2917022705078125
kgtk query -i $P1435_NEW_RESULTS         --match 'n: (q)-[p]->(v)'         --where 'NOT kgtk_lqstring(v) AND NOT kgtk_number(v)'         --return 'distinct q, p.label, v'         -o $P1435_NODES
kgtk ifnotexists -i $P1435_NODES         --filter-on $P1435_STRUCTURED_LITERALS         -o $P1435_QNODES
0 0 4.063961029052734


In [55]:
for prop, val in property_mapping.items():
    output = subprocess.check_output("wc -l < $%s_QNODES" % prop, shell=True)
    output = output.decode("utf-8").strip()
    output = int(output) - 1
    # property_results_count[prop].append(output)
    print("%s: %d" % (prop, output))

P1000: 287123
P1001: 3927
P1002: 544
P1018: 753
P102: 37673
P1028: 15733
P1029: 82
P103: 68189
P1035: 17254
P1037: 4777
P1038: 18203
P1040: 7789
P1041: 178
P1049: 5123
P105: 7001
P1050: 2656
P106: 49557
P1064: 95
P1066: 8018
P1068: 234
P1071: 21447
P1072: 619
P1073: 451
P1075: 114
P1078: 154
P1079: 1015
P1080: 17447
P110: 903
P111: 32
P112: 6727
P113: 1136
P1136: 21
P114: 1144
P1142: 16973
P115: 352829
P1165: 269
P1170: 78
P118: 50137
P119: 4929
P1192: 26928
P1194: 78
P1201: 2023
P1202: 1867
P121: 1368
P1211: 533
P122: 11074
P123: 23737
P126: 1979
P127: 24751
P1283: 264
P1290: 18780
P1299: 6176
P1303: 13781
P1308: 5176
P131: 636562
P1313: 38066
P1318: 15
P1321: 809309
P1322: 12
P1327: 19
P1336: 986457
P1343: 287003
P1344: 512200
P1346: 7727
P137: 14219
P1383: 17133
P1398: 174
P1399: 812
P1408: 86766
P1411: 2714
P1412: 66573
P1414: 361
P1419: 22212
P1427: 2155
P1429: 7904
P1431: 7210
P1433: 2667
P1434: 17533
P1435: 1215


# Step 7 Quality Checking

In [56]:
for prop, val in property_mapping.items():
    start = time.time()
    command_1 = "kgtk query -i $%s_QNODES $P31 $P279STAR \
        --match 'q: (node1)-[nodeProp]->(node2), P31: (node2)-[]->(nodex), P279star: (nodex)-[]->(par)' \
        --where 'par in %s' \
        --return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`' \
        -o $%s_CORRECT_TEMP_1" % (prop, property_dict[prop], prop)
    print(command_1)
    code_1 = os.system(command_1)
    # print(code_1)
    command_2 = "kgtk ifnotexists -i $%s_QNODES \
        --filter-on $%s_CORRECT_TEMP_1 \
        --input-keys node1 node2 \
        --filter-keys node1 node2 \
        -o $%s_INCORRECT_TEMP" % (prop, prop, prop)
    print(command_2)
    code_2 = os.system(command_2)
    # print(code_2)
    command_3 = "kgtk query -i $%s_INCORRECT_TEMP $P279STAR \
        --match 'i: (node1)-[nodeProp]->(node2), P279star: (node2)-[]->(par)' \
        --where 'par in %s' \
        --return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`' \
        -o $%s_CORRECT_TEMP_2" % (prop, property_dict[prop], prop)
    print(command_3)
    code_3 = os.system(command_3)
    # print(code_3)
    command_4 = "kgtk ifnotexists -i $%s_INCORRECT_TEMP \
        --filter-on $%s_CORRECT_TEMP_2 \
        --input-keys node1 node2 \
        --filter-keys node1 node2 \
        -o $%s_INCORRECT" % (prop, prop, prop)
    print(command_4)
    code_4 = os.system(command_4)
    # print(code_4)
    command_5 = "kgtk cat -i $%s_CORRECT_TEMP_1 $%s_CORRECT_TEMP_2 \
        -o $%s_CORRECT" % (prop, prop, prop)
    print(command_5)
    code_5 = os.system(command_5)
    # print(code_5)
    runtime = time.time() - start
    property_runtime[prop].append(runtime)
    print(code_1, code_2, code_3, code_4, code_5, runtime)

kgtk query -i $P1000_QNODES $P31 $P279STAR         --match 'q: (node1)-[nodeProp]->(node2), P31: (node2)-[]->(nodex), P279star: (nodex)-[]->(par)'         --where 'par in ["Q1241356"]'         --return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`'         -o $P1000_CORRECT_TEMP_1
kgtk ifnotexists -i $P1000_QNODES         --filter-on $P1000_CORRECT_TEMP_1         --input-keys node1 node2         --filter-keys node1 node2         -o $P1000_INCORRECT_TEMP
kgtk query -i $P1000_INCORRECT_TEMP $P279STAR         --match 'i: (node1)-[nodeProp]->(node2), P279star: (node2)-[]->(par)'         --where 'par in ["Q1241356"]'         --return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`'         -o $P1000_CORRECT_TEMP_2
kgtk ifnotexists -i $P1000_INCORRECT_TEMP         --filter-on $P1000_CORRECT_TEMP_2         --input-keys node1 node2         --filter-keys node1 node2         -o $P1000_INCORRECT
kgtk cat -i $P1000_CORRECT_TEMP_1 $P1000_CORRECT_TEM

kgtk ifnotexists -i $P103_INCORRECT_TEMP         --filter-on $P103_CORRECT_TEMP_2         --input-keys node1 node2         --filter-keys node1 node2         -o $P103_INCORRECT
kgtk cat -i $P103_CORRECT_TEMP_1 $P103_CORRECT_TEMP_2         -o $P103_CORRECT
0 0 0 0 0 12.213000535964966
kgtk query -i $P1035_QNODES $P31 $P279STAR         --match 'q: (node1)-[nodeProp]->(node2), P31: (node2)-[]->(nodex), P279star: (nodex)-[]->(par)'         --where 'par in ["Q618779"]'         --return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`'         -o $P1035_CORRECT_TEMP_1
kgtk ifnotexists -i $P1035_QNODES         --filter-on $P1035_CORRECT_TEMP_1         --input-keys node1 node2         --filter-keys node1 node2         -o $P1035_INCORRECT_TEMP
kgtk query -i $P1035_INCORRECT_TEMP $P279STAR         --match 'i: (node1)-[nodeProp]->(node2), P279star: (node2)-[]->(par)'         --where 'par in ["Q618779"]'         --return 'distinct node1 as `node1`, nodeProp.label as `label`, 

kgtk query -i $P1050_INCORRECT_TEMP $P279STAR         --match 'i: (node1)-[nodeProp]->(node2), P279star: (node2)-[]->(par)'         --where 'par in ["Q12136", "Q1441305", "Q175854", "Q2057971", "Q796194", "Q808"]'         --return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`'         -o $P1050_CORRECT_TEMP_2
kgtk ifnotexists -i $P1050_INCORRECT_TEMP         --filter-on $P1050_CORRECT_TEMP_2         --input-keys node1 node2         --filter-keys node1 node2         -o $P1050_INCORRECT
kgtk cat -i $P1050_CORRECT_TEMP_1 $P1050_CORRECT_TEMP_2         -o $P1050_CORRECT
0 0 0 0 0 10.509939193725586
kgtk query -i $P106_QNODES $P31 $P279STAR         --match 'q: (node1)-[nodeProp]->(node2), P31: (node2)-[]->(nodex), P279star: (nodex)-[]->(par)'         --where 'par in ["Q12737077", "Q17305127", "Q192581", "Q2207288", "Q28640"]'         --return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`'         -o $P106_CORRECT_TEMP_1
kgtk ifnotexists -i 

kgtk ifnotexists -i $P1075_QNODES         --filter-on $P1075_CORRECT_TEMP_1         --input-keys node1 node2         --filter-keys node1 node2         -o $P1075_INCORRECT_TEMP
kgtk query -i $P1075_INCORRECT_TEMP $P279STAR         --match 'i: (node1)-[nodeProp]->(node2), P279star: (node2)-[]->(par)'         --where 'par in ["Q15632617", "Q215627", "Q28020127", "Q95074"]'         --return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`'         -o $P1075_CORRECT_TEMP_2
kgtk ifnotexists -i $P1075_INCORRECT_TEMP         --filter-on $P1075_CORRECT_TEMP_2         --input-keys node1 node2         --filter-keys node1 node2         -o $P1075_INCORRECT
kgtk cat -i $P1075_CORRECT_TEMP_1 $P1075_CORRECT_TEMP_2         -o $P1075_CORRECT
0 0 0 0 0 9.621049165725708
kgtk query -i $P1078_QNODES $P31 $P279STAR         --match 'q: (node1)-[nodeProp]->(node2), P31: (node2)-[]->(nodex), P279star: (nodex)-[]->(par)'         --where 'par in ["Q1127794"]'         --return 'distinct nod

kgtk cat -i $P113_CORRECT_TEMP_1 $P113_CORRECT_TEMP_2         -o $P113_CORRECT
0 0 0 0 0 9.4671311378479
kgtk query -i $P1136_QNODES $P31 $P279STAR         --match 'q: (node1)-[nodeProp]->(node2), P31: (node2)-[]->(nodex), P279star: (nodex)-[]->(par)'         --where 'par in ["Q5"]'         --return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`'         -o $P1136_CORRECT_TEMP_1
kgtk ifnotexists -i $P1136_QNODES         --filter-on $P1136_CORRECT_TEMP_1         --input-keys node1 node2         --filter-keys node1 node2         -o $P1136_INCORRECT_TEMP
kgtk query -i $P1136_INCORRECT_TEMP $P279STAR         --match 'i: (node1)-[nodeProp]->(node2), P279star: (node2)-[]->(par)'         --where 'par in ["Q5"]'         --return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`'         -o $P1136_CORRECT_TEMP_2
kgtk ifnotexists -i $P1136_INCORRECT_TEMP         --filter-on $P1136_CORRECT_TEMP_2         --input-keys node1 node2         --filter-keys

kgtk ifnotexists -i $P119_INCORRECT_TEMP         --filter-on $P119_CORRECT_TEMP_2         --input-keys node1 node2         --filter-keys node1 node2         -o $P119_INCORRECT
kgtk cat -i $P119_CORRECT_TEMP_1 $P119_CORRECT_TEMP_2         -o $P119_CORRECT
0 0 0 0 0 10.055366039276123
kgtk query -i $P1192_QNODES $P31 $P279STAR         --match 'q: (node1)-[nodeProp]->(node2), P31: (node2)-[]->(nodex), P279star: (nodex)-[]->(par)'         --where 'par in ["Q1067164", "Q16858238"]'         --return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`'         -o $P1192_CORRECT_TEMP_1
kgtk ifnotexists -i $P1192_QNODES         --filter-on $P1192_CORRECT_TEMP_1         --input-keys node1 node2         --filter-keys node1 node2         -o $P1192_INCORRECT_TEMP
kgtk query -i $P1192_INCORRECT_TEMP $P279STAR         --match 'i: (node1)-[nodeProp]->(node2), P279star: (node2)-[]->(par)'         --where 'par in ["Q1067164", "Q16858238"]'         --return 'distinct node1 as `node1`,

kgtk query -i $P123_INCORRECT_TEMP $P279STAR         --match 'i: (node1)-[nodeProp]->(node2), P279star: (node2)-[]->(par)'         --where 'par in ["Q1002697", "Q11032", "Q152416", "Q183888", "Q2085381", "Q2516866", "Q35127", "Q43229", "Q5"]'         --return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`'         -o $P123_CORRECT_TEMP_2
kgtk ifnotexists -i $P123_INCORRECT_TEMP         --filter-on $P123_CORRECT_TEMP_2         --input-keys node1 node2         --filter-keys node1 node2         -o $P123_INCORRECT
kgtk cat -i $P123_CORRECT_TEMP_1 $P123_CORRECT_TEMP_2         -o $P123_CORRECT
0 0 0 0 0 11.45438289642334
kgtk query -i $P126_QNODES $P31 $P279STAR         --match 'q: (node1)-[nodeProp]->(node2), P31: (node2)-[]->(nodex), P279star: (nodex)-[]->(par)'         --where 'par in ["Q14623646", "Q4164871", "Q43229", "Q5"]'         --return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`'         -o $P126_CORRECT_TEMP_1
kgtk ifnotexists 

kgtk cat -i $P1308_CORRECT_TEMP_1 $P1308_CORRECT_TEMP_2         -o $P1308_CORRECT
0 0 0 0 0 10.238984107971191
kgtk query -i $P131_QNODES $P31 $P279STAR         --match 'q: (node1)-[nodeProp]->(node2), P31: (node2)-[]->(nodex), P279star: (nodex)-[]->(par)'         --where 'par in ["Q3895768", "Q56061"]'         --return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`'         -o $P131_CORRECT_TEMP_1
kgtk ifnotexists -i $P131_QNODES         --filter-on $P131_CORRECT_TEMP_1         --input-keys node1 node2         --filter-keys node1 node2         -o $P131_INCORRECT_TEMP
kgtk query -i $P131_INCORRECT_TEMP $P279STAR         --match 'i: (node1)-[nodeProp]->(node2), P279star: (node2)-[]->(par)'         --where 'par in ["Q3895768", "Q56061"]'         --return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`'         -o $P131_CORRECT_TEMP_2
kgtk ifnotexists -i $P131_INCORRECT_TEMP         --filter-on $P131_CORRECT_TEMP_2         --input-keys node

kgtk ifnotexists -i $P1343_INCORRECT_TEMP         --filter-on $P1343_CORRECT_TEMP_2         --input-keys node1 node2         --filter-keys node1 node2         -o $P1343_INCORRECT
kgtk cat -i $P1343_CORRECT_TEMP_1 $P1343_CORRECT_TEMP_2         -o $P1343_CORRECT
0 0 0 0 0 20.19474172592163
kgtk query -i $P1344_QNODES $P31 $P279STAR         --match 'q: (node1)-[nodeProp]->(node2), P31: (node2)-[]->(nodex), P279star: (nodex)-[]->(par)'         --where 'par in ["Q1190554", "Q386724", "Q43229"]'         --return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`'         -o $P1344_CORRECT_TEMP_1
kgtk ifnotexists -i $P1344_QNODES         --filter-on $P1344_CORRECT_TEMP_1         --input-keys node1 node2         --filter-keys node1 node2         -o $P1344_INCORRECT_TEMP
kgtk query -i $P1344_INCORRECT_TEMP $P279STAR         --match 'i: (node1)-[nodeProp]->(node2), P279star: (node2)-[]->(par)'         --where 'par in ["Q1190554", "Q386724", "Q43229"]'         --return 'disti

kgtk ifnotexists -i $P1411_QNODES         --filter-on $P1411_CORRECT_TEMP_1         --input-keys node1 node2         --filter-keys node1 node2         -o $P1411_INCORRECT_TEMP
kgtk query -i $P1411_INCORRECT_TEMP $P279STAR         --match 'i: (node1)-[nodeProp]->(node2), P279star: (node2)-[]->(par)'         --where 'par in ["Q378427", "Q38033430", "Q4220917", "Q618779"]'         --return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`'         -o $P1411_CORRECT_TEMP_2
kgtk ifnotexists -i $P1411_INCORRECT_TEMP         --filter-on $P1411_CORRECT_TEMP_2         --input-keys node1 node2         --filter-keys node1 node2         -o $P1411_INCORRECT
kgtk cat -i $P1411_CORRECT_TEMP_1 $P1411_CORRECT_TEMP_2         -o $P1411_CORRECT
0 0 0 0 0 10.374099731445312
kgtk query -i $P1412_QNODES $P31 $P279STAR         --match 'q: (node1)-[nodeProp]->(node2), P31: (node2)-[]->(nodex), P279star: (nodex)-[]->(par)'         --where 'par in ["Q17376908"]'         --return 'distinct n

kgtk cat -i $P1433_CORRECT_TEMP_1 $P1433_CORRECT_TEMP_2         -o $P1433_CORRECT
0 0 0 0 0 9.973886728286743
kgtk query -i $P1434_QNODES $P31 $P279STAR         --match 'q: (node1)-[nodeProp]->(node2), P31: (node2)-[]->(nodex), P279star: (nodex)-[]->(par)'         --where 'par in ["Q3895768", "Q559618", "Q867335", "Q9134"]'         --return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`'         -o $P1434_CORRECT_TEMP_1
kgtk ifnotexists -i $P1434_QNODES         --filter-on $P1434_CORRECT_TEMP_1         --input-keys node1 node2         --filter-keys node1 node2         -o $P1434_INCORRECT_TEMP
kgtk query -i $P1434_INCORRECT_TEMP $P279STAR         --match 'i: (node1)-[nodeProp]->(node2), P279star: (node2)-[]->(par)'         --where 'par in ["Q3895768", "Q559618", "Q867335", "Q9134"]'         --return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`'         -o $P1434_CORRECT_TEMP_2
kgtk ifnotexists -i $P1434_INCORRECT_TEMP         --filter-

In [71]:
for prop, val in property_mapping.items():
    
    output_1 = subprocess.check_output("wc -l < $%s_CORRECT" % prop, shell=True)
    output_1 = output_1.decode("utf-8").strip()
    correct_lines = int(output_1) - 1
    property_results_count[prop].append(correct_lines)
    
    output_2 = subprocess.check_output("wc -l < $%s_INCORRECT" % prop, shell=True)
    output_2 = output_2.decode("utf-8").strip()
    incorrect_lines = int(output_2) - 1
    property_results_count[prop].append(incorrect_lines)
    
    output_3 = subprocess.check_output("wc -l < $%s_QNODES" % prop, shell=True)
    output_3 = output_3.decode("utf-8").strip()
    qnode_lines = int(output_3) - 1
    property_results_count[prop].append(qnode_lines)
    
    print("%s: correct %d; incorrect %d; total %d" % (prop, correct_lines, incorrect_lines, qnode_lines))
    
    assert correct_lines + incorrect_lines == qnode_lines, "The sum is not correct!"

P1000: correct 68; incorrect 287055; total 287123
P1001: correct 1898; incorrect 2029; total 3927
P1002: correct 439; incorrect 105; total 544
P1018: correct 667; incorrect 86; total 753
P102: correct 35310; incorrect 2363; total 37673
P1028: correct 15277; incorrect 456; total 15733
P1029: correct 82; incorrect 0; total 82
P103: correct 65552; incorrect 2637; total 68189
P1035: correct 8912; incorrect 8342; total 17254
P1037: correct 4764; incorrect 13; total 4777
P1038: correct 17704; incorrect 499; total 18203
P1040: correct 7478; incorrect 311; total 7789
P1041: correct 105; incorrect 73; total 178
P1049: correct 4754; incorrect 369; total 5123
P105: correct 6937; incorrect 64; total 7001
P1050: correct 1365; incorrect 1291; total 2656
P106: correct 25482; incorrect 24075; total 49557
P1064: correct 68; incorrect 27; total 95
P1066: correct 7984; incorrect 34; total 8018
P1068: correct 181; incorrect 53; total 234
P1071: correct 15424; incorrect 6023; total 21447
P1072: correct 12;

# Step (8). Output statistics

In [62]:
import pandas as pd

## Runtime

In [60]:
runtime_copy = property_runtime.copy()
for k, v in runtime_copy.items():
    if len(runtime_copy[k]) < 7:
        new_list = [0] * 7
        for i, t in enumerate(runtime_copy[k]):
            new_list[i] = runtime_copy[k][i]
        runtime_copy[k] = new_list
    combined_list = runtime_copy[k][:4] + [runtime_copy[k][4] + runtime_copy[k][5]] + runtime_copy[k][6:]
    runtime_copy[k] = combined_list
runtime_copy

{'P1000': [5.1377081871032715,
  5.2808003425598145,
  226.55320620536804,
  11.906784534454346,
  44.8679838180542,
  37.007588624954224],
 'P1001': [4873.026188135147,
  3.8210620880126953,
  233.53115892410278,
  4.712164878845215,
  5.769415616989136,
  10.097424507141113],
 'P1002': [2.7174062728881836,
  3.3586645126342773,
  222.08623671531677,
  1.9388160705566406,
  5.0736775398254395,
  9.878424406051636],
 'P1018': [2.0767264366149902,
  3.3858444690704346,
  223.88510823249817,
  1.6232929229736328,
  5.868239164352417,
  9.787675142288208],
 'P102': [728.1263544559479,
  3.7279090881347656,
  228.79758763313293,
  5.048824787139893,
  7.862472295761108,
  11.096062183380127],
 'P1026': [2.774015426635742, 4.045220851898193, 0, 0, 0, 0],
 'P1028': [2.9343748092651367,
  4.011143684387207,
  221.43906807899475,
  2.2541825771331787,
  9.361197233200073,
  13.302943468093872],
 'P1029': [2.1095290184020996,
  3.9420437812805176,
  226.9954059123993,
  2.038606643676758,
  5.6

In [65]:
df = pd.DataFrame.from_dict(runtime_copy, orient='index', 
                            columns=['Entity Resolution', 'Property Mapping', 'Query Wikidata Infobox', 'Filter new results', 'Datatype Filtering', 'Quality Checking'])
df.to_csv('/nas/home/bohuizha/KG/hunger-for-knowledge/batch_output/runtime.csv', sep=',')

## Results count

In [74]:
count_copy = property_results_count.copy()
for k, v in count_copy.items():
    if len(count_copy[k]) < 4:
        new_list = [0] * 4
        for i, t in enumerate(count_copy[k]):
            new_list[i] = count_copy[k][i]
        count_copy[k] = new_list
count_copy

{'P1000': [290, 68, 287055, 287123],
 'P1001': [760787, 1898, 2029, 3927],
 'P1002': [825, 439, 105, 544],
 'P1018': [189, 667, 86, 753],
 'P102': [432427, 35310, 2363, 37673],
 'P1026': [7346, 0, 0, 0],
 'P1028': [4240, 15277, 456, 15733],
 'P1029': [2769, 82, 0, 82],
 'P103': [107210, 65552, 2637, 68189],
 'P1035': [522, 8912, 8342, 17254],
 'P1037': [13046, 4764, 13, 4777],
 'P1038': [33957, 17704, 499, 18203],
 'P1039': [45, 0, 0, 0],
 'P1040': [47592, 7478, 311, 7789],
 'P1041': [85, 105, 73, 178],
 'P1049': [1896, 4754, 369, 5123],
 'P105': [2892792, 6937, 64, 7001],
 'P1050': [236675, 1365, 1291, 2656],
 'P1057': [124, 0, 0, 0],
 'P106': [8273095, 25482, 24075, 49557],
 'P1060': [287, 0, 0, 0],
 'P1064': [23191, 68, 27, 95],
 'P1066': [56327, 7984, 34, 8018],
 'P1068': [406, 181, 53, 234],
 'P1071': [52640, 15424, 6023, 21447],
 'P1072': [5543, 12, 607, 619],
 'P1073': [2322, 34, 417, 451],
 'P1075': [1496, 94, 20, 114],
 'P1078': [75, 3, 151, 154],
 'P1079': [170, 481, 534, 101

In [75]:
df = pd.DataFrame.from_dict(count_copy, orient='index', 
                            columns=['Wikidata results', 'Found correct', 'Found incorrect', 'Found total (Qnodes)'])
df.to_csv('/nas/home/bohuizha/KG/hunger-for-knowledge/batch_output/results_count.csv', sep=',')