# Step 1 User Query

In [None]:
import os
import re
import time
import json
import subprocess
import pandas as pd

from tqdm import tqdm
from collections import defaultdict

## Define alias and variables

In [None]:
# Parameters

# Folder where database files store
data_path = "/nas/home/bohuizha/KG/hunger-for-knowledge/data/"

# The names of files in the KGTK Wikidata distirbution that we will use in this notebook.
data_file_names = {
    "claims": "claims.tsv",
    "wiki_info": "wikidata_infobox.tsv",
    "p31": "P31.tsv",
    "p279star": "P279star.tsv",
    "labels": "labels.en.tsv",
    "constraints": "value_type_constraint.json"
}

# We will define environment variables to hold the full paths to the files as we will use them in the shell commands
kgtk_environment_variables = []

os.environ['DATABASE'] = data_path
kgtk_environment_variables.append('DATABASE')

for key, value in data_file_names.items():
    variable = key.upper()
    os.environ[variable] = data_path + value
    kgtk_environment_variables.append(variable)
    
for variable in kgtk_environment_variables:
    print("{}: \"{}\"".format(variable, os.environ[variable]))

**Method 1: query from Wikidata**
- property constraint (P2302)
- value-type constraint (Q21510865)

In [None]:
# property_list = !kgtk query -i $CLAIMS --match 'c: (q)-[p:P2302]->(v:Q21510865)' --return 'q' --limit 100
# property_list = property_list[1:5]

**Method 2: preprocess Kartik's work**

In [None]:
with open(os.environ['CONSTRAINTS']) as f:
    property_dict = json.load(f)

In [None]:
# start_time = round(time.time())
property_runtime = {k: defaultdict(float) for k, _ in property_dict.items()}

In [None]:
property_lines_count = {k: defaultdict(int) for k, _ in property_dict.items()}
property_qnodes_count = {k: defaultdict(int) for k, _ in property_dict.items()}

In [None]:
# Parameters

# Folder on local machine where to create the output and temporary folders
output_path = "/nas/home/bohuizha/KG/hunger-for-knowledge/batch_output/"
if not os.path.exists(output_path):
    os.makedirs(output_path)
output_list = ['results', 'infobox_results', 'new_results', 'samples', 
               'unknown', 'entity', 'class', 'query', 'agree',
               'direct_infer', 'indirect_infer', 'infer', 
               'structured_literals', 'nodes', 'qnodes', 
               'correct_temp_1', 'correct_temp_2', 'incorrect_temp', 
               'correct', 'incorrect']
for folder_name in output_list:
    folder_path = os.path.join(output_path, folder_name)
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
    variable = folder_name.upper()
    os.environ[variable] = folder_path
    kgtk_environment_variables.append(variable)

os.environ['OUTPUT'] = output_path
kgtk_environment_variables.append('OUTPUT')

os.environ['PROPERTY_MAPPING'] = "/nas/home/bohuizha/KG/hunger-for-knowledge/batch_output/property_mapping_full.json"
kgtk_environment_variables.append('PROPERTY_MAPPING')

os.environ['RUNTIME'] = "/nas/home/bohuizha/KG/hunger-for-knowledge/batch_output/runtime.json"
kgtk_environment_variables.append('RUNTIME')

os.environ['LINE_STATISTICS'] = "/nas/home/bohuizha/KG/hunger-for-knowledge/batch_output/line.statistics.json"
kgtk_environment_variables.append('LINE_STATISTICS')

os.environ['QNODE_STATISTICS'] = "/nas/home/bohuizha/KG/hunger-for-knowledge/batch_output/qnode.statistics.json"
kgtk_environment_variables.append('QNODE_STATISTICS')

for prop in property_dict.keys():
    output_file_names = {
        "%s_results" % prop: "results/results.%s.tsv" % prop,
        "%s_infobox_results" % prop: "infobox_results/infobox.results.%s.tsv" % prop,
        "%s_new_results" % prop: "new_results/new.results.%s.tsv" % prop,
        "%s_samples" % prop: "samples/samples.%s.tsv" % prop,
        "%s_agree" % prop: "agree/agree.%s.tsv" % prop,
        "%s_query_file" % prop: "query/query.file.%s.tsv" % prop,
        "%s_direct_infer" % prop: "direct_infer/direct.infer.%s.tsv" % prop,
        "%s_indirect_infer" % prop: "indirect_infer/indirect.infer.%s.tsv" % prop,
        "%s_infers" % prop: "infers/infers.%s.tsv" % prop,
        "%s_structured_literals" % prop: "structured_literals/structured.literals.%s.tsv" % prop,
        "%s_nodes" % prop: "nodes/nodes.%s.tsv" % prop,
        "%s_qnodes" % prop: "qnodes/qnodes.%s.tsv" % prop,
        "%s_correct_temp_1" % prop: "correct_temp_1/correct_temp_1.%s.tsv" % prop,
        "%s_correct_temp_2" % prop: "correct_temp_2/correct_temp_2.%s.tsv" % prop,
        "%s_incorrect_temp" % prop: "incorrect_temp/incorrect_temp.%s.tsv" % prop,
        "%s_correct" % prop: "correct/correct.%s.tsv" % prop,
        "%s_incorrect" % prop: "incorrect/incorrect.%s.tsv" % prop
    }
    # "%s_unknown" % prop: "unknown/unknown.%s.tsv" % prop,
    # "%s_class" % prop: "class/class.%s.tsv" % prop,
    # "%s_entity" % prop: "entity/entity.%s.tsv" % prop,
    # print(output_file_names)
    for key, value in output_file_names.items():
        variable = key.upper()
        os.environ[variable] = os.path.join(output_path, value)
        kgtk_environment_variables.append(variable)

# kgtk_environment_variables.sort()
# for variable in kgtk_environment_variables:
#     print("{}: \"{}\"".format(variable, os.environ[variable]))

# Step 2: Wikidata Results

Generate a property file for each property using `kgtk filter` (filter for property). **Note:** Already run on background and since it's relatively time consuming, it's better to not run again.

In [None]:
# for prop, val in property_dict.items():
#     command = "kgtk filter -i $CLAIMS -p \" ; %s ; \" -o $%s_RESULTS" % (prop, prop) 
#     # print(command)
#     code = os.system(command)
#     print(code, prop) 

In [None]:
# code from Kartik
propFileDict = defaultdict()
with open(os.environ['CLAIMS'], 'r') as f: 
    headerLine = next(f)#.decode("utf-8")
    for line in tqdm(f):
        line = line#.decode("utf-8")
        lineP = line.rstrip().split("\t")
        prop = lineP[2]
        if prop in property_dict: 
            if prop not in propFileDict:
                propFileDict[prop] = open(os.environ["%s_RESULTS" % prop], "w")
                propFileDict[prop].write(headerLine)
            propFileDict[prop].write(line)
    for file1 in propFileDict.values():
        file1.close()

## Count known results in Wikidata database:

In [None]:
for prop in property_dict.keys():
    if not os.path.exists(os.environ['%s_RESULTS' % prop]):
        property_lines_count[prop]['Known'] = 0
        property_qnodes_count[prop]['Known'] = 0
        continue
    if os.path.getsize(os.environ['%s_RESULTS' % prop]) == 0:
        property_lines_count[prop]['Known'] = 0
        property_qnodes_count[prop]['Known'] = 0
        continue
    # count in lines
    lines = subprocess.check_output("wc -l < $%s_RESULTS" % prop, shell=True)
    lines = lines.decode("utf-8").strip()
    lines = int(lines) - 1
    property_lines_count[prop]['Known'] = lines
    # print("%s -> %d" % (prop, lines))
    # count in qnodes
    command = "kgtk query -i $%s_RESULTS --match '(qnode)-[]->()' --return 'count(distinct qnode)'" % prop
    nodes = subprocess.check_output(command, shell=True)
    nodes = nodes.decode("utf-8").strip().split('\n')[1]
    nodes = int(nodes)
    property_qnodes_count[prop]['Known'] = nodes
    print("%s -> %d" % (prop, nodes))

## Find unknow results in Wikidata database:

Find the most frequent class and substitute the entities of the class as the whole entity set we're going to query. **Note:** Currently we don't apply this step.

In [None]:
# for prop, val in property_dict.items():
#     command = "kgtk query -i $%s_RESULTS $P31 \
#         --match 'r: (entity)-[]->(), P31: (entity)-[]->(class)' \
#         --return 'distinct entity as node1, \"P31\" as label, class as node2' \
#         -o $%s_CLASS" % (prop, prop)
#     print(command)
#     start = time.time()
#     code = os.system(command)
#     runtime = time.time() - start
#     print(code, runtime)
#     property_runtime[prop].append(runtime)

In [None]:
# entity_class_map = dict()
# for prop, _ in property_dict.items():
#     command = "kgtk query -i $%s_CLASS \
#         --match 'c: ()-[]->(class)' \
#         --return 'class, count(class) as N' \
#         --order-by 'N desc' \
#         --limit 1" % prop
#     start = time.time()
#     output = subprocess.check_output(command, shell=True)
#     runtime = time.time() - start
#     property_runtime[prop].append(runtime)
#     output = output.decode("utf-8").strip().split()
#     print("%s" % prop, output, runtime)
#     if len(output) == 4:
#         entity_class_map[prop] = output[2]

In [None]:
# class_entity_map = dict()
# for key, value in entity_class_map.items():
#     class_entity_map[value] = []
# for key, value in entity_class_map.items():
#     class_entity_map[value].append(key)
# class_entity_map

- Find all entities

In [None]:
# for cls, prop in class_entity_map.items():
#     command = "kgtk filter -i $P31 -p \";; %s\" -o $%s_ENTITY" % (cls, prop[0])
#     print(command)
#     # If ${val} already in folder, no need to query
#     # if os.path.exists(os.environ['%s_ENTITY' % prop[0]]):
#     #     print(0)
#     #     runtime = 0
#     #     propperty_runtime[prop].append(runtime)
#     # else:
#     start = time.time()
#     code = os.system(command)
#     runtime = time.time() - start
#     property_runtime[prop[0]].append(runtime)
#     print(code, runtime)
#     if len(prop) > 1:
#         for p in prop[1:]:
#             os.system('cp $%s_ENTITY $%s_ENTITY' % (prop[0], p))
#             property_runtime[p].append(runtime)

In [None]:
# directly filter
# !kgtk query -i $%s_CLASS $P31 \
#         --match 'c: (n)-[]->(class), P31: (entity)-[]->(class)' \
#         --where 'n != entity' \
#         --return 'distinct entity as node1, \"P31\" as label, class as node2' \
#         -o $%s_ENTITY" % (prop, prop)

- Eliminate entities who have known property / properties values

In [None]:
# for prop, val in property_dict.items():
#     command = "kgtk ifnotexists -i $%s_ENTITY \
#         --filter-on $%s_RESULTS \
#         --input-keys node1 \
#         --filter-keys node1 \
#         -o $%s_QUERY_FILE" % (prop, prop, prop)
#     print(command)
#     # if os.path.exists(os.environ['%s_QUERY_FILE' % prop[0]]):
#     #     print(0)
#     # else:
#     #     code = os.system(command)
#     #     print(code)
#     start = time.time()
#     code = os.system(command)
#     runtime = time.time() - start
#     property_runtime[prop].append(runtime)
#     print(code, runtime)

## Count unknown results

- rows

In [None]:
# for prop, val in property_dict.items():
#     output = subprocess.check_output("wc -l < $%s_QUERY_FILE" % prop, shell=True)
#     output = output.decode("utf-8").strip()
#     output = int(output) - 1
#     print("%s: %s" % (prop, output))

- entities

In [None]:
# for prop, val in property_dict.items():
#     command = "kgtk query -i $%s_QUERY_FILE \
#         --match '(p)-[]->()' \
#         --return 'count(distinct p) as N'" % prop
#     output = subprocess.check_output(command, shell=True)
#     output = output.decode("utf-8").strip().split()[-1]
#     print("%s: %s" % (prop, output))

# Step 3 Selection of Additional KG(s)

Currently we use Wikidata Infobox generated from DBpedia.

# Step 4 Schema Alignment

## Entity resolution

Use query results from Wikidata database to infer properties in Wikidata infobox and return the most frequent property.

**Direct Infer:** Query for qnode;

In [None]:
for prop, val in property_dict.items():
    command = "kgtk query -i $%s_RESULTS --match '()-[]->()' --limit 200000 -o $%s_SAMPLES" % (prop, prop)
    code = os.system(command)

In [None]:
for prop, val in property_dict.items():
    command = "kgtk query -i $%s_SAMPLES -i $WIKI_INFO \
        --match 's: (entity)-[]->(v), w: (entity)-[p]->(v)' \
        --return 'entity, p.label, v as node2' \
        -o $%s_DIRECT_INFER" % (prop, prop)
    # print(command)
    start = time.time()
    code = os.system(command)
    runtime = time.time() - start
    property_runtime[prop]['Entity Resolution'] = runtime
    print(prop, code, runtime)

## Property mapping

In [None]:
# {Wikidata property: DBpedia property}, e.g. {'P452': 'property:industry'}
property_mapping = dict()
for prop, val in property_dict.items():
    if not os.path.exists(os.environ['%s_DIRECT_INFER' % prop]):
        continue
    if os.path.getsize(os.environ['%s_DIRECT_INFER' % prop]) == 0:
        continue
    command = "kgtk query -i $%s_DIRECT_INFER \
        --match '(q)-[p]->(v)' \
        --return 'p.label, count(v) as N' \
        --order-by 'N desc' \
        --limit 1" % prop
    start = time.time()
    # print(command)
    output = subprocess.check_output(command, shell=True)
    runtime = time.time() - start
    property_runtime[prop]['Property Mapping'] = runtime
    output = output.decode("utf-8").strip().split()
    print(output, runtime)
    if len(output) == 4:
        property_mapping[prop] = output[2]
        print("%s -> %s" % (prop, output[2]))

In [None]:
# save property_mapping
with open(os.environ['PROPERTY_MAPPING'], 'w+') as f:
    json.dump(property_mapping, f, indent=4)

In [None]:
# create reverse property mapping: {DBpedia property: Wikidata property}
# assert len(set(property_mapping.keys())) == len(set(property_mapping.values())), 'Not one-to-one mapping!'
print('Number of Wikidata properties: %d' % len(set(property_mapping.keys()))) 
print('Number of DBpedia properties: %d' % len(set(property_mapping.values())))
property_mapping_inverted = defaultdict(list)
for key, value in property_mapping.items():
    property_mapping_inverted[value].append(key) 

# Step 5 Results from other KG(s)

For those entities don't have property value, query in Wikidata infobox:

In [None]:
DBpediaResultsFileDict = defaultdict()
with open(os.environ['WIKI_INFO'], 'r') as f: 
    headerLine = next(f)
    for line in tqdm(f):
        lineP = line.rstrip().split("\t")
        dbpedia_prop = lineP[1]
        if dbpedia_prop in property_mapping_inverted:
            for wiki_prop in property_mapping_inverted[dbpedia_prop]:
                if wiki_prop not in DBpediaResultsFileDict:
                    DBpediaResultsFileDict[wiki_prop] = open(os.environ["%s_INFOBOX_RESULTS" % wiki_prop], "w")
                    DBpediaResultsFileDict[wiki_prop].write(headerLine)
                DBpediaResultsFileDict[wiki_prop].write(line)
    for file1 in DBpediaResultsFileDict.values():
        file1.close()

## Filter new results found from additional KG

In [None]:
for prop, val in property_mapping.items():
    if not os.path.exists(os.environ['%s_INFOBOX_RESULTS' % prop]):
        continue
    if os.path.getsize(os.environ['%s_INFOBOX_RESULTS' % prop]) == 0:
        continue
    command = "kgtk ifnotexists -i $%s_INFOBOX_RESULTS \
        --filter-on $%s_RESULTS \
        --input-keys node1 node2 \
        --filter-keys node1 node2 \
        -o $%s_NEW_RESULTS" % (prop, prop, prop) 
    # print(command)
    start = time.time()
    code = os.system(command)
    runtime = time.time() - start
    property_runtime[prop]['Knowledge Retrieval'] = runtime
    print(prop, code, runtime)

- Count rows of new findings:

In [None]:
# for prop, val in property_mapping.items():
#     output = subprocess.check_output("wc -l < $%s_NEW_RESULTS" % prop, shell=True)
#     output = output.decode("utf-8").strip()
#     output = int(output) - 1
#     print("%s: %s" % (prop, output))

- Count unique entities of new findings:

In [None]:
# for prop, val in property_mapping.items():
#     command = "kgtk query -i $%s_NEW_RESULTS \
#         --match 'n: (p)-[]->()' \
#         --return 'count(distinct p) as N'" % prop
#     output = subprocess.check_output(command, shell=True)
#     output = output.decode("utf-8").strip().split()[1]
#     # print("%s: %s" % (prop, output))

## Filter out entities (rows) we still don't know

In [None]:
# for prop, val in property_mapping.items():
#     command = "kgtk ifnotexists -i $%s_QUERY_FILE \
#         --filter-on $%s_NEW_RESULTS \
#         --input-keys node1 \
#         --filter-keys node1 \
#         -o $%s_UNKNOWN" % (prop, prop, prop)
#     print(command)
#     start = time.time()
#     code = os.system(command)
#     runtime = time.time() - start
#     property_runtime[prop].append(runtime)
#     print(code, runtime)

- Count rows still unknow

In [None]:
# for prop, val in property_mapping.items():
#     output = subprocess.check_output("wc -l < $%s_UNKNOWN" % prop, shell=True)
#     output = output.decode("utf-8").strip()
#     output = int(output) - 1
#     print("%s: %s" % (prop, output))

- Count entities still unknow

In [None]:
# for prop, val in property_mapping.items():
#     command = "kgtk query -i $%s_UNKNOWN \
#         --match 'n: (p)-[]->()' \
#         --return 'count(distinct p) as N'" % prop
#     output = subprocess.check_output(command, shell=True)
#     output = output.decode("utf-8").strip().split()[1]
#     print("%s: %s" % (prop, output))

# Step 6 Datatype Filtering

## 1. Filter Structured literals:

In [None]:
for prop, val in property_mapping.items():
    if not os.path.exists(os.environ['%s_NEW_RESULTS' % prop]):
        continue
    if os.path.getsize(os.environ['%s_NEW_RESULTS' % prop]) == 0:
        continue
    command = "kgtk query -i $%s_NEW_RESULTS -i $WIKI_INFO \
        --match 'n: (q)-[p]->(s), w: (s)-[sv]->(v)' \
        --where 'NOT kgtk_lqstring(s) AND NOT kgtk_number(s) AND sv.label = \"dbpedia:structured_value\"' \
        --return 'q, p.label, s' \
        -o $%s_STRUCTURED_LITERALS" % (prop, prop)
    # print(command)
    start = time.time()
    code = os.system(command)
    runtime = time.time() - start
    property_runtime[prop]['Datatype Filtering'] = runtime
    print(prop, code, runtime)

## 2. Filter Qnodes

In [None]:
for prop, val in property_mapping.items():
    if not os.path.exists(os.environ['%s_NEW_RESULTS' % prop]):
        continue
    if os.path.getsize(os.environ['%s_NEW_RESULTS' % prop]) == 0:
        continue
    command_1 = "kgtk query -i $%s_NEW_RESULTS \
        --match 'n: (q)-[p]->(v)' \
        --where 'NOT kgtk_lqstring(v) AND NOT kgtk_number(v)' \
        --return 'distinct q, p.label, v' \
        -o $%s_NODES" % (prop, prop)
    # print(command_1) 
    command_2 = "kgtk ifnotexists -i $%s_NODES \
        --filter-on $%s_STRUCTURED_LITERALS \
        -o $%s_QNODES" % (prop, prop, prop)
    # print(command_2) 
    start = time.time()
    code_1 = os.system(command_1)
    code_2 = os.system(command_2)
    runtime = time.time() - start
    property_runtime[prop]['Datatype Filtering'] += runtime
    print(prop, code_1, code_2, runtime) 

In [None]:
for prop in property_mapping.keys():
    if not os.path.exists(os.environ['%s_QNODES' % prop]):
        property_lines_count[prop]['Found Total (Qnodes)'] = 0
        property_qnodes_count[prop]['Found Total (Qnodes)'] = 0
        continue
    if os.path.getsize(os.environ['%s_QNODES' % prop]) == 0:
        property_lines_count[prop]['Found Total (Qnodes)'] = 0
        property_qnodes_count[prop]['Found Total (Qnodes)'] = 0
        continue
    # count in lines
    lines = subprocess.check_output("wc -l < $%s_QNODES" % prop, shell=True)
    lines = lines.decode("utf-8").strip()
    lines = int(lines) - 1
    property_lines_count[prop]['Found Total (Qnodes)'] = lines
    print("%s -> %d" % (prop, lines))
    # count in qnodes
    command = "kgtk query -i $%s_QNODES --match '(qnode)-[]->()' --return 'count(distinct qnode)'" % prop
    nodes = subprocess.check_output(command, shell=True)
    nodes = nodes.decode("utf-8").strip().split('\n')[1]
    nodes = int(nodes)
    property_qnodes_count[prop]['Found Total (Qnodes)'] = nodes

# Step 7 Quality Checking

Agree:

In [None]:
for prop, val in property_mapping.items():
    start = time.time()
    command = "kgtk query -i $%s_RESULTS $%s_INFOBOX_RESULTS \
        --match 'r: (qnode)-[p]->(value), i: (qnode)-[]->(value)' \
        --return 'qnode as `node1`, p.label as `label`, value as `node2`' \
        -o $%s_AGREE" % (prop, prop, prop)
    code = os.system(command)
    runtime = time.time() - start
    print(prop, code, runtime)

In [None]:
for prop in property_mapping.keys():
    if not os.path.exists(os.environ['%s_AGREE' % prop]):
        property_lines_count[prop]['Agree'] = 0
        property_qnodes_count[prop]['Agree'] = 0
        continue
    if os.path.getsize(os.environ['%s_AGREE' % prop]) == 0:
        property_lines_count[prop]['Agree'] = 0
        property_qnodes_count[prop]['Agree'] = 0
        continue
    # count in lines
    lines = subprocess.check_output("wc -l < $%s_AGREE" % prop, shell=True)
    lines = lines.decode("utf-8").strip()
    lines = int(lines) - 1
    property_lines_count[prop]['Agree'] = lines
    print("%s -> %d" % (prop, lines))
    # count in qnodes
    command = "kgtk query -i $%s_AGREE --match '(qnode)-[]->()' --return 'count(distinct qnode)'" % prop
    nodes = subprocess.check_output(command, shell=True)
    nodes = nodes.decode("utf-8").strip().split('\n')[1]
    nodes = int(nodes)
    property_qnodes_count[prop]['Agree'] = nodes

Semantic Checking:

In [None]:
for prop, val in property_mapping.items():
    start = time.time()
    command_1 = "kgtk query -i $%s_QNODES $P31 $P279STAR \
        --match 'q: (node1)-[nodeProp]->(node2), P31: (node2)-[]->(nodex), P279star: (nodex)-[]->(par)' \
        --where 'par in %s' \
        --return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`' \
        -o $%s_CORRECT_TEMP_1" % (prop, property_dict[prop], prop)
    # print(command_1)
    code_1 = os.system(command_1)
    # print(code_1)
    command_2 = "kgtk ifnotexists -i $%s_QNODES \
        --filter-on $%s_CORRECT_TEMP_1 \
        --input-keys node1 node2 \
        --filter-keys node1 node2 \
        -o $%s_INCORRECT_TEMP" % (prop, prop, prop)
    # print(command_2)
    code_2 = os.system(command_2)
    # print(code_2)
    command_3 = "kgtk query -i $%s_INCORRECT_TEMP $P279STAR \
        --match 'i: (node1)-[nodeProp]->(node2), P279star: (node2)-[]->(par)' \
        --where 'par in %s' \
        --return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`' \
        -o $%s_CORRECT_TEMP_2" % (prop, property_dict[prop], prop)
    # print(command_3)
    code_3 = os.system(command_3)
    # print(code_3)
    command_4 = "kgtk ifnotexists -i $%s_INCORRECT_TEMP \
        --filter-on $%s_CORRECT_TEMP_2 \
        --input-keys node1 node2 \
        --filter-keys node1 node2 \
        -o $%s_INCORRECT" % (prop, prop, prop)
    # print(command_4)
    code_4 = os.system(command_4)
    # print(code_4)
    command_5 = "kgtk cat -i $%s_CORRECT_TEMP_1 $%s_CORRECT_TEMP_2 \
        -o $%s_CORRECT" % (prop, prop, prop)
    # print(command_5)
    code_5 = os.system(command_5)
    # print(code_5)
    runtime = time.time() - start
    property_runtime[prop]['Semantic Validation'] = runtime
    print(prop, code_1, code_2, code_3, code_4, code_5, runtime)

In [None]:
for prop in property_mapping.keys():
    if not os.path.exists(os.environ['%s_CORRECT' % prop]):
        property_lines_count[prop]['Correct'] = 0
        property_qnodes_count[prop]['Correct'] = 0
        continue
    if os.path.getsize(os.environ['%s_CORRECT' % prop]) == 0:
        property_lines_count[prop]['Correct'] = 0
        property_qnodes_count[prop]['Correct'] = 0
        continue
    # count in lines
    lines = subprocess.checka_output("wc -l < $%s_CORRECT" % prop, shell=True)
    lines = lines.decode("utf-8").strip()
    lines = int(lines) - 1
    property_lines_count[prop]['Correct'] = lines
    print("%s -> %d" % (prop, lines))
    # count in qnodes
    command = "kgtk query -i $%s_CORRECT --match '(qnode)-[]->()' --return 'count(distinct qnode)'" % prop
    nodes = subprocess.check_output(command, shell=True)
    nodes = nodes.decode("utf-8").strip().split('\n')[1]
    nodes = int(nodes)
    property_qnodes_count[prop]['Correct'] = nodes

In [None]:
for prop in property_mapping.keys():
    if not os.path.exists(os.environ['%s_INCORRECT' % prop]):
        property_lines_count[prop]['Incorrect'] = 0
        property_qnodes_count[prop]['Incorrect'] = 0
        continue
    if os.path.getsize(os.environ['%s_INCORRECT' % prop]) == 0:
        property_lines_count[prop]['Incorrect'] = 0
        property_qnodes_count[prop]['Incorrect'] = 0
        continue
    # count in lines
    lines = subprocess.check_output("wc -l < $%s_INCORRECT" % prop, shell=True)
    lines = lines.decode("utf-8").strip()
    lines = int(lines) - 1
    property_lines_count[prop]['Incorrect'] = lines
    print("%s -> %d" % (prop, lines))
    # count in qnodes
    command = "kgtk query -i $%s_INCORRECT --match '(qnode)-[]->()' --return 'count(distinct qnode)'" % prop
    nodes = subprocess.check_output(command, shell=True)
    nodes = nodes.decode("utf-8").strip().split('\n')[1]
    nodes = int(nodes)
    property_qnodes_count[prop]['Incorrect'] = nodes

In [None]:
for prop, val in property_mapping.items():
    
    output_1 = subprocess.check_output("wc -l < $%s_CORRECT" % prop, shell=True)
    output_1 = output_1.decode("utf-8").strip()
    correct_lines = int(output_1) - 1
    # property_results_count[prop]['Correct'] = correct_lines
    
    output_2 = subprocess.check_output("wc -l < $%s_INCORRECT" % prop, shell=True)
    output_2 = output_2.decode("utf-8").strip()
    incorrect_lines = int(output_2) - 1
    # property_results_count[prop]['Incorrect'] = incorrect_lines
    
    output_3 = subprocess.check_output("wc -l < $%s_QNODES" % prop, shell=True)
    output_3 = output_3.decode("utf-8").strip()
    qnode_lines = int(output_3) - 1
    # property_results_count[prop].append(qnode_lines) 
    
    print("%s: correct %d; incorrect %d; total %d" % (prop, correct_lines, incorrect_lines, qnode_lines))
    
    assert correct_lines + incorrect_lines == qnode_lines, "The sum is not correct!"

# Step (8). Output statistics

## Runtime

In [None]:
# df = pd.DataFrame.from_dict(runtime_copy, orient='index', 
#                             columns=['Entity Resolution', 'Property Mapping', 'Query Wikidata Infobox', 'Filter new results', 'Datatype Filtering', 'Quality Checking'])
# df.to_csv('/nas/home/bohuizha/KG/hunger-for-knowledge/batch_output/runtime.csv', sep=',')
with open(os.environ['RUNTIME'], 'w', encoding='utf-8') as f:
    json.dump(property_runtime, f, ensure_ascii=False, indent=4)

## Results count

In [None]:
with open(os.environ['LINE_STATISTICS'], 'w', encoding='utf-8') as f:
    json.dump(property_lines_count, f, ensure_ascii=False, indent=4)

In [None]:
with open(os.environ['QNODE_STATISTICS'], 'w', encoding='utf-8') as f:
    json.dump(property_qnodes_count, f, ensure_ascii=False, indent=4)