# Step 0 Set up `kgtk`
Check `kgtk` GitHub with branch `dev`;

# Step 1 User Query

Example description: find industry of companies;

In [61]:
import os

from kgtk.functions import kgtk

### Define alias and variables

In [62]:
# Parameters

# Folder where database files store
data_path = "/nas/home/bohuizha/KG/hunger-for-knowledge/data/"

# Label file of Wikidata
os.environ['KGTK_LABEL_FILE'] = data_path + "labels.en.tsv"
kgtk_environment_variables.append('KGTK_LABEL_FILE')

# Folder on local machine where to create the output and temporary folders
output_path = "/nas/home/bohuizha/KG/hunger-for-knowledge/output/P452/"
if not os.path.exists(output_path):
    os.mkdir(output_path)

# Location of the cache database for kypher
cache_path = "/nas/home/bohuizha/KG/hunger-for-knowledge/kypher"
# Whether to delete the cache database
delete_database = False

# The names of files in the KGTK Wikidata distirbution that we will use in this notebook.
data_file_names = {
    "claims": "claims.tsv",
    "wiki_info": "wikidata_infobox.tsv",
    "p31": "P31.tsv",
    "p279star": "P279star.tsv",
    "labels": "labels.en.tsv"
}

# We will define environment variables to hold the full paths to the files as we will use them in the shell commands
kgtk_environment_variables = []

os.environ['DATABASE'] = data_path
kgtk_environment_variables.append('DATABASE')

for key, value in data_file_names.items():
    variable = key.upper()
    os.environ[variable] = data_path + value
    kgtk_environment_variables.append(variable)

output_file_names = {
    "results": "industry_of_company.tsv",
    "new_results": "new_industry_of_company.tsv",
    "infobox_results": "all_industry_of_company.tsv",
    "entity": "company.tsv",
    "query_file": "company_wo_industry.tsv",
    "direct_infer": "direct_infer.tsv",
    "indirect_infer": "indirect_infer.tsv",
    "infers": "infers.tsv",
    "numbers": "numbers.tsv",
    "strings": "strings.tsv",
    "empty_strings": "empty_strings.tsv",
    "pure_empty": "pure_empty.tsv",
    "non_empty": "non_empty_strings.tsv",
    "structured_literals": "structured_literals.tsv",
    "nodes": "nodes.tsv",
    "qnodes": "qnodes.tsv",
    "correct_temp_1": "industry.type-constraints.instanceOf.correct_temp_1.tsv",
    "correct_temp_2": "industry.type-constraints.instanceOf.correct_temp_2.tsv",
    "incorrect_temp": "industry.type-constraints.instanceOf.incorrect_temp.tsv",
    "qnodes_correct": "industry.type-constraints.instanceOf.correct.tsv",
    "qnodes_incorrect": "industry.type-constraints.instanceOf.incorrect.tsv",
    "correct": "correct.tsv",
    "incorrect": "incorrect.tsv"
}

os.environ['OUTPUT'] = output_path
kgtk_environment_variables.append('OUTPUT')

for key, value in output_file_names.items():
    variable = key.upper()
    os.environ[variable] = output_path + value
    kgtk_environment_variables.append(variable)

# KGTK creates a SQLite database to index the knowledge graph.
if cache_path:
    os.environ['STORE'] = "{}/wikidata.sqlite3.db".format(cache_path)
else:
    os.environ['STORE'] = "{}/{}/wikidata.sqlite3.db".format(output_path, temp_folder)
kgtk_environment_variables.append('STORE')
    
# Envronment variables with shortcuts to the commands we use often
# os.environ['kgtk'] = kgtk
# Use for debugging, but careful as it causes import to dataframes to break
# os.environ['kypher'] = "time kgtk --debug query --graph-cache " + os.environ['STORE']
os.environ['kypher'] = "kgtk query --graph-cache " + os.environ['STORE']
# kgtk_environment_variables.append('kgtk')
kgtk_environment_variables.append('kypher')
    
# kgtk_environment_variables.sort()
for variable in kgtk_environment_variables:
    print("{}: \"{}\"".format(variable, os.environ[variable]))

DATABASE: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/"
CLAIMS: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/claims.tsv"
WIKI_INFO: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/wikidata_infobox.tsv"
P31: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/P31.tsv"
P279STAR: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/P279star.tsv"
LABELS: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/labels.en.tsv"
OUTPUT: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/P452/"
RESULTS: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/P452/industry_of_company.tsv"
NEW_RESULTS: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/P452/new_industry_of_company.tsv"
INFOBOX_RESULTS: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/P452/all_industry_of_company.tsv"
ENTITY: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/P452/company.tsv"
QUERY_FILE: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/P452/company_wo_industry.tsv"
DIRECT_INFER: "/nas/home/bohuizha/KG/hunger-for-knowle

# Step 2: Wikidata Results

In [3]:
# SPARQL query: 
# SELECT DISTINCT ?companyLabel ?industryLabel 
# WHERE
# { 
#   ?company wdt:P31/wdt:P279*  wd:Q783794 ;
#            wdt:P452  ?industry . 
#   SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } 
# } 
# where `P106` means "occupation", `Q82955` means "politician", `P26` means "spouse";

In [4]:
!kgtk query -i $CLAIMS -i $P31 -i $P279STAR -i $LABELS \
            --match 'P31: (company)-[]->(class), P279star: (class)-[]->(:Q783794), claims: (company)-[p:P452]->(industry), labels: (industry)-[]->(industry_label)' \
            --return 'company as node1, "P452" as label, industry_label as node2' \
            --limit 10

node1	label	node2
Q1718083	P452	'phonographic industry'@en
Q23777331	P452	'media industry'@en
Q11502827	P452	'public transport'@en
Q11584534	P452	'public transport'@en
Q11589421	P452	'public transport'@en
Q11636491	P452	'public transport'@en
Q11991861	P452	'public transport'@en
Q1433158	P452	'public transport'@en
Q16098651	P452	'intercity bus service'@en
Q16927508	P452	'transport'@en


In [5]:
!kgtk query -i $CLAIMS -i $P31 -i $P279STAR \
            --match 'P31: (company)-[]->(class), P279star: (class)-[]->(:Q783794), claims: (company)-[p:P452]->(industry)' \
            --return 'company as node1, "P452" as label, industry as node2' \
            -o $RESULTS

Check head of the results:

In [6]:
!head $RESULTS | column -ts $'\t'

node1      label  node2
Q1718083   P452   Q10302058
Q23777331  P452   Q56611639
Q11502827  P452   Q178512
Q11584534  P452   Q178512
Q11589421  P452   Q178512
Q11636491  P452   Q178512
Q11991861  P452   Q178512
Q1433158   P452   Q178512
Q16098651  P452   Q493016


Check if Wikidata results have strings:

In [7]:
!kgtk query -i $RESULTS \
    --match '(q)-[p]->(v)' \
    --where 'kgtk_lqstring(v)' \
    --return 'q, p.label, v'

node1	label	node2


### Count known results in Wikidata database:

Count entity-value pairs / **rows** (result should -1 which is the header):

In [8]:
!wc -l $RESULTS

19342 /nas/home/bohuizha/KG/hunger-for-knowledge/output/P452/industry_of_company.tsv


Count how many **unique entities** have property in Wikidata:

In [9]:
!kgtk query -i $RESULTS \
            --match '(p)-[]->()' \
            --return 'count(distinct p) as N'

N
16088


### Find unknow results in Wikidata database:
- Find all entities

In [10]:
!kgtk query -i $P31 -i $P279STAR \
            --match 'P31: (entity)-[]->(class), P279star: (class)-[]->(:Q783794)' \
            --return 'entity as node1, "P31" as label, "Q783794" as node2' \
            -o $ENTITY

- Eliminate entities who have property / properties

In [11]:
!kgtk ifnotexists -i $ENTITY \
                  --filter-on $RESULTS \
                  --input-keys node1 \
                  --filter-keys node1 \
                  -o $QUERY_FILE

### Count unknown results in Wikidata database:

In [12]:
!kgtk query -i $QUERY_FILE \
            --match '(p)-[]->()' \
            --return 'count(distinct p) as N'

N
265773


# Step 3 Selection of Additional KG(s)
- DBpedia
- Getty
- Freebase
- Company KGs

# Step 4 Schema Alignment
## Entity resolution

Use query results from Wikidata database to infer properties in Wikidata infobox and return the most frequent property.

In [13]:
!kgtk query -i $RESULTS -i $WIKI_INFO \
            --match 'c: (entity)-[]->(v), w: (entity)-[p]->(v)' \
            --return 'entity, p.label, v as node2' \
            -o $DIRECT_INFER

Infer from structured literals (since there is no string structured value in Wikidata infobox).

In [14]:
!kgtk query -i $RESULTS -i $WIKI_INFO -i $LABELS \
            --match 'c: (entity)-[]->(v1), l: (v1)-[]->(v1_label), w: (entity)-[p]->(s)-[sv]->(v2)' \
            --where 'sv.label = "dbpedia:structured_value" AND kgtk_lqstring_text(v1_label) = kgtk_lqstring_text(v2)' \
            --return 'entity, p.label, v2 as node2' \
            -o $INDIRECT_INFER

In [15]:
!kgtk cat -i $DIRECT_INFER $INDIRECT_INFER -o $INFERS

## Property mapping

In [16]:
!kgtk query -i $INFERS \
            --match '(q)-[p]->(v)' \
            --return 'p.label, count(v) as N' \
            --order-by 'N desc' \
            --limit 1

label	N
property:industry	4608


# Step 5 Results from other KG(s)

For those entities don't have property value, query in Wikidata infobox:

In [17]:
!kgtk query -i $QUERY_FILE -i $WIKI_INFO \
            --match 'p: (entity)-[]->(), w: (entity)-[property]->(value)' \
            --where 'property.label = "property:industry"' \
            --return 'entity, property.label, value' \
            -o $NEW_RESULTS

- Count rows of new findings:

In [18]:
new_results_line = !wc -l < $NEW_RESULTS
new_results_line = int(new_results_line[0]) - 1
new_results_line

4257

- Count unique politicians of new findings:

In [19]:
new_results_distinct = !kgtk query \
    -i $NEW_RESULTS \
    --match 'n: (p)-[]->()' \
    --return 'count(distinct p) as N'
new_results_distinct = int(new_results_distinct[1])
new_results_distinct

3493

Agree:

In [20]:
!kgtk query -i $ENTITY $WIKI_INFO \
            --match 'c: (entity)-[]->(), w: (entity)-[property]->(value)' \
            --where 'property.label = "property:industry"' \
            --return 'entity, property.label, value' \
            -o $INFOBOX_RESULTS

In [21]:
!kgtk query -i $INFOBOX_RESULTS $RESULTS \
    --match 'n: (q)-[]->(v), r: (q)-[]->(v)' \
    --return 'count(distinct q) as N'

N
4281


# Step 6 Datatype Filtering

### 1. Numbers:

In [22]:
!kgtk query -i $NEW_RESULTS \
            --match 'n: (p)-[]->(s)' \
            --where 'kgtk_number(s)' \
            -o $NUMBERS

In [23]:
!head $NUMBERS | column -ts $'\t'

node1      label              node2
Q25111888  property:industry  3
Q5512171   property:industry  5


In [24]:
!kgtk query -i $NUMBERS \
    --match 'n: (p)-[]->()' \
    --return 'count(distinct p) as N'

N
2


### 2. Strings:

In [25]:
!kgtk query -i $NEW_RESULTS \
    --match 'n: (p)-[]->(s)' \
    --where 'kgtk_lqstring(s)' \
    -o $STRINGS

In [26]:
!head $STRINGS | column -ts $'\t'

node1      label              node2
Q1024012   property:industry  'Aerospace components'@en
Q1024012   property:industry  'Telecommunications'@en
Q10261135  property:industry  'Publishing house'@en
Q10263702  property:industry  'Motorcycles and automobiles manufacturing and sales'@en
Q10304618  property:industry  'Real estate and recurring income'@en
Q10310717  property:industry  'Film'@en
Q10342900  property:industry  'Automotive'@en
Q10344803  property:industry  'Grupo PlayArte'@en
Q1046186   property:industry  'Banking, Financial services'@en


In [60]:
!wc -l $STRINGS

2390 /nas/home/bohuizha/KG/hunger-for-knowledge/output/P452/strings.tsv


In [27]:
!kgtk query -i $STRINGS \
    --match 'n: (p)-[]->()' \
    --return 'count(distinct p) as N'

N
2105


### 1. Filter Structured literals:

In [28]:
!kgtk query -i $NEW_RESULTS -i $WIKI_INFO \
            --match 'n: (q)-[p]->(s), w: (s)-[sv]->(v)' \
            --where 'NOT kgtk_lqstring(s) AND NOT kgtk_number(s) AND sv.label = "dbpedia:structured_value"' \
            --return 'q, p.label, s' \
            -o $STRUCTURED_LITERALS

In [29]:
!head $STRUCTURED_LITERALS | column -ts $'\t'

node1      label              node2
Q25348727  property:industry  nodemxZbyK2VRrGoaxfdLmyLxw-4089958


### 2. Filter Qnodes

In [30]:
!kgtk query -i $NEW_RESULTS \
            --match 'n: (q)-[p]->(v)' \
            --where 'NOT kgtk_lqstring(v) AND NOT kgtk_number(v)' \
            --return 'distinct q, p.label, v' \
            -o $NODES

!kgtk ifnotexists -i $NODES \
                  --filter-on $STRUCTURED_LITERALS \
                  -o $QNODES

- Check if the data type is useful:

In [31]:
!head $QNODES | column -ts $'\t'

node1      label              node2
Q1023161   property:industry  Q269415
Q1024380   property:industry  Q5501371
Q1044059   property:industry  Q815825
Q1044059   property:industry  Q49389
Q1060363   property:industry  Q899383
Q1060363   property:industry  Q831882
Q1060363   property:industry  Q1020768
Q1073564   property:industry  Q778575
Q10831597  property:industry  Q778575


In [63]:
kgtk("""
    query -i $QNODES
        --match '()-[]->()'
    / add-labels    
    """)

Unnamed: 0,node1,label,node2,node1;label,node2;label
0,Q1023161,property:industry,Q269415,'CD Baby'@en,'digital distribution'@en
1,Q1024380,property:industry,Q5501371,'Canada Steamship Lines'@en,'freight company'@en
2,Q1044059,property:industry,Q815825,'Carnival Corporation & plc'@en,'hospitality'@en
3,Q1044059,property:industry,Q49389,'Carnival Corporation & plc'@en,'tourism'@en
4,Q1060363,property:industry,Q899383,'Terrestrial Trunked Radio'@en,'European Telecommunications Standards Institu...
...,...,...,...,...,...
1802,Q906522,property:industry,Q291,'Reality Kings'@en,'pornography'@en
1803,Q910541,property:industry,Q2342494,'Topps'@en,'collectible'@en
1804,Q910541,property:industry,Q5200157,'Topps'@en,
1805,Q93856,property:industry,Q187916,'Jockey Club'@en,'horse racing'@en


In [32]:
!kgtk query -i $QNODES \
        --match 'q: (n)-[p]->(v)' \
        --where 'n = "Q318672"'

node1	label	node2
Q318672	property:industry	Q178512


In [33]:
!wc -l < $QNODES

1808


In [34]:
!kgtk query -i $QNODES \
    --match 'n: (p)-[]->()' \
    --return 'count(distinct p) as N'

N
1463


# Step 7 Quality Checking

Problem: no `nodeProp.id`;

In [35]:
!kgtk query -i $QNODES $P31 $P279STAR \
--match 'q: (node1)-[nodeProp]->(node2), P31: (node2)-[]->(nodex), P279star: (nodex)-[]->(par)' \
--where 'par in ["Q8148", "Q268592", "Q8187769", "Q3958441", "Q121359"] ' \
--return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`' \
-o $CORRECT_TEMP_1

In [36]:
!head $CORRECT_TEMP_1 | column -ts $'\t'

node1      label              node2
Q16967954  property:industry  Q1004
Q29641882  property:industry  Q1061108
Q20708988  property:industry  Q1068473
Q4651269   property:industry  Q1068473
Q21036693  property:industry  Q11633
Q17103612  property:industry  Q11650
Q631792    property:industry  Q11650
Q17026016  property:industry  Q11661
Q5187684   property:industry  Q11661


In [37]:
!wc -l < $CORRECT_TEMP_1

294


In [38]:
# find out the filter step here
!kgtk ifnotexists -i $QNODES \
    --filter-on $CORRECT_TEMP_1 \
    --input-keys node1 node2 \
    --filter-keys node1 node2 \
    -o $INCORRECT_TEMP

In [39]:
!head $INCORRECT_TEMP | column -ts $'\t'

node1      label              node2
Q1023161   property:industry  Q269415
Q1024380   property:industry  Q5501371
Q1044059   property:industry  Q815825
Q1060363   property:industry  Q899383
Q1060363   property:industry  Q831882
Q1060363   property:industry  Q1020768
Q1073564   property:industry  Q778575
Q10831597  property:industry  Q778575
Q1085796   property:industry  Q291


In [40]:
!wc -l < $INCORRECT_TEMP

1515


In [41]:
!kgtk query -i $INCORRECT_TEMP $P279STAR \
    --match 'i: (node1)-[nodeProp]->(node2), P279star: (node2)-[]->(par)' \
    --where 'par in ["Q8148", "Q268592", "Q8187769", "Q3958441", "Q121359"] ' \
    --return 'distinct node1 as `node1`, nodeProp.label as `label`, node2 as `node2`' \
    -o $CORRECT_TEMP_2

In [42]:
!head $CORRECT_TEMP_2 | column -ts $'\t'

node1      label              node2
Q17053415  property:industry  Q121359
Q1284698   property:industry  Q1740966
Q13512414  property:industry  Q3220391
Q21189701  property:industry  Q3220391
Q4831131   property:industry  Q3220391
Q30644769  property:industry  Q483639
Q5458947   property:industry  Q483639
Q6999952   property:industry  Q483639
Q12591271  property:industry  Q5503


In [43]:
!wc -l < $CORRECT_TEMP_2

220


In [44]:
!kgtk ifnotexists -i $INCORRECT_TEMP \
    --filter-on $CORRECT_TEMP_2 \
    --input-keys node1 node2 \
    --filter-keys node1 node2 \
    -o $QNODES_INCORRECT

In [45]:
!head $QNODES_INCORRECT | column -ts $'\t'

node1      label              node2
Q1024380   property:industry  Q5501371
Q1044059   property:industry  Q815825
Q1060363   property:industry  Q899383
Q1060363   property:industry  Q831882
Q1060363   property:industry  Q1020768
Q1073564   property:industry  Q778575
Q10831597  property:industry  Q778575
Q1085796   property:industry  Q291
Q10878258  property:industry  Q769493


In [46]:
!wc -l < $QNODES_INCORRECT

1296


In [47]:
!kgtk --debug cat -i $CORRECT_TEMP_1 $CORRECT_TEMP_2 -o $QNODES_CORRECT

In [48]:
!head $QNODES_CORRECT | column -ts $'\t'

node1      label              node2
Q16967954  property:industry  Q1004
Q29641882  property:industry  Q1061108
Q20708988  property:industry  Q1068473
Q4651269   property:industry  Q1068473
Q21036693  property:industry  Q11633
Q17103612  property:industry  Q11650
Q631792    property:industry  Q11650
Q17026016  property:industry  Q11661
Q5187684   property:industry  Q11661


In [49]:
!kgtk query -i $QNODES_CORRECT \
    --match '(q)-[]->()' \
    --return 'count(distinct q) as N'

N
435


In [64]:
kgtk("""
    query -i $QNODES_CORRECT
        --match '(q)-[p]->(v)'
        --return 'q, p.label, v' 
        --limit 100
    / add-labels    
    """)

Unnamed: 0,node1,label,node2,node1;label,node2;label
0,Q16967954,property:industry,Q1004,'WOWIO'@en,'comics'@en
1,Q29641882,property:industry,Q1061108,'Kansas City Board of Public Utilities'@en,'water supply'@en
2,Q20708988,property:industry,Q1068473,'GreenCE'@en,'educational technology'@en
3,Q4651269,property:industry,Q1068473,'AEC Daily'@en,'educational technology'@en
4,Q21036693,property:industry,Q11633,'Looksery'@en,'photography'@en
...,...,...,...,...,...
95,Q16959592,property:industry,Q289,'LA Productions'@en,'television'@en
96,Q17150122,property:industry,Q289,'3AD'@en,'television'@en
97,Q17300077,property:industry,Q289,'Temple Hill Entertainment'@en,'television'@en
98,Q17335612,property:industry,Q289,'Vinyl Films'@en,'television'@en


In [50]:
correct_lines = !wc -l < $QNODES_CORRECT
correct_lines = int(correct_lines[0]) - 1
correct_lines

512

In [51]:
incorrect_lines = !wc -l < $QNODES_INCORRECT
incorrect_lines = int(incorrect_lines[0]) - 1
incorrect_lines

1295

In [52]:
qnode_lines = !wc -l < $QNODES
qnode_lines = int(qnode_lines[0]) - 1
qnode_lines

1807

In [53]:
assert correct_lines + incorrect_lines == qnode_lines, "The sum is not correct!"

In [54]:
!kgtk cat -i $NUMBERS $STRINGS $STRUCTURED_LITERALS $QNODES_INCORRECT -o $INCORRECT

In [55]:
!wc -l $INCORRECT

3688 /nas/home/bohuizha/KG/hunger-for-knowledge/output/P452/incorrect.tsv


In [56]:
!kgtk query -i $INCORRECT \
    --match '(q)-[]->()' \
    --return 'count(distinct q)'

count(DISTINCT graph_7205_c1."node1")
3224
