# Step 0 Set up `kgtk`
Check in `spouse_of_politician.ipynb`

# Step 1 Send a SPARQL query use `kgtk`

Example 1: Find founding years of universities: 

In [1]:
import os

### Define alias and variables

In [15]:
# Parameters

# Folder where database files store
data_path = "/nas/home/bohuizha/KG/hunger-for-knowledge/data/"

# Folder on local machine where to create the output and temporary folders
output_path = "/nas/home/bohuizha/KG/hunger-for-knowledge/output/"

# Location of the cache database for kypher
cache_path = "/nas/home/bohuizha/KG/hunger-for-knowledge/kypher"
# Whether to delete the cache database
delete_database = False

# The names of files in the KGTK Wikidata distirbution that we will use in this notebook.
data_file_names = {
    "claims": "claims.tsv",
    "item": "claims.wikibase-item.tsv.gz",
    "wiki_info": "wikidata_infobox.tsv",
    "p31": "P31.tsv",
    "p279star": "P279star.tsv"
}

# We will define environment variables to hold the full paths to the files as we will use them in the shell commands
kgtk_environment_variables = []

os.environ['DATABASE'] = data_path
kgtk_environment_variables.append('DATABASE')

for key, value in data_file_names.items():
    variable = key.upper()
    os.environ[variable] = data_path + value
    kgtk_environment_variables.append(variable)

output_file_names = {
    "results": "founding_year_of_university.tsv",
    "new_results": "new_founding_year_of_university.tsv",
    "university": "university.tsv",
    "query_file": "university_wo_founding_year.tsv",
    "numbers": "numbers.tsv",
    "strings": "strings.tsv",
    "empty_strings": "empty_strings.tsv",
    "pure_empty": "pure_empty.tsv",
    "structured_literals": "structured_literals.tsv",
    "nodes": "nodes.tsv",
    "qnodes": "qnodes.tsv"
}

os.environ['OUTPUT'] = output_path
kgtk_environment_variables.append('OUTPUT')

for key, value in output_file_names.items():
    variable = key.upper()
    os.environ[variable] = output_path + value
    kgtk_environment_variables.append(variable)

# KGTK creates a SQLite database to index the knowledge graph.
if cache_path:
    os.environ['STORE'] = "{}/wikidata.sqlite3.db".format(cache_path)
else:
    os.environ['STORE'] = "{}/{}/wikidata.sqlite3.db".format(output_path, temp_folder)
kgtk_environment_variables.append('STORE')
    
# Envronment variables with shortcuts to the commands we use often
# os.environ['kgtk'] = "kgtk"
# Use for debugging, but careful as it causes import to dataframes to break
# os.environ['kypher'] = "time kgtk --debug query --graph-cache " + os.environ['STORE']
os.environ['kypher'] = "kgtk query --graph-cache " + os.environ['STORE']
# kgtk_environment_variables.append('kgtk')
kgtk_environment_variables.append('kypher')
    
# kgtk_environment_variables.sort()
for variable in kgtk_environment_variables:
    print("{}: \"{}\"".format(variable, os.environ[variable]))

DATABASE: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/"
CLAIMS: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/claims.tsv"
ITEM: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/claims.wikibase-item.tsv.gz"
WIKI_INFO: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/wikidata_infobox.tsv"
P31: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/P31.tsv"
P279STAR: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/P279star.tsv"
OUTPUT: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/"
RESULTS: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/founding_year_of_university.tsv"
NEW_RESULTS: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/new_founding_year_of_university.tsv"
UNIVERSITY: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/university.tsv"
QUERY_FILE: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/university_wo_founding_year.tsv"
NUMBERS: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/numbers.tsv"
STRINGS: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/strin

In [13]:
# !$kypher \
# -i "$ITEM" --as items \
# -i "$WIKI_INFO" --as wiki \
# -i "$P31" --as p31 \
# -i "$P279STAR" --as p279star \
# --limit 10

id	node1	label	node2
P10-P1629-Q34508-bcc39400-0	P10	P1629	Q34508
P10-P1855-Q15075950-7eff6d65-0	P10	P1855	Q15075950
P10-P1855-Q69063653-c8cdb04c-0	P10	P1855	Q69063653
P10-P1855-Q7378-555592a4-0	P10	P1855	Q7378
P10-P2302-Q21502404-d012aef4-0	P10	P2302	Q21502404
P10-P2302-Q21510851-5224fe0b-0	P10	P2302	Q21510851
P10-P2302-Q21510852-dde2f0ce-0	P10	P2302	Q21510852
P10-P31-Q18610173-85ef4d24-0	P10	P31	Q18610173
P1000-P1629-Q1241356-d5c10f50-0	P1000	P1629	Q1241356
P1000-P1855-Q1742-2566356a-0	P1000	P1855	Q1742


### Main `kgtk` query:

In [1]:
# SPARQL query: 
# SELECT DISTINCT ?universityLabel (YEAR(?inception) AS ?foundingYear) 
# WHERE 
# { 
#   ?university wdt:P31/wdt:P279* wd:Q3918 ; 
#               wdt:P571 ?inception . 
#   SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } 
# } 
# where `P31` means "instance of" and `P279` means "subclass of";

In [None]:
# all universities (show 10 of them)
!$kypher -i p31 -i p279star \
--match 'p31: (university)-[]->(class), p279star: (class)-[]->(:Q3918)' \
--return 'distinct university' \
--limit 10

**Type 1:** return full date information:

In [None]:
!kgtk query -i $CLAIMS -i $P31 -i $P279STAR \
--match 'P31: (university)-[]->(class), P279star: (class)-[]->(:Q3918), claims: (university)-[p:P571]->(d)' \
--return 'university, p.label, d' \
-o $RESULTS

**Type 2:** return only `kgtk_date_date`:

In [None]:
!kgtk query -i $CLAIMS -i $P31 -i $P279STAR \
--match 'P31: (university)-[]->(class), P279star: (class)-[]->(:Q3918), claims: (university)-[p:P571]->(d)' \
--return 'university, p.label, kgtk_date_date(d)'

**Type 3:** return only `kgtk_date_year`:

In [6]:
!kgtk query -i $CLAIMS -i $P31 -i $P279STAR \
--match 'P31: (university)-[]->(class), P279star: (class)-[]->(:Q3918), claims: (university)-[p:P571]->(d)' \
--return 'university, p.label, kgtk_date_year(d)'

For the convinient of property inference, we use **type 2**.

### Count known results in Wikidata database:

Count university-founding_year pairs / **rows** (result should -1 which is the header):

In [32]:
!wc -l $RESULTS

14150 /nas/home/bohuizha/KG/hunger-for-knowledge/output/founding_year_of_university.tsv


Count how many **unique politicians** have spouse in Wikidata:

In [33]:
!kgtk query -i $RESULTS \
--match '(p)-[]->()' \
--return 'count(distinct p) as N'

N
13385


Duplicates:

In [None]:
!kgtk query -i $RESULTS \
--match '(p)-[]->(s)' \
--return 'p, count(s) as N' \
--order-by 'N desc'

### Find unknow results in Wikidata database:
- Find all universities (already completed)

In [35]:
!kgtk query -i $P31 -i $P279STAR \
--match 'P31: (university)-[]->(class), P279star: (class)-[]->(:Q3918)' \
--return 'university as node1, "P31/P279*" as label, "Q3918" as node2' \
-o $UNIVERSITY

- Eliminate universities which have the founding year

In [36]:
!kgtk ifnotexists -i $UNIVERSITY \
                  --filter-on $RESULTS \
                  --input-keys node1 \
                  --filter-keys node1 \
                  -o $QUERY_FILE

In [37]:
!wc -l $QUERY_FILE

2328 /nas/home/bohuizha/KG/hunger-for-knowledge/output/university_wo_founding_year.tsv


### Count unknown results in Wikidata database:

In [38]:
!kgtk query -i $QUERY_FILE \
            --match '(p)-[]->()' \
            --return 'count(distinct p) as N'

N
2317


# Step 2 Infer properties

Use query results from Wikidata database to infer properties in Wikidata infobox and return the most frequent property.

In [39]:
!kgtk query -i $RESULTS -i $WIKI_INFO \
            --match 'f: (q)-[]->(y), w: (q)-[p]->(v)' \
            --where 'kgtk_date_year(y) = "^" + kgtk_unstringify(v)' \
            --return 'p.label, count(v) as N' \
            --order-by 'N desc' \
            --limit 1

label	N
property:established	5419


# Step 3 Run query in Wikidata infobox 

For those don't have founding year, query in Wikidata infobox:

In [40]:
!kgtk query -i $QUERY_FILE -i $WIKI_INFO \
            --match 'u: (q)-[]->(), w: (q)-[p]->(v)' \
            --where 'p.label = "property:established"' \
            --return 'distinct q, p.label, v' \
            -o $NEW_RESULTS

One can directly output results:

- Count rows of new findings:

In [41]:
!wc -l $NEW_RESULTS

54 /nas/home/bohuizha/KG/hunger-for-knowledge/output/new_founding_year_of_university.tsv


- Count unique politicians of new findings:

In [42]:
!kgtk query -i $NEW_RESULTS \
            --match 'n: (p)-[]->()' \
            --return 'count(distinct p) as N'

N
50


# Step 4 Datatype distribution of new findings

Collect lines' number and distinct entities' number:

In [3]:
lines = ["Lines"]
entities = ["Entities"]

### 1. Numbers:

- Rows:

In [4]:
!kgtk query -i $NEW_RESULTS \
            --match 'n: (p)-[]->(s)' \
            --where 'kgtk_number(s)' \
            -o $NUMBERS

numbers_line = !wc -l < $NUMBERS
numbers_line = int(numbers_line[0]) - 1
lines.append(numbers_line)
# numbers_line

18

- Unique universities:

In [5]:
numbers_distinct = !kgtk query -i $NUMBERS \
    --match '(p)-[]->()' \
    --return 'count(distinct p) as N'

numbers_distinct = int(numbers_distinct[1])
entities.append(numbers_distinct)
# numbers_distinct

18

- Check if the data type is useful:

In [6]:
!head $NUMBERS | column -ts $'\t'

node1      label                 node2
Q15402055  property:established  1968
Q16884982  property:established  2004
Q16939054  property:established  2007
Q18844652  property:established  1996
Q2974561   property:established  1985
Q3445987   property:established  1880
Q4783794   property:established  1988
Q4783795   property:established  1987
Q5508786   property:established  2009


Useful!

- Duplicates (universities that have multiple founding years):

In [None]:
!kgtk query -i $NUMBERS \
            --match '(p)-[]->(s)' \
            --return 'p, count(s) as N' \
            --order-by 'N desc'

### 2. Structured literals:

- Rows:

In [7]:
!kgtk query -i $NEW_RESULTS -i $WIKI_INFO \
            --match 'n: (q)-[p]->(s), w: (s)-[sv]->(v)' \
            --where 'NOT kgtk_lqstring(s) AND NOT kgtk_number(s) AND sv.label = "dbpedia:structured_value" AND "^" + kgtk_unstringify(v) != 0' \
            --return 'q, p.label, "^" + kgtk_unstringify(v) as node2' \
            -o $STRUCTURED_LITERALS

structured_literals_line = !wc -l < $STRUCTURED_LITERALS
structured_literals_line = int(structured_literals_line[0]) - 1
lines.append(structured_literals_line)
# structured_literals_line

25

- Unique universities:

In [8]:
structured_literals_distinct = !kgtk query -i $STRUCTURED_LITERALS \
    --match '(p)-[]->()' \
    --return 'count(distinct p) as N'

structured_literals_distinct = int(structured_literals_distinct[1])
entities.append(structured_literals_distinct)
# structured_literals_distinct

25

- Check if the data type is useful:

In [9]:
!head $STRUCTURED_LITERALS | column -ts $'\t'

node1      label                 node2
Q12002144  property:established  1956
Q14432001  property:established  1785
Q17006626  property:established  2013
Q17058724  property:established  1917
Q18354913  property:established  1979
Q19599762  property:established  2006
Q26910836  property:established  2009
Q28172529  property:established  2015
Q4736101   property:established  1985


- Duplicates:

In [11]:
!kgtk query -i $STRUCTURED_LITERALS \
            --match '(p)-[]->(s)' \
            --return 'p, count(s) as N' \
            --order-by 'N desc' \
            --limit 5

node1	N
Q8053355	1
Q7899980	1
Q7688746	1
Q7553485	1
Q7402054	1


### 3. Strings:

#### All:

- Rows:

In [12]:
!kgtk query -i $NEW_RESULTS \
            --match 'n: (p)-[]->(s)' \
            --where 'kgtk_lqstring(s)' \
            -o $STRINGS

strings_line = !wc -l < $STRINGS
strings_line = int(strings_line[0]) - 1
lines.append(strings_line)
# strings_line

7

- Unique universities:

In [13]:
strings_distinct = !kgtk query -i $STRINGS \
    --match '(p)-[]->()' \
    --return 'count(distinct p) as N'

strings_distinct = int(strings_distinct[1])
entities.append(strings_distinct)
# strings_distinct

7

- Check if the data type is useful:

In [14]:
!head $STRINGS | column -ts $'\t'

node1      label                 node2
Q29015031  property:established  'Announced'@en
Q3550203   property:established  ''@en
Q3579379   property:established  '*'@en
Q4997932   property:established  ''@en
Q5261745   property:established  'in Smyrna, Asia Minor'@en
Q7895186   property:established  'TBD'@en
Q7896577   property:established  'Future'@en


**Not** useful!

- Duplicates:

In [None]:
!kgtk query -i $STRINGS \
            --match '(p)-[]->(s)' \
            --return 'p, count(s) as N' \
            --order-by 'N desc'

### 4. Qnodes

- Rows:

In [17]:
!kgtk query -i $NEW_RESULTS \
            --match 'n:()-[]->(q)' \
            --where 'NOT kgtk_lqstring(q) AND NOT kgtk_number(q)' \
            -o $NODES

!kgtk ifnotexists -i $NODES \
                  --filter-on $STRUCTURED_LITERALS \
                  --input-keys node1 \
                  --filter-keys node1 \
                  -o $QNODES 

qnodes_line = !wc -l < $QNODES
qnodes_line = int(qnodes_line[0]) - 1
lines.append(qnodes_line)
# qnodes_line

3

- Unique universities:

In [19]:
qnodes_distinct = !kgtk query -i $QNODES \
    --match '(p)-[]->()' \
    --return 'count(distinct p) as N'

qnodes_distinct = int(qnodes_distinct[1])
entities.append(qnodes_distinct)
# qnodes_distinct

2

- Check if the data type is useful:

In [27]:
!head $QNODES | column -ts $'\t'

node1      label                 node2
Q3550203   property:established  nodemxZbyK2VRrGoaxfdLmyLxw-7539726
Q3550203   property:established  nodemxZbyK2VRrGoaxfdLmyLxw-7539727
Q55391746  property:established  nodemxZbyK2VRrGoaxfdLmyLxw-5561996


In [29]:
!kgtk query -i $WIKI_INFO \
            --match '(q)-[p]->(v)' \
            --where 'q = "nodemxZbyK2VRrGoaxfdLmyLxw-7539726"' \
            --return 'q, p.label, v'

node1	label	node2
nodemxZbyK2VRrGoaxfdLmyLxw-7539726	dbpedia:structured_value	"--02-20"
nodemxZbyK2VRrGoaxfdLmyLxw-7539726	dbpedia:structured_uri	xml-schema-type:gMonthDay


Actually these are the filtered structured literals, so **not** useful!

- Duplicates:

In [None]:
!kgtk query -i $QNODES \
            --match '(p)-[]->(s)' \
            --return 'p, count(s) as N'

### 5. Summary

In [20]:
from prettytable import PrettyTable

In [26]:
x = PrettyTable()

x.field_names = ["Data type", "Numbers", "Strings", "Structured literals", "Qnodes"]

x.add_row(lines)
x.add_row(entities)

print(x)

+-----------+---------+---------+---------------------+--------+
| Data type | Numbers | Strings | Structured literals | Qnodes |
+-----------+---------+---------+---------------------+--------+
|   Lines   |    18   |    25   |          7          |   3    |
|  Entities |    18   |    25   |          7          |   2    |
+-----------+---------+---------+---------------------+--------+
