# Step 0 Set up `kgtk`
Use GitHub branch `dev`:

In [None]:
%%bash
git clone -b dev https://github.com/usc-isi-i2/kgtk.git
cd kgtk
python setup.py install

# Step 1 Send a SPARQL query use `kgtk`

Example 1: Find spouse(s) of politicians:

In [1]:
import os

### Define alias and variables

In [2]:
# Parameters

# Folder where database files store
data_path = "/nas/home/bohuizha/KG/hunger-for-knowledge/data/"

# Folder on local machine where to create the output and temporary folders
output_path = "/nas/home/bohuizha/KG/hunger-for-knowledge/output/P26/"
if not os.path.exists(output_path):
    os.mkdir(output_path)

# Location of the cache database for kypher
cache_path = "/nas/home/bohuizha/KG/hunger-for-knowledge/kypher"
# Whether to delete the cache database
delete_database = False

# The names of files in the KGTK Wikidata distirbution that we will use in this notebook.
data_file_names = {
    "claims": "claims.tsv",
    "wiki_info": "wikidata_infobox.tsv",
    "p31": "P31.tsv",
    "p279star": "P279star.tsv"
}

# We will define environment variables to hold the full paths to the files as we will use them in the shell commands
kgtk_environment_variables = []

os.environ['DATABASE'] = data_path
kgtk_environment_variables.append('DATABASE')

for key, value in data_file_names.items():
    variable = key.upper()
    os.environ[variable] = data_path + value
    kgtk_environment_variables.append(variable)

output_file_names = {
    "results": "spouse_of_politician.tsv",
    "new_results": "new_spouse_of_politician.tsv",
    "infobox_results": "all_spouse_of_politician.tsv",
    "politician": "politician.tsv",
    "query_file": "politician_wo_spouse.tsv",
    "numbers": "numbers.tsv",
    "strings": "strings.tsv",
    "empty_strings": "empty_strings.tsv",
    "pure_empty": "pure_empty.tsv",
    "non_empty": "non_empty_strings.tsv",
    "structured_literals": "structured_literals.tsv",
    "nodes": "nodes.tsv",
    "qnodes": "qnodes.tsv",
    "correct": "correct.tsv",
    "incorrect": "incorrect.tsv"
}

os.environ['OUTPUT'] = output_path
kgtk_environment_variables.append('OUTPUT')

for key, value in output_file_names.items():
    variable = key.upper()
    os.environ[variable] = output_path + value
    kgtk_environment_variables.append(variable)

# KGTK creates a SQLite database to index the knowledge graph.
if cache_path:
    os.environ['STORE'] = "{}/wikidata.sqlite3.db".format(cache_path)
else:
    os.environ['STORE'] = "{}/{}/wikidata.sqlite3.db".format(output_path, temp_folder)
kgtk_environment_variables.append('STORE')
    
# Envronment variables with shortcuts to the commands we use often
# os.environ['kgtk'] = kgtk
# Use for debugging, but careful as it causes import to dataframes to break
# os.environ['kypher'] = "time kgtk --debug query --graph-cache " + os.environ['STORE']
os.environ['kypher'] = "kgtk query --graph-cache " + os.environ['STORE']
# kgtk_environment_variables.append('kgtk')
kgtk_environment_variables.append('kypher')
    
# kgtk_environment_variables.sort()
for variable in kgtk_environment_variables:
    print("{}: \"{}\"".format(variable, os.environ[variable]))

DATABASE: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/"
CLAIMS: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/claims.tsv"
WIKI_INFO: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/wikidata_infobox.tsv"
P31: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/P31.tsv"
P279STAR: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/P279star.tsv"
OUTPUT: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/P26/"
RESULTS: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/P26/spouse_of_politician.tsv"
NEW_RESULTS: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/P26/new_spouse_of_politician.tsv"
INFOBOX_RESULTS: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/P26/all_spouse_of_politician.tsv"
POLITICIAN: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/P26/politician.tsv"
QUERY_FILE: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/P26/politician_wo_spouse.tsv"
NUMBERS: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/P26/numbers.tsv"
STRINGS: "/nas/home/bohuizha/KG/hunger-for-k

### Main `kgtk` query:

In [1]:
# SPARQL query: 
# SELECT ?politicianLabel ?spouseLabel 
# WHERE
# { 
#   ?politician wdt:P31  wd:Q5 ; 
#               wdt:P106 wd:Q82955 ; 
#               wdt:P26  ?spouse . 
#   SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } 
# } 
# where `P106` means "occupation", `Q82955` means "politician", `P26` means "spouse";

In [8]:
!kgtk query -i $CLAIMS -i $P31 \
            --match 'P31: (politician)-[]->(:Q5), claims: (:Q82955)<-[:P106]-(politician)-[:P26]->(spouse)' \
            --return 'politician, "P26" as label, spouse' \
            -o $RESULTS

### Count known results in Wikidata database:

Count politician-spouse pairs / **rows** (result should -1 which is the header):

In [9]:
!wc -l $RESULTS

36849 /nas/home/bohuizha/KG/hunger-for-knowledge/output/P26/spouse_of_politician.tsv


In [3]:
!head $RESULTS

node1	label	node2
Q1000937	P26	Q1510302
Q1001	P26	Q264908
Q1001705	P26	Q1242788
Q1001936	P26	Q867859
Q100205	P26	Q75287835
Q1002523	P26	Q1063295
Q1004670	P26	Q1292572
Q1006241	P26	Q6779773
Q100637	P26	Q75238697


Count how many **unique politicians** have spouse in Wikidata:

In [10]:
!kgtk query -i $RESULTS \
            --match '(p)-[]->()' \
            --return 'count(distinct p) as N'

N
30281


### Find unknow results in Wikidata database:
- Find all politicians

In [11]:
!kgtk query -i $CLAIMS -i $P31 \
            --match 'P31: (politician)-[]->(:Q5), claims: (politician)-[:P106]->(:Q82955)' \
            --return 'politician as node1, "P106" as label, "Q82955" as node2' \
            -o $POLITICIAN

- Eliminate politicians who have spouse(s)

In [12]:
!kgtk ifnotexists -i $POLITICIAN \
                  --filter-on $RESULTS \
                  --input-keys node1 \
                  --filter-keys node1 \
                  -o $QUERY_FILE

### Count unknown results in Wikidata database:

In [13]:
!kgtk query -i $QUERY_FILE \
            --match '(p)-[]->()' \
            --return 'count(distinct p) as N'

N
593029


# Step 2 Infer properties

Use query results from Wikidata database to infer properties in Wikidata infobox and return the most frequent property.

In [11]:
!kgtk query -i $RESULTS -i $WIKI_INFO \
            --match 's: (politician)-[]->(v), w: (politician)-[p]->(v)' \
            --return 'p.label, count(v) as N' \
            --order-by 'N desc' \
            --limit 1

label	N
property:spouse	5688


# Step 3 Run query in Wikidata infobox 

For those don't have spouse, query in Wikidata infobox:

In [14]:
!kgtk query -i $QUERY_FILE -i $WIKI_INFO \
            --match 'p: (politician)-[]->(), w: (politician)-[property]->(spouse)' \
            --where 'property.label = "property:spouse"' \
            --return 'politician, property.label, spouse' \
            -o $NEW_RESULTS

- Count rows of new findings:

In [15]:
new_results_line = !wc -l < $NEW_RESULTS
new_results_line = int(new_results_line[0]) - 1
new_results_line

58733

- Count unique politicians of new findings:

In [16]:
new_results_distinct = !kgtk query \
    -i $NEW_RESULTS \
    --match 'n: (p)-[]->()' \
    --return 'count(distinct p) as N'
new_results_distinct = int(new_results_distinct[1])
new_results_distinct

39438

Agree:

In [17]:
!kgtk query -i $POLITICIAN -i $WIKI_INFO \
    --match 'p: (politician)-[]->(), w: (politician)-[property]->(spouse)' \
    --where 'property.label = "property:spouse"' \
    --return 'politician, property.label, spouse' \
    -o $INFOBOX_RESULTS

In [18]:
!kgtk query -i $INFOBOX_RESULTS $RESULTS \
    --match 'a: (q)-[]->(v), s: (q)-[]->(v)' \
    --return 'count(distinct q) as N'

N
5165


# Step 4 Datatype distribution of new findings

Collect lines' number and distinct entities' number:

In [19]:
lines = ["Lines"]
entities = ["Entities"]

### 1. Numbers:

- Rows:

In [20]:
!kgtk query -i $NEW_RESULTS \
            --match 'n: (p)-[]->(s)' \
            --where 'kgtk_number(s)' \
            -o $NUMBERS

numbers_line = !wc -l < $NUMBERS
numbers_line = int(numbers_line[0]) - 1
lines.append(numbers_line)
numbers_line

7334

- Unique politicians:

In [21]:
numbers_distinct = !kgtk query -i $NUMBERS \
    --match '(p)-[]->()' \
    --return 'count(distinct p) as N'
numbers_distinct = int(numbers_distinct[1])
entities.append(numbers_distinct)
numbers_distinct

4586

- Check if data type is useful:

In [22]:
!head $NUMBERS | column -ts $'\t'

node1      label            node2
Q1000051   property:spouse  1913
Q1000502   property:spouse  1947
Q100942    property:spouse  1907
Q1027026   property:spouse  1988
Q1027380   property:spouse  1831
Q1027380   property:spouse  1837
Q1027380   property:spouse  1842
Q1029113   property:spouse  1872
Q10292736  property:spouse  1970


**Not** useful!

- Duplicates (politicians that have multiple numbers):

In [23]:
!kgtk query -i $NUMBERS \
            --match '(p)-[]->(s)' \
            --return 'p, count(s) as N' \
            --order-by 'N desc'

node1	N
Q16062641	20
Q635520	13
Q560707	11
Q1456676	10
Q982197	8
Q1573501	7
Q1335209	7
Q8298365	6
Q7814782	6
Q735191	6
Q7101537	6
Q6551176	6
Q6039259	6
Q5644938	6
Q54859909	6
Q525363	6
Q466862	6
Q373212	6
Q3460826	6
Q344157	6
Q3441473	6
Q25849505	6
Q18808619	6
Q178649	6
Q16106114	6
Q152474	6
Q1508394	6
Q1337736	6
Q983981	5
Q973774	5
Q888599	5
Q888421	5
Q7351226	5
Q7172014	5
Q699288	5
Q6847287	5
Q6536935	5
Q6239169	5
Q6207815	5
Q5830827	5
Q5585987	5
Q5200662	5
Q470748	5
Q461101	5
Q456527	5
Q450615	5
Q444691	5
Q430443	5
Q39246	5
Q37585475	5
Q356475	5
Q335759	5
Q333820	5
Q291169	5
Q27832572	5
Q26436159	5
Q2577927	5
Q202276	5
Q1973315	5
Q1941315	5
Q1142551	5
Q11259760	5
Q966261	4
Q960737	4
Q955376	4
Q947791	4
Q947190	4
Q943046	4
Q922916	4
Q908749	4
Q889512	4
Q889250	4
Q889119	4
Q887335	4
Q885601	4
Q883491	4
Q882612	4
Q881381	4
Q880457	4
Q880154	4
Q878798	4
Q878708	4
Q869844	4
Q863703	4
Q817304	4
Q8015063	

### 2. Structured literals:

- Rows:

In [19]:
!kgtk query -i $NEW_RESULTS -i $WIKI_INFO \
            --match 'n: (q)-[p]->(s), w: (s)-[sv]->(v)' \
            --where 'NOT kgtk_lqstring(s) AND NOT kgtk_number(s) AND sv.label = "dbpedia:structured_value"' \
            --return 'q, p.label, v' \
            -o $STRUCTURED_LITERALS

structured_literals_line = !wc -l < $STRUCTURED_LITERALS
structured_literals_line = int(structured_literals_line[0]) - 1
lines.append(structured_literals_line)
# structured_literals_line

- Unique politicians:

In [20]:
structured_literals_distinct = !kgtk query -i $STRUCTURED_LITERALS \
    --match '(p)-[]->()' \
    --return 'count(distinct p) as N'

structured_literals_distinct = int(structured_literals_distinct[1])
entities.append(structured_literals_distinct)
structured_literals_distinct

1460

- Check if data type is useful:

In [21]:
!head $STRUCTURED_LITERALS | column -ts $'\t'

node1     label            node2
Q1027380  property:spouse  "1839-07-23"
Q1027380  property:spouse  "1845-05-27"
Q1037067  property:spouse  "1868-07-14"
Q1064442  property:spouse  "1895-05-11"
Q1065552  property:spouse  "1914-01-31"
Q1066516  property:spouse  "1803-10-04"
Q1066645  property:spouse  "1939-09-03"
Q1066645  property:spouse  "1996-08-01"
Q1070523  property:spouse  "1981-04-16"


**Not** useful!

- Duplicates:

In [None]:
!kgtk query -i $STRUCTURED_LITERALS \
            --match '(p)-[]->(s)' \
            --return 'p, count(s) as N' \
            --order-by 'N desc'

### 3. Strings:

#### Ver 1: Filter all empty strings

- Rows:

In [22]:
!kgtk query -i $NEW_RESULTS \
            --match 'n: (p)-[]->(s)' \
            --where 'kgtk_lqstring(s) AND kgtk_lqstring_text(s) != ""' \
            -o $STRINGS

strings_line = !wc -l < $STRINGS
strings_line = int(strings_line[0]) - 1
lines.append(strings_line)
strings_line

42300

- Unique politicians:

In [23]:
strings_distinct = !kgtk query -i $STRINGS \
    --match '(p)-[]->()' \
    --return 'count(distinct p) as N'
strings_distinct = int(strings_distinct[1])
entities.append(strings_distinct)
strings_distinct

38018

- Check if the data type is useful:

In [24]:
!head $STRINGS | column -ts $'\t'

node1     label            node2
Q1000051  property:spouse  'Agnes Veronica O\'Leary'@en
Q1000061  property:spouse  'Yelena Taratuta'@en
Q1000314  property:spouse  'Helen O\'Connor'@en
Q1000401  property:spouse  'Klára Siklay'@en
Q1000502  property:spouse  'Marion Lucille McLachlan'@en
Q1000799  property:spouse  'Elizabeth Harris'@en
Q1000799  property:spouse  'Marjorie Greenwood Cormack'@en
Q1000957  property:spouse  'Countess Júlia Károlyi de Nagykároly'@en
Q1001178  property:spouse  'Judit Kovách'@en


Useful!

#### Ver 2: query all and analyze

- Rows:

In [25]:
!kgtk query -i $NEW_RESULTS \
            --match 'n: (p)-[]->(s)' \
            --where 'kgtk_lqstring(s)' \
            -o $STRINGS

strings_line = !wc -l < $STRINGS
strings_line = int(strings_line[0]) - 1
lines.append(strings_line)
strings_line

48167

- Unique politicians:

In [26]:
strings_distinct = !kgtk query -i $STRINGS \
    --match '(p)-[]->()' \
    --return 'count(distinct p) as N'

strings_distinct = int(strings_distinct[1])
entities.append(strings_distinct)
strings_distinct

38314

Check if the data type is useful:

In [27]:
!head $STRINGS | column -ts $'\t'

node1     label            node2
Q1000051  property:spouse  ''@en
Q1000051  property:spouse  'Agnes Veronica O\'Leary'@en
Q1000061  property:spouse  'Yelena Taratuta'@en
Q1000314  property:spouse  'Helen O\'Connor'@en
Q1000401  property:spouse  'Klára Siklay'@en
Q1000502  property:spouse  ''@en
Q1000502  property:spouse  'Marion Lucille McLachlan'@en
Q1000799  property:spouse  'Elizabeth Harris'@en
Q1000799  property:spouse  'Marjorie Greenwood Cormack'@en


Useful, but need cleaning.

- Duplicates:

In [None]:
!kgtk query -i $STRINGS \
            --match '(p)-[]->(s)' \
            --return 'p, count(s) as N' \
            --order-by 'N desc'

#### Empty:

- Rows:

In [29]:
!kgtk query -i $NEW_RESULTS \
            --match 'n: (p)-[]->(s)' \
            --where 'kgtk_lqstring_text(s) = ""' \
            -o $EMPTY_STRINGS

empty_lines = !wc -l < $EMPTY_STRINGS
int(empty_lines[0]) - 1

5867

- Unique politicians:

In [30]:
empty_distinct = !kgtk query -i $EMPTY_STRINGS \
    --match 'n: (p)-[]->()' \
    --return 'count(distinct p) as N'

int(empty_distinct[1])

5867

which means there is no duplicate for empty string politician results.

#### Further check empty strings:
Since some of the empty items not excatly mean only return empty for that politician, for example, `Q1133864` has three spouse values: `nodemxZbyK2VRrGoaxfdLmyLxw-7343552`, `'Ethel Arnold'@en`, and `''@en`, so it need further check.

First we filter out all non-empty strings:

In [31]:
!kgtk query -i $NEW_RESULTS \
            --match 'n: (p)-[]->(s)' \
            --where 'kgtk_lqstring(s) AND kgtk_lqstring_text(s) != ""' \
            -o $NON_EMPTY

Then we filter those in empty but also have spouse in non-empty ones, then we can count how many "pure" empty:

In [32]:
!kgtk ifnotexists -i $EMPTY_STRINGS \
                  --filter-on $NON_EMPTY \
                  --input-keys node1 \
                  --filter-keys node1 \
                  -o $PURE_EMPTY

pure_empty_line = !wc -l < $PURE_EMPTY
int(pure_empty_line[0]) - 1

296

In [17]:
pure_empty_distinct = !kgtk query -i $PURE_EMPTY \
    --match 'p: (politician)-[]->()' \
    --return 'count(distinct politician) as N'

int(pure_empty_distinct[1])

16

Check one of them by hand:

In [18]:
!head $PURE_EMPTY | column -ts $'\t'

node1      label            node2
Q1594657   property:spouse  ''@en
Q16727834  property:spouse  ''@en
Q1714588   property:spouse  ''@en
Q205993    property:spouse  ''@en
Q22443695  property:spouse  ''@en
Q250177    property:spouse  ''@en
Q3326309   property:spouse  ''@en
Q5441059   property:spouse  ''@en
Q5529328   property:spouse  ''@en


In [19]:
!kgtk query -i $NEW_RESULTS \
            --match '(q:Q1594657)-[p]->(v)' \
            --return 'q, p.label, v'

node1	label	node2
Q1594657	property:spouse	''@en


### 4. Qnodes

- Rows:

In [28]:
!kgtk query -i $NEW_RESULTS \
            --match 'n:()-[]->(q)' \
            --where 'NOT kgtk_lqstring(q) AND NOT kgtk_number(q)' \
            -o $NODES

!kgtk ifnotexists -i $NODES \
                  --filter-on $STRUCTURED_LITERALS \
                  --input-keys node1 \
                  --filter-keys node1 \
                  -o $QNODES

qnodes_line = !wc -l < $QNODES
qnodes_line = int(qnodes_line[0]) - 1
lines.append(qnodes_line)
qnodes_line

1250

- Unique politicians:

In [29]:
qnodes_distinct = !kgtk query -i $QNODES \
    --match '(p)-[]->()' \
    --return 'count(distinct p) as N'

qnodes_distinct = int(qnodes_distinct[1])
entities.append(qnodes_distinct)
qnodes_distinct

1238

- Check if the data type is useful:

In [30]:
!head $QNODES | column -ts $'\t'

node1      label            node2
Q10296812  property:spouse  Q24279259
Q10311321  property:spouse  Q21020057
Q10316641  property:spouse  Q3781041
Q10321616  property:spouse  Q30939176
Q103926    property:spouse  Q541118
Q1045405   property:spouse  Q3052569
Q104705    property:spouse  Q7853663
Q1053248   property:spouse  Q6041867
Q1064126   property:spouse  Q560127


Check one of them by hand on Internet and it is useful!

- Duplicates:

In [30]:
!kgtk query -i $QNODES \
            --match '(p)-[]->(s)' \
            --return 'p, count(s) as N' \
            --order-by 'N desc' \
            --limit 10

node1	N
Q8298365	2
Q8016992	2
Q6957644	2
Q6832113	2
Q6779683	2
Q6385397	2
Q6350598	2
Q434765	2
Q31865	2
Q315613	2


### 5. Summary

In [34]:
from prettytable import PrettyTable

In [35]:
x = PrettyTable()

x.field_names = ["Data type", "Numbers", "Strings", "Structured literals", "Qnodes"]

x.add_row(lines)
x.add_row(entities)

print(x)

+-----------+---------+---------+---------------------+--------+
| Data type | Numbers | Strings | Structured literals | Qnodes |
+-----------+---------+---------+---------------------+--------+
|   Lines   |   7334  |   1942  |        42300        |  1250  |
|  Entities |   4586  |   1460  |        38018        |  1238  |
+-----------+---------+---------+---------------------+--------+


In [34]:
!kgtk cat -i $NON_EMPTY $QNODES -o $CORRECT

In [35]:
!wc -l $CORRECT

43551 /nas/home/bohuizha/KG/hunger-for-knowledge/output/P26/correct.tsv


In [41]:
!kgtk query -i $CORRECT \
    --match '(q)-[]->()' \
    --return 'count(distinct q)'

count(DISTINCT graph_7194_c1."node1")
39073


In [45]:
!kgtk cat -i $NUMBERS $PURE_EMPTY $STRUCTURED_LITERALS -o $INCORRECT

In [46]:
!kgtk query -i $INCORRECT \
    --match '(q)-[]->()' \
    --return 'count(distinct q)'

count(DISTINCT graph_7202_c1."node1")
5790
