# Step 0 Set up `kgtk`
Check in `spouse_of_politician.ipynb`

# Step 1 Send a SPARQL query use `kgtk`

Example 1: Find cost of movies: 

In [2]:
import os

### Define alias and variables

In [3]:
# Parameters

# Folder where database files store
data_path = "/nas/home/bohuizha/KG/hunger-for-knowledge/data/"

# Folder on local machine where to create the output and temporary folders
output_path = "/nas/home/bohuizha/KG/hunger-for-knowledge/output/"

# The names of files in the KGTK Wikidata distirbution that we will use in this notebook.
data_file_names = {
    "claims": "claims.tsv",
    "wiki_info": "wikidata_infobox.tsv",
    "p31": "P31.tsv",
    "p279star": "P279star.tsv"
}

# We will define environment variables to hold the full paths to the files as we will use them in the shell commands
kgtk_environment_variables = []

os.environ['DATABASE'] = data_path
kgtk_environment_variables.append('DATABASE')

for key, value in data_file_names.items():
    variable = key.upper()
    os.environ[variable] = data_path + value
    kgtk_environment_variables.append(variable)

output_file_names = {
    "results": "movie_with_revenue.tsv",
    "new_results": "new_movie_with_revenue.tsv",
    "movie": "movie.tsv",
    "query_file": "movie_wo_revenue.tsv",
    "numbers": "numbers.tsv",
    "strings": "strings.tsv",
    "empty_strings": "empty_strings.tsv",
    "pure_empty": "pure_empty.tsv",
    "non_empty": "non_empty_strings.tsv",
    "structured_literals": "structured_literals.tsv",
    "nodes": "nodes.tsv",
    "qnodes": "qnodes.tsv"
}

os.environ['OUTPUT'] = output_path
kgtk_environment_variables.append('OUTPUT')

for key, value in output_file_names.items():
    variable = key.upper()
    os.environ[variable] = output_path + value
    kgtk_environment_variables.append(variable)
    
# kgtk_environment_variables.sort()
for variable in kgtk_environment_variables:
    print("{}: \"{}\"".format(variable, os.environ[variable]))

DATABASE: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/"
CLAIMS: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/claims.tsv"
WIKI_INFO: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/wikidata_infobox.tsv"
P31: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/P31.tsv"
P279STAR: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/P279star.tsv"
OUTPUT: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/"
RESULTS: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/movie_with_revenue.tsv"
NEW_RESULTS: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/new_movie_with_revenue.tsv"
MOVIE: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/movie.tsv"
QUERY_FILE: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/movie_wo_revenue.tsv"
NUMBERS: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/numbers.tsv"
STRINGS: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/strings.tsv"
EMPTY_STRINGS: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/empty_strings.tsv"
PURE_EMPTY: "/nas/home/bohuizh

### Main `kgtk` query:

In [1]:
# SPARQL query: 
# SELECT DISTINCT ?movieLabel ?cost 
# WHERE 
# { 
#   ?movie wdt:P31 wd:Q11424 ; 
#          wdt:P577 ?publicationDate ; 
#          wdt:P2130 ?cost . 
#   FILTER(YEAR(?publicationDate) = 2020) . 
#   SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } 
# } 
# where `P2130` is "cost" and `P2142` is "box office";

In [11]:
!kgtk query -i $CLAIMS -i $P31 -i $P279STAR \
            --match 'P31: (movie)-[]->(class), P279star: (class)-[]->(:Q11424), claims: (movie)-[:P2130]->(cost)' \
            --return 'distinct movie as node1, "P2130" as label, kgtk_quantity_number_int(cost) as node2' \
            -o $RESULTS

In [12]:
!head $RESULTS | column -ts $'\t'

node1      label  node2
Q1002251   P2130  4000000
Q1003845   P2130  31000000
Q102225    P2130  150000000
Q102235    P2130  150000000
Q102244    P2130  100000000
Q102438    P2130  125000000
Q102448    P2130  130000000
Q1027247   P2130  14000000
Q10298666  P2130  145000000


### Count known results in Wikidata database:

Count movie-revenue pairs / **rows** (result should -1 which is the header):

In [13]:
!wc -l $RESULTS

3337 /nas/home/bohuizha/KG/hunger-for-knowledge/output/movie_with_revenue.tsv


Count how many **unique movies** have spouse in Wikidata:

In [14]:
!kgtk query -i $RESULTS \
            --match '(p)-[]->()' \
            --return 'count(distinct p) as N'

N
3330


Duplicates:

In [16]:
!kgtk query -i $RESULTS \
            --match '(p)-[]->(s)' \
            --return 'p, count(s) as N' \
            --order-by 'N desc' \
            --limit 10

node1	N
Q6074	2
Q316555	2
Q255342	2
Q20926273	2
Q16795448	2
Q134430	2
Q998377	1
Q994209	1
Q990840	1
Q97930682	1


### Find unknow results in Wikidata database:

- Find all movies (already completed)

In [5]:
# !kgtk query -i $CLAIMS \
#             --match 'c: (movie)-[:P31]->(:Q11424)' \
#             --return 'movie as node1, "P31" as lael, "Q11424" as node2' \
#             -o $MOVIE

# !wc -l < $MOVIE

255070


In [16]:
!kgtk query -i $P31 -i $P279STAR \
            --match 'P31: (movie)-[]->(class), P279star: (class)-[]->(:Q11424)' \
            --return 'movie as node1, "P31" as label, "Q11424" as node2' \
            -o $MOVIE

!wc -l < $MOVIE

306195


- Eliminate movies which have revenue

In [17]:
!kgtk ifnotexists -i $MOVIE \
                  --filter-on $RESULTS \
                  --input-keys node1 \
                  --filter-keys node1 \
                  -o $QUERY_FILE

!wc -l < $QUERY_FILE

302669


### Count unknown results in Wikidata database:

In [20]:
!kgtk query -i $QUERY_FILE \
            --match '(p)-[]->()' \
            --return 'count(distinct p) as N'

N
300208


# Step 2 Infer properties

Use query results from Wikidata database to infer properties in Wikidata infobox and return the most frequent property.

In [3]:
!kgtk query -i $RESULTS -i $WIKI_INFO \
            --match 'm: (movie)-[]->(cost), w: (movie)-[p]->(v)' \
            --where 'kgtk_quantity_number_int(cost) = kgtk_quantity_number_int(v)' \
            --return 'p.label, count(v) as N' \
            --order-by 'N desc' \
            --limit 1

label	N
property:budget	17


In [4]:
!kgtk query -i $RESULTS -i $WIKI_INFO \
            --match 'm: (movie)-[]->(cost), w: (movie)-[p]->(s)-[sv]->(v)' \
            --where 'sv.label = "dbpedia:structured_value" AND kgtk_quantity_number_int(cost) = kgtk_quantity_number_int(kgtk_unstringify(v))' \
            --return 'p.label, count(v) as N' \
            --order-by 'N desc' \
            --limit 1

label	N
property:budget	1049


Try to combine the above two inference:

In [14]:
!kgtk query -i $RESULTS -i $WIKI_INFO --force \
            --match 'm: (movie)-[]->(cost), w: (movie)-[p]->(s)-[sv]->(v)' \
            --where '(kgtk_quantity_number_int(cost) = kgtk_quantity_number_int(kgtk_unstringify(s))) OR (sv.label = "dbpedia:structured_value" AND kgtk_quantity_number_int(cost) = kgtk_quantity_number_int(kgtk_unstringify(v)))' \
            --return 'p.label, count(movie) as N' \
            --order-by 'N desc' \
            --limit 1

label	N
property:budget	1049


In [15]:
!kgtk query -i $RESULTS -i $WIKI_INFO --force \
            --match 'm: (movie)-[]->(cost), w: (movie1)-[p]->(v1), w: (movie2)-[p]->(s)-[sv]->(v2)' \
            --where 'kgtk_quantity_number_int(cost) = kgtk_quantity_number_int(kgtk_unstringify(v1)) OR (sv.label = "dbpedia:structured_value" AND kgtk_quantity_number_int(cost) = kgtk_quantity_number_int(kgtk_unstringify(v2)))' \
            --return 'p.label, count(v1) + count(v2) as N' \
            --order-by 'N desc' \
            --limit 1

'id'



# Step 3 Run query in Wikidata infobox 

For those don't have revenue, query in Wikidata infobox:

In [18]:
!kgtk query -i $QUERY_FILE -i $WIKI_INFO \
            --match 'm: (movie)-[]->(), w: (movie)-[p]->(revenue)' \
            --where 'p.label = "property:budget"' \
            --return 'distinct movie, p.label, revenue' \
            -o $NEW_RESULTS

- Count rows of new findings:

In [19]:
!wc -l < $NEW_RESULTS

19702


- Count unique movies of new findings:

In [20]:
!kgtk query -i $NEW_RESULTS \
            --match 'n: (p)-[]->()' \
            --return 'count(distinct p) as N'

N
18336


# Step 4 Datatype distribution of new findings

Collect lines' number and distinct entities' number:

In [5]:
lines = ["Lines"]
entities = ["Entities"]

### 1. Numbers:

- Rows:

In [6]:
!kgtk query -i $NEW_RESULTS \
            --match 'n: (p)-[]->(s)' \
            --where 'kgtk_number(s)' \
            -o $NUMBERS

numbers_line = !wc -l < $NUMBERS
numbers_line = int(numbers_line[0]) - 1
lines.append(numbers_line)
# numbers_line

- Unique entities:

In [7]:
numbers_distinct = !kgtk query -i $NUMBERS \
    --match '(p)-[]->()' \
    --return 'count(distinct p) as N'

numbers_distinct = int(numbers_distinct[1])
entities.append(numbers_distinct)
# numbers_distinct

- Check if the data type is useful:

In [8]:
!head $NUMBERS | column -ts $'\t'

node1      label            node2
Q1015942   property:budget  25000
Q1033304   property:budget  7.2
Q1043614   property:budget  120000000
Q1050105   property:budget  280000000
Q10551717  property:budget  342
Q1060164   property:budget  40000000
Q106819    property:budget  180000000
Q10716445  property:budget  300
Q10863988  property:budget  300000000


- Duplicates (movies that have multiple numbers):

In [None]:
!kgtk query -i $NUMBERS \
            --match '(p)-[]->(s)' \
            --return 'p, count(s) as N' \
            --order-by 'N desc'

### 2. Structured literals:

- Rows:

In [9]:
!kgtk query -i $NEW_RESULTS -i $WIKI_INFO \
            --match 'n: (q)-[p]->(s), w: (s)-[sv]->(v)' \
            --where 'NOT kgtk_lqstring(s) AND NOT kgtk_number(s) AND sv.label = "dbpedia:structured_value"' \
            --return 'q, p.label, v' \
            -o $STRUCTURED_LITERALS

structured_literals_line = !wc -l < $STRUCTURED_LITERALS
structured_literals_line = int(structured_literals_line[0]) - 1
lines.append(structured_literals_line)
# structured_literals_line

- Unique entities:

In [10]:
structured_literals_distinct = !kgtk query -i $STRUCTURED_LITERALS \
    --match '(p)-[]->()' \
    --return 'count(distinct p) as N'

structured_literals_distinct = int(structured_literals_distinct[1])
entities.append(structured_literals_distinct)
# structured_literals_distinct

- Check if the data type is useful:

In [11]:
!head $STRUCTURED_LITERALS | column -ts $'\t'

node1     label            node2
Q1000394  property:budget  "361000.0"
Q1001943  property:budget  "7100000.0"
Q1002100  property:budget  "4.5E7"
Q1002142  property:budget  "1500000.0"
Q1002480  property:budget  "4500000.0"
Q1004392  property:budget  "1.5E7"
Q1004440  property:budget  "1.2E7"
Q1004531  property:budget  "2.0E7"
Q1004567  property:budget  "1.5E7"


Useful, but need reformat!

- Duplicates:

In [None]:
!kgtk query -i $STRUCTURED_LITERALS \
            --match '(p)-[]->(s)' \
            --return 'p, count(s) as N' \
            --order-by 'N desc'

### 3. Strings:

#### Ver 1: filter the empty strings

In [12]:
!kgtk query -i $NEW_RESULTS \
            --match 'n: (p)-[]->(s)' \
            --where 'kgtk_lqstring(s) AND kgtk_lqstring_text(s) != ""' \
            -o $STRINGS

strings_line = !wc -l < $STRINGS
strings_line = int(strings_line[0]) - 1
lines.append(strings_line)
# strings_line

In [13]:
strings_distinct = !kgtk query -i $STRINGS \
    --match '(p)-[]->()' \
    --return 'count(distinct p) as N'

strings_distinct = int(strings_distinct[1])
entities.append(strings_distinct)
# strings_distinct

- Check if the data type is useful:

In [14]:
!head $STRINGS | column -ts $'\t'

node1      label            node2
Q1004410   property:budget  'App. $3,500,000'@en
Q1008801   property:budget  'INR 26 Crore'@en
Q10264518  property:budget  'R$ 6 million'@en
Q10277912  property:budget  'R$4 million'@en
Q10287994  property:budget  'R$10 million'@en
Q10294054  property:budget  '$300,000 USD'@en
Q10296231  property:budget  'CAD$2,000,000'@en
Q10327416  property:budget  'R$6 million'@en
Q10328595  property:budget  'R$4.5–5 million'@en


Useful, but need more cleaning!

#### Ver 2: query all and analyze

- Rows:

In [26]:
!kgtk query -i $NEW_RESULTS \
            --match 'n: (p)-[]->(s)' \
            --where 'kgtk_lqstring(s)' \
            -o $STRINGS

!wc -l < $STRINGS

4041


- Unique movies:

In [27]:
!kgtk query -i $STRINGS \
            --match '(p)-[]->()' \
            --return 'count(distinct p) as N'

N
3823


- Duplicates:

In [None]:
!kgtk query -i $STRINGS \
            --match '(p)-[]->(s)' \
            --return 'p, count(s) as N' \
            --order-by 'N desc'

#### Empty:

- Rows:

In [29]:
!kgtk query -i $NEW_RESULTS \
            --match 'n: (p)-[]->(s)' \
            --where 'kgtk_lqstring_text(s) = ""' \
            -o $EMPTY_STRINGS

!wc -l < $EMPTY_STRINGS

440


- Unique movies:

In [30]:
!kgtk query -i $EMPTY_STRINGS \
            --match 'n: (p)-[]->()' \
            --return 'count(distinct p) as N'

N
439


#### Further check empty strings:

First we filter out all non-empty strings:

In [31]:
!kgtk query -i $NEW_RESULTS \
            --match 'n: (p)-[]->(s)' \
            --where 'NOT kgtk_lqstring(s) OR kgtk_lqstring_text(s) != ""' \
            -o $NON_EMPTY

Then we filter those in empty but also have spouse in non-empty ones, then we can count how many "pure" empty:

In [32]:
!kgtk ifnotexists -i $EMPTY_STRINGS \
                  --filter-on $NON_EMPTY \
                  --input-keys node1 \
                  --filter-keys node1 \
                  -o $PURE_EMPTY 

!wc -l < $PURE_EMPTY 

14


In [33]:
!kgtk query -i $PURE_EMPTY  \
            --match 'p: (q)-[]->()' \
            --return 'count(distinct q) as N'

N
13


Check one of them by hand:

In [34]:
!head $PURE_EMPTY | column -ts $'\t'

node1      label            node2
Q155653    property:budget  ''@en
Q16242907  property:budget  ''@en
Q186572    property:budget  ''@en
Q24807309  property:budget  ''@en
Q255328    property:budget  ''@en
Q26252639  property:budget  ''@en
Q27954163  property:budget  ''@en
Q27959196  property:budget  ''@en
Q28698824  property:budget  ''@en


In [35]:
!kgtk query -i $NEW_RESULTS \
            --match '(q:Q16242907)-[p]->(v)' \
            --return 'q, p.label, v'

node1	label	node2
Q16242907	property:budget	''@en


### 4. Qnodes

- Rows:

In [15]:
!kgtk query -i $NEW_RESULTS \
            --match 'n:()-[]->(q)' \
            --where 'NOT kgtk_lqstring(q) AND NOT kgtk_number(q)' \
            -o $NODES

!kgtk ifnotexists -i $NODES \
                  --filter-on $STRUCTURED_LITERALS \
                  --input-keys node1 \
                  --filter-keys node1 \
                  -o $QNODES

qnodes_line = !wc -l < $QNODES
qnodes_line = int(qnodes_line[0]) - 1
lines.append(qnodes_line)
# qnodes_line

11

- Unique movies:

In [16]:
qnodes_distinct = !kgtk query -i $QNODES \
    --match '(p)-[]->()' \
    --return 'count(distinct p) as N'

qnodes_distinct = int(qnodes_distinct[1])
entities.append(qnodes_distinct)
# qnodes_distinct

11

- Check if the data type is useful:

In [17]:
!head $QNODES | column -ts $'\t'

node1      label            node2
Q10851237  property:budget  Q208526
Q15637595  property:budget  Q1104069
Q21065428  property:budget  Q208526
Q2706071   property:budget  Q189097
Q4155584   property:budget  Q158478
Q4796595   property:budget  Q7346593
Q48671706  property:budget  Q178843
Q4927579   property:budget  Q17193
Q5284662   property:budget  Q178843


**Not** useful, mostly currecy, others are other film information items; 

- Duplicates:

In [19]:
!kgtk query -i $QNODES \
            --match '(p)-[]->(s)' \
            --return 'p, count(s) as N'

node1	N
Q10851237	1
Q15637595	1
Q21065428	1
Q2706071	1
Q4155584	1
Q4796595	1
Q48671706	1
Q4927579	1
Q5284662	1
Q7278388	1
Q7288987	1


### 5. Summary

In [20]:
from prettytable import PrettyTable

In [21]:
x = PrettyTable()

x.field_names = ["Data type", "Numbers", "Strings", "Structured literals", "Qnodes"]

x.add_row(lines)
x.add_row(entities)

print(x)

+-----------+---------+---------+---------------------+--------+
| Data type | Numbers | Strings | Structured literals | Qnodes |
+-----------+---------+---------+---------------------+--------+
|   Lines   |   1517  |  14133  |         3601        |   11   |
|  Entities |   1517  |  14109  |         3589        |   11   |
+-----------+---------+---------+---------------------+--------+
