# Step 0 Set up `kgtk`
Check in `spouse_of_politician.ipynb`

# Step 1 Send a SPARQL query use `kgtk`

Example 1: Find cost of movies: 

In [5]:
import os

### Define alias and variables

In [6]:
# Parameters

# Folder where database files store
data_path = "/nas/home/bohuizha/KG/hunger-for-knowledge/data/"

# Folder on local machine where to create the output and temporary folders
output_path = "/nas/home/bohuizha/KG/hunger-for-knowledge/output/P2130/"
if not os.path.exists(output_path):
    os.mkdir(output_path)

# The names of files in the KGTK Wikidata distirbution that we will use in this notebook.
data_file_names = {
    "claims": "claims.tsv",
    "wiki_info": "wikidata_infobox.tsv",
    "p31": "P31.tsv",
    "p279star": "P279star.tsv"
}

# We will define environment variables to hold the full paths to the files as we will use them in the shell commands
kgtk_environment_variables = []

os.environ['DATABASE'] = data_path
kgtk_environment_variables.append('DATABASE')

for key, value in data_file_names.items():
    variable = key.upper()
    os.environ[variable] = data_path + value
    kgtk_environment_variables.append(variable)

output_file_names = {
    "results": "movie_with_revenue.tsv",
    "new_results": "new_movie_with_revenue.tsv",
    "infobox_results": "all_movie_with_revenue.tsv",
    "movie": "movie.tsv",
    "query_file": "movie_wo_revenue.tsv",
    "numbers": "numbers.tsv",
    "strings": "strings.tsv",
    "empty_strings": "empty_strings.tsv",
    "pure_empty": "pure_empty.tsv",
    "non_empty": "non_empty_strings.tsv",
    "structured_literals": "structured_literals.tsv",
    "nodes": "nodes.tsv",
    "qnodes": "qnodes.tsv",
    "correct": "correct.tsv",
    "incorrect": "incorrect.tsv"
}

os.environ['OUTPUT'] = output_path
kgtk_environment_variables.append('OUTPUT')

for key, value in output_file_names.items():
    variable = key.upper()
    os.environ[variable] = output_path + value
    kgtk_environment_variables.append(variable)
    
# kgtk_environment_variables.sort()
for variable in kgtk_environment_variables:
    print("{}: \"{}\"".format(variable, os.environ[variable]))

DATABASE: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/"
CLAIMS: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/claims.tsv"
WIKI_INFO: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/wikidata_infobox.tsv"
P31: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/P31.tsv"
P279STAR: "/nas/home/bohuizha/KG/hunger-for-knowledge/data/P279star.tsv"
OUTPUT: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/P2130/"
RESULTS: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/P2130/movie_with_revenue.tsv"
NEW_RESULTS: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/P2130/new_movie_with_revenue.tsv"
INFOBOX_RESULTS: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/P2130/all_movie_with_revenue.tsv"
MOVIE: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/P2130/movie.tsv"
QUERY_FILE: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/P2130/movie_wo_revenue.tsv"
NUMBERS: "/nas/home/bohuizha/KG/hunger-for-knowledge/output/P2130/numbers.tsv"
STRINGS: "/nas/home/bohuizha/KG/hunger-for-knowled

### Main `kgtk` query:

In [1]:
# SPARQL query: 
# SELECT DISTINCT ?movieLabel ?cost 
# WHERE 
# { 
#   ?movie wdt:P31 wd:Q11424 ; 
#          wdt:P577 ?publicationDate ; 
#          wdt:P2130 ?cost . 
#   FILTER(YEAR(?publicationDate) = 2020) . 
#   SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } 
# } 
# where `P2130` is "cost" and `P2142` is "box office";

In [7]:
!kgtk query -i $CLAIMS -i $P31 -i $P279STAR \
            --match 'P31: (movie)-[]->(class), P279star: (class)-[]->(:Q11424), claims: (movie)-[:P2130]->(cost)' \
            --return 'distinct movie as node1, "P2130" as label, kgtk_quantity_number_int(cost) as node2' \
            -o $RESULTS

In [8]:
!head $RESULTS | column -ts $'\t'

node1      label  node2
Q1002251   P2130  4000000
Q1003845   P2130  31000000
Q102225    P2130  150000000
Q102235    P2130  150000000
Q102244    P2130  100000000
Q102438    P2130  125000000
Q1027247   P2130  14000000
Q10307713  P2130  160000000
Q103474    P2130  10500000


### Count known results in Wikidata database:

Count movie-revenue pairs / **rows** (result should -1 which is the header):

In [9]:
!wc -l $RESULTS

3337 /nas/home/bohuizha/KG/hunger-for-knowledge/output/P2130/movie_with_revenue.tsv


Count how many **unique movies** have spouse in Wikidata:

In [10]:
!kgtk query -i $RESULTS \
            --match '(p)-[]->()' \
            --return 'count(distinct p) as N'

N
3330


Duplicates:

In [7]:
!kgtk query -i $RESULTS \
            --match '(p)-[]->(s)' \
            --return 'p, count(s) as N' \
            --order-by 'N desc' \
            --limit 10

node1	N
Q6074	2
Q316555	2
Q255342	2
Q20926273	2
Q16795448	2
Q134430	2
Q998377	1
Q994209	1
Q990840	1
Q97930682	1


### Find unknow results in Wikidata database:

- Find all movies (already completed)

In [8]:
# !kgtk query -i $CLAIMS \
#             --match 'c: (movie)-[:P31]->(:Q11424)' \
#             --return 'movie as node1, "P31" as lael, "Q11424" as node2' \
#             -o $MOVIE

# !wc -l < $MOVIE

In [11]:
!kgtk query -i $P31 -i $P279STAR \
            --match 'P31: (movie)-[]->(class), P279star: (class)-[]->(:Q11424)' \
            --return 'movie as node1, "P31" as label, "Q11424" as node2' \
            -o $MOVIE

!wc -l < $MOVIE

306195


In [12]:
!kgtk query -i $MOVIE \
    --match '(q)-[]->()' \
    --return 'count(distinct q) as N'

N
303538


- Eliminate movies which have revenue

In [13]:
!kgtk ifnotexists -i $MOVIE \
                  --filter-on $RESULTS \
                  --input-keys node1 \
                  --filter-keys node1 \
                  -o $QUERY_FILE

!wc -l < $QUERY_FILE

302669


### Count unknown results in Wikidata database:

In [14]:
!kgtk query -i $QUERY_FILE \
            --match '(p)-[]->()' \
            --return 'count(distinct p) as N'

N
300208


# Step 2 Infer properties

Use query results from Wikidata database to infer properties in Wikidata infobox and return the most frequent property.

In [40]:
!kgtk query -i $RESULTS -i $WIKI_INFO \
            --match 'm: (movie)-[]->(cost), w: (movie)-[p]->(v)' \
            --where 'kgtk_quantity_number_int(cost) = kgtk_quantity_number_int(v)' \
            --return 'p.label, count(v) as N' \
            --order-by 'N desc' \
            --limit 3

label	N
property:budget	17


In [41]:
!kgtk query -i $RESULTS -i $WIKI_INFO \
            --match 'm: (movie)-[]->(cost), w: (movie)-[p]->(s)-[sv]->(v)' \
            --where 'sv.label = "dbpedia:structured_value" AND kgtk_quantity_number_int(cost) = kgtk_quantity_number_int(kgtk_unstringify(v))' \
            --return 'p.label, count(v) as N' \
            --order-by 'N desc' \
            --limit 3

label	N
property:budget	1049
property:gross	4


Try to combine the above two inference:

In [14]:
!kgtk query -i $RESULTS -i $WIKI_INFO --force \
            --match 'm: (movie)-[]->(cost), w: (movie)-[p]->(s)-[sv]->(v)' \
            --where '(kgtk_quantity_number_int(cost) = kgtk_quantity_number_int(kgtk_unstringify(s))) OR (sv.label = "dbpedia:structured_value" AND kgtk_quantity_number_int(cost) = kgtk_quantity_number_int(kgtk_unstringify(v)))' \
            --return 'p.label, count(movie) as N' \
            --order-by 'N desc' \
            --limit 1

label	N
property:budget	1049


In [15]:
!kgtk query -i $RESULTS -i $WIKI_INFO --force \
            --match 'm: (movie)-[]->(cost), w: (movie1)-[p]->(v1), w: (movie2)-[p]->(s)-[sv]->(v2)' \
            --where 'kgtk_quantity_number_int(cost) = kgtk_quantity_number_int(kgtk_unstringify(v1)) OR (sv.label = "dbpedia:structured_value" AND kgtk_quantity_number_int(cost) = kgtk_quantity_number_int(kgtk_unstringify(v2)))' \
            --return 'p.label, count(v1) + count(v2) as N' \
            --order-by 'N desc' \
            --limit 1

'id'



# Step 3 Run query in Wikidata infobox 

For those don't have revenue, query in Wikidata infobox:

In [15]:
!kgtk query -i $QUERY_FILE -i $WIKI_INFO \
            --match 'm: (movie)-[]->(), w: (movie)-[p]->(revenue)' \
            --where 'p.label = "property:budget"' \
            --return 'distinct movie, p.label, revenue' \
            -o $NEW_RESULTS

- Count rows of new findings:

In [16]:
!wc -l < $NEW_RESULTS

19702


- Count unique movies of new findings:

In [17]:
!kgtk query -i $NEW_RESULTS \
            --match 'n: (p)-[]->()' \
            --return 'count(distinct p) as N'

N
18336


Agree:

In [18]:
!kgtk query -i $MOVIE $WIKI_INFO \
            --match 'm: (movie)-[]->(), w: (movie)-[p]->(revenue)' \
            --where 'p.label = "property:budget"' \
            --return 'distinct movie, p.label, revenue' \
            -o $INFOBOX_RESULTS

In [19]:
!kgtk query -i $INFOBOX_RESULTS $RESULTS \
    --match 'a: (q)-[]->(v), m: (q)-[]->(v)' \
    --return 'count(distinct q) as N'

N
16


In [None]:
!kgtk query -i $INFOBOX_RESULTS $WIKI_INFO $RESULTS \
    --match 'a: (q)-[]->(v), m: (q)-[]->(v)' \
    --return 'count(distinct q) as N'

# Step 4 Datatype distribution of new findings

Collect lines' number and distinct entities' number:

In [20]:
lines = ["Lines"]
entities = ["Entities"]

### 1. Numbers:

- Rows:

In [21]:
!kgtk query -i $NEW_RESULTS \
            --match 'n: (p)-[]->(s)' \
            --where 'kgtk_number(s)' \
            -o $NUMBERS

numbers_line = !wc -l < $NUMBERS
numbers_line = int(numbers_line[0]) - 1
lines.append(numbers_line)
numbers_line

1517

- Unique entities:

In [22]:
numbers_distinct = !kgtk query -i $NUMBERS \
    --match '(p)-[]->()' \
    --return 'count(distinct p) as N'

numbers_distinct = int(numbers_distinct[1])
entities.append(numbers_distinct)
numbers_distinct

1517

- Check if the data type is useful:

In [23]:
!head -n 20 $NUMBERS | column -ts $'\t'

node1      label            node2
Q1015942   property:budget  25000
Q1033304   property:budget  7.2
Q1043614   property:budget  120000000
Q1050105   property:budget  280000000
Q10551717  property:budget  342
Q1060164   property:budget  40000000
Q106819    property:budget  180000000
Q10716445  property:budget  300
Q10863988  property:budget  300000000
Q1094828   property:budget  180000000
Q1094839   property:budget  30
Q1095356   property:budget  6000000
Q10968665  property:budget  2200000
Q10983189  property:budget  10
Q11003667  property:budget  25000000
Q11048250  property:budget  1
Q1109255   property:budget  10
Q11268003  property:budget  3.5
Q1130343   property:budget  31


- Duplicates (movies that have multiple numbers):

In [None]:
!kgtk query -i $NUMBERS \
            --match '(p)-[]->(s)' \
            --return 'p, count(s) as N' \
            --order-by 'N desc'

### 2. Structured literals:

- Rows:

In [24]:
!kgtk query -i $NEW_RESULTS -i $WIKI_INFO \
            --match 'n: (q)-[p]->(s), w: (s)-[sv]->(v)' \
            --where 'NOT kgtk_lqstring(s) AND NOT kgtk_number(s) AND sv.label = "dbpedia:structured_value"' \
            --return 'q, p.label, v' \
            -o $STRUCTURED_LITERALS

structured_literals_line = !wc -l < $STRUCTURED_LITERALS
structured_literals_line = int(structured_literals_line[0]) - 1
lines.append(structured_literals_line)
structured_literals_line

14133

- Unique entities:

In [25]:
structured_literals_distinct = !kgtk query -i $STRUCTURED_LITERALS \
    --match '(p)-[]->()' \
    --return 'count(distinct p) as N'

structured_literals_distinct = int(structured_literals_distinct[1])
entities.append(structured_literals_distinct)
structured_literals_distinct

14109

- Check if the data type is useful:

In [26]:
!head -n 20 $STRUCTURED_LITERALS | column -ts $'\t'

node1     label            node2
Q1000394  property:budget  "361000.0"
Q1001943  property:budget  "7100000.0"
Q1002100  property:budget  "4.5E7"
Q1002142  property:budget  "1500000.0"
Q1002480  property:budget  "4500000.0"
Q1004392  property:budget  "1.5E7"
Q1004440  property:budget  "1.2E7"
Q1004531  property:budget  "2.0E7"
Q1004567  property:budget  "1.5E7"
Q1004657  property:budget  "2100000.0"
Q1004801  property:budget  "700000.0"
Q1008351  property:budget  "1214899.0"
Q1009788  property:budget  "1600000.0"
Q1010058  property:budget  "1.8E8"
Q1010099  property:budget  "6.36E7"
Q1012216  property:budget  "2.2E7"
Q1014049  property:budget  "600000.0"
Q1015807  property:budget  "1700000.0"
Q1016369  property:budget  "1.0E7"


Useful, but need reformat!

- Duplicates:

In [27]:
!kgtk query -i $STRUCTURED_LITERALS \
            --match '(p)-[]->(s)' \
            --return 'p, count(s) as N' \
            --order-by 'N desc'

node1	N
Q603433	4
Q979675	3
Q673850	3
Q714713	2
Q711516	2
Q6186046	2
Q5447214	2
Q42708010	2
Q3567754	2
Q3017103	2
Q29011	2
Q223278	2
Q2164258	2
Q20164609	2
Q19242484	2
Q17183934	2
Q17125180	2
Q15901539	2
Q14704171	2
Q1218054	2
Q999353	1
Q998219	1
Q997966	1
Q997850	1
Q997671	1
Q997423	1
Q997405	1
Q997206	1
Q996995	1
Q996159	1
Q994481	1
Q994248	1
Q992813	1
Q991440	1
Q990976	1
Q990564	1
Q989036	1
Q987305	1
Q984164	1
Q983912	1
Q982791	1
Q981670	1
Q981189	1
Q981032	1
Q980308	1
Q980257	1
Q980223	1
Q980041	1
Q979734	1
Q979726	1
Q979462	1
Q979279	1
Q979256	1
Q979196	1
Q978849	1
Q978770	1
Q978475	1
Q977996	1
Q977750	1
Q977372	1
Q977196	1
Q976728	1
Q976149	1
Q976139	1
Q975358	1
Q974771	1
Q974427	1
Q973722	1
Q973663	1
Q973623	1
Q973616	1
Q972922	1
Q972771	1
Q972154	1
Q972072	1
Q971985	1
Q971941	1
Q971636	1
Q971630	1
Q971468	1
Q971412	1
Q971165	1
Q971109	1
Q970746	1
Q970655	1
Q970543	1
Q969651	1
Q969646	1
Q969

Q48818131	1
Q4881779	1
Q488041	1
Q4880295	1
Q4880283	1
Q487978	1
Q4879432	1
Q4878491	1
Q48781975	1
Q4877864	1
Q4877364	1
Q4877213	1
Q4876991	1
Q4876862	1
Q4876343	1
Q4875755	1
Q4875682	1
Q4875675	1
Q487447	1
Q487438	1
Q487383	1
Q48734	1
Q4873361	1
Q4872863	1
Q487271	1
Q487181	1
Q487138	1
Q486954	1
Q486826	1
Q486822	1
Q4867896	1
Q48674621	1
Q48674494	1
Q48674419	1
Q48674322	1
Q48674258	1
Q48674240	1
Q48674131	1
Q48673969	1
Q48673967	1
Q48673898	1
Q48673882	1
Q48673862	1
Q48672826	1
Q48672822	1
Q48672812	1
Q48672735	1
Q48672553	1
Q48672480	1
Q48672452	1
Q48672376	1
Q48671691	1
Q48671600	1
Q48671585	1
Q486591	1
Q48637969	1
Q4860288	1
Q4859547	1
Q485803	1
Q485610	1
Q485527	1
Q4853901	1
Q484987	1
Q484951	1
Q484857	1
Q4844399	1
Q484048	1
Q4840366	1
Q4840351	1
Q4840341	1
Q4840281	1
Q4840270	1
Q4840214	1
Q4840170	1
Q4839322	1
Q4839304	1
Q4839171	1
Q483909	1
Q4839085	1
Q4838754	1
Q483852	1
Q4838336	1
Q4837101	1
Q

Q2140150	1
Q2140135	1
Q214013	1
Q2139961	1
Q2138296	1
Q213773	1
Q2137301	1
Q213683	1
Q2133691	1
Q213326	1
Q2132418	1
Q213215	1
Q213208	1
Q213138	1
Q21313531	1
Q2131160	1
Q213053	1
Q213012	1
Q2129738	1
Q212965	1
Q212864	1
Q2128630	1
Q2127904	1
Q212775	1
Q212689	1
Q21264925	1
Q2124822	1
Q212129	1
Q212098	1
Q2120775	1
Q21203805	1
Q21203305	1
Q2119500	1
Q2119348	1
Q2118806	1
Q21187932	1
Q21186592	1
Q2118509	1
Q2117963	1
Q211693	1
Q2116797	1
Q21161299	1
Q21161034	1
Q2115481	1
Q211545	1
Q211429	1
Q211373	1
Q211372	1
Q2113568	1
Q211321	1
Q21130181	1
Q2112932	1
Q211278	1
Q211206	1
Q2112023	1
Q2110876	1
Q21107374	1
Q2110480	1
Q21104090	1
Q21098710	1
Q21095446	1
Q21095400	1
Q2108316	1
Q21079862	1
Q210681	1
Q210590	1
Q2104413	1
Q2104178	1
Q210364	1
Q21034423	1
Q21032710	1
Q21016455	1
Q21015106	1
Q21010879	1
Q21010868	1
Q21010864	1
Q21010850	1
Q21010849	1
Q21010845	1
Q21009568	1
Q21009548	1
Q21009512	1
Q21009457	1
Q

Q1236069	1
Q123558	1
Q1235281	1
Q1234664	1
Q1234074	1
Q1233890	1
Q1233213	1
Q1233030	1
Q123166	1
Q1230682	1
Q1230516	1
Q1230318	1
Q12302227	1
Q1229857	1
Q1229424	1
Q1229070	1
Q12285060	1
Q1228304	1
Q1228275	1
Q122623	1
Q1225756	1
Q1225134	1
Q12216977	1
Q122113	1
Q1219965	1
Q1219922	1
Q1219899	1
Q1219828	1
Q1219825	1
Q1219814	1
Q1219785	1
Q1219775	1
Q1219738	1
Q1219727	1
Q1219711	1
Q1219665	1
Q1219584	1
Q1219559	1
Q1219363	1
Q1219353	1
Q1219330	1
Q1219280	1
Q1219171	1
Q1219056	1
Q1219026	1
Q1218974	1
Q1218963	1
Q1218959	1
Q1218897	1
Q1218888	1
Q1218876	1
Q1218856	1
Q1218847	1
Q1218827	1
Q1218719	1
Q1218665	1
Q1218601	1
Q1218571	1
Q1218542	1
Q1218526	1
Q1218520	1
Q1218481	1
Q1218462	1
Q1218403	1
Q1218313	1
Q1218263	1
Q1218226	1
Q1218172	1
Q1218111	1
Q121810	1
Q1218039	1
Q1218028	1
Q1218001	1
Q1217996	1
Q1217937	1
Q1217931	1
Q1217879	1
Q1217852	1
Q1217752	1
Q1217733	1
Q1217715	1
Q1217573	1
Q1217565	1
Q12175

### 3. Strings:

#### Ver 1: filter the empty strings

In [28]:
!kgtk query -i $NEW_RESULTS \
            --match 'n: (p)-[]->(s)' \
            --where 'kgtk_lqstring(s) AND kgtk_lqstring_text(s) != ""' \
            -o $STRINGS

strings_line = !wc -l < $STRINGS
strings_line = int(strings_line[0]) - 1
lines.append(strings_line)
strings_line

3601

In [29]:
strings_distinct = !kgtk query -i $STRINGS \
    --match '(p)-[]->()' \
    --return 'count(distinct p) as N'

strings_distinct = int(strings_distinct[1])
entities.append(strings_distinct)
strings_distinct

3589

- Check if the data type is useful:

In [30]:
!head -n 20 $STRINGS | column -ts $'\t'

node1      label            node2
Q1004410   property:budget  'App. $3,500,000'@en
Q1008801   property:budget  'INR 26 Crore'@en
Q10264518  property:budget  'R$ 6 million'@en
Q10277912  property:budget  'R$4 million'@en
Q10287994  property:budget  'R$10 million'@en
Q10294054  property:budget  '$300,000 USD'@en
Q10296231  property:budget  'CAD$2,000,000'@en
Q10327416  property:budget  'R$6 million'@en
Q10328595  property:budget  'R$4.5–5 million'@en
Q10329335  property:budget  'R$3,6 million'@en
Q1033304   property:budget  'INR'@en
Q10338807  property:budget  'RS3 million'@en
Q10338882  property:budget  'R$1,6 million'@en
Q10339344  property:budget  'R$7.5 million'@en
Q10339864  property:budget  'R$ 13,000,000'@en
Q10344758  property:budget  'R$ 10,500,000'@en
Q1034978   property:budget  '$400,000-$500,000'@en
Q1034984   property:budget  'over $1 million or $250,000'@en
Q1035443   property:budget  '£450,000 or £475,000'@en


Useful, but need more cleaning!

#### Ver 2: query all and analyze

- Rows:

In [26]:
!kgtk query -i $NEW_RESULTS \
            --match 'n: (p)-[]->(s)' \
            --where 'kgtk_lqstring(s)' \
            -o $STRINGS

!wc -l < $STRINGS

4041


- Unique movies:

In [27]:
!kgtk query -i $STRINGS \
            --match '(p)-[]->()' \
            --return 'count(distinct p) as N'

N
3823


- Duplicates:

In [None]:
!kgtk query -i $STRINGS \
            --match '(p)-[]->(s)' \
            --return 'p, count(s) as N' \
            --order-by 'N desc'

#### Empty:

- Rows:

In [31]:
!kgtk query -i $NEW_RESULTS \
            --match 'n: (p)-[]->(s)' \
            --where 'kgtk_lqstring_text(s) = ""' \
            -o $EMPTY_STRINGS

!wc -l < $EMPTY_STRINGS

440


- Unique movies:

In [32]:
!kgtk query -i $EMPTY_STRINGS \
            --match 'n: (p)-[]->()' \
            --return 'count(distinct p) as N'

N
439


#### Further check empty strings:

First we filter out all non-empty strings:

In [33]:
!kgtk query -i $NEW_RESULTS \
            --match 'n: (p)-[]->(s)' \
            --where 'kgtk_lqstring(s) AND kgtk_lqstring_text(s) != ""' \
            -o $NON_EMPTY

Then we filter those in empty but also have spouse in non-empty ones, then we can count how many "pure" empty:

In [34]:
!kgtk ifnotexists -i $EMPTY_STRINGS \
                  --filter-on $NON_EMPTY \
                  --input-keys node1 \
                  --filter-keys node1 \
                  -o $PURE_EMPTY 

!wc -l < $PURE_EMPTY 

235


In [35]:
!kgtk query -i $PURE_EMPTY  \
            --match 'p: (q)-[]->()' \
            --return 'count(distinct q) as N'

N
234


Check one of them by hand:

In [36]:
!head -n 20 $PURE_EMPTY | column -ts $'\t'

node1      label            node2
Q1010058   property:budget  ''@en
Q1027212   property:budget  ''@en
Q1048319   property:budget  ''@en
Q1058193   property:budget  ''@en
Q1061539   property:budget  ''@en
Q1066948   property:budget  ''@en
Q10863988  property:budget  ''@en
Q1087225   property:budget  ''@en
Q10923631  property:budget  ''@en
Q1093751   property:budget  ''@en
Q10938623  property:budget  ''@en
Q1112469   property:budget  ''@en
Q1138659   property:budget  ''@en
Q1141186   property:budget  ''@en
Q1142950   property:budget  ''@en
Q1182350   property:budget  ''@en
Q11837540  property:budget  ''@en
Q1211778   property:budget  ''@en
Q12127059  property:budget  ''@en


In [35]:
!kgtk query -i $NEW_RESULTS \
            --match '(q:Q16242907)-[p]->(v)' \
            --return 'q, p.label, v'

node1	label	node2
Q16242907	property:budget	''@en


### 4. Qnodes

- Rows:

In [15]:
!kgtk query -i $NEW_RESULTS \
            --match 'n:()-[]->(q)' \
            --where 'NOT kgtk_lqstring(q) AND NOT kgtk_number(q)' \
            -o $NODES

!kgtk ifnotexists -i $NODES \
                  --filter-on $STRUCTURED_LITERALS \
                  --input-keys node1 \
                  --filter-keys node1 \
                  -o $QNODES

qnodes_line = !wc -l < $QNODES
qnodes_line = int(qnodes_line[0]) - 1
lines.append(qnodes_line)
# qnodes_line

11

- Unique movies:

In [16]:
qnodes_distinct = !kgtk query -i $QNODES \
    --match '(p)-[]->()' \
    --return 'count(distinct p) as N'

qnodes_distinct = int(qnodes_distinct[1])
entities.append(qnodes_distinct)
# qnodes_distinct

11

- Check if the data type is useful:

In [17]:
!head $QNODES | column -ts $'\t'

node1      label            node2
Q10851237  property:budget  Q208526
Q15637595  property:budget  Q1104069
Q21065428  property:budget  Q208526
Q2706071   property:budget  Q189097
Q4155584   property:budget  Q158478
Q4796595   property:budget  Q7346593
Q48671706  property:budget  Q178843
Q4927579   property:budget  Q17193
Q5284662   property:budget  Q178843


**Not** useful, mostly currecy, others are other film information items; 

- Duplicates:

In [19]:
!kgtk query -i $QNODES \
            --match '(p)-[]->(s)' \
            --return 'p, count(s) as N'

node1	N
Q10851237	1
Q15637595	1
Q21065428	1
Q2706071	1
Q4155584	1
Q4796595	1
Q48671706	1
Q4927579	1
Q5284662	1
Q7278388	1
Q7288987	1


### 5. Summary

In [20]:
from prettytable import PrettyTable

In [21]:
x = PrettyTable()

x.field_names = ["Data type", "Numbers", "Strings", "Structured literals", "Qnodes"]

x.add_row(lines)
x.add_row(entities)

print(x)

+-----------+---------+---------+---------------------+--------+
| Data type | Numbers | Strings | Structured literals | Qnodes |
+-----------+---------+---------+---------------------+--------+
|   Lines   |   1517  |  14133  |         3601        |   11   |
|  Entities |   1517  |  14109  |         3589        |   11   |
+-----------+---------+---------+---------------------+--------+


In [37]:
!kgtk cat -i $NUMBERS $NON_EMPTY $STRUCTURED_LITERALS -o $CORRECT

In [38]:
!wc -l $CORRECT

19252 /nas/home/bohuizha/KG/hunger-for-knowledge/output/P2130/correct.tsv


In [39]:
!kgtk query -i $CORRECT \
    --match '(q)-[]->()' \
    --return 'count(distinct q)'

count(DISTINCT graph_7200_c1."node1")
18313
