# KGTK Browser Cache Setup

This note book will create the SQLite DB Cache and the required indices for KGTK Browser.

The required input parameters are:
- input_path: Path where the following files should be present
    - labels.en.tsv.gz
    - aliases.en.tsv.gz
    - descriptions.en.tsv.gz
    - claims.tsv.gz
    - metadata.types.tsv.gz
    - qualifiers.tsv.gz
    - metadata.pagerank.undirected.tsv.gz
- output_path: Output path 
- project_name: folder inside the `output_path` where the required files and cache will be created

**Cache file location:** `<output_path>/<project_name>/temp.<project_name>/wikidata.sqlite3.db`

In [1]:
import os
import pandas as pd
from kgtk.configure_kgtk_notebooks import ConfigureKGTK
import kgtk.kypher.api as kapi

In [3]:
input_path = "/Volumes/saggu-ssd/wikidata-dwd-v2"
output_path = "/Volumes/saggu-ssd/wikidata-dwd-v2"

project_name = "kgtk-search-6"

files = 'label,pagerank_undirected,alias,description,claims,datatypes,qualifiers'
# files = 'label,pagerank_undirected'

In [4]:
files = files.split(',')

In [5]:
ck = ConfigureKGTK(files)
ck.configure_kgtk(input_graph_path=input_path,
                  output_path=output_path,
                  project_name=project_name)

User home: /Users/amandeep
Current dir: /Users/amandeep/Github/kgtk-browser
KGTK dir: /Users/amandeep/Github
Use-cases dir: /Users/amandeep/Github/use-cases


In [6]:
ck.print_env_variables()

EXAMPLES_DIR: /Users/amandeep/Github/examples
KGTK_GRAPH_CACHE: /Volumes/saggu-ssd/wikidata-dwd-v2/kgtk-search-6/temp.kgtk-search-6/wikidata.sqlite3.db
GRAPH: /Volumes/saggu-ssd/wikidata-dwd-v2
TEMP: /Volumes/saggu-ssd/wikidata-dwd-v2/kgtk-search-6/temp.kgtk-search-6
STORE: /Volumes/saggu-ssd/wikidata-dwd-v2/kgtk-search-6/temp.kgtk-search-6/wikidata.sqlite3.db
KGTK_LABEL_FILE: /Volumes/saggu-ssd/wikidata-dwd-v2/labels.en.tsv.gz
OUT: /Volumes/saggu-ssd/wikidata-dwd-v2/kgtk-search-6
USE_CASES_DIR: /Users/amandeep/Github/use-cases
kypher: kgtk query --graph-cache /Volumes/saggu-ssd/wikidata-dwd-v2/kgtk-search-6/temp.kgtk-search-6/wikidata.sqlite3.db
KGTK_OPTION_DEBUG: false
kgtk: kgtk
label: /Volumes/saggu-ssd/wikidata-dwd-v2/labels.en.tsv.gz
pagerank_undirected: /Volumes/saggu-ssd/wikidata-dwd-v2/metadata.pagerank.undirected.tsv.gz
alias: /Volumes/saggu-ssd/wikidata-dwd-v2/aliases.en.tsv.gz
description: /Volumes/saggu-ssd/wikidata-dwd-v2/descriptions.en.tsv.gz
claims: /Volumes/saggu-ssd/

## Load the files into cache

In [7]:
ck.load_files_into_cache()

kgtk query --graph-cache /Volumes/saggu-ssd/wikidata-dwd-v2/kgtk-search-6/temp.kgtk-search-6/wikidata.sqlite3.db -i "/Volumes/saggu-ssd/wikidata-dwd-v2/labels.en.tsv.gz" --as label  -i "/Volumes/saggu-ssd/wikidata-dwd-v2/metadata.pagerank.undirected.tsv.gz" --as pagerank_undirected  -i "/Volumes/saggu-ssd/wikidata-dwd-v2/aliases.en.tsv.gz" --as alias  -i "/Volumes/saggu-ssd/wikidata-dwd-v2/descriptions.en.tsv.gz" --as description  -i "/Volumes/saggu-ssd/wikidata-dwd-v2/claims.tsv.gz" --as claims  -i "/Volumes/saggu-ssd/wikidata-dwd-v2/metadata.property.datatypes.tsv.gz" --as datatypes  -i "/Volumes/saggu-ssd/wikidata-dwd-v2/qualifiers.tsv.gz" --as qualifiers  --limit 3
id	node1	label	node2
P10-label-en	P10	label	'video'@en
P1000-label-en	P1000	label	'record held'@en
P1001-label-en	P1001	label	'applies to jurisdiction'@en


## Define the Kypher API

In [8]:
_kapi2 = kapi.KypherApi(graphcache=os.environ['STORE'], loglevel=1, index='auto',
                      maxresults=100, maxcache=0)

## Create a file with `label`, `undirected_pagerank` and `description`

In [9]:
!kgtk query --gc $STORE \
    -i label pagerank_undirected description\
    --match 'label: (qnode)-[l]->(y), pagerank: (qnode)-[:undirected_pagerank]->(pr)' \
    --opt 'description: (qnode)-[:description]->(d)' \
    --return 'qnode as node1, l.label as label, y as node2, upper(y) as `node2;upper`, pr as `node1;pagerank`, ifnull(d, "") as `node1;description`' \
    --order-by 'qnode' \
    -o $OUT/label_pagerank_undirected_description.tsv.gz

### Load this file into cache as well

In [10]:
!kgtk query --gc $STORE -i $OUT/label_pagerank_undirected_description.tsv.gz --as l_d_pgr_ud --limit 10

node1	label	node2	node2;upper	node1;pagerank	node1;description
P10	label	'video'@en	'VIDEO'@EN	1.7127847966708486e-08	'relevant video. For images, use the property P18. For film trailers, qualify with \"object has role\" (P3831)=\"trailer\" (Q622550)'@en
P1000	label	'record held'@en	'RECORD HELD'@EN	1.521396967388256e-08	'notable record achieved by a person or entity, include qualifiers for dates held'@en
P1001	label	'applies to jurisdiction'@en	'APPLIES TO JURISDICTION'@EN	4.327619932819293e-08	'the item (an institution, law, public office ...) or statement belongs to or has power over or applies to the value (a territorial jurisdiction: a country, state, municipality, ...)'@en
P1002	label	'engine configuration'@en	'ENGINE CONFIGURATION'@EN	1.782297153902441e-08	'configuration of an engine\'s cylinders'@en
P1003	label	'National Library of Romania ID'@en	'NATIONAL LIBRARY OF ROMANIA ID'@EN	2.804742345044519e-08	'identifier for authority control used at the National Library of Romania'@

## Create the required indices

In [11]:
%%time 
!kgtk --debug query -i l_d_pgr_ud --idx node1 "node2;upper" label text:node2//name=ldpgridx --gc $STORE --limit 5

[2021-12-02 20:55:50 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_8 AS graph_8_c1
     LIMIT ?
  PARAS: [5]
---------------------------------------------
[2021-12-02 20:55:50 sqlstore]: CREATE INDEX "graph_8_node1_idx" ON "graph_8" ("node1")
[2021-12-02 20:56:21 sqlstore]: ANALYZE "graph_8_node1_idx"
[2021-12-02 20:56:24 sqlstore]: CREATE INDEX "graph_8_node2;upper_idx" ON "graph_8" ("node2;upper")
[2021-12-02 20:57:17 sqlstore]: ANALYZE "graph_8_node2;upper_idx"
[2021-12-02 20:57:21 sqlstore]: CREATE INDEX "graph_8_label_idx" ON "graph_8" ("label")
[2021-12-02 20:57:38 sqlstore]: ANALYZE "graph_8_label_idx"
[2021-12-02 20:57:41 sqlstore]: CREATE VIRTUAL TABLE "graph_8_txtidx_ldpgridx" USING FTS5 ("node2", tokenize="trigram", content="graph_8")
[2021-12-02 20:57:41 sqlstore]: INSERT INTO "graph_8_txtidx_ldpgridx" ("node2") SELECT "node2" FROM graph_8
node1	label	node2	node2;upper	node1;pagerank	node1;description
P10	label	'video'@en	

In [12]:
%%time
!kgtk --debug query -i label --idx label --gc $STORE --limit 5

[2021-12-02 21:05:19 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_1 AS graph_1_c1
     LIMIT ?
  PARAS: [5]
---------------------------------------------
[2021-12-02 21:05:19 sqlstore]: CREATE INDEX "graph_1_label_idx" ON "graph_1" ("label")
[2021-12-02 21:05:44 sqlstore]: ANALYZE "graph_1_label_idx"
id	node1	label	node2
P10-label-en	P10	label	'video'@en
P1000-label-en	P1000	label	'record held'@en
P1001-label-en	P1001	label	'applies to jurisdiction'@en
P1002-label-en	P1002	label	'engine configuration'@en
P1003-label-en	P1003	label	'National Library of Romania ID'@en
CPU times: user 307 ms, sys: 96.6 ms, total: 404 ms
Wall time: 30.1 s


In [13]:
%%time
!kgtk --debug query -i alias --idx label --gc $STORE --limit 5

[2021-12-02 21:05:49 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_3 AS graph_3_c1
     LIMIT ?
  PARAS: [5]
---------------------------------------------
[2021-12-02 21:05:49 sqlstore]: CREATE INDEX "graph_3_label_idx" ON "graph_3" ("label")
[2021-12-02 21:05:53 sqlstore]: ANALYZE "graph_3_label_idx"
id	node1	label	node2
P10-alias-en-282226-0	P10	alias	'gif'@en
P10-alias-en-2f86d8-0	P10	alias	'animation'@en
P10-alias-en-c1427e-0	P10	alias	'media'@en
P10-alias-en-c61ab1-0	P10	alias	'trailer (Commons)'@en
P1001-alias-en-0dd7ce-0	P1001	alias	'belongs to jurisdiction'@en
CPU times: user 66.4 ms, sys: 28.3 ms, total: 94.6 ms
Wall time: 6.34 s


In [14]:
%%time
!kgtk --debug query -i description --idx id --gc $STORE --limit 5

[2021-12-02 21:05:55 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_4 AS graph_4_c1
     LIMIT ?
  PARAS: [5]
---------------------------------------------
[2021-12-02 21:05:55 sqlstore]: CREATE INDEX "graph_4_id_idx" ON "graph_4" ("id")
[2021-12-02 21:06:23 sqlstore]: ANALYZE "graph_4_id_idx"
id	node1	label	node2
P10-description-en	P10	description	'relevant video. For images, use the property P18. For film trailers, qualify with \"object has role\" (P3831)=\"trailer\" (Q622550)'@en
P1000-description-en	P1000	description	'notable record achieved by a person or entity, include qualifiers for dates held'@en
P1001-description-en	P1001	description	'the item (an institution, law, public office ...) or statement belongs to or has power over or applies to the value (a territorial jurisdiction: a country, state, municipality, ...)'@en
P1002-description-en	P1002	description	'configuration of an engine\'s cylinders'@en
P1003-description-en	P1003

In [15]:
%%time
!kgtk --debug query -i claims --idx label node1 node2 id --gc $STORE --limit 5

[2021-12-02 21:06:29 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_5 AS graph_5_c1
     LIMIT ?
  PARAS: [5]
---------------------------------------------
[2021-12-02 21:06:29 sqlstore]: CREATE INDEX "graph_5_label_idx" ON "graph_5" ("label")
[2021-12-02 21:15:41 sqlstore]: ANALYZE "graph_5_label_idx"
[2021-12-02 21:16:17 sqlstore]: CREATE INDEX "graph_5_node1_idx" ON "graph_5" ("node1")
[2021-12-02 21:22:16 sqlstore]: ANALYZE "graph_5_node1_idx"
[2021-12-02 21:22:58 sqlstore]: CREATE INDEX "graph_5_node2_idx" ON "graph_5" ("node2")
[2021-12-02 21:37:59 sqlstore]: ANALYZE "graph_5_node2_idx"
[2021-12-02 21:38:57 sqlstore]: CREATE INDEX "graph_5_id_idx" ON "graph_5" ("id")
[2021-12-02 21:47:09 sqlstore]: ANALYZE "graph_5_id_idx"
id	node1	label	node2	rank	node2;wikidatatype
P10-P1628-32b85d-7927ece6-0	P10	P1628	"http://www.w3.org/2006/vcard/ns#Video"	normal	url
P10-P1628-acf60d-b8950832-0	P10	P1628	"https://schema.org/video"	normal	url


In [19]:
%%time
!kgtk --debug query -i datatypes --idx label node1 --gc $STORE --limit 5

[2021-12-02 21:55:56 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_6 AS graph_6_c1
     LIMIT ?
  PARAS: [5]
---------------------------------------------
[2021-12-02 21:55:56 sqlstore]: CREATE INDEX "graph_6_label_idx" ON "graph_6" ("label")
[2021-12-02 21:55:56 sqlstore]: ANALYZE "graph_6_label_idx"
[2021-12-02 21:55:56 sqlstore]: CREATE INDEX "graph_6_node1_idx" ON "graph_6" ("node1")
[2021-12-02 21:55:56 sqlstore]: ANALYZE "graph_6_node1_idx"
id	node1	label	node2
P10-datatype	P10	datatype	commonsMedia
P1000-datatype	P1000	datatype	wikibase-item
P1001-datatype	P1001	datatype	wikibase-item
P1002-datatype	P1002	datatype	wikibase-item
P1003-datatype	P1003	datatype	external-id
CPU times: user 14 ms, sys: 10.9 ms, total: 24.9 ms
Wall time: 1.13 s


In [17]:
%%time
!kgtk --debug query -i qualifiers --idx node2 node1 label --gc $STORE --limit 5

[2021-12-02 21:49:01 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_7 AS graph_7_c1
     LIMIT ?
  PARAS: [5]
---------------------------------------------
[2021-12-02 21:49:01 sqlstore]: CREATE INDEX "graph_7_node2_idx" ON "graph_7" ("node2")
[2021-12-02 21:51:38 sqlstore]: ANALYZE "graph_7_node2_idx"
[2021-12-02 21:51:46 sqlstore]: CREATE INDEX "graph_7_node1_idx" ON "graph_7" ("node1")
[2021-12-02 21:53:23 sqlstore]: ANALYZE "graph_7_node1_idx"
[2021-12-02 21:53:34 sqlstore]: CREATE INDEX "graph_7_label_idx" ON "graph_7" ("label")
[2021-12-02 21:55:22 sqlstore]: ANALYZE "graph_7_label_idx"
id	node1	label	node2	node2;wikidatatype
P10-P1855-Q15075950-7eff6d65-0-P10-54b214-0	P10-P1855-Q15075950-7eff6d65-0	P10	"Smoorverliefd 12 september.webm"	commonsMedia
P10-P1855-Q15075950-7eff6d65-0-P3831-Q622550-0	P10-P1855-Q15075950-7eff6d65-0	P3831	Q622550	wikibase-item
P10-P1855-Q4504-a69d2c73-0-P10-bef003-0	P10-P1855-Q4504-a69d2c73-0	P10	"Komod

## Take a look at cache file content

In [20]:
!kgtk query --gc $STORE --show-cache 

Graph Cache:
DB file: /Volumes/saggu-ssd/wikidata-dwd-v2/kgtk-search-6/temp.kgtk-search-6/wikidata.sqlite3.db
  size:  152.30 GB   	free:  0 Bytes   	modified:  2021-12-02 21:55:56

KGTK File Information:
alias:
  size:  130.57 MB   	modified:  2021-11-17 14:57:10   	graph:  graph_3
claims:
  size:  9.59 GB   	modified:  2021-11-19 12:20:27   	graph:  graph_5
datatypes:
  size:  45.00 KB   	modified:  2020-12-11 10:51:06   	graph:  graph_6
description:
  size:  341.84 MB   	modified:  2021-11-19 12:11:44   	graph:  graph_4
l_d_pgr_ud:
  size:  1.18 GB   	modified:  2021-12-02 20:48:25   	graph:  graph_8
label:
  size:  590.02 MB   	modified:  2021-11-17 14:57:46   	graph:  graph_1
pagerank_undirected:
  size:  1.53 GB   	modified:  2021-11-17 14:58:47   	graph:  graph_2
qualifiers:
  size:  2.09 GB   	modified:  2021-11-19 12:14:21   	graph:  graph_7

Graph Table Information:
graph_1:
  size:  4.15 GB   	created:  2021-12-02 18:36:14
  header:  ['id', 'node1', 'label', 'node2']
graph_2

## Define a function to do a `textmatch` search

In [21]:
def text_search_labels(search_text, limit=20):
    text_search_labels_query = _kapi2.get_query(
        doc="Doc string here",
        name=f"text_search_labels_{search_text}",
        inputs='l_d_pgr_ud',
        match='l_d_pgr_ud: (qnode)-[l:label]->(y)',
        where=f'textmatch(y, "{search_text}")',
       ret='distinct qnode as node1, y as label, 10*matchscore(y) as score, cast(l.`node1;pagerank`, float) as prank, l.`node1;description` as description',
       order='score*prank',
       limit=limit
    )
    results =  list([list(x) for x in text_search_labels_query.execute()])
    df = pd.DataFrame(results, columns=['node1', 'label', 'score', 'pagerank', 'description'])
    print(len(df))
    return df
    

In [22]:
text_search_labels('turkey')

[2021-12-02 21:56:13 query]: SQL Translation:
---------------------------------------------
  SELECT DISTINCT graph_8_c1."node1" "_aLias.node1", graph_8_c1."node2" "_aLias.label", (? * BM25(txtidx_1.graph_8_txtidx_ldpgridx)) "_aLias.score", CAST(graph_8_c1."node1;pagerank" AS float) "_aLias.prank", graph_8_c1."node1;description" "_aLias.description"
     FROM graph_8 AS graph_8_c1, graph_8_txtidx_ldpgridx AS txtidx_1
     WHERE graph_8_c1."label" = ?
        AND txtidx_1."node2" MATCH ? and txtidx_1.rowid = graph_8_c1.rowid
     ORDER BY ("_aLias.score" * "_aLias.prank") ASC
     LIMIT ?
  PARAS: [10, 'label', 'turkey', 20]
---------------------------------------------


20


Unnamed: 0,node1,label,score,pagerank,description
0,Q43,'Turkey'@en,-114.674631,0.0002377611,'sovereign state straddling Southeastern Europ...
1,Q1529096,'village in Turkey'@en,-92.68678,4.277189e-05,'a type of low level administrative division i...
2,Q21030356,'Member of the Grand National Assembly of Turk...,-60.860806,6.437066e-06,'Turkish MP'@en
3,Q1147395,'district of Turkey'@en,-91.098835,2.547179e-06,'administrative division of Turkey'@en
4,Q815324,'town municipality of Turkey'@en,-78.928703,2.173517e-06,"'type of administrative region in Turkey, larg..."
5,Q483856,'Turkey national association football team'@en,-65.34857,7.770456e-07,'men\'s national association football team rep...
6,Q1624749,'Turkey national under-21 football team'@en,-67.850142,7.236489e-07,'national association football team'@en
7,Q3590479,'Turkey national under-17 football team'@en,-67.850142,3.909747e-07,'national association football team'@en
8,Q7855167,'Turkey national under-19 football team'@en,-67.850142,3.797119e-07,'national association football team'@en
9,Q48336,'province of Turkey'@en,-91.098835,1.809655e-07,'first-level administrative division of Turkey...


## Define a function to search for Qnodes Exactly

In [23]:
def exact_search_items(search_text, limit=20):
    search_text = search_text.upper()
    text_search_labels_query =  _kapi2.get_query(
    doc="""
    Create the Kypher query used by 'BrowserBackend.get_node_labels()'
    for case_independent searches.
    Given parameters 'NODE' and 'LANG' retrieve labels for 'NODE' in
    the specified language (using 'any' for 'LANG' retrieves all labels).
    Return distinct 'node1', 'node_label' pairs as the result (we include
    'NODE' as an output to make it easier to union result frames).
    """,
    name=f'exact_search_items{search_text}',
    inputs='l_d_pgr_ud',
    match='l_d_pgr_ud: (n)-[r:label]->(l)',
    where=f'n="{search_text}"',
    ret='distinct n as node1, l as node_label, r.`node1;description` as description',
)
    results =  list([list(x) for x in text_search_labels_query.execute()])
    df = pd.DataFrame(results, columns=['node1', 'label', 'description'])
    print(len(df))
    return df
    

In [24]:
%%time
exact_search_items('q30')

[2021-12-02 22:27:54 query]: SQL Translation:
---------------------------------------------
  SELECT DISTINCT graph_8_c1."node1" "_aLias.node1", graph_8_c1."node2" "_aLias.node_label", graph_8_c1."node1;description" "_aLias.description"
     FROM graph_8 AS graph_8_c1
     WHERE graph_8_c1."label" = ?
        AND (graph_8_c1."node1" = ?)
     LIMIT ?
  PARAS: ['label', 'Q30', 100]
---------------------------------------------


1
CPU times: user 67.9 ms, sys: 98.3 ms, total: 166 ms
Wall time: 248 ms


Unnamed: 0,node1,label,description
0,Q30,'United States of America'@en,'sovereign state in North America'@en


In [25]:
%%time
exact_search_items('Q140')

[2021-12-02 22:27:55 query]: SQL Translation:
---------------------------------------------
  SELECT DISTINCT graph_8_c1."node1" "_aLias.node1", graph_8_c1."node2" "_aLias.node_label", graph_8_c1."node1;description" "_aLias.description"
     FROM graph_8 AS graph_8_c1
     WHERE graph_8_c1."label" = ?
        AND (graph_8_c1."node1" = ?)
     LIMIT ?
  PARAS: ['label', 'Q140', 100]
---------------------------------------------


1
CPU times: user 17.3 ms, sys: 4.09 ms, total: 21.4 ms
Wall time: 20.8 ms


Unnamed: 0,node1,label,description
0,Q140,'lion'@en,'species of big cat'@en


## Define a function to search labels Exactly

In [26]:
def exact_search_labels(search_text, limit=20):
    search_text = f"'{search_text.upper()}'@EN"
    text_search_labels_query =  _kapi2.get_query(
    doc="""
     Exact Match case insensitive query
    """,
    name=f'exact_search_labels{search_text}',
    inputs='l_d_pgr_ud',
    match=f'l_d_pgr_ud: (n)-[r:label]->(l)',
    where=f'r.`node2;upper`="{search_text}"',
    ret='distinct n as node1, l as node_label, cast("-1.0", float) as score, cast(r.`node1;pagerank`, float) as prank, r.`node1;description` as description',
    order='score*prank',
    limit=limit
)
    results =  list([list(x) for x in text_search_labels_query.execute()])
    df = pd.DataFrame(results, columns=['node1', 'label', 'score', 'prank', 'description'])
    print(len(df))
    return df

In [27]:
exact_search_labels('canada')

[2021-12-02 22:27:57 query]: SQL Translation:
---------------------------------------------
  SELECT DISTINCT graph_8_c1."node1" "_aLias.node1", graph_8_c1."node2" "_aLias.node_label", CAST(? AS float) "_aLias.score", CAST(graph_8_c1."node1;pagerank" AS float) "_aLias.prank", graph_8_c1."node1;description" "_aLias.description"
     FROM graph_8 AS graph_8_c1
     WHERE graph_8_c1."label" = ?
        AND (graph_8_c1."node2;upper" = ?)
     ORDER BY ("_aLias.score" * "_aLias.prank") ASC
     LIMIT ?
  PARAS: ['-1.0', 'label', "'CANADA'@EN", 20]
---------------------------------------------


20


Unnamed: 0,node1,label,score,prank,description
0,Q16,'Canada'@en,-1.0,0.001097894,'sovereign state in North America'@en
1,Q2569593,'Canada'@en,-1.0,8.178669e-08,'former French colony in New France between th...
2,Q13265725,'Canada'@en,-1.0,3.126547e-08,'family name'@en
3,Q13265795,'Canada'@en,-1.0,1.498003e-08,'2012 novel by American author Richard Ford'@en
4,Q2608363,'Canada'@en,-1.0,1.364988e-08,"'village in Appelscha, in the Netherlands'@en"
5,Q18612153,'Canada'@en,-1.0,1.364276e-08,'180th strip of the webcomic xkcd'@en
6,Q103921530,'Canada'@en,-1.0,1.33154e-08,"'Shipwreck off the Scottish Coast, imported fr..."
7,Q99292858,'Canada'@en,-1.0,1.269767e-08,'the country of Canada as depicted in Star Tre...
8,Q14624136,'Canada'@en,-1.0,1.255841e-08,'moth genus of Pteromalidae'@en
9,Q5029265,'Canada'@en,-1.0,1.234508e-08,"'unincorporated community in Kansas, United St..."


## Define a function to fo a `textlike` search

In [28]:
def text_like_search_labels(search_text, limit=20):
    search_label = f"%{'%'.join(search_text.split(' '))}%"
    print(search_text)
    text_search_labels_query = _kapi2.get_query(
        doc="Doc string here",
        name=f"text_like_search_labels_{search_text}",
        inputs='l_d_pgr_ud',
        match='l_d_pgr_ud: (qnode)-[l:label]->(y)',
        where=f'textlike(y, "{search_label}")',
       ret='distinct qnode as node1, y as label, matchscore(y) as score, cast(l.`node1;pagerank`, float) as prank, l.`node1;description` as description',
       order='score*prank',
       limit=limit
    )
    results =  list([list(x) for x in text_search_labels_query.execute()])
    df = pd.DataFrame(results, columns=['node1', 'label', 'score', 'pagerank', 'description'])
    print(len(df))
    return df
    

In [29]:
text_like_search_labels("fifa group b")

[2021-12-02 22:28:01 query]: SQL Translation:
---------------------------------------------
  SELECT DISTINCT graph_8_c1."node1" "_aLias.node1", graph_8_c1."node2" "_aLias.label", BM25(txtidx_1.graph_8_txtidx_ldpgridx) "_aLias.score", CAST(graph_8_c1."node1;pagerank" AS float) "_aLias.prank", graph_8_c1."node1;description" "_aLias.description"
     FROM graph_8 AS graph_8_c1, graph_8_txtidx_ldpgridx AS txtidx_1
     WHERE graph_8_c1."label" = ?
        AND txtidx_1."node2" LIKE ? and txtidx_1.rowid = graph_8_c1.rowid
     ORDER BY ("_aLias.score" * "_aLias.prank") ASC
     LIMIT ?
  PARAS: ['label', '%fifa%group%b%', 20]
---------------------------------------------


fifa group b
20


Unnamed: 0,node1,label,score,pagerank,description
0,Q31189406,'2018 FIFA World Cup Group B'@en,-14.896818,4.302684e-08,
1,Q10260332,'2014 FIFA World Cup Group B'@en,-14.896818,2.85662e-08,
2,Q17115977,'Category:2014 FIFA World Cup group table temp...,-11.105398,2.377329e-08,'Wikimedia category'@en
3,Q18608402,'Category:2015 FIFA Women\'s World Cup group t...,-10.099565,2.085697e-08,'Wikimedia category'@en
4,Q20730977,'2018 FIFA World Cup qualification – UEFA Grou...,-11.356739,1.5939e-08,
5,Q187411,'2010 FIFA World Cup Group B'@en,-14.896818,1.178271e-08,'football tournament'@en
6,Q39134591,'Template:2018 FIFA World Cup Group B table'@en,-12.183998,1.340353e-08,'Wikimedia template'@en
7,Q39134633,'Template:2018 FIFA World Cup Group C table'@en,-12.183998,1.337304e-08,'Wikimedia template'@en
8,Q39395683,'Template:2018 FIFA World Cup Group D table'@en,-12.183998,1.336062e-08,'Wikimedia template'@en
9,Q39134549,'Template:2018 FIFA World Cup Group A table'@en,-12.183998,1.334831e-08,'Wikimedia template'@en
