Skip to content

Commit

Permalink
Wikidata Elastic Search - Backend Implementation (#14)
Browse files Browse the repository at this point in the history
* Added wikidata search extension, and search API routes

* Updated search config, added individual search options for classes, entities and props

* Added transforms to search payload, id2uri function for classes and properties

* Added function level documentation and typing to the search implementation

* Updated OntClass uri2id method commet

* Updated Search extension_interface, ISearch to IEntitySearch and IOntologySearch

* Fixed API read directly from local implementation

* Enhanced typing to search payload, introduced search payload dataclass, imporved typing by creating search models

* Removed SearchPayload model and udpated all the functions in search interface to return List[SearchItem], Enhanced typing covered in the PR reviews

* Updated and renamed SearchItem dataclass to SearchResult

* Updated SearchResult dataclass docs comment
  • Loading branch information
punith300i committed Jun 6, 2023
1 parent 4acc6b8 commit 55ebb0a
Show file tree
Hide file tree
Showing 8 changed files with 220 additions and 19 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ loguru = "^0.6.0"
orjson = "^3.8.2"
drepr = "^2.10.0"
rsoup = "^2.5.1"
nh3 = "^0.2.13"

lat_lon_parser = "^1.3.0"

Expand Down
15 changes: 5 additions & 10 deletions sand/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from sand.controllers.assistant import assistant_bp
from sand.controllers.project import project_bp
from sand.controllers.table import table_bp, table_row_bp
from sand.controllers.search import search_bp
from sand.controllers.settings import setting_bp
from sand.deserializer import deserialize_graph
from sand.models import EntityAR, SemanticModel
Expand All @@ -27,6 +28,7 @@
assistant_bp,
table_row_bp,
setting_bp,
search_bp,
generate_api(
SemanticModel,
deserializers={"data": deserialize_graph},
Expand All @@ -35,25 +37,18 @@
generate_readonly_api_4dict(
"entities",
serialize=serialize_entity,
id2ent=ChainedMapping(
EntityAR(), import_attr(SETTINGS["entity"]["default"])
),
id2ent=EntityAR(),
),
generate_readonly_api_4dict(
"classes",
serialize=serialize_class,
id2ent=ChainedMapping(
OntClassAR(), import_attr(SETTINGS["ont_classes"]["default"])
),
id2ent=OntClassAR(),
unique_field_funcs={"uri": OntClass.uri2id},
),
generate_readonly_api_4dict(
"properties",
serialize=serialize_property,
id2ent=ChainedMapping(
OntPropertyAR(),
import_attr(SETTINGS["ont_props"]["default"]),
),
id2ent=OntPropertyAR(),
unique_field_funcs={"uri": OntProperty.uri2id},
),
],
Expand Down
7 changes: 7 additions & 0 deletions sand/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
"ont_classes": {
"constructor": "sand.extensions.wikidata.get_ontclass_db",
"uri2id": "sand.extensions.wikidata.uri2id",
"id2uri": "sm.namespaces.prelude.WikidataNamespace.get_entity_abs_uri",
"args": {
"dbfile": "/tmp/wdclasses.db",
"proxy": True,
Expand All @@ -41,6 +42,7 @@
"ont_props": {
"constructor": "sand.extensions.wikidata.get_ontprop_db",
"uri2id": "sand.extensions.wikidata.uri2id",
"id2uri": "sm.namespaces.prelude.WikidataNamespace.get_prop_abs_uri",
"args": {
"dbfile": "/tmp/wdprops.db",
"proxy": True,
Expand All @@ -62,6 +64,11 @@
"mtab": "sand.extensions.assistants.mtab.MTabAssistant",
# "default": "mtab",
},
"search": {
"entities": "sand.extensions.search.wikidata_search.WikidataSearch",
"classes": "sand.extensions.search.wikidata_search.WikidataSearch",
"props": "sand.extensions.search.wikidata_search.WikidataSearch"
},
"exports": {
"drepr": "sand.extensions.export.drepr.main.DreprExport",
"default": "sand.extensions.export.drepr.main.DreprExport"
Expand Down
61 changes: 61 additions & 0 deletions sand/controllers/search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import threading
from typing import Dict, List, Union, Literal
from flask.blueprints import Blueprint
from sm.misc.funcs import import_func
from sand.config import SETTINGS
from flask import request, jsonify

from sand.extension_interface.search import IEntitySearch, IOntologySearch
from sand.models.search import SearchResult
from gena.serializer import get_dataclass_serializer

search_bp = Blueprint("search", "search")

GetSearchCache = threading.local()
serializer = get_dataclass_serializer(SearchResult)


def get_search(name: Literal['classes', 'entities', 'props']) -> Union[IEntitySearch, IOntologySearch]:
"""
Returns an implementation of an ISearch Interface from the
configuration file.
"""
global GetSearchCache

if not hasattr(GetSearchCache, "search"):
GetSearchCache.search = {}
search_config = SETTINGS["search"]
constructor = search_config[name]
GetSearchCache.search[name] = import_func(constructor)()

return GetSearchCache.search[name]


@search_bp.route(f"/{search_bp.name}/classes", methods=["GET"])
def search_classes():
"""API Route to search for classes with their names"""
search_text = request.args.get('q')
wikidata_search = get_search('classes')
search_results = wikidata_search.find_class_by_name(search_text)
serialized_payload = [serializer(item) for item in search_results]
return jsonify({'items': serialized_payload})


@search_bp.route(f"/{search_bp.name}/entities", methods=["GET"])
def search_entities():
"""API Route to search for entities with their names"""
search_text = request.args.get('q')
wikidata_search = get_search('entities')
search_results = wikidata_search.find_entity_by_name(search_text)
serialized_payload = [serializer(item) for item in search_results]
return jsonify({'items': serialized_payload})


@search_bp.route(f"/{search_bp.name}/props", methods=["GET"])
def search_props():
"""API Route to search for properties with their names"""
search_text = request.args.get('q')
wikidata_search = get_search('props')
search_results = wikidata_search.find_props_by_name(search_text)
serialized_payload = [serializer(item) for item in search_results]
return jsonify({'items': serialized_payload})
24 changes: 16 additions & 8 deletions sand/extension_interface/search.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,29 @@
from abc import ABC, abstractmethod
from typing import List
from sand.models.search import SearchResult


class ISearch(ABC):
""" Search Interface to support searches from multiple
class IEntitySearch(ABC):
""" Entity Search Interface to support searches from multiple
KG datastores.
"""
@abstractmethod
def find_class_by_name(self):
"""Search Class using name"""
def find_entity_by_name(self, search_text: str) -> List[SearchResult]:
"""Search Entity using name"""
pass


class IOntologySearch(ABC):
""" Class and Property Ontology Search Interface to support searches from multiple
KG datastores.
"""

@abstractmethod
def find_entity_by_name(self):
"""Search Entity using name"""
def find_class_by_name(self, search_text: str) -> List[SearchResult]:
"""Search Class using name"""
pass

@abstractmethod
def find_props_by_name(self):
def find_props_by_name(self, search_text: str) -> List[SearchResult]:
"""Search properties using name"""
pass
pass
103 changes: 103 additions & 0 deletions sand/extensions/search/wikidata_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import requests
from typing import Dict, List
import nh3
from sand.extension_interface.search import IEntitySearch, IOntologySearch
from sand.models.entity import Entity
from sand.models.ontology import OntClass, OntProperty, OntClassAR
from sand.models.search import SearchResult


class WikidataSearch(IEntitySearch, IOntologySearch):

def __init__(self):
self.wikidata_url = "https://www.wikidata.org/w/api.php"
self.PARAMS = {
"action": "query",
"format": "json",
"list": "search",
"srsearch": "",
"utf8": "",
"srnamespace": 0,
"srlimit": 10,
"srprop": "snippet|titlesnippet"
}
self.ont_class_ar = None

def get_class_search_params(self, search_text: str) -> Dict:
"""Updates class search parameters for wikidata API"""
class_params = self.PARAMS.copy()
class_params["srnamespace"] = 0
class_params['srsearch'] = f"haswbstatement:P279 {search_text}"
return class_params

def get_local_class_properties(self, id: str) -> OntClass:
"""Calls local class search API to fetch all class metadata using class ID"""
if self.ont_class_ar is None:
self.ont_class_ar = OntClassAR()
return self.ont_class_ar[id]

def get_entity_search_params(self, search_text: str) -> Dict:
"""Updates entity search parameters for wikidata API"""
entity_params = self.PARAMS.copy()
entity_params["srnamespace"] = 0
entity_params['srsearch'] = search_text
return entity_params

def get_props_search_params(self, search_text: str) -> Dict:
"""Updates property search parameters for wikidata API"""
props_params = self.PARAMS.copy()
props_params["srnamespace"] = 120
props_params['srsearch'] = search_text
return props_params

def find_class_by_name(self, search_text: str) -> List[SearchResult]:
"""
Uses Wikidata API to search for classes using their name/text.
Uses local ID based class search to fetch label and description data.
"""
request_params = self.get_class_search_params(search_text)
api_data = requests.get(self.wikidata_url, request_params)
search_results = api_data.json()['query']['search']
payload_results = []
for search_result in search_results:
local_class_props = self.get_local_class_properties(search_result['title'])
item = SearchResult(
label=local_class_props.label,
id=search_result['title'],
description=local_class_props.description,
uri=OntClass.id2uri(search_result['title'])
)
payload_results.append(item)
return payload_results

def find_entity_by_name(self, search_text: str) -> List[SearchResult]:
"""Uses Wikidata API to search for entities using their name/text."""
request_params = self.get_entity_search_params(search_text)
api_data = requests.get(self.wikidata_url, request_params)
search_results = api_data.json()['query']['search']
payload_results = []
for search_result in search_results:
item = SearchResult(
label=nh3.clean(search_result['titlesnippet'], tags=set()),
id=search_result['title'],
description=nh3.clean(search_result['snippet'], tags=set()),
uri=Entity.id2uri(search_result['title'])
)
payload_results.append(item)
return payload_results

def find_props_by_name(self, search_text: str) -> List[SearchResult]:
"""Uses Wikidata API to search for properties using their name/text."""
request_params = self.get_props_search_params(search_text)
api_data = requests.get(self.wikidata_url, request_params)
search_results = api_data.json()['query']['search']
payload_results = []
for search_result in search_results:
item = SearchResult(
label=nh3.clean(search_result['titlesnippet'], tags=set()),
id=search_result['title'].split(":")[1],
description=nh3.clean(search_result['snippet'], tags=set()),
uri=OntProperty.id2uri(search_result['title'].split(":")[1])
)
payload_results.append(item)
return payload_results
15 changes: 14 additions & 1 deletion sand/models/ontology.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@ def uri2id(uri: str) -> str:
"The method is set when its store is initialized. Check the call order to ensure `OntClassAR` is called first"
)

@staticmethod
def id2uri(id: str) -> str:
"""Convert class ID to class URI."""
raise NotImplementedError(
"The method is set when its store is initialized. Check the call order to ensure `OntClassAR` is called first"
)

OntPropertyDataType = Literal[
"monolingualtext",
Expand Down Expand Up @@ -62,6 +68,12 @@ def uri2id(uri: str) -> str:
"The method is set when its store is initialized. Check the call order to ensure `OntPropertyAR` is called first"
)

@staticmethod
def id2uri(id: str) -> str:
"""Convert property ID to property URI."""
raise NotImplementedError(
"The method is set when its store is initialized. Check the call order to ensure `OntPropertyAR` is called first"
)

PROP_AR = None
CLASS_AR = None
Expand Down Expand Up @@ -89,6 +101,7 @@ def OntPropertyAR() -> Mapping[str, OntProperty]:
func = import_func(cfg["constructor"])
PROP_AR = func(**cfg["args"])
OntProperty.uri2id = import_func(cfg["uri2id"])
OntProperty.id2uri = import_func(cfg["id2uri"])

return PROP_AR

Expand All @@ -102,5 +115,5 @@ def OntClassAR() -> Mapping[str, OntClass]:
func = import_func(cfg["constructor"])
CLASS_AR = func(**cfg["args"])
OntClass.uri2id = import_func(cfg["uri2id"])

OntClass.id2uri = import_func(cfg["id2uri"])
return CLASS_AR
13 changes: 13 additions & 0 deletions sand/models/search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from dataclasses import dataclass


@dataclass
class SearchResult:
"""
Search Result dataclass to save the values of each search result in a search
"""
label: str
id: str
description: str
uri: str

0 comments on commit 55ebb0a

Please sign in to comment.