Skip to content

Commit

Permalink
Merge pull request #57 from Lakshmi-bashyam/master
Browse files Browse the repository at this point in the history
Extend to new label types
  • Loading branch information
Lakshmi-bashyam committed Feb 27, 2024
2 parents 7b0d47b + e16e708 commit b28615a
Show file tree
Hide file tree
Showing 14 changed files with 97 additions and 70 deletions.
7 changes: 5 additions & 2 deletions CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,12 @@ authors:
- family-names: Bartz
given-names: Christopher
affiliation: "ZBW - Leibniz Information Centre for Economics"
- family-names: Rajendram Bashyam
given-names: Lakshmi
affiliation: "ZBW - Leibniz Information Centre for Economics"
title: "stwfsapy (a library for matching labels of thesaurus concepts via finite-state automata)"
abstract: "This library provides functionality to find the labels of SKOS thesaurus concepts in text. A deterministic finite automaton is constructed from the labels of the thesaurus concepts to perform the matching. In addition, a classifier is trained to score the matched concept occurrences."
version: 0.3.0
version: 0.4.0
license: Apache-2.0
date-released: 2021-10-18
repository-code: "https://github.com/zbw/stwfsapy"
Expand All @@ -28,4 +31,4 @@ references:
given-names: Martin
title: "stwfsa"
type: software
repository-code: "https://github.com/zbw/stwfsa"
repository-code: "https://github.com/zbw/stwfsa"
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ In addition, a classifier is trained to score the matched concept occurrences.

## Data Requirements
The construction of the automaton requires a SKOS thesaurus represented as a `rdflib` `Graph`.
Concepts should be related to labels by `skos:prefLabel` or `skos:altLabel`.
Concepts should be related to labels by `skos:prefLabel`, `skos:altLabel`, `zbwext:altLabelNarrower`, `zbwext:altLabelRelated` or `skos:hiddenLabel`.
Concepts have to be identifiable by `rdf:type`.
The training of the predictor requires annotated text.
Each training sample should be annotated with one or more concepts from the thesaurus.
Expand Down
2 changes: 1 addition & 1 deletion stwfsapy/tests/predictor_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.

from stwfsapy.text_features import mk_text_features
from scipy.sparse.lil import lil_matrix
from scipy.sparse import lil_matrix
from stwfsapy import predictor as p
import stwfsapy.thesaurus as t
from stwfsapy.automata.dfa import Dfa
Expand Down
34 changes: 20 additions & 14 deletions stwfsapy/tests/thesaurus/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,29 +14,35 @@


from rdflib.term import URIRef, Literal
from rdflib.namespace import Namespace


test_ref_type = URIRef("http://zbw.eu/stw/descriptor")
ZBWEXT = Namespace("http://zbw.eu/namespaces/zbw-extensions/")

_thsys_uri_print = "http://zbw.eu/stw/thsys/70265"
_thsys_uri_media = "https://zbw.eu/stw/thsys/181994"
_thsys_uri_insurance = "http://zbw.eu/stw/thsys/70892"
_thsys_uri_it = "https://zbw.eu/stw/thsys/73341"

thsys_ref_print = URIRef(_thsys_uri_print)
thsys_ref_media = URIRef(_thsys_uri_media)
thsys_ref_insurance = URIRef(_thsys_uri_insurance)
thsys_ref_it = URIRef(_thsys_uri_it)

thsys_prefLabel_print_en = Literal("Printed matters", lang="en")
thsys_prefLabel_insurance_en = Literal("Insurance industry", lang="en")


_concept_uri_printed = "http://zbw.eu/stw/descriptor/14812-5"
_concept_uri_media = "http://zbw.eu/stw/descriptor/18211-4"
_concept_uri_insurance = "http://zbw.eu/stw/descriptor/13811-5"
_concept_uri_it = "http://zbw.eu/stw/descriptor/30373-6"

concept_ref_printed = URIRef(_concept_uri_printed)
concept_ref_media = URIRef(_concept_uri_media)
concept_ref_insurance = URIRef(_concept_uri_insurance)
concept_ref_it = URIRef(_concept_uri_it)

concept_prefLabel_printed_en = Literal("Printed Products", lang="en")
concept_prefLabel_printed_de = Literal("Druckerzeugnis", lang="de")
concept_prefLabel_printed_missing = Literal("Druckerzeugnis")
concept_prefLabel_insurance_en = Literal("Private insurance", lang="en")
concept_prefLabel_insurance_de = Literal("Privatversicherung", lang="de")
concept_prefLabel_insurance_missing = Literal("Privatversicherung")

concept_altLabel_printed_en = Literal("Print Media", lang="en")
concept_altLabel_insurance_en = Literal("Mutual insurance", lang="en")
concept_altLabelRelated_insurance_en = Literal(
"Insurance cooperative",
lang="en")

concept_prefLabel_media_en = Literal("Press media", lang="en")
concept_prefLabel_it_en = Literal("Electronic identification", lang="en")
concept_altLabelNarrower_it_en = Literal("Digital signature", lang="en")
40 changes: 24 additions & 16 deletions stwfsapy/tests/thesaurus/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,47 +22,55 @@
@pytest.fixture
def tuples():
return [
(c.concept_ref_printed, c.concept_prefLabel_printed_en),
(c.concept_ref_media, c.concept_prefLabel_media_en),
(c.concept_ref_printed, c.concept_prefLabel_printed_missing)
(c.concept_ref_insurance, c.concept_prefLabel_insurance_en),
(c.concept_ref_it, c.concept_prefLabel_it_en),
(c.concept_ref_insurance, c.concept_prefLabel_insurance_missing)
]


@pytest.fixture
def label_graph():
g = Graph()
g.add((
c.concept_ref_printed,
c.concept_ref_insurance,
SKOS.prefLabel,
c.concept_prefLabel_printed_en))
c.concept_prefLabel_insurance_en))
g.add((
c.concept_ref_printed,
c.concept_ref_insurance,
SKOS.altLabel,
c.concept_altLabel_printed_en))
c.concept_altLabel_insurance_en))
g.add((
c.concept_ref_media,
SKOS.altLabel,
c.concept_altLabel_printed_en))
c.concept_ref_insurance,
c.ZBWEXT.altLabelRelated,
c.concept_altLabelRelated_insurance_en))
g.add((
c.concept_ref_it,
SKOS.prefLabel,
c.concept_prefLabel_it_en))
g.add((
c.concept_ref_it,
c.ZBWEXT.altLabelNarrower,
c.concept_altLabelNarrower_it_en))
return g


@pytest.fixture
def typed_label_graph(label_graph):
g = label_graph
g.add((
c.concept_ref_printed,
c.concept_ref_insurance,
SKOS.prefLabel,
c.concept_prefLabel_printed_de))
c.concept_prefLabel_insurance_de))
g.add((
c.thsys_ref_print,
c.thsys_ref_insurance,
SKOS.prefLabel,
c.thsys_prefLabel_print_en))
c.thsys_prefLabel_insurance_en))
g.add((
c.concept_ref_media,
c.concept_ref_it,
RDF.type,
c.test_ref_type))
g.add((
c.concept_ref_printed,
c.concept_ref_insurance,
RDF.type,
c.test_ref_type))
return g
8 changes: 4 additions & 4 deletions stwfsapy/tests/thesaurus/extract_by_type_uri_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@
def test_extract_by_type_uri(typed_label_graph):
res = list(t.extract_by_type_uri(typed_label_graph, c.test_ref_type))
assert len(res) == 2
assert c.concept_ref_printed in res
assert c.concept_ref_media in res
assert c.concept_ref_insurance in res
assert c.concept_ref_it in res


def test_extract_by_type_uri_with_remove(typed_label_graph):
res = list(t.extract_by_type_uri(
typed_label_graph,
c.test_ref_type,
{c.concept_ref_printed}))
assert res == [c.concept_ref_media]
{c.concept_ref_insurance}))
assert res == [c.concept_ref_it]
4 changes: 2 additions & 2 deletions stwfsapy/tests/thesaurus/extract_deprecated_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,6 @@


def test_extract_deprecated(label_graph):
label_graph.add((c.concept_ref_printed, OWL.deprecated, Literal(True)))
label_graph.add((c.concept_ref_insurance, OWL.deprecated, Literal(True)))
res = list(t.extract_deprecated(label_graph))
assert res == [c.concept_ref_printed]
assert res == [c.concept_ref_insurance]
8 changes: 6 additions & 2 deletions stwfsapy/tests/thesaurus/extract_labels_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,9 @@

def test_extract_labels(label_graph):
extracted = list(t.extract_labels(label_graph))
assert (c.concept_ref_printed, c.concept_prefLabel_printed_en) in extracted
assert (c.concept_ref_printed, c.concept_altLabel_printed_en) in extracted
assert (c.concept_ref_insurance,
c.concept_prefLabel_insurance_en) in extracted
assert (c.concept_ref_insurance,
c.concept_altLabel_insurance_en) in extracted
assert (c.concept_ref_insurance,
c.concept_altLabelRelated_insurance_en) in extracted
6 changes: 3 additions & 3 deletions stwfsapy/tests/thesaurus/language_filter_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@
@pytest.fixture
def multi_lang_tuples():
return [
(c.concept_ref_printed, c.concept_prefLabel_printed_en),
(c.concept_ref_printed, c.concept_prefLabel_printed_de),
(c.concept_ref_printed, c.concept_prefLabel_printed_missing)
(c.concept_ref_insurance, c.concept_prefLabel_insurance_en),
(c.concept_ref_insurance, c.concept_prefLabel_insurance_de),
(c.concept_ref_insurance, c.concept_prefLabel_insurance_missing)
]


Expand Down
18 changes: 9 additions & 9 deletions stwfsapy/tests/thesaurus/retrieve_concept_labels_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ def patch_module(mocker):
@pytest.fixture
def concept_set():
return {
c.concept_ref_printed,
c.concept_ref_media
c.concept_ref_insurance,
c.concept_ref_it
}


Expand Down Expand Up @@ -78,12 +78,12 @@ def test_integration(typed_label_graph, concept_set):
langs={"en"},
))
assert (
c.concept_ref_printed,
c.concept_prefLabel_printed_en.value) in result
c.concept_ref_insurance,
c.concept_prefLabel_insurance_en.value) in result
assert (
c.concept_ref_printed,
c.concept_altLabel_printed_en.value) in result
c.concept_ref_insurance,
c.concept_altLabel_insurance_en.value) in result
assert (
c.concept_ref_printed,
c.concept_prefLabel_printed_de.value) not in result
assert c.thsys_ref_print not in map(lambda t: t[0], result)
c.concept_ref_insurance,
c.concept_prefLabel_insurance_de.value) not in result
assert c.thsys_ref_insurance not in map(lambda t: t[0], result)
12 changes: 6 additions & 6 deletions stwfsapy/tests/thesaurus/set_filter_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,17 @@

def test_filter_subject_tuples_from_set(tuples):
res = list(
t.filter_subject_tuples_from_set(tuples, {c.concept_ref_printed}))
t.filter_subject_tuples_from_set(tuples, {c.concept_ref_insurance}))
assert len(res) == 2
assert res[0][0] == c.concept_ref_printed
assert res[1][0] == c.concept_ref_printed
assert res[0][0] == c.concept_ref_insurance
assert res[1][0] == c.concept_ref_insurance


def test_filter_refs_from_set_complement():
res = list(
t._filter_refs_from_set_complement(
[c.concept_ref_printed, c.concept_ref_media],
{c.concept_ref_media}
[c.concept_ref_insurance, c.concept_ref_it],
{c.concept_ref_it}
)
)
assert res == [c.concept_ref_printed]
assert res == [c.concept_ref_insurance]
10 changes: 5 additions & 5 deletions stwfsapy/tests/thesaurus_features_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@

def test_collect_po_from_tuples():
tuples = [
(tc.concept_ref_printed, tc.concept_ref_media),
(tc.concept_ref_media, tc.thsys_ref_media),
(tc.concept_ref_printed, tc.thsys_ref_print)
(tc.concept_ref_insurance, tc.concept_ref_it),
(tc.concept_ref_it, tc.thsys_ref_it),
(tc.concept_ref_insurance, tc.thsys_ref_insurance)
]
po = tf._collect_po_from_tuples(tuples)
assert po == {
tc.concept_ref_printed: {tc.concept_ref_media, tc.thsys_ref_print},
tc.concept_ref_media: {tc.thsys_ref_media}
tc.concept_ref_insurance: {tc.concept_ref_it, tc.thsys_ref_insurance},
tc.concept_ref_it: {tc.thsys_ref_it}
}


Expand Down
2 changes: 1 addition & 1 deletion stwfsapy/tests/util/passthrough_transformer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def test_array_input():
]
pt = PassthroughTransformer()
out_feat = pt.transform(in_feat)
assert type(out_feat) == np.ndarray
assert isinstance(out_feat, np.ndarray)
assert (sp.diags([[1, 7, -3]], [0]).toarray() == out_feat).all()


Expand Down
14 changes: 10 additions & 4 deletions stwfsapy/thesaurus.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,19 @@
from typing import Tuple, FrozenSet, Iterable, Any, Optional
from rdflib import Graph
from rdflib.term import Literal, URIRef
from rdflib.namespace import SKOS, OWL, RDF
from rdflib.namespace import Namespace, SKOS, OWL, RDF

ZBWEXT = Namespace("http://zbw.eu/namespaces/zbw-extensions/")


def extract_labels(g: Graph) -> Iterable[Tuple[URIRef, Literal]]:
"""
Extracts SKOS.prefLabels and SKOS.altLabels from a rdflib.Graph
Extracts SKOS.prefLabels, SKOS.altLabels, SKOS.hiddenLabel,
ZBWEXT.altLabelRelated and ZBWEXT.altLabelNarrower from a rdflib.Graph
"""
return g[: SKOS.prefLabel | SKOS.altLabel]
# return g[: SKOS.prefLabel | SKOS.altLabel]
return g[: SKOS.prefLabel | SKOS.altLabel | SKOS.hiddenLabel |
ZBWEXT.altLabelRelated | ZBWEXT.altLabelNarrower]


def extract_by_type_uri(
Expand Down Expand Up @@ -106,7 +111,8 @@ def retrieve_concept_labels(
allowed: Optional[FrozenSet[URIRef]] = frozenset(),
langs: FrozenSet[str] = frozenset()
) -> Iterable[Tuple[URIRef, str]]:
"""Extracts altLabels and prefLabels from a SKOS graph.
"""Extracts altLabels, altLabelNarrower,
altLabelRelated and prefLabels from a SKOS graph.
Only the labels that are in one of the specified language will be reported.
In addition the concept URIs can be limited by a set.
Expand Down

0 comments on commit b28615a

Please sign in to comment.