diff --git a/tests/test_graph_audit.py b/tests/test_graph_audit.py index c4e25f7..b6cb4fc 100644 --- a/tests/test_graph_audit.py +++ b/tests/test_graph_audit.py @@ -353,6 +353,39 @@ def test_schema_compliance_empty_graph() -> None: assert result.by_url == [] +def test_schema_compliance_offer_unit_price_specification_no_range_warning() -> None: + from wordlift_sdk.validation.shacl import resolve_shape_specs + + product = URIRef("https://example.org/products/p1") + offer = URIRef("https://example.org/offers/o1") + price_spec = URIRef("https://example.org/price-specs/ps1") + graph = _g( + (product, RDF.type, _schema("Product")), + (product, _schema("url"), Literal("https://example.org/product/p1")), + (product, _schema("offers"), offer), + (offer, RDF.type, _schema("Offer")), + (offer, _schema("priceSpecification"), price_spec), + (price_spec, RDF.type, _schema("UnitPriceSpecification")), + (price_spec, _schema("price"), Literal("10.00")), + (price_spec, _schema("priceCurrency"), Literal("USD")), + ) + + kpi = SchemaComplianceKpi( + shape_specs=resolve_shape_specs(builtin_shapes=["schemaorg-grammar"]), + depth=2, + ) + result = kpi.collect(graph) + + warning_messages = [entry.message for entry in result.by_url[0].warnings] + merchant_warning_messages = [ + entry.message for entry in result.by_url[0].google_merchant.warnings + ] + assert "Schema.org range check: priceSpecification." not in warning_messages + assert ( + "Schema.org range check: priceSpecification." not in merchant_warning_messages + ) + + # --------------------------------------------------------------------------- # GraphAuditor integration # --------------------------------------------------------------------------- diff --git a/tests/test_product_snippet_validation.py b/tests/test_product_snippet_validation.py index b202ba1..c1d9815 100644 --- a/tests/test_product_snippet_validation.py +++ b/tests/test_product_snippet_validation.py @@ -44,3 +44,58 @@ def test_product_snippet_aggregate_offer_satisfies_one_of( ) assert result.conforms is True, result.report_text + + +def test_schemaorg_grammar_price_specification_accepts_unit_price_specification( + tmp_path: Path, +) -> None: + payload = { + "@context": {"@vocab": "http://schema.org/"}, + "@graph": [ + { + "@id": "https://example.com/offers/o1", + "@type": "Offer", + "priceSpecification": {"@id": "https://example.com/price-specs/ps1"}, + }, + { + "@id": "https://example.com/price-specs/ps1", + "@type": "UnitPriceSpecification", + "price": 10.0, + "priceCurrency": "USD", + }, + ], + } + path = tmp_path / "schemaorg-price-spec-ok.jsonld" + path.write_text(json.dumps(payload), encoding="utf-8") + + result = shacl.validate_file(path.as_posix(), shape_specs=["schemaorg-grammar"]) + messages = [issue.message for issue in shacl.extract_validation_issues(result)] + + assert "Schema.org range check: priceSpecification." not in messages + + +def test_schemaorg_grammar_price_specification_rejects_invalid_type( + tmp_path: Path, +) -> None: + payload = { + "@context": {"@vocab": "http://schema.org/"}, + "@graph": [ + { + "@id": "https://example.com/offers/o1", + "@type": "Offer", + "priceSpecification": {"@id": "https://example.com/price-specs/ps1"}, + }, + { + "@id": "https://example.com/price-specs/ps1", + "@type": "Thing", + "name": "invalid price specification type", + }, + ], + } + path = tmp_path / "schemaorg-price-spec-bad.jsonld" + path.write_text(json.dumps(payload), encoding="utf-8") + + result = shacl.validate_file(path.as_posix(), shape_specs=["schemaorg-grammar"]) + messages = [issue.message for issue in shacl.extract_validation_issues(result)] + + assert "Schema.org range check: priceSpecification." in messages diff --git a/tests/test_validation_from_url.py b/tests/test_validation_from_url.py index 8130509..cf12ee3 100644 --- a/tests/test_validation_from_url.py +++ b/tests/test_validation_from_url.py @@ -18,7 +18,7 @@ def _render_html(_options): class _FakePreparedValidator: - prepared_shapes = SimpleNamespace(shape_source_map={}) + prepared_shapes = SimpleNamespace(shape_source_map={}, shapes_graph=Graph()) def validate_graph(self, data_graph, *, normalize_schema_org=True): return SimpleNamespace( diff --git a/wordlift_sdk/graph/audit/kpis/schema_compliance.py b/wordlift_sdk/graph/audit/kpis/schema_compliance.py index 102a6fa..4be03b4 100644 --- a/wordlift_sdk/graph/audit/kpis/schema_compliance.py +++ b/wordlift_sdk/graph/audit/kpis/schema_compliance.py @@ -15,6 +15,7 @@ from wordlift_sdk.validation.shacl import ( _normalize_schema_org_uris, # type: ignore[attr-defined] PreparedShaclValidator, + _should_skip_schemaorg_subclass_range_warning, # type: ignore[attr-defined] ) _SCHEMA_URL_HTTP = URIRef("http://schema.org/url") @@ -196,12 +197,16 @@ def run_prebuilt(self, subgraphs_by_url: dict[str, Graph]) -> SchemaComplianceRu main_issues = _extract_issues( main_validation.report_graph, self._main_validator.prepared_shapes.shape_source_map, + data_graph=subgraph, + shapes_graph=self._main_validator.prepared_shapes.shapes_graph, issue_level=self._issue_level, include_issue_details=self._include_issue_details, ) merchant_issues = _extract_issues( merchant_validation.report_graph, self._merchant_validator.prepared_shapes.shape_source_map, + data_graph=subgraph, + shapes_graph=self._merchant_validator.prepared_shapes.shapes_graph, issue_level=self._issue_level, include_issue_details=self._include_issue_details, ) @@ -310,6 +315,9 @@ def _build_subgraph( def _extract_issues( report_graph: Graph, source_map: dict, + *, + data_graph: Graph | None = None, + shapes_graph: Graph | None = None, issue_level: str = "warning", include_issue_details: bool = True, ) -> ExtractedIssues: @@ -319,6 +327,14 @@ def _extract_issues( warning_count = 0 for node in report_graph.subjects(SH.resultSeverity, None): + if _should_skip_schemaorg_subclass_range_warning( + report_graph=report_graph, + node=node, + source_map=source_map, + data_graph=data_graph, + shapes_graph=shapes_graph, + ): + continue severity_iri = report_graph.value(node, SH.resultSeverity) if severity_iri == SH.Violation: error_count += 1 diff --git a/wordlift_sdk/validation/shacl.py b/wordlift_sdk/validation/shacl.py index 9f50291..b9b0773 100644 --- a/wordlift_sdk/validation/shacl.py +++ b/wordlift_sdk/validation/shacl.py @@ -13,14 +13,17 @@ from html.parser import HTMLParser import json +from rdflib.collection import Collection from rdflib import Graph, URIRef -from rdflib.namespace import SH +from rdflib.namespace import RDF, SH from rdflib.term import Identifier from requests import Response, get from wordlift_sdk.render import RenderOptions, render_html DEFAULT_OPT_IN_EXCLUDED_SHAPES = {"google-image-license-metadata.ttl"} +_SCHEMAORG_GRAMMAR_SHAPE = "schemaorg-grammar.ttl" +_SCHEMAORG_SUBCLASS_ONTOLOGY_RESOURCE = "schemaorg-subclass-ontology.nt" _VALIDATOR_OPTIONS = { "inference": "rdfs", "abort_on_first": False, @@ -37,6 +40,7 @@ class ValidationResult: data_graph: Graph shape_source_map: dict[Identifier, str] warning_count: int + shapes_graph: Graph | None = None @dataclass @@ -289,6 +293,129 @@ def _read_shape_resource(name: str) -> str | None: return resource.read_text(encoding="utf-8") +@lru_cache(maxsize=1) +def _load_schemaorg_subclass_ontology() -> Graph | None: + data = _read_shape_resource(_SCHEMAORG_SUBCLASS_ONTOLOGY_RESOURCE) + if data is None: + return None + ontology = Graph() + ontology.parse(data=data, format="nt") + return _normalize_schema_org_uris(ontology) + + +def _normalize_schema_uri_ref(term: Identifier | None) -> URIRef | None: + if not isinstance(term, URIRef): + return None + value = str(term) + if value.startswith("https://schema.org/"): + return URIRef("http://schema.org/" + value[len("https://schema.org/") :]) + return term + + +@lru_cache(maxsize=1) +def _schemaorg_subclass_map() -> dict[URIRef, set[URIRef]]: + ontology = _load_schemaorg_subclass_ontology() + if ontology is None: + return {} + + subclass_map: dict[URIRef, set[URIRef]] = {} + for child, parent in ontology.subject_objects( + URIRef("http://www.w3.org/2000/01/rdf-schema#subClassOf") + ): + child_ref = _normalize_schema_uri_ref(child) + parent_ref = _normalize_schema_uri_ref(parent) + if child_ref is None or parent_ref is None: + continue + subclass_map.setdefault(child_ref, set()).add(parent_ref) + return subclass_map + + +def _schemaorg_superclasses(class_iri: URIRef) -> set[URIRef]: + subclass_map = _schemaorg_subclass_map() + seen: set[URIRef] = set() + stack = [class_iri] + while stack: + current = stack.pop() + for parent in subclass_map.get(current, set()): + if parent in seen: + continue + seen.add(parent) + stack.append(parent) + return seen + + +def _shape_expected_classes( + shapes_graph: Graph, source_shape: Identifier +) -> set[URIRef]: + expected: set[URIRef] = set() + list_node = shapes_graph.value(source_shape, SH["or"]) + if list_node is None: + return expected + try: + members = Collection(shapes_graph, list_node) + except Exception: + return expected + + for member in members: + class_term = shapes_graph.value(member, SH["class"]) + class_ref = _normalize_schema_uri_ref(class_term) + if class_ref is not None: + expected.add(class_ref) + return expected + + +def _should_skip_schemaorg_subclass_range_warning( + *, + report_graph: Graph, + node: Identifier, + source_map: dict[Identifier, str], + data_graph: Graph | None, + shapes_graph: Graph | None, +) -> bool: + if data_graph is None or shapes_graph is None: + return False + + source_shape = report_graph.value(node, SH.sourceShape) + if source_shape is None: + return False + shape_source = source_map.get(source_shape) or source_map.get(str(source_shape)) + if shape_source != _SCHEMAORG_GRAMMAR_SHAPE.removesuffix(".ttl"): + return False + + message = report_graph.value(node, SH.resultMessage) + if not isinstance(message, Identifier) or not str(message).startswith( + "Schema.org range check:" + ): + return False + + value = report_graph.value(node, SH.value) + value_ref = _normalize_schema_uri_ref(value) + if value_ref is None: + return False + + expected_classes = _shape_expected_classes(shapes_graph, source_shape) + if not expected_classes: + return False + + value_types = { + type_ref + for type_ref in ( + _normalize_schema_uri_ref(term) + for term in data_graph.objects(value_ref, RDF.type) + ) + if type_ref is not None + } + if not value_types: + return False + + for value_type in value_types: + if value_type in expected_classes: + return True + if expected_classes.intersection(_schemaorg_superclasses(value_type)): + return True + return False + + def _resolve_shape_sources(shape_specs: Iterable[str] | None) -> list[str]: if not shape_specs: return _default_shape_resource_names() @@ -429,6 +556,7 @@ def validate_file( data_graph=result.data_graph, shape_source_map=validator.prepared_shapes.shape_source_map, warning_count=result.warning_count, + shapes_graph=validator.prepared_shapes.shapes_graph, ) @@ -458,6 +586,7 @@ def validate_jsonld_from_url( data_graph=result.data_graph, shape_source_map=validator.prepared_shapes.shape_source_map, warning_count=result.warning_count, + shapes_graph=validator.prepared_shapes.shapes_graph, ) @@ -470,6 +599,14 @@ def _severity_to_level(severity: Identifier | None) -> str: def extract_validation_issues(result: ValidationResult) -> list[ValidationIssue]: issues: list[ValidationIssue] = [] for node in result.report_graph.subjects(SH.resultSeverity, None): + if _should_skip_schemaorg_subclass_range_warning( + report_graph=result.report_graph, + node=node, + source_map=result.shape_source_map, + data_graph=result.data_graph, + shapes_graph=result.shapes_graph, + ): + continue severity = result.report_graph.value(node, SH.resultSeverity) source_shape = result.report_graph.value(node, SH.sourceShape) message = result.report_graph.value(node, SH.resultMessage) diff --git a/wordlift_sdk/validation/shacls/schemaorg-subclass-ontology.nt b/wordlift_sdk/validation/shacls/schemaorg-subclass-ontology.nt new file mode 100644 index 0000000..f0a9585 --- /dev/null +++ b/wordlift_sdk/validation/shacls/schemaorg-subclass-ontology.nt @@ -0,0 +1,996 @@ + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + .