In [None]:
!pip install shexer

sheXer can handle different types of inputs:

* Local/remote files.
* in-memory string content.
* SPARQL endpoints.
* Compressed files.
* RDFlib graphs.

In this notebook, you'll find examples on how to provide such inputs to sheXer


In [3]:
from shexer.shaper import Shaper

def default_namespaces():
    return {"http://example.org/": "ex",
            "http://www.w3.org/XML/1998/namespace/": "xml",
            "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
            "http://www.w3.org/2000/01/rdf-schema#": "rdfs",
            "http://www.w3.org/2001/XMLSchema#": "xsd",
            "http://xmlns.com/foaf/0.1/": "foaf"
            }

In [None]:
# Getting some shapes for a graph in an str object

raw_graph_turtle = """@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix ex: <http://example.org/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

ex:Jimmy a foaf:Person ;  # Complete
	foaf:age "23"^^xsd:integer ;
	foaf:name "Jimmy" ;
	foaf:familyName "Jones" .

ex:Sarah a foaf:Person ;  # Complete implicit type for age
	foaf:age 22 ;
	foaf:name "Sarah" ;
	foaf:familyName "Salem" .

ex:Bella a foaf:Person ;  # Missing familyName
	foaf:age "56"^^xsd:integer ;
	foaf:name "Isabella" .

ex:David a foaf:Person ;  # Missing age and use knows
	foaf:name "David" ;
	foaf:familyName "Doulofeau" ;
	foaf:knows ex:Sarah .

ex:HumanLike foaf:name "Person" ;  # foaf properties, but not explicit type.
	foaf:familyName "Maybe" ;
	foaf:age 99 ;
	foaf:knows ex:David .


ex:x1 rdf:type foaf:Document ;
	foaf:depiction "A thing that is nice" ;
	foaf:title "A nice thing" .


ex:x2 rdf:type foaf:Document ;
	foaf:title "Another thing" ."""

from shexer.consts import TURTLE_ITER, TURTLE

shaper = Shaper(
            raw_graph=raw_graph_turtle,  # parameter to pass the input str
            namespaces_dict=default_namespaces(),  # some namespaces to pretify the result
            all_classes_mode=True,  # get a shape for each class with instances
            input_format=TURTLE_ITER)  # input format should be indicated. Available options in shexer.const
                                       # TURTLE_ITER is turtle, but the parser used in implemented within sheXer
                                       # TURTLE input format in available. Such option uses rdflib's parser

str_result = shaper.shex_graph(string_output=True)
print(str_result)


In [None]:
# Same case, same content. But now the input is in a remote file

shaper = Shaper(
            graph_file_input="https://raw.githubusercontent.com/weso/shexer/refs/heads/master/test/t_files/t_graph_1.ttl",  # parameter to pass a path for a remote file
            namespaces_dict=default_namespaces(),
            all_classes_mode=True,
            input_format=TURTLE)  # using rdflib's parser, some more namespaces appear in the results (precharged ones in an rdflig.Graph() object)

str_result = shaper.shex_graph(string_output=True)
print(str_result)

In [None]:
# Same case, but now the file is local

import requests

def remote_to_local(url, local_path):
  response = requests.get(url)
  if response.status_code == 200:
      with open(local_path, "w", encoding="utf-8") as out_stream:
          out_stream.write(response.text)

remote_to_local("https://raw.githubusercontent.com/weso/shexer/refs/heads/master/test/t_files/t_graph_1.ttl",
                "local_file.ttl")

shaper = Shaper(
            graph_file_input="./local_file.ttl",  # the parameter for local and remote files is the same
            namespaces_dict=default_namespaces(),
            all_classes_mode=True,
            input_format=TURTLE)

str_result = shaper.shex_graph(string_output=True)
print(str_result)

In [None]:
# Same case, but now the parsed content if N-Triples instead. This format uses an internal parser too
from shexer.consts import NT

remote_to_local("https://raw.githubusercontent.com/weso/shexer/refs/heads/master/test/t_files/t_graph_1.nt",
                "local_file.nt")

shaper = Shaper(
            graph_file_input="./local_file.nt",  # the parameter for local and remote files is the same
            namespaces_dict=default_namespaces(),
            all_classes_mode=True,
            input_format=NT)

str_result = shaper.shex_graph(string_output=True)
print(str_result)

In [None]:
#Same case, but now the content is split in two different files that should be parsed as a single dataset

remote_to_local("https://raw.githubusercontent.com/weso/shexer/refs/heads/master/test/t_files/t_graph_1.nt",
                "local_file.nt")

with open("local_file.nt") as in_stream:
  lines = in_stream.readlines()
  with open("local_file_pt1.nt", "w") as out_1:  # Writing 7 first lines to "local_file_pt1.nt"
    out_1.write("".join(lines[:7]))
  with open("local_file_pt2.nt", "w") as out_2: # Writing the rest of lines to to "local_file_pt2.nt"
    out_2.write("".join(lines[7:]))

shaper = Shaper(
            graph_list_of_files_input=["./local_file_pt1.nt", "./local_file_pt2.nt"],  # Use this parameter to parse a list of files instead of a single file
            namespaces_dict=default_namespaces(),
            all_classes_mode=True,
            input_format=NT)

str_result = shaper.shex_graph(string_output=True)
print(str_result)


In [None]:
# Same case, but now we process an rdflib graph object

from rdflib import Graph

remote_to_local("https://raw.githubusercontent.com/weso/shexer/refs/heads/master/test/t_files/t_graph_1.ttl",
                "local_file.ttl")

g = Graph()
g.parse("./local_file.ttl")

shaper = Shaper(
            rdflib_graph=g,  # Use this parameter for rdflib inputs
            namespaces_dict=default_namespaces(),
            all_classes_mode=True,
            input_format=TURTLE)

str_result = shaper.shex_graph(string_output=True)
print(str_result)

In [None]:
# Same target content, but not it is compressed in a ZIP file
# In this case,t he file consist of a single file. But in case the file
# zips more files, the setting would be the same and all files woul be parsed
# parsed as a single data source

from shexer.consts  import ZIP, TURTLE_ITER
import urllib.request

urllib.request.urlretrieve("https://github.com/weso/shexer/raw/refs/heads/master/test/t_files/compression/t_graph_1.ttl.zip",
                           "local_file.zip")

shaper = Shaper(
            graph_file_input="./local_file.zip",  # as it is a file input, we still use this parameter to declare the path
            namespaces_dict=default_namespaces(),
            all_classes_mode=True,
            input_format=TURTLE_ITER,
            compression_mode=ZIP)

str_result = shaper.shex_graph(string_output=True)
print(str_result)

In [None]:
# Example to generate a shape for some nodes exposed in DBpedia endpoint.
# With this setting, only 4 nodes (result of the SPARQL query) will be used
# as example. Only exploring direct connections with those nodes. No other
# chape than the specified in the shape map will be generated



shape_map_raw = "SPARQL'select ?s where {?s a <http://dbpedia.org/ontology/Person>} LIMIT 4'@<ShapePerson>"
shaper = Shaper(shape_map_raw=shape_map_raw,  # indicates target shapes and nodes as a shape map
                url_endpoint="https://dbpedia.org/sparql",  # target url
                namespaces_dict=default_namespaces(),  #some namespaces to pretify results.
                depth_for_building_subgraph=1,  # distance fo exploration from seed nodes
                track_classes_for_entities_at_last_depth_level=False, # no exception for the previous rule
                all_classes_mode=False)  # no class out of the content specified in the shape map will generate a shape
str_result = shaper.shex_graph(string_output=True,
                               acceptance_threshold=0.9)  # Only accept very frequent observations
print(str_result)