# Exploring Schema.org JSON-LD data

In [22]:
import copy
import json
import requests
from pyld import jsonld
import rdflib
from rdflib import Graph, plugin, ConjunctiveGraph
from rdflib.serializer import Serializer

In [2]:
url = 'https://schema.org/version/3.5/schema.jsonld'
content = requests.get(url).content.decode('utf8')

In [3]:
content_js = json.loads(content)

In [4]:
print('Keys:', content_js.keys())
print('@id:', content_js['@id'])
print('@context:', content_js['@context'])

Keys: dict_keys(['@context', '@graph', '@id'])
@id: http://schema.org/#3.5
@context: {'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 'xsd': 'http://www.w3.org/2001/XMLSchema#'}


In [5]:
print(type(content_js['@graph']))
print(len(content_js['@graph']))

<class 'list'>
1646


In [6]:
# show graph keys
keys = set()

for v in content_js['@graph']:
    keys |= set(v.keys())

keys

{'@id',
 '@type',
 'http://purl.org/dc/terms/source',
 'http://schema.org/category',
 'http://schema.org/domainIncludes',
 'http://schema.org/inverseOf',
 'http://schema.org/rangeIncludes',
 'http://schema.org/sameAs',
 'http://schema.org/supersededBy',
 'http://www.w3.org/2002/07/owl#equivalentClass',
 'http://www.w3.org/2002/07/owl#equivalentProperty',
 'http://www.w3.org/2004/02/skos/core#closeMatch',
 'rdfs:comment',
 'rdfs:label',
 'rdfs:subClassOf',
 'rdfs:subPropertyOf'}

In [7]:
# show graph types
types = set()
for v in content_js['@graph']:
    t = v['@type'] if isinstance(v['@type'], list) else {v['@type']}
    types |= set(t) 
    if 'http://schema.org/GenderType' in t:
        display(v)
types

{'@id': 'http://schema.org/Male',
 '@type': 'http://schema.org/GenderType',
 'rdfs:comment': 'The male gender.',
 'rdfs:label': 'Male'}

{'@id': 'http://schema.org/Female',
 '@type': 'http://schema.org/GenderType',
 'rdfs:comment': 'The female gender.',
 'rdfs:label': 'Female'}

{'http://schema.org/ActionStatusType',
 'http://schema.org/Audience',
 'http://schema.org/BoardingPolicyType',
 'http://schema.org/BookFormatType',
 'http://schema.org/Boolean',
 'http://schema.org/ContactPointOption',
 'http://schema.org/DataType',
 'http://schema.org/DayOfWeek',
 'http://schema.org/DeliveryMethod',
 'http://schema.org/DigitalDocumentPermissionType',
 'http://schema.org/DriveWheelConfigurationValue',
 'http://schema.org/EventStatusType',
 'http://schema.org/GamePlayMode',
 'http://schema.org/GameServerStatus',
 'http://schema.org/GenderType',
 'http://schema.org/ItemAvailability',
 'http://schema.org/ItemListOrderType',
 'http://schema.org/MapCategoryType',
 'http://schema.org/MusicAlbumProductionType',
 'http://schema.org/MusicAlbumReleaseType',
 'http://schema.org/MusicReleaseFormatType',
 'http://schema.org/OfferItemCondition',
 'http://schema.org/OrderStatus',
 'http://schema.org/PaymentStatusType',
 'http://schema.org/ReservationStatusType',
 'http://schema.org/R

In [8]:
# show graph rdfs:Class
classes = {}
for v in content_js['@graph']:
    t = v['@type']
    if not isinstance(t, list):
        t = [t]
    if 'rdfs:Class' not in t:
        continue
    classes[v['rdfs:label']] = v
print('# Classes:', len(classes))

# Classes: 624


In [9]:
# get data about some types
for v in content_js['@graph']:
    if (
        v['rdfs:label'] == 'Dataset'
        or v['rdfs:label'].startswith('About')
        or v['rdfs:label'] == 'GenderType'
    ):
        display(v)

{'@id': 'http://schema.org/AboutPage',
 '@type': 'rdfs:Class',
 'rdfs:comment': 'Web page type: About page.',
 'rdfs:label': 'AboutPage',
 'rdfs:subClassOf': {'@id': 'http://schema.org/WebPage'}}

{'@id': 'http://schema.org/Dataset',
 '@type': 'rdfs:Class',
 'http://purl.org/dc/terms/source': {'@id': 'http://www.w3.org/wiki/WebSchemas/SchemaDotOrgSources#source_DatasetClass'},
 'http://www.w3.org/2002/07/owl#equivalentClass': [{'@id': 'http://rdfs.org/ns/void#Dataset'},
  {'@id': 'http://purl.org/dc/dcmitype/Dataset'},
  {'@id': 'http://www.w3.org/ns/dcat#Dataset'}],
 'rdfs:comment': 'A body of structured information describing some topic(s) of interest.',
 'rdfs:label': 'Dataset',
 'rdfs:subClassOf': {'@id': 'http://schema.org/CreativeWork'}}

{'@id': 'http://schema.org/GenderType',
 '@type': 'rdfs:Class',
 'rdfs:comment': 'An enumeration of genders.',
 'rdfs:label': 'GenderType',
 'rdfs:subClassOf': {'@id': 'http://schema.org/Enumeration'}}

In [10]:
enums = {}

for v in content_js['@graph']:
    if ('rdfs:subClassOf' in v):
        if (
            isinstance(v['rdfs:subClassOf'], dict) 
            and v['rdfs:subClassOf']['@id'] == 'http://schema.org/Enumeration'
        ) or (
            isinstance(v['rdfs:subClassOf'], list) 
            and len([True for item in v['rdfs:subClassOf'] if item['@id'] == 'http://schema.org/Enumeration'])
        ): 
            enums[v['rdfs:label']] = v
print('# Enums types:', len(enums))

# Enums types: 29


In [11]:
for k, e in enums.items():
    e['values'] = []
    for v in content_js['@graph']:
        if e['@id'] in v['@type']:
            e['values'].append(v)
print('# Gender Enum values:')
enums['GenderType']['values']

# Gender Enum values:


[{'@id': 'http://schema.org/Male',
  '@type': 'http://schema.org/GenderType',
  'rdfs:comment': 'The male gender.',
  'rdfs:label': 'Male'},
 {'@id': 'http://schema.org/Female',
  '@type': 'http://schema.org/GenderType',
  'rdfs:comment': 'The female gender.',
  'rdfs:label': 'Female'}]

In [12]:
# get properties
properties = set()
for v in content_js['@graph']:
    if v['@type'] == 'rdf:Property':
        properties |= {v['rdfs:label']}
print('# Properties:', len(properties))

# Properties: 905


In [13]:
# schema properties domain
schema_properties = {}
k_domain = 'http://schema.org/domainIncludes'
for v in content_js['@graph']:
    if v['@type'] == 'rdf:Property' and k_domain in v:
        v_domain = v[k_domain]
        if isinstance(v_domain, list):
            for d in v_domain:
                id_domain = d['@id']
                if id_domain not in schema_properties:
                    schema_properties[id_domain] = []
                schema_properties[id_domain].append(v['@id'])
        else:
            id_domain = v_domain['@id']
            if id_domain not in schema_properties:
                schema_properties[id_domain] = []
            schema_properties[id_domain].append(v['@id'])

In [14]:
print(schema_properties['http://schema.org/CreativeWork'])

['http://schema.org/about', 'http://schema.org/accessibilitySummary', 'http://schema.org/educationalAlignment', 'http://schema.org/associatedMedia', 'http://schema.org/funder', 'http://schema.org/audio', 'http://schema.org/provider', 'http://schema.org/encoding', 'http://schema.org/interactivityType', 'http://schema.org/character', 'http://schema.org/audience', 'http://schema.org/sourceOrganization', 'http://schema.org/isPartOf', 'http://schema.org/video', 'http://schema.org/publication', 'http://schema.org/text', 'http://schema.org/expires', 'http://schema.org/contributor', 'http://schema.org/publisher', 'http://schema.org/reviews', 'http://schema.org/typicalAgeRange', 'http://schema.org/position', 'http://schema.org/releasedEvent', 'http://schema.org/contentLocation', 'http://schema.org/schemaVersion', 'http://schema.org/accessibilityFeature', 'http://schema.org/aggregateRating', 'http://schema.org/locationCreated', 'http://schema.org/accessModeSufficient', 'http://schema.org/tempora

In [15]:
# create schema js
schema = copy.copy(classes)
schema['Dataset']

{'@id': 'http://schema.org/Dataset',
 '@type': 'rdfs:Class',
 'http://purl.org/dc/terms/source': {'@id': 'http://www.w3.org/wiki/WebSchemas/SchemaDotOrgSources#source_DatasetClass'},
 'http://www.w3.org/2002/07/owl#equivalentClass': [{'@id': 'http://rdfs.org/ns/void#Dataset'},
  {'@id': 'http://purl.org/dc/dcmitype/Dataset'},
  {'@id': 'http://www.w3.org/ns/dcat#Dataset'}],
 'rdfs:comment': 'A body of structured information describing some topic(s) of interest.',
 'rdfs:label': 'Dataset',
 'rdfs:subClassOf': {'@id': 'http://schema.org/CreativeWork'}}

## Using PyLD / RDFLib / RDFLib-jsonld

In [17]:
# compact a document according to a particular context
# see: http://json-ld.org/spec/latest/json-ld/#compacted-document-form
# compacted = jsonld.compact(content_js['@context'], content_js['@graph'])

In [25]:
# g = Graph().parse(data=content, format='n3')
# g = Graph().parse('http://schema.org/#3.5')
g = ConjunctiveGraph()
p = g.parse('https://schema.org/version/latest/schema.jsonld', format='json-ld')
# print(g.serialize(format='json-ld', indent=4))
g, p

(<Graph identifier=N09eaef1312fd4bf093848a3d7dfbe461 (<class 'rdflib.graph.ConjunctiveGraph'>)>,
 <Graph identifier=https://schema.org/version/3.5/schema.jsonld (<class 'rdflib.graph.Graph'>)>)

In [27]:
g.serialize(format='json-ld', indent=4)

b'[\n    {\n        "@graph": [\n            {\n                "@id": "http://schema.org/isbn",\n                "@type": [\n                    "http://www.w3.org/1999/02/22-rdf-syntax-ns#Property"\n                ],\n                "http://schema.org/domainIncludes": [\n                    {\n                        "@id": "http://schema.org/Book"\n                    }\n                ],\n                "http://schema.org/rangeIncludes": [\n                    {\n                        "@id": "http://schema.org/Text"\n                    }\n                ],\n                "http://www.w3.org/2000/01/rdf-schema#comment": [\n                    {\n                        "@value": "The ISBN of the book."\n                    }\n                ],\n                "http://www.w3.org/2000/01/rdf-schema#label": [\n                    {\n                        "@value": "isbn"\n                    }\n                ],\n                "http://www.w3.org/2000/01/rdf-schema#subPr