# Exploring Schema.org JSON-LD data

In [60]:
import copy
import json
import requests

In [2]:
url = 'https://schema.org/version/3.5/schema.jsonld'
content = requests.get(url).content.decode('utf8')

In [8]:
content_js = json.loads(content)

In [21]:
print('Keys:', content_js.keys())
print('@id:', content_js['@id'])
print('@context:', content_js['@context'])

Keys: dict_keys(['@context', '@graph', '@id'])
@id: http://schema.org/#3.5
@context: {'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 'xsd': 'http://www.w3.org/2001/XMLSchema#'}


In [25]:
print(type(content_js['@graph']))
print(len(content_js['@graph']))

<class 'list'>
1646


In [38]:
# show graph keys
keys = set()

for v in content_js['@graph']:
    keys |= set(v.keys())

keys

{'@id',
 '@type',
 'http://purl.org/dc/terms/source',
 'http://schema.org/category',
 'http://schema.org/domainIncludes',
 'http://schema.org/inverseOf',
 'http://schema.org/rangeIncludes',
 'http://schema.org/sameAs',
 'http://schema.org/supersededBy',
 'http://www.w3.org/2002/07/owl#equivalentClass',
 'http://www.w3.org/2002/07/owl#equivalentProperty',
 'http://www.w3.org/2004/02/skos/core#closeMatch',
 'rdfs:comment',
 'rdfs:label',
 'rdfs:subClassOf',
 'rdfs:subPropertyOf'}

In [42]:
# show graph types
types = set()
for v in content_js['@graph']:
    t = v['@type']
    types |= set(t) if isinstance(t, list) else {t}
types

{'http://schema.org/ActionStatusType',
 'http://schema.org/Audience',
 'http://schema.org/BoardingPolicyType',
 'http://schema.org/BookFormatType',
 'http://schema.org/Boolean',
 'http://schema.org/ContactPointOption',
 'http://schema.org/DataType',
 'http://schema.org/DayOfWeek',
 'http://schema.org/DeliveryMethod',
 'http://schema.org/DigitalDocumentPermissionType',
 'http://schema.org/DriveWheelConfigurationValue',
 'http://schema.org/EventStatusType',
 'http://schema.org/GamePlayMode',
 'http://schema.org/GameServerStatus',
 'http://schema.org/GenderType',
 'http://schema.org/ItemAvailability',
 'http://schema.org/ItemListOrderType',
 'http://schema.org/MapCategoryType',
 'http://schema.org/MusicAlbumProductionType',
 'http://schema.org/MusicAlbumReleaseType',
 'http://schema.org/MusicReleaseFormatType',
 'http://schema.org/OfferItemCondition',
 'http://schema.org/OrderStatus',
 'http://schema.org/PaymentStatusType',
 'http://schema.org/ReservationStatusType',
 'http://schema.org/R

In [59]:
# show graph rdfs:Class
classes = {}
for v in content_js['@graph']:
    t = v['@type']
    if not isinstance(t, list):
        t = [t]
    if 'rdfs:Class' not in t:
        continue
    classes[v['rdfs:label']] = v
print('# Classes:', len(classes))

# Classes: 624


In [37]:
# get data about Dataset
for v in content_js['@graph']:
    if v['rdfs:label'] == 'Dataset':
        display(v)

{'@id': 'http://schema.org/Dataset',
 '@type': 'rdfs:Class',
 'http://purl.org/dc/terms/source': {'@id': 'http://www.w3.org/wiki/WebSchemas/SchemaDotOrgSources#source_DatasetClass'},
 'http://www.w3.org/2002/07/owl#equivalentClass': [{'@id': 'http://rdfs.org/ns/void#Dataset'},
  {'@id': 'http://purl.org/dc/dcmitype/Dataset'},
  {'@id': 'http://www.w3.org/ns/dcat#Dataset'}],
 'rdfs:comment': 'A body of structured information describing some topic(s) of interest.',
 'rdfs:label': 'Dataset',
 'rdfs:subClassOf': {'@id': 'http://schema.org/CreativeWork'}}

In [52]:
# get fields
fields = set()
for v in content_js['@graph']:
    if v['@type'] == 'rdf:Property':
        fields |= {v['rdfs:label']}
print('# Properties:', len(fields))

# Properties: 905


In [54]:
# get fields
for v in content_js['@graph']:
    if v['@type'] == 'rdf:Property' and v['rdfs:label'].lower() == 'about':
        display(v)

{'@id': 'http://schema.org/about',
 '@type': 'rdf:Property',
 'http://purl.org/dc/terms/source': {'@id': 'https://github.com/schemaorg/schemaorg/issues/1670'},
 'http://schema.org/category': 'issue-1670',
 'http://schema.org/domainIncludes': [{'@id': 'http://schema.org/CreativeWork'},
  {'@id': 'http://schema.org/CommunicateAction'},
  {'@id': 'http://schema.org/Event'}],
 'http://schema.org/inverseOf': {'@id': 'http://schema.org/subjectOf'},
 'http://schema.org/rangeIncludes': {'@id': 'http://schema.org/Thing'},
 'rdfs:comment': 'The subject matter of the content.',
 'rdfs:label': 'about'}

In [63]:
# create schema js
schema = copy.copy(classes)
schema['Dataset']

{'@id': 'http://schema.org/Dataset',
 '@type': 'rdfs:Class',
 'http://purl.org/dc/terms/source': {'@id': 'http://www.w3.org/wiki/WebSchemas/SchemaDotOrgSources#source_DatasetClass'},
 'http://www.w3.org/2002/07/owl#equivalentClass': [{'@id': 'http://rdfs.org/ns/void#Dataset'},
  {'@id': 'http://purl.org/dc/dcmitype/Dataset'},
  {'@id': 'http://www.w3.org/ns/dcat#Dataset'}],
 'rdfs:comment': 'A body of structured information describing some topic(s) of interest.',
 'rdfs:label': 'Dataset',
 'rdfs:subClassOf': {'@id': 'http://schema.org/CreativeWork'}}

In [64]:
!pwd

/mnt/sda1/dev/quansight/notebooks/calpoly
