# Exploring Schema.org JSON-LD data

In [43]:
import copy
import json
import requests
from jinja2 import Template

In [2]:
url = 'https://schema.org/version/3.5/schema.jsonld'
content = requests.get(url).content.decode('utf8')

In [3]:
content_js = json.loads(content)

In [4]:
print('Keys:', content_js.keys())
print('@id:', content_js['@id'])
print('@context:', content_js['@context'])

Keys: dict_keys(['@context', '@graph', '@id'])
@id: http://schema.org/#3.5
@context: {'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 'xsd': 'http://www.w3.org/2001/XMLSchema#'}


In [5]:
print(type(content_js['@graph']))
print(len(content_js['@graph']))

<class 'list'>
1646


In [6]:
# show graph keys
keys = set()

for v in content_js['@graph']:
    keys |= set(v.keys())

keys

{'@id',
 '@type',
 'http://purl.org/dc/terms/source',
 'http://schema.org/category',
 'http://schema.org/domainIncludes',
 'http://schema.org/inverseOf',
 'http://schema.org/rangeIncludes',
 'http://schema.org/sameAs',
 'http://schema.org/supersededBy',
 'http://www.w3.org/2002/07/owl#equivalentClass',
 'http://www.w3.org/2002/07/owl#equivalentProperty',
 'http://www.w3.org/2004/02/skos/core#closeMatch',
 'rdfs:comment',
 'rdfs:label',
 'rdfs:subClassOf',
 'rdfs:subPropertyOf'}

In [7]:
# show graph types
types = set()
for v in content_js['@graph']:
    t = v['@type'] if isinstance(v['@type'], list) else {v['@type']}
    types |= set(t) 
    if 'http://schema.org/GenderType' in t:
        display(v)
types

{'@id': 'http://schema.org/Male',
 '@type': 'http://schema.org/GenderType',
 'rdfs:comment': 'The male gender.',
 'rdfs:label': 'Male'}

{'@id': 'http://schema.org/Female',
 '@type': 'http://schema.org/GenderType',
 'rdfs:comment': 'The female gender.',
 'rdfs:label': 'Female'}

{'http://schema.org/ActionStatusType',
 'http://schema.org/Audience',
 'http://schema.org/BoardingPolicyType',
 'http://schema.org/BookFormatType',
 'http://schema.org/Boolean',
 'http://schema.org/ContactPointOption',
 'http://schema.org/DataType',
 'http://schema.org/DayOfWeek',
 'http://schema.org/DeliveryMethod',
 'http://schema.org/DigitalDocumentPermissionType',
 'http://schema.org/DriveWheelConfigurationValue',
 'http://schema.org/EventStatusType',
 'http://schema.org/GamePlayMode',
 'http://schema.org/GameServerStatus',
 'http://schema.org/GenderType',
 'http://schema.org/ItemAvailability',
 'http://schema.org/ItemListOrderType',
 'http://schema.org/MapCategoryType',
 'http://schema.org/MusicAlbumProductionType',
 'http://schema.org/MusicAlbumReleaseType',
 'http://schema.org/MusicReleaseFormatType',
 'http://schema.org/OfferItemCondition',
 'http://schema.org/OrderStatus',
 'http://schema.org/PaymentStatusType',
 'http://schema.org/ReservationStatusType',
 'http://schema.org/R

In [8]:
# show graph rdfs:Class
classes = {}
for v in content_js['@graph']:
    t = v['@type']
    if not isinstance(t, list):
        t = [t]
    if 'rdfs:Class' not in t:
        continue
    classes[v['@id']] = v
print('# Classes:', len(classes))

# Classes: 624


In [9]:
# get data about some types
for v in content_js['@graph']:
    if (
        v['rdfs:label'] == 'Dataset'
        or v['rdfs:label'].startswith('About')
        or v['rdfs:label'] == 'GenderType'
    ):
        display(v)

{'@id': 'http://schema.org/AboutPage',
 '@type': 'rdfs:Class',
 'rdfs:comment': 'Web page type: About page.',
 'rdfs:label': 'AboutPage',
 'rdfs:subClassOf': {'@id': 'http://schema.org/WebPage'}}

{'@id': 'http://schema.org/Dataset',
 '@type': 'rdfs:Class',
 'http://purl.org/dc/terms/source': {'@id': 'http://www.w3.org/wiki/WebSchemas/SchemaDotOrgSources#source_DatasetClass'},
 'http://www.w3.org/2002/07/owl#equivalentClass': [{'@id': 'http://rdfs.org/ns/void#Dataset'},
  {'@id': 'http://purl.org/dc/dcmitype/Dataset'},
  {'@id': 'http://www.w3.org/ns/dcat#Dataset'}],
 'rdfs:comment': 'A body of structured information describing some topic(s) of interest.',
 'rdfs:label': 'Dataset',
 'rdfs:subClassOf': {'@id': 'http://schema.org/CreativeWork'}}

{'@id': 'http://schema.org/GenderType',
 '@type': 'rdfs:Class',
 'rdfs:comment': 'An enumeration of genders.',
 'rdfs:label': 'GenderType',
 'rdfs:subClassOf': {'@id': 'http://schema.org/Enumeration'}}

In [10]:
enums = {}

for v in content_js['@graph']:
    if ('rdfs:subClassOf' in v):
        if (
            isinstance(v['rdfs:subClassOf'], dict) 
            and v['rdfs:subClassOf']['@id'] == 'http://schema.org/Enumeration'
        ) or (
            isinstance(v['rdfs:subClassOf'], list) 
            and len([True for item in v['rdfs:subClassOf'] if item['@id'] == 'http://schema.org/Enumeration'])
        ): 
            enums[v['rdfs:label']] = v
print('# Enums types:', len(enums))

# Enums types: 29


In [55]:
for k, e in enums.items():
    e['__values'] = []
    for v in content_js['@graph']:
        if e['@id'] in v['@type']:
            e['__values'].append(v)
print('# Gender Enum values:')
enums['GenderType']['__values']

# Gender Enum values:


[{'@id': 'http://schema.org/Male',
  '@type': 'http://schema.org/GenderType',
  'rdfs:comment': 'The male gender.',
  'rdfs:label': 'Male'},
 {'@id': 'http://schema.org/Female',
  '@type': 'http://schema.org/GenderType',
  'rdfs:comment': 'The female gender.',
  'rdfs:label': 'Female'}]

In [58]:
# get properties
properties = {}
for v in content_js['@graph']:
    if v['@type'] == 'rdf:Property':
        properties[v['@id']] = v['rdfs:label']
print('# Properties:', len(properties))
str(properties)[:1000]

# Properties: 905


"{'http://schema.org/downloadUrl': 'downloadUrl', 'http://schema.org/pagination': 'pagination', 'http://schema.org/gameItem': 'gameItem', 'http://schema.org/departureStation': 'departureStation', 'http://schema.org/musicCompositionForm': 'musicCompositionForm', 'http://schema.org/result': 'result', 'http://schema.org/isbn': 'isbn', 'http://schema.org/occupationLocation': 'occupationLocation', 'http://schema.org/about': 'about', 'http://schema.org/accessibilitySummary': 'accessibilitySummary', 'http://schema.org/serviceArea': 'serviceArea', 'http://schema.org/memberOf': 'memberOf', 'http://schema.org/nextItem': 'nextItem', 'http://schema.org/recordingOf': 'recordingOf', 'http://schema.org/geoWithin': 'geoWithin', 'http://schema.org/audienceType': 'audienceType', 'http://schema.org/broadcastFrequency': 'broadcastFrequency', 'http://schema.org/appliesToPaymentMethod': 'appliesToPaymentMethod', 'http://schema.org/interactionService': 'interactionService', 'http://schema.org/targetDescripti

In [28]:
# schema properties domain
def _prepare_types(types):
    if isinstance(types, dict):
        types = [types]
    result = []
    for t in types:
        result.append(t['@id'])
    return result


schema_properties = {}
k_domain = 'http://schema.org/domainIncludes'
for v in content_js['@graph']:
    if v['@type'] == 'rdf:Property' and k_domain in v:
        v_domain = v[k_domain]
        if isinstance(v_domain, list):
            for d in v_domain:
                id_domain = d['@id']
                if id_domain not in schema_properties:
                    schema_properties[id_domain] = {}
                _t = _prepare_types(v.get('http://schema.org/rangeIncludes', None))
                schema_properties[id_domain].update({v['@id']: _t})
        else:
            id_domain = v_domain['@id']
            if id_domain not in schema_properties:
                schema_properties[id_domain] = {}
            _t = _prepare_types(v.get('http://schema.org/rangeIncludes', None))
            schema_properties[id_domain].update({v['@id']: _t})

In [29]:
display(schema_properties['http://schema.org/CreativeWork'])

{'http://schema.org/about': ['http://schema.org/Thing'],
 'http://schema.org/accessibilitySummary': ['http://schema.org/Text'],
 'http://schema.org/educationalAlignment': ['http://schema.org/AlignmentObject'],
 'http://schema.org/associatedMedia': ['http://schema.org/MediaObject'],
 'http://schema.org/funder': ['http://schema.org/Organization',
  'http://schema.org/Person'],
 'http://schema.org/audio': ['http://schema.org/Clip',
  'http://schema.org/AudioObject'],
 'http://schema.org/provider': ['http://schema.org/Organization',
  'http://schema.org/Person'],
 'http://schema.org/encoding': ['http://schema.org/MediaObject'],
 'http://schema.org/interactivityType': ['http://schema.org/Text'],
 'http://schema.org/character': ['http://schema.org/Person'],
 'http://schema.org/audience': ['http://schema.org/Audience'],
 'http://schema.org/sourceOrganization': ['http://schema.org/Organization'],
 'http://schema.org/isPartOf': ['http://schema.org/CreativeWork'],
 'http://schema.org/video': ['h

In [59]:
# create schema js
def _get_properties(schema, properties):
    schema_id = schema['@id']
    properties.update(schema_properties.get(schema_id, {}))
    
    if not 'rdfs:subClassOf' in schema:
        return
    
    sub_class_of = schema['rdfs:subClassOf']

    if isinstance(sub_class_of, dict):
        sub_class_of = [sub_class_of]

    for c in sub_class_of:
        if c['@id'] not in schemas:
            print('EE', c['@id'])
            continue
        _get_properties(schemas[c['@id']], properties)

    schema['__properties'] = properties
        
        
def get_properties_recursively(schemas, schema=None):
    if schema is None:
        for k, schema in schemas.items():
            schema_id = schema['@id']
            properties = schema_properties.get(schema_id, {})
            _get_properties(schema, properties)
            
    else:
        schema_id = schema['@id']
        properties = schema_properties.get(schema_id, {})
        _get_properties(schema, properties)

schemas = copy.copy(classes)
get_properties_recursively(schemas)

EE http://schema.org/MedicalBusiness
EE http://schema.org/MedicalBusiness
EE http://schema.org/MedicalBusiness
EE rdfs:Class


In [60]:
display(schemas['http://schema.org/Dataset']['__properties'])

{'http://schema.org/dataFeedElement': ['http://schema.org/Thing',
  'http://schema.org/Text',
  'http://schema.org/DataFeedItem'],
 'http://schema.org/includedInDataCatalog': ['http://schema.org/DataCatalog'],
 'http://schema.org/datasetTimeInterval': ['http://schema.org/DateTime'],
 'http://schema.org/issn': ['http://schema.org/Text'],
 'http://schema.org/catalog': ['http://schema.org/DataCatalog'],
 'http://schema.org/includedDataCatalog': ['http://schema.org/DataCatalog'],
 'http://schema.org/distribution': ['http://schema.org/DataDownload'],
 'http://schema.org/about': ['http://schema.org/Thing'],
 'http://schema.org/accessibilitySummary': ['http://schema.org/Text'],
 'http://schema.org/educationalAlignment': ['http://schema.org/AlignmentObject'],
 'http://schema.org/associatedMedia': ['http://schema.org/MediaObject'],
 'http://schema.org/funder': ['http://schema.org/Organization',
  'http://schema.org/Person'],
 'http://schema.org/audio': ['http://schema.org/Clip',
  'http://schem

In [61]:
display(schemas['http://schema.org/Person']['__properties'])

{'http://schema.org/memberOf': ['http://schema.org/ProgramMembership',
  'http://schema.org/Organization'],
 'http://schema.org/address': ['http://schema.org/Text',
  'http://schema.org/PostalAddress'],
 'http://schema.org/spouse': ['http://schema.org/Person'],
 'http://schema.org/funder': ['http://schema.org/Organization',
  'http://schema.org/Person'],
 'http://schema.org/colleagues': ['http://schema.org/Person'],
 'http://schema.org/deathDate': ['http://schema.org/Date'],
 'http://schema.org/height': ['http://schema.org/Distance',
  'http://schema.org/QuantitativeValue'],
 'http://schema.org/workLocation': ['http://schema.org/Place',
  'http://schema.org/ContactPoint'],
 'http://schema.org/children': ['http://schema.org/Person'],
 'http://schema.org/hasOccupation': ['http://schema.org/Occupation'],
 'http://schema.org/jobTitle': ['http://schema.org/Text'],
 'http://schema.org/hasOfferCatalog': ['http://schema.org/OfferCatalog'],
 'http://schema.org/deathPlace': ['http://schema.org/P

## Converting to GraphQL types

In [131]:
# Schema Data Types -> GraphQL Types
schema_graphql_map = {
    'Text': 'GraphQLString',
    'URL': 'GraphQLString',
    'PropertyValue': 'GraphQLString',
    'Date': 'GraphQLString',
    'Number': 'GraphQLFloat',
    'Float': 'GraphQLFloat',
    'Integer': 'GraphQLInt',
    'Time': 'GraphQLString',
    'DateTime': 'GraphQLString',
    'Boolean': 'GraphQLBoolean',
    'CssSelectorType': 'GraphQLString',
    'XPathType': 'GraphQLString',
}

# GraphQL Types
graphql_primitive_types = (
    'GraphQLString',
    'GraphQLInt',
    'GraphQLFloat',
    'GraphQLBoolean',
)

def fix_expected_types(
    types,
    schema_graphql_map=schema_graphql_map,
    graphql_primitive_types=graphql_primitive_types
):
    """fix_expected_types"""
    result = {
        schema_graphql_map[t] if t in schema_graphql_map
        else t
        for t in types
    }
    
    n_total = len(result)
    n_primitive = len([
        None for t in result if t in graphql_primitive_types
    ])
    
    if n_total > 1:
        if n_total == n_primitive:
            result = {'GraphQLString'}
        else:
            result = {
                t for t in result 
                if t not in graphql_primitive_types
            }
    return result


def get_name_from_id(schema_id):
    return schema_id.split('/')[-1]


def get_graphql_type_names(schemas, types):
    result = []
    for t in types:
        if t in schemas:
            result.append(schemas[t]['rdfs:label'])
        else:
            _type_name = get_name_from_id(t)
            # print('EE', t, _type_name)
            result.append(_type_name)
    return fix_expected_types(result)


get_graphql_type_names(
    schemas, {
        'http://schema.org/DataFeedItem',
        'http://schema.org/Text',
        'http://schema.org/Thing'}
)

{'DataFeedItem', 'Thing'}

In [132]:
# TODO: convert to the new structure

def get_union_types(schemas, schema_properties):
    schema_properties = copy.copy(schema_properties)
    
    result = {}
    for schema_id, v in schema_properties.items():
        schema_name = get_name_from_id(schema_id) 
        if len(v) > 1:
            result[schema_name] = schema_name
        else:
            result[schema_name] = list(get_graphql_type_names(schemas, v))[0]
    return result


get_union_types(schemas, schemas['http://schema.org/Dataset']['__properties'])

{'dataFeedElement': 'dataFeedElement',
 'includedInDataCatalog': 'DataCatalog',
 'datasetTimeInterval': 'GraphQLString',
 'issn': 'GraphQLString',
 'catalog': 'DataCatalog',
 'includedDataCatalog': 'DataCatalog',
 'distribution': 'DataDownload',
 'about': 'Thing',
 'accessibilitySummary': 'GraphQLString',
 'educationalAlignment': 'AlignmentObject',
 'associatedMedia': 'MediaObject',
 'funder': 'funder',
 'audio': 'audio',
 'provider': 'provider',
 'encoding': 'MediaObject',
 'interactivityType': 'GraphQLString',
 'character': 'Person',
 'audience': 'Audience',
 'sourceOrganization': 'Organization',
 'isPartOf': 'CreativeWork',
 'video': 'video',
 'publication': 'PublicationEvent',
 'text': 'GraphQLString',
 'expires': 'GraphQLString',
 'contributor': 'contributor',
 'publisher': 'publisher',
 'reviews': 'Review',
 'typicalAgeRange': 'GraphQLString',
 'position': 'position',
 'releasedEvent': 'PublicationEvent',
 'contentLocation': 'Place',
 'schemaVersion': 'schemaVersion',
 'accessibi

In [133]:
def get_schemas_union_types(schemas):
    union_types = {}

    for schema_name, schema_detail in schemas.items():
        if '__properties' not in schema_detail:
            # Schema.org Primitive Data types don't have properties
            # print('EE', schema_name)
            continue
        union_types.update(get_union_types(schemas, schema_detail['__properties']))
    return union_types


str(get_schemas_union_types(schemas))[:1000]

"{'about': 'Thing', 'funder': 'funder', 'workFeatured': 'CreativeWork', 'audience': 'Audience', 'remainingAttendeeCapacity': 'GraphQLInt', 'actor': 'Person', 'performers': 'performers', 'endDate': 'endDate', 'doorTime': 'GraphQLString', 'contributor': 'contributor', 'maximumAttendeeCapacity': 'GraphQLInt', 'typicalAgeRange': 'GraphQLString', 'organizer': 'organizer', 'attendees': 'attendees', 'aggregateRating': 'AggregateRating', 'subEvent': 'Event', 'subEvents': 'Event', 'offers': 'Offer', 'inLanguage': 'inLanguage', 'attendee': 'attendee', 'workPerformed': 'CreativeWork', 'eventStatus': 'EventStatusType', 'startDate': 'startDate', 'director': 'Person', 'superEvent': 'Event', 'duration': 'Duration', 'translator': 'translator', 'previousStartDate': 'GraphQLString', 'review': 'Review', 'sponsor': 'sponsor', 'location': 'location', 'recordedIn': 'CreativeWork', 'composer': 'composer', 'isAccessibleForFree': 'GraphQLBoolean', 'performer': 'performer', 'sameAs': 'GraphQLString', 'alternate

In [134]:
def get_schemas_type_defs(schemas):
    type_defs = {}
    
    for schema_name, schema_detail in schemas.items():
        if '__properties' not in schema_detail:
            continue
        type_defs[schema_name] = {}
        
        for prop_id, prop_defs in schema_detail['__properties'].items():
            prop_name = properties[prop_id]
            type_defs[schema_name][prop_name] = get_graphql_type_names(schemas, prop_defs)
    return type_defs

str(get_schemas_type_defs(schemas))[:1000]

"{'http://schema.org/UserLikes': {'about': {'Thing'}, 'funder': {'Organization', 'Person'}, 'workFeatured': {'CreativeWork'}, 'audience': {'Audience'}, 'remainingAttendeeCapacity': {'GraphQLInt'}, 'actor': {'Person'}, 'performers': {'Organization', 'Person'}, 'endDate': {'GraphQLString'}, 'doorTime': {'GraphQLString'}, 'contributor': {'Organization', 'Person'}, 'maximumAttendeeCapacity': {'GraphQLInt'}, 'typicalAgeRange': {'GraphQLString'}, 'organizer': {'Organization', 'Person'}, 'attendees': {'Organization', 'Person'}, 'aggregateRating': {'AggregateRating'}, 'subEvent': {'Event'}, 'subEvents': {'Event'}, 'offers': {'Offer'}, 'inLanguage': {'Language'}, 'attendee': {'Organization', 'Person'}, 'workPerformed': {'CreativeWork'}, 'eventStatus': {'EventStatusType'}, 'startDate': {'GraphQLString'}, 'director': {'Person'}, 'superEvent': {'Event'}, 'duration': {'Duration'}, 'translator': {'Organization', 'Person'}, 'previousStartDate': {'GraphQLString'}, 'review': {'Review'}, 'sponsor': {'Or

In [135]:
def create_schema_type(schemas):
    with open('templates/template_type.txt', 'r') as f:
        template_type = Template(f.read())

    type_defs = get_schemas_type_defs(schemas)
    union_types = get_schemas_union_types(schemas)

    result = template_type.render(
        type_defs=type_defs.items(),
        union_types=union_types.items()
    )

    return result.replace('\n\n', '\n')

result = create_schema_type(schemas)

print(result[:500] + '\n...\n' + result[-250:])

ValueError: too many values to unpack (expected 2)

In [36]:
def create_union_input_types(schemas):
    type_defs = get_schemas_type_defs(schemas)
    union_types = get_schemas_union_types(schemas)
    
    result = template_union_input.render(
        input_types=union_types.items(),
        type_defs=type_defs.items()
    )
    return result


print(create_union_input_types(schemas)[:1500] + ' ...')


path = '/home/xmn/dev/quansight/tmp/schemas'
# path = '/tmp/jupyterlab-metadata-service/backend/jupyterlab_metadata_service_server/src/schemas'
os.makedirs(path, exist_ok=True)

with open(os.path.join(path, 'schemaorg-typedef.js'), 'w') as f:
    f.write(create_schema_type(schemas))
    
with open(os.path.join(path, 'schemaorg-input.js'), 'w') as f:
    f.write(create_union_input_types(schemas))

KeyError: 'Expected Type'