# Exploring Schema.org JSON-LD data

In [1]:
import copy
import json
import os
import requests
from jinja2 import Template

In [50]:
url = 'https://schema.org/version/3.5/schema.jsonld'
# url = 'https://schema.org/version/latest/all-layers.jsonld'
# KeyError 'rdfs:label' for some properties

content = requests.get(url).content.decode('utf8')

In [51]:
content_js = json.loads(content)

In [52]:
print('Keys:', content_js.keys())
print('@id:', content_js['@id'])
print('@context:', content_js['@context'])

Keys: dict_keys(['@context', '@graph', '@id'])
@id: http://schema.org/#3.5
@context: {'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 'xsd': 'http://www.w3.org/2001/XMLSchema#'}


In [53]:
print(type(content_js['@graph']))
print(len(content_js['@graph']))

<class 'list'>
1646


In [54]:
primitive_types = (
    'http://schema.org/Text',
    'http://schema.org/URL',
    'http://schema.org/PropertyValue',
    'http://schema.org/Date',
    'http://schema.org/Number',
    'http://schema.org/Float',
    'http://schema.org/Integer',
    'http://schema.org/Time',
    'http://schema.org/DateTime',
    'http://schema.org/Boolean',
    'http://schema.org/DataType',
)

In [55]:
# show graph keys
keys = set()

for v in content_js['@graph']:
    keys |= set(v.keys())

keys

{'@id',
 '@type',
 'http://purl.org/dc/terms/source',
 'http://schema.org/category',
 'http://schema.org/domainIncludes',
 'http://schema.org/inverseOf',
 'http://schema.org/rangeIncludes',
 'http://schema.org/sameAs',
 'http://schema.org/supersededBy',
 'http://www.w3.org/2002/07/owl#equivalentClass',
 'http://www.w3.org/2002/07/owl#equivalentProperty',
 'http://www.w3.org/2004/02/skos/core#closeMatch',
 'rdfs:comment',
 'rdfs:label',
 'rdfs:subClassOf',
 'rdfs:subPropertyOf'}

In [48]:
# show graph types
types = set()
for v in content_js['@graph']:
    t = v['@type'] if isinstance(v['@type'], list) else {v['@type']}
    types |= set(t)
types

{'http://schema.org/ActionStatusType',
 'http://schema.org/Audience',
 'http://schema.org/BoardingPolicyType',
 'http://schema.org/BookFormatType',
 'http://schema.org/Boolean',
 'http://schema.org/ContactPointOption',
 'http://schema.org/DataType',
 'http://schema.org/DayOfWeek',
 'http://schema.org/DeliveryMethod',
 'http://schema.org/DigitalDocumentPermissionType',
 'http://schema.org/DriveWheelConfigurationValue',
 'http://schema.org/EventStatusType',
 'http://schema.org/GamePlayMode',
 'http://schema.org/GameServerStatus',
 'http://schema.org/GenderType',
 'http://schema.org/ItemAvailability',
 'http://schema.org/ItemListOrderType',
 'http://schema.org/MapCategoryType',
 'http://schema.org/MusicAlbumProductionType',
 'http://schema.org/MusicAlbumReleaseType',
 'http://schema.org/MusicReleaseFormatType',
 'http://schema.org/OfferItemCondition',
 'http://schema.org/OrderStatus',
 'http://schema.org/PaymentStatusType',
 'http://schema.org/ReservationStatusType',
 'http://schema.org/R

In [83]:
# show graph rdfs:Class
def get_schemas(graph):
    classes = {}
    for v in graph:
        t = v['@type']
        if not isinstance(t, list):
            t = [t]
        if 'rdfs:Class' not in t:
            continue
        classes[v['@id']] = v
    return classes

graph = content_js['@graph']
classes = get_schemas(graph)
schemas = copy.copy(classes)
print('# Classes:', len(classes))

# Classes: 624


In [89]:
for v in graph:
    if 'supersededBy' in v['rdfs:label']:
        print(v)
        break
    

In [57]:
for k, v in schemas.items():
    if v['@id'].endswith('Enumeration'):
        display(v)

{'@id': 'http://schema.org/Enumeration',
 '@type': 'rdfs:Class',
 'rdfs:comment': 'Lists or enumerations—for example, a list of cuisines or music genres, etc.',
 'rdfs:label': 'Enumeration',
 'rdfs:subClassOf': {'@id': 'http://schema.org/Intangible'}}

In [82]:
# get data about some types
with_superseded_by = {}
for v in content_js['@graph']:
    _k = 'http://schema.org/supersededBy'
    if _k in v:
        with_superseded_by[v['@id']] = v
        
    if 'rdfs:label' not in v:
        continue

    if isinstance(v['rdfs:label'], dict):
        label = v['rdfs:label']['@value']
    else:
        label = v['rdfs:label']
    
    if (
        label == 'Dataset'
        or label.startswith('About')
        or label == 'GenderType'
        or label == 'Enumeration'
    ):
        display(v)
print('=' * 80)
with_superseded_by

{'@id': 'http://schema.org/AboutPage',
 '@type': 'rdfs:Class',
 'rdfs:comment': 'Web page type: About page.',
 'rdfs:label': 'AboutPage',
 'rdfs:subClassOf': {'@id': 'http://schema.org/WebPage'}}

{'@id': 'http://schema.org/Dataset',
 '@type': 'rdfs:Class',
 'http://purl.org/dc/terms/source': {'@id': 'http://www.w3.org/wiki/WebSchemas/SchemaDotOrgSources#source_DatasetClass'},
 'http://www.w3.org/2002/07/owl#equivalentClass': [{'@id': 'http://rdfs.org/ns/void#Dataset'},
  {'@id': 'http://purl.org/dc/dcmitype/Dataset'},
  {'@id': 'http://www.w3.org/ns/dcat#Dataset'}],
 'rdfs:comment': 'A body of structured information describing some topic(s) of interest.',
 'rdfs:label': 'Dataset',
 'rdfs:subClassOf': {'@id': 'http://schema.org/CreativeWork'}}

{'@id': 'http://schema.org/GenderType',
 '@type': 'rdfs:Class',
 'rdfs:comment': 'An enumeration of genders.',
 'rdfs:label': 'GenderType',
 'rdfs:subClassOf': {'@id': 'http://schema.org/Enumeration'},
 '__values': [{'@id': 'http://schema.org/Male',
   '@type': 'http://schema.org/GenderType',
   'rdfs:comment': 'The male gender.',
   'rdfs:label': 'Male'},
  {'@id': 'http://schema.org/Female',
   '@type': 'http://schema.org/GenderType',
   'rdfs:comment': 'The female gender.',
   'rdfs:label': 'Female'}]}

{'@id': 'http://schema.org/Enumeration',
 '@type': 'rdfs:Class',
 'rdfs:comment': 'Lists or enumerations—for example, a list of cuisines or music genres, etc.',
 'rdfs:label': 'Enumeration',
 'rdfs:subClassOf': {'@id': 'http://schema.org/Intangible'}}



{'http://schema.org/UserLikes': {'@id': 'http://schema.org/UserLikes',
  '@type': 'rdfs:Class',
  'http://schema.org/supersededBy': {'@id': 'http://schema.org/InteractionCounter'},
  'rdfs:comment': 'UserInteraction and its subtypes is an old way of talking about users interacting with pages. It is generally better to use <a class="localLink" href="http://schema.org/Action">Action</a>-based vocabulary, alongside types such as <a class="localLink" href="http://schema.org/Comment">Comment</a>.',
  'rdfs:label': 'UserLikes',
  'rdfs:subClassOf': {'@id': 'http://schema.org/UserInteraction'}},
 'http://schema.org/UserPlays': {'@id': 'http://schema.org/UserPlays',
  '@type': 'rdfs:Class',
  'http://schema.org/supersededBy': {'@id': 'http://schema.org/InteractionCounter'},
  'rdfs:comment': 'UserInteraction and its subtypes is an old way of talking about users interacting with pages. It is generally better to use <a class="localLink" href="http://schema.org/Action">Action</a>-based vocabulary

In [59]:
def get_schema_enums(schemas, graph):
    enums = {}

    for k, v in schemas.items():
        if ('rdfs:subClassOf' in v):
            if (
                isinstance(v['rdfs:subClassOf'], dict) 
                and v['rdfs:subClassOf']['@id'] == 'http://schema.org/Enumeration'
            ) or (
                isinstance(v['rdfs:subClassOf'], list) 
                and len([True for item in v['rdfs:subClassOf'] if item['@id'] == 'http://schema.org/Enumeration'])
            ): 
                eid = v['@id']
                enums[eid] = v
                enums[eid]['__values'] = []
                for v2 in graph:
                    if enums[eid]['@id'] in v2['@type']:
                        enums[eid]['__values'].append(v2)
    return enums
enums = get_schema_enums(schemas, graph)
print('# Enums types:', len(enums))
enums['http://schema.org/GenderType']

# Enums types: 29


{'@id': 'http://schema.org/GenderType',
 '@type': 'rdfs:Class',
 'rdfs:comment': 'An enumeration of genders.',
 'rdfs:label': 'GenderType',
 'rdfs:subClassOf': {'@id': 'http://schema.org/Enumeration'},
 '__values': [{'@id': 'http://schema.org/Male',
   '@type': 'http://schema.org/GenderType',
   'rdfs:comment': 'The male gender.',
   'rdfs:label': 'Male'},
  {'@id': 'http://schema.org/Female',
   '@type': 'http://schema.org/GenderType',
   'rdfs:comment': 'The female gender.',
   'rdfs:label': 'Female'}]}

In [60]:
print('# Gender Enum values:')
enums['http://schema.org/GenderType']['__values']

# Gender Enum values:


[{'@id': 'http://schema.org/Male',
  '@type': 'http://schema.org/GenderType',
  'rdfs:comment': 'The male gender.',
  'rdfs:label': 'Male'},
 {'@id': 'http://schema.org/Female',
  '@type': 'http://schema.org/GenderType',
  'rdfs:comment': 'The female gender.',
  'rdfs:label': 'Female'}]

In [68]:
# schema properties domain
def _prepare_types(schemas, types, err=set()):
    if isinstance(types, dict):
        types = [types]
    result = []
    for t in types:
        tid = t['@id']
        if tid not in schemas:
            err |= {tid}
            continue
        result.append(t['@id'])
    return result


def create_schema_properties(schemas, graph):
    schema_properties = {}
    k_domain = 'http://schema.org/domainIncludes'
    for v in graph:
        if v['@type'] == 'rdf:Property' and k_domain in v:
            v_domain = v[k_domain]
            if isinstance(v_domain, list):
                for d in v_domain:
                    id_domain = d['@id']
                    if id_domain not in schema_properties:
                        schema_properties[id_domain] = {}
                    _t = _prepare_types(
                        schemas, 
                        v.get('http://schema.org/rangeIncludes', None)
                    )
                    schema_properties[id_domain].update({v['@id']: _t})
            else:
                id_domain = v_domain['@id']
                if id_domain not in schema_properties:
                    schema_properties[id_domain] = {}
                _t = _prepare_types(
                    schemas, 
                    v.get('http://schema.org/rangeIncludes', None)
                )
                schema_properties[id_domain].update({v['@id']: _t})
    return schema_properties
schema_properties = create_schema_properties(schemas, graph)

In [69]:
display(schema_properties['http://schema.org/supersededBy'])

KeyError: 'http://schema.org/supersededBy'

In [17]:
# create schema js
def _get_properties(schema, properties):
    schema_id = schema['@id']
    properties.update(schema_properties.get(schema_id, {}))
    
    if not 'rdfs:subClassOf' in schema:
        schema['__properties'] = properties
        return
    
    sub_class_of = schema['rdfs:subClassOf']

    if isinstance(sub_class_of, dict):
        sub_class_of = [sub_class_of]

    for c in sub_class_of:
        if c['@id'] not in schemas:
            print('EE', c['@id'])
            continue
        _get_properties(schemas[c['@id']], properties)

    schema['__properties'] = properties
        
        
def get_properties_recursively(schemas, schema=None):
    if schema is None:
        for k, schema in schemas.items():
            schema_id = schema['@id']
            properties = schema_properties.get(schema_id, {})
            _get_properties(schema, properties)
            
    else:
        schema_id = schema['@id']
        properties = schema_properties.get(schema_id, {})
        _get_properties(schema, properties)


get_properties_recursively(schemas)

EE http://schema.org/MedicalBusiness
EE http://schema.org/MedicalBusiness
EE http://schema.org/MedicalBusiness
EE rdfs:Class


In [18]:
display(schemas['http://schema.org/Thing']['__properties'])

{'http://schema.org/sameAs': ['http://schema.org/URL'],
 'http://schema.org/alternateName': ['http://schema.org/Text'],
 'http://schema.org/image': ['http://schema.org/URL',
  'http://schema.org/ImageObject'],
 'http://schema.org/additionalType': ['http://schema.org/URL'],
 'http://schema.org/name': ['http://schema.org/Text'],
 'http://schema.org/identifier': ['http://schema.org/Text',
  'http://schema.org/URL',
  'http://schema.org/PropertyValue'],
 'http://schema.org/subjectOf': ['http://schema.org/CreativeWork',
  'http://schema.org/Event'],
 'http://schema.org/mainEntityOfPage': ['http://schema.org/CreativeWork',
  'http://schema.org/URL'],
 'http://schema.org/url': ['http://schema.org/URL'],
 'http://schema.org/potentialAction': ['http://schema.org/Action'],
 'http://schema.org/description': ['http://schema.org/Text'],
 'http://schema.org/disambiguatingDescription': ['http://schema.org/Text']}

In [19]:
display(schemas['http://schema.org/Place']['__properties'])

{'http://schema.org/priceRange': ['http://schema.org/Text'],
 'http://schema.org/branchOf': ['http://schema.org/Organization'],
 'http://schema.org/paymentAccepted': ['http://schema.org/Text'],
 'http://schema.org/openingHours': ['http://schema.org/Text'],
 'http://schema.org/currenciesAccepted': ['http://schema.org/Text'],
 'http://schema.org/serviceArea': ['http://schema.org/Place',
  'http://schema.org/AdministrativeArea',
  'http://schema.org/GeoShape'],
 'http://schema.org/memberOf': ['http://schema.org/ProgramMembership',
  'http://schema.org/Organization'],
 'http://schema.org/address': ['http://schema.org/Text',
  'http://schema.org/PostalAddress'],
 'http://schema.org/funder': ['http://schema.org/Organization',
  'http://schema.org/Person'],
 'http://schema.org/subOrganization': ['http://schema.org/Organization'],
 'http://schema.org/hasOfferCatalog': ['http://schema.org/OfferCatalog'],
 'http://schema.org/globalLocationNumber': ['http://schema.org/Text'],
 'http://schema.org/

## Converting to GraphQL types

In [20]:
# Schema Data Types -> GraphQL Types
schema_graphql_map = {
    'Text': 'GraphQLString',
    'URL': 'GraphQLString',
    'PropertyValue': 'GraphQLString',
    'Date': 'GraphQLString',
    'Number': 'GraphQLFloat',
    'Float': 'GraphQLFloat',
    'Integer': 'GraphQLInt',
    'Time': 'GraphQLString',
    'DateTime': 'GraphQLString',
    'Boolean': 'GraphQLBoolean',
}

# GraphQL Types
graphql_primitive_types = (
    'GraphQLString',
    'GraphQLInt',
    'GraphQLFloat',
    'GraphQLBoolean',
)

def fix_expected_types(
    types,
    schema_graphql_map=schema_graphql_map,
    graphql_primitive_types=graphql_primitive_types
):
    """fix_expected_types"""
    result = {
        schema_graphql_map[t] if t in schema_graphql_map
        else t
        for t in types
    }
    
    n_total = len(result)
    n_primitive = len([
        None for t in result if t in graphql_primitive_types
    ])
    
    if n_total > 1:
        if n_total == n_primitive:
            result = {'GraphQLString'}
        else:
            result = {
                t for t in result 
                if t not in graphql_primitive_types
            }
    return result


def get_name_from_id(schema_id):
    return schema_id.split('/')[-1]


def get_graphql_type_names(schemas, types):
    result = []
    for t in types:
        if t in schemas:
            result.append(schemas[t]['rdfs:label'])
        else:
            _type_name = get_name_from_id(t)
            # print('EE', t, _type_name)
            result.append(_type_name)
    return fix_expected_types(result)


get_graphql_type_names(
    schemas, {
        'http://schema.org/DataFeedItem',
        'http://schema.org/Text',
        'http://schema.org/Thing'}
)

{'DataFeedItem', 'Thing'}

In [21]:
# TODO: convert to the new structure

def get_union_types(schemas, schema_properties):
    schema_properties = copy.copy(schema_properties)
    
    result = {}
    for schema_id, v in schema_properties.items():
        schema_name = get_name_from_id(schema_id) 
        _types = get_graphql_type_names(schemas, v)
        if len(_types) <= 1:
            continue
        result[schema_name] = _types
    return result


get_union_types(schemas, schemas['http://schema.org/Dataset']['__properties'])

{'dataFeedElement': {'DataFeedItem', 'Thing'},
 'funder': {'Organization', 'Person'},
 'audio': {'AudioObject', 'Clip'},
 'provider': {'Organization', 'Person'},
 'video': {'Clip', 'VideoObject'},
 'contributor': {'Organization', 'Person'},
 'publisher': {'Organization', 'Person'},
 'copyrightHolder': {'Organization', 'Person'},
 'author': {'Organization', 'Person'},
 'isBasedOnUrl': {'CreativeWork', 'Product'},
 'translator': {'Organization', 'Person'},
 'isBasedOn': {'CreativeWork', 'Product'},
 'creator': {'Organization', 'Person'},
 'sponsor': {'Organization', 'Person'},
 'producer': {'Organization', 'Person'},
 'subjectOf': {'CreativeWork', 'Event'}}

In [22]:
def get_schemas_union_types(schemas):
    union_types = {}

    for schema_name, schema_detail in schemas.items():
        if '__properties' not in schema_detail:
            # Schema.org Primitive Data types don't have properties
            # print('EE', schema_name)
            continue
        union_types.update(get_union_types(schemas, schema_detail['__properties']))
    return union_types


str(get_schemas_union_types(schemas))[:1000]

"{'funder': {'Organization', 'Person'}, 'performers': {'Organization', 'Person'}, 'contributor': {'Organization', 'Person'}, 'organizer': {'Organization', 'Person'}, 'attendees': {'Organization', 'Person'}, 'attendee': {'Organization', 'Person'}, 'translator': {'Organization', 'Person'}, 'sponsor': {'Organization', 'Person'}, 'location': {'Place', 'PostalAddress'}, 'composer': {'Organization', 'Person'}, 'performer': {'Organization', 'Person'}, 'subjectOf': {'CreativeWork', 'Event'}, 'serviceArea': {'Place', 'GeoShape', 'AdministrativeArea'}, 'memberOf': {'ProgramMembership', 'Organization'}, 'members': {'Organization', 'Person'}, 'member': {'Organization', 'Person'}, 'owns': {'Product', 'OwnershipInfo'}, 'brand': {'Brand', 'Organization'}, 'areaServed': {'Place', 'GeoShape', 'AdministrativeArea'}, 'photo': {'ImageObject', 'Photograph'}, 'photos': {'ImageObject', 'Photograph'}, 'geo': {'GeoShape', 'GeoCoordinates'}, 'provider': {'Organization', 'Person'}, 'broker': {'Organization', 'Pe

In [23]:
def get_schemas_type_defs(schemas, err=set()):
    type_defs = {}
    
    for schema_id, schema_detail in schemas.items():
        if '__properties' not in schema_detail:
            continue
        # skip for primitive fields
        if schema_id in primitive_types:
            continue
        schema_name = get_name_from_id(schema_id)
        type_defs[schema_name] = {}
        
        for prop_id, prop_defs in schema_detail['__properties'].items():
            prop_name = properties[prop_id]
            _types = get_graphql_type_names(schemas, prop_defs)
            if len(_types) > 1:
                type_defs[schema_name][prop_name] = prop_name
            else:
                try:
                    type_defs[schema_name][prop_name] = list(_types)[0]
                except:
                    err |= {prop_name}
    return type_defs


err = set()
type_defs = get_schemas_type_defs(schemas, err)['Dataset']
print('EE', '=' * 77)
display(err)
print('Result', '=' * 73)
display(type_defs)



{'cssSelector', 'xpath'}



{'dataFeedElement': 'dataFeedElement',
 'includedInDataCatalog': 'DataCatalog',
 'datasetTimeInterval': 'GraphQLString',
 'issn': 'GraphQLString',
 'catalog': 'DataCatalog',
 'includedDataCatalog': 'DataCatalog',
 'distribution': 'DataDownload',
 'about': 'Thing',
 'accessibilitySummary': 'GraphQLString',
 'educationalAlignment': 'AlignmentObject',
 'associatedMedia': 'MediaObject',
 'funder': 'funder',
 'audio': 'audio',
 'provider': 'provider',
 'encoding': 'MediaObject',
 'interactivityType': 'GraphQLString',
 'character': 'Person',
 'audience': 'Audience',
 'sourceOrganization': 'Organization',
 'isPartOf': 'CreativeWork',
 'video': 'video',
 'publication': 'PublicationEvent',
 'text': 'GraphQLString',
 'expires': 'GraphQLString',
 'contributor': 'contributor',
 'publisher': 'publisher',
 'reviews': 'Review',
 'typicalAgeRange': 'GraphQLString',
 'position': 'GraphQLString',
 'releasedEvent': 'PublicationEvent',
 'contentLocation': 'Place',
 'schemaVersion': 'GraphQLString',
 'acce

In [24]:
def create_schema_type(schemas):
    with open('templates/template_type.txt', 'r') as f:
        template_type = Template(f.read())

    type_defs = get_schemas_type_defs(schemas)
    union_types = get_schemas_union_types(schemas)
    enum_defs = get_schema_enums(schemas)

    result = template_type.render(
        type_defs=type_defs,
        union_types=union_types,
        enum_defs=enum_defs
    )

    return result.replace('\n\n', '\n')

result = create_schema_type(schemas)

print(result[:500] + '\n...\n' + result[-20000:-19500], '\n...\n')

TypeError: get_schema_enums() missing 1 required positional argument: 'graph'

In [None]:
def create_union_input_types(schemas):
    with open('templates/template_union_input.txt', 'r') as f:
        template_union_input = Template(f.read())
    
    type_defs = get_schemas_type_defs(schemas)
    union_types = get_schemas_union_types(schemas)
    enum_defs = get_schema_enums(schemas)
    
    result = template_union_input.render(
        input_types=union_types,
        type_defs=type_defs,
        enum_defs=enum_defs
    )
    return result


result = create_union_input_types(schemas)
print(result[:500] + '\n...\n' + result[-250:])

In [None]:
def create_enum_types(schemas):
    with open('templates/template_enum.txt', 'r') as f:
        template_enum = Template(f.read())
    
    enum_defs = get_schema_enums(schemas)
    
    result = template_enum.render(
        enum_defs=enum_defs,
    )
    return result


result = create_enum_types(schemas)
print(result[:500] + '\n...\n' + result[-250:])

In [None]:
path = os.path.join(
    os.environ['HOME'],
    'dev/quansight/tmp/schemas'
)
os.makedirs(path, exist_ok=True)

with open(os.path.join(path, 'schemaorg-typedef.js'), 'w') as f:
    f.write(create_schema_type(schemas))
    
with open(os.path.join(path, 'schemaorg-input.js'), 'w') as f:
    f.write(create_union_input_types(schemas))
    
with open(os.path.join(path, 'schemaorg-enum.js'), 'w') as f:
    f.write(create_enum_types(schemas))

In [None]:
!cp {path}/schemaorg-*.js ~/dev/quansight/calpoly-project/jupyterlab-metadata-service/backend/jupyterlab_metadata_service_server/src/schemas