# Generate LinkML YAML from an OWL ontology

Using [OwlReady2](https://owlready2.readthedocs.io/en/latest/index.html) package. 

Optionally you can define the Notebook parameters for [papermill](https://papermill.readthedocs.io/en/latest/usage-parameterize.html)

In [1]:
# Papermill parameters. Do not delete this cell.
ontology_uri = 'https://raw.githubusercontent.com/MaastrichtU-IDS/semanticscience/master/ontology/sio/release/sio-subset-labels.owl'
base_uri = 'http://semanticscience.org/resource/'
# ontology_uri = 'https://raw.githubusercontent.com/MaastrichtU-IDS/semanticscience/master/ontology/sio/release/sio-subset-labels.owl'

Import the library and define the local `ontologies` folder. If an URL is given, first searches for a local copy of the OWL file and, if not found, tries to download it from the Internet.

In [2]:
from owlready2 import *
import types
global onto 
# Download the ontology locally:
import os
os.system('wget -Nq ' + ontology_uri)



0

### Analyze the ontology with `Owlready2`

In [3]:
onto = get_ontology('sio-subset-labels.owl').load()
# onto = get_ontology(ontology_uri).load()

In [4]:
# https://owlready2.readthedocs.io/en/latest/onto.html#accessing-the-content-of-an-ontology
print(len(list(onto.classes())), 'classes')
print(len(list(onto.properties())), 'properties')
print(len(list(onto.object_properties())), 'object properties')
print(len(list(onto.data_properties())), 'data properties', list(onto.data_properties()))
print(len(list(onto.annotation_properties())), 'annotation properties')
print(len(list(onto.individuals())), 'individuals')
print(onto.base_iri)
for key in onto._namespaces:
    print(key)
# print(onto.get_namespace())

1570 classes
237 properties
211 object properties
1 data properties [resource.hasValue]
25 annotation properties
0 individuals
http://semanticscience.org/ontology/sio.owl#
sio-subset-labels.owl#
http://semanticscience.org/ontology/sio.owl#
http://semanticscience.org/resource/
http://protege.stanford.edu/plugins/owl/protege#
http://purl.org/dc/elements/1.1/
http://purl.org/dc/terms/
http://purl.org/spar/cito/
http://purl.org/vocab/vann/
http://schema.org/
http://www.w3.org/2004/02/skos/core#
http://xmlns.com/foaf/0.1/


### Analyze the ontology with `RDFLib`

In [5]:
import rdflib
from rdflib import Graph, ConjunctiveGraph, Literal, RDF, URIRef, Namespace
from rdflib.namespace import RDFS, XSD, DC, DCTERMS, VOID, OWL, SKOS

g = rdflib.Graph()
result = g.parse('sio-subset-labels.owl', format='xml')
# result = g.parse(ontology_uri, format='xml')

In [6]:
print(len(list(g.subjects(RDF.type, OWL.Class))), 'classes')
print(len(list(g.subjects(RDF.type, OWL.ObjectProperty))), 'object properties')
print(len(list(g.subjects(RDF.type, OWL.AnnotationProperty))), 'annotation properties')
print(len(list(g.subjects(RDF.type, OWL.DataProperty))), 'data properties')
print(len(list(g.subjects(RDF.type, OWL.Individuals))), 'individuals')

1711 classes
211 object properties
25 annotation properties
0 data properties
0 individuals


In [7]:
# Checkout the prefixes/namespaces
linkml_prefixes = {
    'linkml': 'https://w3id.org/linkml/'
}
for ns_prefix, namespace in g.namespaces():
    if str(namespace) == 'http://schema.org/':
        ns_prefix = 'schema'
    if str(namespace) == 'http://www.w3.org/2004/02/skos/core#':
        ns_prefix = 'skosvocab'
    linkml_prefixes[ns_prefix] = str(namespace)

In [8]:
## CLASSES
linkml_classes = {}
for owl_class in g.subjects(RDF.type, OWL.Class):
    if not str(owl_class).startswith(base_uri):
        # Avoid blank nodes
        continue

    linkml_id = str(owl_class).replace(base_uri, '')

    linkml_classes[linkml_id] = {
        'description': str(g.value(owl_class, DCTERMS.description)),
    }

    # Get 1 parent class (if many)
    parent_class = None
    parents = g.objects(owl_class, RDFS.subClassOf)
    
    if parents:
        for parent in parents:
            if not str(parent).startswith(base_uri):
                # Avoid blank nodes
                continue
            if 'is_a' in linkml_classes[linkml_id].keys(): 
                if not 'see_also' in linkml_classes[linkml_id].keys(): 
                    linkml_classes[linkml_id]['see_also'] = []
                linkml_classes[linkml_id]['see_also'].append(str(parent).replace(base_uri, ''))
            else:
                linkml_classes[linkml_id]['is_a'] = str(parent).replace(base_uri, '')
            # TODO: currently we just take the first parent class we find
            # break

In [9]:
## SLOTS
linkml_slots = {}
for obj_prop in g.subjects(RDF.type, OWL.ObjectProperty):
    if not str(obj_prop).startswith(base_uri):
        # Avoid blank nodes
        continue

    prop_id = str(obj_prop).replace(base_uri, '')
    if prop_id == 'isTransitivelyRelatedTo':
        # Skip problematic properties
        continue
    linkml_slots[prop_id] = {}

    description = g.value(obj_prop, DCTERMS.description)
    if description:
        linkml_slots[prop_id]['description'] = str(description)
    domain = g.value(obj_prop, RDFS.domain)
    if domain and str(domain).startswith(base_uri):
        linkml_slots[prop_id]['domain'] = str(domain).replace(base_uri, '')
    range = g.value(obj_prop, RDFS.range)
    if range and str(range).startswith(base_uri):
        linkml_slots[prop_id]['range'] = str(range).replace(base_uri, '')
    comm = g.value(obj_prop, URIRef('http://schema.org/comment'))
    if comm:
        linkml_slots[prop_id]['notes'] = str(comm)
    # TODO: owl:inverseOf, TransitiveProperty, ReflexiveProperty, SymmetricProperty
    # example = str(g.value(obj_prop, SKOS.example))

    # Add the property as slot to its domain class
    domain = str(g.value(obj_prop, RDFS.domain))
    if domain and str(domain).startswith(base_uri):
        domain_cls = str(domain).replace(base_uri, '')
        if 'slots' not in linkml_classes[domain_cls]:
            linkml_classes[domain_cls]['slots'] = []
        linkml_classes[domain_cls]['slots'].append(prop_id)


    # Get only 1 parent class (if many)
    parent_class = None
    parents = g.objects(obj_prop, RDFS.subPropertyOf)
    
    if parents:
        for parent in parents:
            if not str(parent).startswith(base_uri):
                # Avoid blank nodes
                continue
            if 'is_a' in linkml_slots[prop_id].keys(): 
                if not 'see_also' in linkml_slots[prop_id].keys(): 
                    linkml_slots[prop_id]['see_also'] = []
                linkml_slots[prop_id]['see_also'].append(str(parent).replace(base_uri, ''))
            else:
                linkml_slots[prop_id]['is_a'] = str(parent).replace(base_uri, '')

In [10]:
from collections import OrderedDict
import yaml
# from ruamel import yaml

# Dump the generated model in a YAML file

sio_linkml = OrderedDict(
  id='https://semanticscience.org',
  name='semanticscience-model',
  description='The Semanticscience Integrated Ontology (SIO) provides a simple, integrated ontology of types and relations for rich description of objects, processes and their attributes.',
  license='https://creativecommons.org/publicdomain/zero/1.0/',
  # Version should be kept in sync with primary Git repository release tag
  version='0.1.0',
  default_prefix='sio',
  default_range='string',
  prefixes=linkml_prefixes,
  slots=linkml_slots,
  classes=linkml_classes,
)

# class quoted(str):
#     pass
# def quoted_presenter(dumper, data):
#     return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='"')
# yaml.add_representer(quoted, quoted_presenter)

# class literal(str):
#     pass
# def literal_presenter(dumper, data):
#     return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
# yaml.add_representer(literal, literal_presenter)

def ordered_dict_presenter(dumper, data):
    return dumper.represent_dict(data.items())
yaml.add_representer(OrderedDict, ordered_dict_presenter)

def str_presenter(dumper, data):
  if len(data.splitlines()) > 1 or len(data) > 80:
    # check for multiline string
    return dumper.represent_scalar('tag:yaml.org,2002:str', data + '\n', style='|')
  return dumper.represent_scalar('tag:yaml.org,2002:str', data)
yaml.add_representer(str, str_presenter)
yaml.representer.SafeRepresenter.add_representer(str, str_presenter)

class MyDumper(yaml.Dumper):
    # Insert blank lines between top-level objects
    def write_line_break(self, data=None):
        super().write_line_break(data)
        if len(self.indents) == 1:
            super().write_line_break()
            super().write_line_break()
            super().write_line_break()
        if len(self.indents) == 2:
            super().write_line_break()


with open('sio-model.yaml', 'w') as yaml_file:
    yaml.dump(
        sio_linkml,
        yaml_file,
        Dumper=MyDumper,
        default_flow_style=False,
        sort_keys=False,
        allow_unicode = True, encoding = None
    )