A hierarchicy of elements must be structured as a type<br>
ele-[ele, ele] is not allowed<br>
ele-type([ele,ele]) is good<br>

DataModelDict shoould be pip installed

!pip install DataModelDict==0.9.9

In [1]:
import json
import DataModelDict as DMD

import numpy as np

import os, glob

In [2]:
# define namespace
#

xmlNameSpace = "http://www.w3.org/2001/XMLSchema#"
localNameSpace = "https://github.com/kaggour/AM-CDM#"

xmlElements = ['xs:dateTime','xs:double','xs:int','xs:string']

In [3]:
# ---------------------
# generate headers
#
def header():
    return("<rdf:RDF xmlns=\"{}\"\n\
  xml:base=\"https://github.com/kaggour/AM-CDM\"\n\
  xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n\
  xmlns:obo=\"http://purl.obolibrary.org/obo/\"\n\
  xmlns:owl=\"http://www.w3.org/2002/07/owl#\"\n\
  xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n\
  xmlns:xml=\"http://www.w3.org/XML/1998/namespace\"\n\
  xmlns:xsd=\"http://www.w3.org/2001/XMLSchema#\"\n\
  xmlns:rdfs=\"http://www.w3.org/2000/01/rdf-schema#\"\n\
  xmlns:AM-CDM=\"{}\">\n\n\n".format(xmlNameSpace[:-1],localNameSpace))


def defOwl():
    return("  <owl:Ontology rdf:about=\"{}\">\n\
    <owl:versionIRI rdf:resource=\"{}/0.1\"/>\n\
  </owl:Ontology>\n\n\n".format(xmlNameSpace[:-1],xmlNameSpace[:-1]))
    

In [4]:
# ---------------------
# handle ObjectProperty: connecting classes
#
def objType(label,DataType,comment):

    description = "  <owl:ObjectProperty rdf:about={}>\n".format("\""+localNameSpace+label+"\"")
    
    if(comment):
        description += "    <rdfs:comment xml:lang=\"en\">{}</rdfs:comment>\n".format(comment)
    
    
    description += "    <rdfs:range rdf:resource={}/>\n\
  </owl:ObjectProperty>\n\n".format("\""+localNameSpace+DataType+"\"")

    return(description)



# ---------------------
# translate xml element to dataProperty: linking thing(?) to value
#
def basicType(label,DataType,comment,rdfsAttribute,nameSpace_):

    description = "  <owl:DatatypeProperty rdf:about={}>\n\
    <rdfs:{} rdf:resource={}/>\n".format("\""+localNameSpace+label+"\"",
                                       rdfsAttribute,
                                       "\""+nameSpace_+DataType[DataType.find(":")+1:]+"\"")
    
    if(comment):
        description += "    <rdfs:comment xml:lang=\"en\">{}</rdfs:comment>\n".format(comment)
        

    description += "    <rdfs:label xml:lang=\"en\">{}</rdfs:label>\n\
  </owl:DatatypeProperty>\n\n".format(label)

        
    return(description)

In [5]:
# to deal with the elements and objTypes
# this function will iterate through the data structure and 
# identify the elements in XML, then convert them to dataTypes or objectTypes
#
def iterLoopElements(owlString,elements):
    
    for element in elements:
        
        
        jsonData = json.loads(element.json())
        
        # ------------
        # processedTypes is a space saving the prepared classes and types
        # if the name is in the file already, the function will pass the process
        #
        if(jsonData["@name"] in processedTypes["elements"]): continue
        
        processedTypes["elements"].extend([jsonData["@name"]])
        # ------------

        
        try:
            comment = jsonData["xs:annotation"]["xs:documentation"]
        except:
            comment = ""
        
        
        if('@type' in list(jsonData.keys())):
            
            if(jsonData['@type'] in xmlElements):
                owlString += basicType(jsonData['@name'],
                                       jsonData['@type'],
                                       comment,
                                       "subPropertyOf",
                                       localNameSpace)
            
            else:
                owlString += objType(jsonData['@name'],
                                     jsonData['@type'],
                                     comment)
                
                
        else:
            elements_ = element.finds("xs:element")
            if(elements_):
                owlString = iterLoopElements(owlString,elements_)
            
                
    
    return(owlString)




In [6]:
#
# deal with simpleTypes in XML
#
def simpleTypePrep(jsonData):

    label = jsonData["@name"]
    
    
    tempString ="  <owl:ObjectProperty rdf:about={}>\n\
    <rdfs:label>{}</rdfs:label>\n\
  </owl:ObjectProperty>".format("\""+localNameSpace+label+"\"",
                                label)

    
    label = jsonData["@name"]+"Type"
    
    tempString += "  <owl:Class rdf:about={}>\n\
    <owl:equivalentClass>\n\
      <owl:Class>\n\
        <owl:oneOf rdf:parseType=\"Collection\">\n".format("\""+localNameSpace+label+"\"")


    #
    # 2 seniories: 1) get json from class 2) get simpleType from root
    #
    if('xs:simpleType' in list(jsonData.keys())):
        jsonData_ = jsonData['xs:simpleType']['xs:restriction']
    else:
        jsonData_ = jsonData['xs:restriction']
        
        
        
    baseType = jsonData_["@base"]
    enumeration = jsonData_['xs:enumeration']
    
    #print("simpleType: ",label,enumeration)
    
    for label in enumeration:
        tempString += "          <rdf:Description rdf:about={}/>\n".format("\""+localNameSpace+label['@value']+"\"")
    
    
    tempString += "        </owl:oneOf>\n\
      </owl:Class>\n\
    </owl:equivalentClass>\n"
    
    
    try:
        comment = jsonData["xs:annotation"]["xs:documentation"]
        tempString += "    <rdfs:comment xml:lang=\"en\">{}</rdfs:comment>\n".format(comment)

    except:
        pass
    
    tempString += "  </owl:Class>\n\n"

        
    return(tempString)

In [7]:
# ---------------------
# handle classes
#
def classType(description,classData):

    tempString = ""
    
    
    jsonData = json.loads(classData.json())
    
    label = jsonData["@name"]
    if(label == "Chemistry"):
        print(jsonData)
    
    
    description += "  <owl:Class rdf:about={}>\n".format("\""+localNameSpace+label+"\"")

    try:
        comment = jsonData["xs:annotation"]["xs:documentation"]
        description += "    <rdfs:comment xml:lang=\"en\">{}</rdfs:comment>\n".format(comment)

    except:
        pass
    

    
    elements = classData.finds("xs:element")
    for element in elements:
        
        jsonData = json.loads(element.json())
            
        label = jsonData["@name"]
        
            
        processedTypes["elements"].extend([label])
        
        
        #print(jsonData)
        if("@type" in list(jsonData)):
            dataType = jsonData["@type"]
        
        # -------------------------------------------
        elif('xs:simpleType' in list(jsonData)):
            
            dataType = jsonData["@name"]+"Type"
            
            label = jsonData["@name"]
            
            if(label in processedTypes["simpleTypes"]): break
                
            processedTypes["simpleTypes"].extend([label])

            # print("simple type is defined in class, which is processed here")
            tempString += simpleTypePrep(jsonData)
            
            
            
            
            
        nameSpace_ = localNameSpace
        rdfsAttribute = "onClass"
        if(dataType in xmlElements):
            nameSpace_ = xmlNameSpace
            rdfsAttribute = "onDataRange"

        description += "    <rdfs:subClassOf>\n\
      <owl:Restriction>\n\
        <owl:onProperty rdf:resource={}/>\n\
        <owl:{} rdf:resource={}/>\n".format("\""+localNameSpace+label+"\"",
                                            rdfsAttribute,
                                            "\""+nameSpace_+dataType[dataType.find(":")+1:]+"\"")


        if(("@maxOccurs" in list(jsonData)) and 
           ("@minOccurs" in list(jsonData)) and 
           (jsonData["@minOccurs"] == jsonData["@maxOccurs"])):

            quantityLimit = "qualifiedCardinality"
            occurs = jsonData["@minOccurs"]

            description += "        <owl:{} rdf:datatype={}>{}</owl:{}>\n".format(quantityLimit,
                                                                                "\""+xmlNameSpace+"nonNegativeInteger\"",
                                                                                occurs,
                                                                                quantityLimit)

        if(("@maxOccurs" in list(jsonData)) and ("@minOccurs" not in list(jsonData))):
            quantityLimit = "maxQualifiedCardinality"
            occurs = jsonData["@maxOccurs"]

            #
            # it can be unbounded in XML
            #
            try:
                float(occurs)
            except:
                occurs = 10

            description += "        <owl:{} rdf:datatype={}>{}</owl:{}>\n".format(quantityLimit,
                                                                                "\""+xmlNameSpace+"nonNegativeInteger\"",
                                                                                occurs,
                                                                                quantityLimit)


        if("@minOccurs" in list(jsonData) and ("@maxOccurs" not in list(jsonData))):
            quantityLimit = "minQualifiedCardinality"
            minOccurs = jsonData["@minOccurs"]

            description += "        <owl:{} rdf:datatype={}>{}</owl:{}>\n".format(quantityLimit,
                                                                                "\""+xmlNameSpace+"nonNegativeInteger\"",
                                                                                occurs,
                                                                                quantityLimit)

        if(("@maxOccurs" not in list(jsonData)) and 
           ("@minOccurs" not in list(jsonData))):

            quantityLimit = "qualifiedCardinality"
            occurs = 1

            description += "        <owl:{} rdf:datatype={}>{}</owl:{}>\n".format(quantityLimit,
                                                                                "\""+xmlNameSpace+"nonNegativeInteger\"",
                                                                                occurs,
                                                                                quantityLimit)

        try:
            comment = jsonData["xs:annotation"]["xs:documentation"]
            description += "      <rdfs:comment xml:lang=\"en\">{}</rdfs:comment>\n".format(comment)

        except:
            pass


        description += "      </owl:Restriction>\n\
    </rdfs:subClassOf>\n"


        
    description += "  </owl:Class>\n\n"
       
    if(tempString): description += tempString
        
        
    return(description)

In [8]:
#
# main function
#

# ------------
# processedTypes is a space saving the prepared classes and types
# if the name is in the file already, the function will pass the process
#
processedTypes = {"elements":[],"simpleTypes":[],"classes":[]}


#
# prepare headers of the files
#
owlString = header() + defOwl()

#
# prepare fundamental types in XML
#

for element in xmlElements:

    processedTypes["elements"].extend([element])

    owlString += basicType(element[element.find(":")+1:],
                           element,
                           "",
                           "range",
                           xmlNameSpace)

# --------------------------------------------
# this tool is able to combine several XMLs by "name" of the element
#
files = glob.glob(os.path.join("data","xmlSchemas","*.xsd"))

#
# processing must be in sequence
#
files = [os.path.join("data","xmlSchemas","base.xsd"),
         os.path.join("data","xmlSchemas","material.xsd"),
         os.path.join("data","xmlSchemas","system.xsd"),
         os.path.join("data","xmlSchemas","build.xsd"),
         os.path.join("data","xmlSchemas","process.xsd"),
         os.path.join("data","xmlSchemas","testInspectionCharacterization.xsd")]


#files = glob.glob(os.path.join("test1.xsd"))



for file in files:
    
    print(" ******* ")
    print(file)

    
    with open(file, 'r') as f:
        xmlRaw = f.read()


    XMLContent_ = DMD.DataModelDict(xmlRaw)
    #print(json.loads(XMLContent_.json()))
    

    elements = XMLContent_.finds("xs:element")
    owlString = iterLoopElements(owlString,elements)

    

    
    #jsonData = json.loads(XMLContent_.json())
    #print(list(jsonData['xs:schema'].keys()))
    
    simpleTypes = XMLContent_.finds('xs:simpleType')
    for simpleType_ in simpleTypes:
        
        jsonData = json.loads(simpleType_.json())

        #
        # @name is not in jsonData.keys meaning that the simple type is defined in class
        # that will be handled in class.  So the process is passed
        #
        if("@name" not in list(jsonData.keys())): continue
        
        label = jsonData["@name"]

        if(label in processedTypes["simpleTypes"]): continue
        processedTypes["simpleTypes"].extend([label])

        owlString += simpleTypePrep(jsonData)
    
        
        
        

    classes = XMLContent_.finds("xs:complexType")
    for classData in classes:

        jsonData = json.loads(classData.json())


        # ****************************
        # need to be reviewed
        #
        
        if("@name" not in list(jsonData)): 
            print(file)
            print(jsonData)
            
            #continue
            
        # ****************************
    
    
        label = jsonData["@name"]

        if(label in processedTypes["classes"]): continue

        processedTypes["classes"].extend([label])

        owlString = classType(owlString,classData)


    
owlString += "\n</rdf:RDF>"

text_file = open(os.path.join("data","owl","AM-CDM-test.owl"), "w")
n = text_file.write(owlString)
text_file.close()


 ******* 
data/xmlSchemas/base.xsd
 ******* 
data/xmlSchemas/material.xsd
{'@name': 'Chemistry', 'xs:sequence': {'xs:element': [{'@name': 'chemistryMeasurementMethod', '@type': 'xs:string', 'xs:annotation': {'xs:documentation': 'Measurement method for real chemistries, which may be different from nominal chemistries. Should not be included if this is a specified chemistry'}}, {'@name': 'elements', '@type': 'chemistryElement'}, {'@name': 'chemistryPercentUnit', '@type': 'ChemistryPercentUnit'}]}}
 ******* 
data/xmlSchemas/system.xsd
 ******* 
data/xmlSchemas/build.xsd
 ******* 
data/xmlSchemas/process.xsd
 ******* 
data/xmlSchemas/testInspectionCharacterization.xsd


In [9]:
print("#elements: ",len(processedTypes['elements']),)
print("#classes: ",len(processedTypes['classes']))

#elements:  947
#classes:  77


In [10]:
# todo
#
# 1. need to check the element "NonPedigreedTestResult" in TIC
# if("@name" not in list(jsonData)): continue
#
# 2.

In [11]:
"""
{'@name': 'Document', 
 'xs:annotation': {'xs:documentation': 'File of any type'}, 
 'xs:sequence': {'xs:element': 
                 [{'@name': 'documentName', '@type': 'xs:string'},
                  {'@name': 'documentLocation', '@type': 'xs:string'},
                  {'@name': 'documentClassificationLevel', '@type': 'xs:string'}, 
                  {'@name': 'documentClassificationIPClass', '@type': 'xs:string'}, 
                  {'@name': 'documentURI', '@type': 'xs:string'}]}}
"""

"\n{'@name': 'Document', \n 'xs:annotation': {'xs:documentation': 'File of any type'}, \n 'xs:sequence': {'xs:element': \n                 [{'@name': 'documentName', '@type': 'xs:string'},\n                  {'@name': 'documentLocation', '@type': 'xs:string'},\n                  {'@name': 'documentClassificationLevel', '@type': 'xs:string'}, \n                  {'@name': 'documentClassificationIPClass', '@type': 'xs:string'}, \n                  {'@name': 'documentURI', '@type': 'xs:string'}]}}\n"