In [6]:
from owlready2 import *
import pandas as pd
from rdflib import Graph, URIRef, BNode, Literal, Namespace
import AccessEntityLabels
import Levenshtein as lev
from stringcmp import isub
from rdflib.namespace import OWL
from CompareWithReference import compareWithReference

In [2]:
def getClasses(onto):        
    return onto.classes()
    
def getDataProperties(onto):        
    return onto.data_properties()
    
def getObjectProperties(onto):        
    return onto.object_properties()
    
def getIndividuals(onto):    
    return onto.individuals()


def getRDFSLabelsForEntity(entity):
    #if hasattr(entity, "label"):
    return entity.label
 

def extractEntities(urionto, entity_type = 'class'):
    
    """
    A function used to extract the information from a given ontology. The returned objects is an array of dictionaries, each having the 'iri', 'name' and 'labels' keys

    ...

    Attributes
    ----------
    urionto : str
        the owl file containing the ontology (e.g. 'cmt.owl')
    entity_type : str
        the type of entity to extract from the ontology. The valid values are {'class', 'objectProperty', 'dataProperty', 'individual'}. if not specified the default is 'class'

    """
    
    #Method from owlready
    onto = get_ontology(urionto).load()
    
    entities = list([])
    
    #load the classes
    if entity_type == 'class':
        print(f"Classes in {urionto} Ontology: {str(len(list(getClasses(onto))))}")
        entities = list(getClasses(onto))
    
    #...or the object properties
    elif entity_type == 'objectProperty':
        print(f"Object Properties in {urionto} Ontology: {str(len(list(getObjectProperties(onto))))}")
        entities = getObjectProperties(onto)
        
    #...or the data properties
    elif entity_type == 'dataProperty':
        print(f"Data Properties in {urionto} Ontology: {str(len(list(getDataProperties(onto))))}")
        entities = getDataProperties(onto)
    
    #...or the individuals from that ontology
    elif entity_type == 'individual':
        print(f"Individuals in {urionto} Ontology: {str(len(list(getIndividuals(onto))))}")
        entities = getIndividuals(onto)
    
    #else if the user input is not one of the valid entity types print an error message
    else:
        print("Incorrect entity type")

    #create a new array to hold all the extracted iris, their name and their label(s). Be it for classes, properties or individuals 
    entity_dict = []
    for entity in entities:
        temp = {}
        temp["iri"] = entity.iri
        temp["name"] = entity.name
        temp["labels"] = getRDFSLabelsForEntity(entity)
        entity_dict.append(temp)
        
        
    return entity_dict

def compare2Arrays(array_1, array_2, entity_type, entity_scores, annotation = 'name'):
    
    """
    A function used to compare 2 lists (of ontology entities) and return an third list with entity pairs and they score and their type based on the lexical comparison using the isub metric

    ...

    Attributes
    ----------
    array_1 : list
        the first list containing entities from the first ontology to compare
    array_2 : list
        the second list containing entities from the second ontology to compare
    entity_type : str
        the type of entity to extract from the ontology. The valid values are {'class', 'objectProperty', 'dataProperty', 'individual'}. if not specified the default is 'class'
    entity_scores : list
        a list containing a pair of IRIs, their entity type and the score based on the selected distance. This list will become the output as well after enriched with the new pairs
    annotation: string
        the "attribute" to be used for the lexical comparison. The valid values are {'name', 'labels'}. If not specified the default value is 'name'

    """
    iterator = 0
    for i in array_1:
        iterator += 1
        score = 0
        best_pair = {}
        for j in array_2:
            
            #this part checks if we are comparing the names or the labels. that's because names are strings but labels are arrays so we need to get one level deeper
            if annotation == 'name':
                string1 = i[annotation]
                string2 = j[annotation]
            if annotation == 'labels':
                if (len(i[annotation])>0) & (len(j[annotation])>0):
                    string1 = i[annotation][0]
                    string2 = j[annotation][0]
                else:
                    string1 = ''
                    string2 = ''
            
            #only to this if both strings are not empty
            if (len(string1)>0) & (len(string2)>0):
                new_score = isub(string1,string2)
                if new_score > score:
                    score = new_score
                    best_pair = {"entity1": i['iri'], "entity2": j['iri'], "entity_type": entity_type, "score": score}
                    
        entity_scores.append(best_pair)
        if (len(array_1)%(iterator*100)) == 0:
            print(len(array_1)%(iterator*100))


    #return the scores list
    return entity_scores

def ontologyMatcher(uri1, uri2, annotation = 'name'):
    
    """
    A function used to orchestrate the matching of two ontologies uri1 and uri2 by comparing the lexical similarity of all entities (i.e. classes and properties) based on the annotation (i.e. name or labels)

    ...

    Attributes
    ----------
    uri1 : string
        the name of the owl file of the first ontology to compare
    uri2 : string
        the name of the owl file of the second ontology to compare
    annotation: string
        the "attribute" to be used for the lexical comparison. The valid values are {'name', 'labels'}. If not specified the default value is 'name'

    """
    
    # load the classes and objects from the 2 uris in the respective arrays of objects
    dict_uri1_classes = extractEntities(uri1,"class")
    dict_uri2_classes = extractEntities(uri2,"class")
    dict_uri1_obj_properties = extractEntities(uri1,"objectProperty")
    dict_uri2_obj_properties = extractEntities(uri2,"objectProperty")
    dict_uri1_data_properties = extractEntities(uri1,"dataProperty")
    dict_uri2_data_properties = extractEntities(uri2,"dataProperty")
    
    # Create an empty array to hold the objects. each object is a dictionary with two uris and the score of the similarity of their names
    entity_scores = []
    
    # compare class names and add the scores to the dictionary
    entity_scores = compare2Arrays(dict_uri1_classes, dict_uri2_classes, 'class', entity_scores, annotation)
    
    # ...do the same with object properties
    entity_scores = compare2Arrays(dict_uri1_obj_properties, dict_uri2_obj_properties, 'objectProperty', entity_scores, annotation)
    
    # ...do the same with data properties
    entity_scores = compare2Arrays(dict_uri1_data_properties, dict_uri2_data_properties, 'dataProperty', entity_scores, annotation)
    
    # finally we convert the dictionary to a dataframe to be able to filter pairs with the score above a certain threshold
    return pd.DataFrame(entity_scores)

def createAlignmentTripples(enity_scores,threshold=0.0):
    
    """
    A function create a graph with the tripples as specified int the entity scores list

    ...

    Attributes
    ----------
    entity_scores : list
        a list containing a pair of IRIs, their entity type and the score based on the selected distance.
    threshold : float
        a number used to filter the pairs that have scored higher than the threshold and only consider them for the graph triples
    """
        
    #initialise a new graph
    g = Graph()

    g.bind("owl", OWL)
    
    matched_onto2_entities = []

    # iterate throw the rows where the score is above a certain thresholf and create the relevant triples. the score table is sorted based on scores so the pair with the highest score appears first
    for index, row in df_entity_scores[df_entity_scores['score']>threshold].sort_values(by='score',ascending = False).iterrows():
        
        # we check if the entity from the second ontology has already been matched with a higher score and if it has we do not add the new pair in the graph 
        if row['entity2'] not in matched_onto2_entities:
            
            #we append the new entity from onto2 to the array so as do ignore it if it shows up again in lower scores
            matched_onto2_entities.append(row['entity2'])
            if row['entity_type'] == 'class':
                g.add((URIRef(row['entity1']), OWL.equivalentClass, URIRef(row['entity2'])))
            elif row['entity_type'] == 'objectProperty':
                g.add((URIRef(row['entity1']), OWL.equivalentProperty, URIRef(row['entity2'])))
            elif row['entity_type'] == 'dataProperty':
                g.add((URIRef(row['entity1']), OWL.equivalentProperty, URIRef(row['entity2'])))
    return g

In [3]:
# Use the matcher function to compare the two ontologies and load the results in a dataframe
# df_entity_scores = ontologyMatcher('cmt.owl', 'ekaw.owl')
# filename = 'zdetor-cmt-ekaw.ttl'

# df_entity_scores = ontologyMatcher('cmt.owl', 'confOf.owl')
# filename = 'zdetor-cmt-confOf.ttl'

df_entity_scores = ontologyMatcher('confOf.owl', 'ekaw.owl', 'name')
filename = 'zdetor-confOf-ekaw.ttl'

threshold = 0.8


display(df_entity_scores[df_entity_scores['score']>threshold].sort_values(by='score',ascending = False))


# parse the dataframe with the scores and creates the triples for those pairs of entities that scored above the threshold. Add the tripples to the KG
g = createAlignmentTripples(df_entity_scores,threshold)
print("\n")
#print the resulting triples in a ttl file
print(g.serialize(format="turtle").decode("utf-8"))  
g.serialize(destination=filename, format='ttl')

Classes in confOf.owl Ontology: 38
Classes in ekaw.owl Ontology: 73
Object Properties in confOf.owl Ontology: 13
Object Properties in ekaw.owl Ontology: 33
Data Properties in confOf.owl Ontology: 23
Data Properties in ekaw.owl Ontology: 0


Unnamed: 0,entity1,entity2,entity_type,score
25,http://confOf#Conference,http://ekaw#Conference,class,1.0
24,http://confOf#Workshop,http://ekaw#Workshop,class,1.0
48,http://confOf#writtenBy,http://ekaw#writtenBy,objectProperty,1.0
32,http://confOf#Student,http://ekaw#Student,class,1.0
8,http://confOf#Person,http://ekaw#Person,class,1.0
12,http://confOf#University,http://ekaw#University,class,1.0
26,http://confOf#Tutorial,http://ekaw#Tutorial,class,1.0
16,http://confOf#Paper,http://ekaw#Paper,class,1.0
4,http://confOf#Event,http://ekaw#Event,class,1.0
3,http://confOf#Social_event,http://ekaw#Social_Event,class,0.96977




@prefix owl: <http://www.w3.org/2002/07/owl#> .

<http://confOf#Author> owl:equivalentClass <http://ekaw#Paper_Author> .

<http://confOf#Camera_Ready_event> owl:equivalentClass <http://ekaw#Camera_Ready_Paper> .

<http://confOf#Chair_PC> owl:equivalentClass <http://ekaw#PC_Chair> .

<http://confOf#Conference> owl:equivalentClass <http://ekaw#Conference> .

<http://confOf#Contribution> owl:equivalentClass <http://ekaw#Contributed_Talk> .

<http://confOf#Event> owl:equivalentClass <http://ekaw#Event> .

<http://confOf#Member_PC> owl:equivalentClass <http://ekaw#PC_Member> .

<http://confOf#Organization> owl:equivalentClass <http://ekaw#Organisation> .

<http://confOf#Paper> owl:equivalentClass <http://ekaw#Paper> .

<http://confOf#Participant> owl:equivalentClass <http://ekaw#Conference_Participant> .

<http://confOf#Person> owl:equivalentClass <http://ekaw#Person> .

<http://confOf#Poster> owl:equivalentClass <http://ekaw#Poster_Paper> .

<http://confOf#Regular> owl:equivalentClass <h

In [4]:
compareWithReference('cmt-ekaw-reference.ttl', "zdetor-cmt-ekaw.ttl")
compareWithReference('cmt-confOf-reference.ttl', "zdetor-cmt-confof.ttl")
compareWithReference('confOf-ekaw-reference.ttl', "zdetor-confOf-ekaw.ttl")

Comparing 'zdetor-cmt-ekaw.ttl' with 'cmt-ekaw-reference.ttl
	Precision: 0.5454545454545454
	Recall: 0.5454545454545454
	F-Score: 0.5454545454545454
Comparing 'zdetor-cmt-confof.ttl' with 'cmt-confOf-reference.ttl
	Precision: 0.5
	Recall: 0.3125
	F-Score: 0.38461538461538464
Comparing 'zdetor-confOf-ekaw.ttl' with 'confOf-ekaw-reference.ttl
	Precision: 0.6363636363636364
	Recall: 0.7
	F-Score: 0.6666666666666666


In [31]:
# df_entity_scores = ontologyMatcher('mouse.owl', 'human.owl', 'labels')
filename = 'zdetor-mouse-human.ttl'

threshold = 0.8


# display(df_entity_scores[df_entity_scores['score']>threshold].sort_values(by='score',ascending = False))


# parse the dataframe with the scores and creates the triples for those pairs of entities that scored above the threshold. Add the tripples to the KG
g = createAlignmentTripples(df_entity_scores,threshold)
print("\n")
#print the resulting triples in a ttl file
# print(g.serialize(format="turtle").decode("utf-8"))  
g.serialize(destination=filename, format='ttl')

compareWithReference('anatomy-reference.ttl', 'zdetor-mouse-human.ttl')



Comparing 'zdetor-mouse-human.ttl' with 'anatomy-reference.ttl
	Precision: 0.9525816649104321
	Recall: 0.5963060686015831
	F-Score: 0.7334685598377282


In [9]:
g = Graph()
g.parse("zdetor-mouse-human.ttl", format="ttl")
    
    
print("The graph contains '" + str(len(g)) + "' triples.")

The graph contains '949' triples.
