In [4]:
import os
import jellyfish
import rdflib
import owlready2 as owl2

# Setup

In [6]:
DATAPATH = '../data'
KG_DP = f'{DATAPATH}/knowledge-graph'
os.makedirs(KG_DP, exist_ok=True)


SCRAPED_DP = f'{DATAPATH}/scrapped'

# T-Box

## Create ontology

In [7]:
ns_owl = rdflib.Namespace('http://www.w3.org/2002/07/owl#')

exaignn_ontology = owl2.get_ontology('http://kb.melll.ic.uff.br/exaignn')

with exaignn_ontology:
    class Paper(owl2.Thing):
        label   = 'Paper'

    class Author(owl2.Thing):
        label   = 'Author'

    class reference(Paper >> Paper): pass
    
    class isReferredBy(Paper >> Paper):
        inverse_property    = reference

    class hasAuthor(Paper >> Author): pass

    class isAuthorOf(Author >> Paper):
        inverse_property    = hasAuthor

    class PaperAuthors(owl2.DatatypeProperty):
        label   = 'PaperAuthors'
        domain  = [Paper]
        range   = [str]
            
    class PaperAbstract(owl2.DatatypeProperty):
        label   = 'PaperAbstract'
        domain  = [Paper]
        range   = [str]

rdflib_graph = owl2.default_world.as_rdflib_graph()
rdflib_graph.bind('owl', ns_owl)

<Graph identifier=Nf02e3105afcc4e21bfd158a9c9eb45a3 (<class 'owlready2.rdflib_store.TripleLiteRDFlibGraph'>)>

## Save ontology

In [11]:
exaignn_ontology.save(file = f'{KG_DP}/exaignn_tbox.owl', format = 'rdfxml')
exaignn_ontology.save(file = f'{KG_DP}/exaignn_tbox.triples', format = 'ntriples')
rdflib_graph.serialize(f'{KG_DP}/exaignn_tbox.ttl', format='turtle')

<Graph identifier=Nf02e3105afcc4e21bfd158a9c9eb45a3 (<class 'owlready2.rdflib_store.TripleLiteRDFlibGraph'>)>

# Create ontology A-Box

## Data transformations

In [8]:
def find_paper_id(paper_name, paper_name_id_map):

    for target_name in paper_name_id_map.keys():
        if jellyfish.jaro_distance(paper_name, target_name) >= 0.8:
            return paper_name_id_map[target_name]
    
    return None

def decode_details(path):
    
    with open(path, 'r') as f:
        lines = list(f.readlines())
    
    paper_id = path.split('/')[-1].split('.')[0]
    paper_name = lines[0]
    paper_authors = lines[1] or None
    # paper_venue = lines[2]
    # paper_numbers = lines[3]
    # paper_url = lines[4]
    paper_abstract = lines[5] if len(lines) == 6 else None

    return paper_id, paper_name, paper_authors, paper_abstract

def decode_related(path, paper_name_id_map):

    paper_id = path.split('/')[-1].split('.')[0]

    related = []
    with open(path, 'r') as f:
        for line in f.readlines():
            if line == '':
                continue
                
            paper_name = line.split('\t')[0]
            related.append(find_paper_id(paper_name, paper_name_id_map))

    return paper_id, related

In [13]:
ontology = owl2.get_ontology(f'{KG_DP}/exaignn_tbox.owl').load()

paper_id_name_map = {}
paper_name_id_map = {}
paper_id_obj_map = {}

details_path = f'{SCRAPED_DP}/details'
for paper in os.listdir(details_path):

    paper_details_path = f'{details_path}/{paper}'
    paper_id, paper_name, paper_authors, paper_abstract = decode_details(paper_details_path)

    paper_id_name_map[paper_id] = paper_name
    paper_name_id_map[paper_name] = paper_id
    
    paper = ontology.Paper(paper_id)
    paper.label = [paper_name]

    if paper_authors is not None:
        paper.PaperAuthors = [paper_authors]

    if paper_abstract is not None:
        paper.PaperAbstract = [paper_abstract]
        
    paper_id_obj_map[paper_id] = paper

derivative_path = f'{SCRAPED_DP}/derivative'
for paper in os.listdir(derivative_path):

    derivative_paper_path = f'{derivative_path}/{paper}'
    paper_id, related = decode_related(derivative_paper_path, paper_name_id_map)

    related = list(filter(lambda x: x is not None, related))

    related_objs = [paper_id_obj_map[rp_id] for rp_id in related]

    paper_id_obj_map[paper_id].isReferredBy = related_objs

prior_path = f'{SCRAPED_DP}/prior'

for paper in os.listdir(prior_path):
    prior_paper_path = f'{prior_path}/{paper}'
    paper_id, related = decode_related(prior_paper_path, paper_name_id_map)

    related = list(filter(lambda x: x is not None, related))

    related_objs = [paper_id_obj_map[rp_id] for rp_id in related]

    paper_id_obj_map[paper_id].reference = related_objs

## Save ontology

In [15]:
rdflib_graph = owl2.default_world.as_rdflib_graph()

ns1 = rdflib.Namespace('http://www.w3.org/2002/07/owl#')
ns2 = rdflib.Namespace('http://kb.melll.ic.uff.br/exaignn#')

rdflib_graph.bind('owl', ns1)
rdflib_graph.bind('exaignn', ns2)

ontology.save(file = f'{KG_DP}/exaignn_abox.owl', format = 'rdfxml')
ontology.save(file = f'{KG_DP}/exaignn_abox.triples', format = 'ntriples')
rdflib_graph.serialize(f'{KG_DP}/exaignn_abox.ttl', format='turtle')

<Graph identifier=Nf02e3105afcc4e21bfd158a9c9eb45a3 (<class 'owlready2.rdflib_store.TripleLiteRDFlibGraph'>)>