In [11]:
import xml.etree.ElementTree as ET
import re
import os
import spacy
import scispacy
from datetime import datetime

### XML Structure
- teiHeader
    - fileDesc
        - titleStmt
            - title
        - publicationStmt
            - date
        - sourceDesc
            - biblStruct
                - idno
    - encodingDesc
    - profileDesc

### XRI (XML-to-RDF-Intermediate) Data Structure
- [
    - [
        - section_number,
        - section_title,
        - {
            - paragraph_number: paragraph_text,
        - }
    - ],
- ]

# RDF Generator
### Reading XML

In [12]:
def xml_reader():

    # LOADING XML AND CREATING ROOT

    xml_file = 'Robles_2015_modified.xml'
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # EXTRACTING METADATA (available data: title, publication_date and doi)

    metadata = []

    for elem in root:
        if elem.tag[29:] == 'teiHeader':
            for sub_elem1 in elem:
                if sub_elem1.tag[29:] == 'fileDesc':
                    publication_info = {}
                    for sub_elem2 in sub_elem1:
                        if sub_elem2.tag[29:] == 'titleStmt':
                            for sub_elem3 in sub_elem2:
                                if sub_elem3.tag[29:] == 'title':
                                    publication_info['Title'] = sub_elem3.text
                        if sub_elem2.tag[29:] == 'publicationStmt':
                            for sub_elem3 in sub_elem2:
                                if sub_elem3.tag[29:] == 'date':
                                    publication_info['Publication Date'] = sub_elem3.attrib['when']
                        if sub_elem2.tag[29:] == 'sourceDesc':
                            for sub_elem3 in sub_elem2:
                                if sub_elem3.tag[29:] == 'biblStruct':
                                    for sub_elem4 in sub_elem3:
                                        if sub_elem4.tag[29:] == 'idno':
                                            publication_info['DOI'] = sub_elem4.text
                    metadata.append(['0', 'Metadata', publication_info])

#     print(metadata)
    #==================================================================
    
    # EXTRACTING ABSTRACT AND NUMBERING PARAGRAPHS

    abstract = []

    for elem in root:
        if elem.tag[29:] == 'teiHeader':
            for sub_elem1 in elem:
                if sub_elem1.tag[29:] == 'profileDesc':
                    for sub_elem2 in sub_elem1:
                        if sub_elem2.tag[29:] == 'abstract':
                            for sub_elem3 in sub_elem2:
                                if sub_elem3.tag[29:] == 'div':
                                    list_of_paragraphs = {}
                                    paragraph_number = 0
                                    for sub_elem4 in sub_elem3:
                                        if sub_elem4.tag[29:] == 'p':
                                            paragraph_number += 1
                                            list_of_paragraphs[paragraph_number] = ET.tostring(sub_elem4, encoding='unicode')
                                    abstract.append(['0', 'Abstract', list_of_paragraphs])

#     print(abstract)
    #==================================================================

    # EXTRACTING OTHER SECTIONS

    # need to normalize the section number (line 17)
    # compare Rojas and Wolf section number
    # \ issue in RDF, see Koshkava 2014 paper

    list_of_sections = []

    for elem in root:
        if (elem.tag[29:] == 'text'):
            for sub_elem1 in elem:
                if (sub_elem1.tag[29:] == 'body'):
                    for sub_elem2 in sub_elem1:
                        if sub_elem2.tag[29:] == 'div':
                            section_number = ''
                            section_name = ''
                            list_of_paragraphs = []
                            for sub_elem3 in sub_elem2:
                                if sub_elem3.tag[29:] == 'head':
                                    if bool(sub_elem3.attrib):
                                        section_number = str(sub_elem3.attrib)
                                        if section_number[-3] == '.':
                                            section_number = section_number[7:-3]
                                        else:
                                            section_number = section_number[7:-2]
                                    else:
                                        section_number = 'NO_SECTION_NUMBER'
                                    section_name = sub_elem3.text
                                if sub_elem3.tag[29:] == 'p':
                                    list_of_paragraphs.append(ET.tostring(sub_elem3, encoding='unicode'))
                            # commented logic skips NO_SECTION_NUMBER with no paragraphs, i.e., Table 1, Table 2, .....
                            # if section_number == 'NO_SECTION_NUMBER' and not bool(list_of_paragraphs):
                            if section_number == 'NO_SECTION_NUMBER':
                                pass
                            else:
                                list_of_sections.append([section_number, section_name, list_of_paragraphs])

#     print(list_of_sections)
    #==================================================================

    # NUMBERING PARAGRAPHS OF SECTIONS 
    # paragraphs are sequentially added in a dictionary
    # dictionary is added with each respective record as a 4th element

    # for section in list_of_sections:
    #     list_of_paragraphs = {}
    #     for paragraph_number, paragraph_text in enumerate(section[2], start=1):
    #         list_of_paragraphs[paragraph_number] = paragraph_text
    #     section.append(list_of_paragraphs)


    # # deleting the 3rd element from each record

    # for section in list_of_sections:
    #     section.remove(section[2])


    # OPTIMIZED VERSION
    for section in list_of_sections:
        dict_ = {i + 1: section[2][i] for i in range(len(section[2]))}
        section.append(dict_)
        section.remove(section[2])

#     print(list_of_sections)
    #==================================================================
    
    # MERGING ABSTRACT WITH OTHER SECTIONS

    document = abstract + list_of_sections
#     print(document)
    #==================================================================

    # PREPROCESSING TEXT

    starting_p_tag_pattern = r'<ns0:p[^>]+>'
    ending_p_tag_pattern = '</ns0:p>'
    starting_ref_tag_pattern = r'<ns0:ref[^>]+>'
    ending_ref_tag_pattern = '</ns0:ref>'
    ref_pattern = r'<ref>.*?</ref>'          # temporary for removing ref tag

    for record in document:
        for paragraph_number, paragraph_text in record[2].items():
            text = paragraph_text
            text = re.sub(starting_p_tag_pattern, '', text)
            text = re.sub(ending_p_tag_pattern, '', text)
            text = re.sub(starting_ref_tag_pattern, '<ref>', text)
            text = re.sub(ending_ref_tag_pattern, '</ref>', text)
            text = re.sub(ref_pattern, '', text)
            record[2][paragraph_number] = text

#     print(document)
    #==================================================================
    
    # MERGING METADATA WITH OTHER DOCUMENT PARTS

    document = metadata + document
#     print(document)
    
    return document

### Necessary Functions for:
***document_representer()*** \
***entity_representer()***

In [13]:
import spacy


def prefix():

    PREFIX_ONNER = "PREFIX onner: <http://purl.org/spatialai/onner/onner-full#>\n"
    PREFIX_DATA = "PREFIX data: <http://purl.org/spatialai/onner/onner-full/data#>\n"
    PREFIX_RDF = "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n"
    PREFIX_RDFS = "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\n"
    PREFIX_XSD = "PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>\n"
    PREFIX_OWL = "PREFIX owl: <http://www.w3.org/2002/07/owl#>\n\n"
    
    return PREFIX_ONNER + PREFIX_DATA + PREFIX_RDF + PREFIX_RDFS + PREFIX_XSD + PREFIX_OWL


def nested_section_check(document, doi, section_number):

    section_ids = []

    for record in document:
        pattern = rf'^{section_number}\.[^.]+$'
        
        if re.search(pattern, record[0]):
            section_id = doi + '_' + str(record[0])
            section_ids.append('data:'+section_id)

    section_ids_joined = ', '.join(section_ids)

    return section_ids_joined


### RDF - Document Representation

In [14]:
def document_representer(document):
    # WRITING RDF TRIPLES FOR DOCUMENT

    # GETTING DOI
    try:
        doi = document[0][2]['DOI']
        doi = doi.replace('/', '_')
    except NameError:
        print('ERROR: Document object is not defined!')

    # PRINTING PREFIXES
    rdf_triple = ''

    # try:
    for record in document:

        # METADATA
        if record[1] == 'Metadata':

            abstract_id = 'data:' + doi + '_A'
            section_ids = []

            for section_number in document:
                if section_number[0] != '0' and '.' not in section_number[0]:
                    section_id = doi + '_' + section_number[0]
                    section_ids.append('data:'+section_id)

            directly_contained_sections = ', '.join(section_ids)

            # PRINTING METADATA
            rdf_triple += f"data:Publication_{doi} rdf:type onner:ScholarlyPublication ;\n"
            rdf_triple += f"onner:publicationTitle '{record[2]['Title']}'^^xsd:string ;\n"
            rdf_triple += f"onner:publicationDate '{record[2]['Publication Date']}'^^xsd:date ;\n"
            rdf_triple += f"onner:doi '{record[2]['DOI']}'^^xsd:string ;\n"
            rdf_triple += f"onner:directlyContainsDocumentPart {abstract_id}, {directly_contained_sections} .\n\n"

        # ABSTRACT AND IT'S DOCUMENT PARTS
        elif record[1] == 'Abstract':

            next_index = document.index(record) + 1
            next_section = document[next_index][0]
            paragraph_ids = []

            # CREATING PARAGRAPH ID FOR ABSTRACT'S PARAGRAPH 
            for paragraph_number, _ in record[2].items():
                paragraph_id = doi + '_A' + '-' + str(paragraph_number)
                paragraph_ids.append('data:'+paragraph_id)

            paragraph_ids_joined = ', '.join(paragraph_ids)

            # PRINTING ABSTRACT AND IT'S PARAGRAPH IDs 
            rdf_triple += f"data:{doi}_A rdf:type onner:Abstract ;\n"
            rdf_triple += f"onner:nextDocumentPart {paragraph_ids[0]} ;\n"    # NEXT DOC PART AFTER ABSTRACT
            rdf_triple += f"onner:directlyContainsDocumentPart {paragraph_ids_joined} .\n\n"

            # PARAGRAPHS AND IT'S DOCUMENT PARTS
            for paragraph_number, paragraph_text in record[2].items():

                # replacing ' with \' in text
                if "'" in paragraph_text:
                    paragraph_text = paragraph_text.replace("'", r"\'")

                # PRINTING PARAGRAPHS AND IT'S LABELED TERM IDs 
                rdf_triple += f"data:{doi}_A-{paragraph_number} rdf:type onner:Paragraph ;\n"
                rdf_triple += f"onner:positionInParentDocumentPart '{paragraph_number}'^^xsd:nonNegativeInteger ;\n"

                if paragraph_number == len(paragraph_ids):
                    rdf_triple += f"onner:nextDocumentPart data:{doi}_{next_section} ;\n"
                else:
                    rdf_triple += f"onner:nextDocumentPart data:{doi}_A-{paragraph_number+1} ;\n"

                rdf_triple += f"onner:paragraphText '{paragraph_text}'^^xsd:string .\n\n"

        # SECTION AND IT'S DOCUMENT PARTS
        else:
            section_number = record[0]
            section_name = record[1]
            next_index = document.index(record) + 1
            paragraph_ids = []

            if next_index == len(document):
                next_section = 'EndOfDocument'
            else:
                next_section = document[next_index][0]

            # IF N0 PARAGRAPHS EXIST BETWEEN A SECTION AND ITS IMMEDIATE SUBSECTION
            if not bool(record[2]):

                directly_contained_sections = nested_section_check(document, doi, section_number)

                rdf_triple += f"data:{doi}_{section_number} rdf:type onner:Section ;\n"
                rdf_triple += f"onner:sectionTitle '{section_name}'^^xsd:string ;\n"
                rdf_triple += f"onner:sectionNumber '{section_number}'^^xsd:string ;\n"
                rdf_triple += f"onner:nextDocumentPart data:{doi}_{next_section} ;\n"
                rdf_triple += f"onner:directlyContainsDocumentPart {directly_contained_sections} .\n\n"

            # IF PARAGRAPHS EXIST BETWEEN A SECTION AND ITS IMMEDIATE SUBSECTION
            else:

                directly_contained_sections = nested_section_check(document, doi, section_number)

                # CREATING PARAGRAPH ID FOR SECTION'S PARAGRAPH 
                for paragraph_number, _ in record[2].items():
                    paragraph_id = doi + '_' + str(section_number) + '-' + str(paragraph_number)
                    paragraph_ids.append('data:'+paragraph_id)

                paragraph_ids_joined = ', '.join(paragraph_ids)

                # PRINTING SECTION AND IT'S PARAGRAPH IDs 
                rdf_triple += f"data:{doi}_{section_number} rdf:type onner:Section ;\n"
                rdf_triple += f"onner:sectionTitle '{section_name}'^^xsd:string ;\n"
                rdf_triple += f"onner:sectionNumber '{section_number}'^^xsd:string ;\n"
                rdf_triple += f"onner:nextDocumentPart {paragraph_ids[0]} ;\n"    # NEXT DOC PART AFTER SECTION

                if bool(directly_contained_sections):
                    rdf_triple += f"onner:directlyContainsDocumentPart {paragraph_ids_joined}, {directly_contained_sections} .\n\n"
                else:
                    rdf_triple += f"onner:directlyContainsDocumentPart {paragraph_ids_joined} .\n\n"

                # PARAGRAPHS AND IT'S DOCUMENT PARTS
                for paragraph_number, paragraph_text in record[2].items():

                    # replacing ' with \' in text
                    if "'" in paragraph_text:
                        paragraph_text = paragraph_text.replace("'", r"\'")

                    # PRINTING PARAGRAPHS AND IT'S LABELED TERM IDs 
                    rdf_triple += f"data:{doi}_{section_number}-{paragraph_number} rdf:type onner:Paragraph ;\n"
                    rdf_triple += f"onner:positionInParentDocumentPart '{paragraph_number}'^^xsd:nonNegativeInteger ;\n"

                    if paragraph_number == len(paragraph_ids):
                        rdf_triple += f"onner:nextDocumentPart data:{doi}_{next_section} ;\n"
                    else:
                        rdf_triple += f"onner:nextDocumentPart data:{doi}_{section_number}-{paragraph_number+1} ;\n"

                    rdf_triple += f"onner:paragraphText '{paragraph_text}'^^xsd:string .\n\n"

    # except NameError:
    #     print('ERROR: Document object is not defined!')

    rdf_triple += f"data:{doi}_EndOfDocument rdf:type onner:EndOfDocument .\n"

    return doi, rdf_triple

### RDF - Named Entity Representation

In [15]:
def entity_representer(entities):
    rdf_triple = ''
    labels_in_doc = []
    labeling_schema = ['CHEMICAL', 
                       'MATERIAL', 
                       'STRUCTURE',
                       'PROPERTY',
                       'APPLICATION',
                       'PROCESS',
                       'EQUIPMENT',
                       'MEASUREMENT',
                       'ABBREVIATION']

    for _ in entities:
        paragraph_id = _[0]
        labeled_term_info_list = _[1]
        labeled_term_ids_joined = _[2]

        if bool(labeled_term_ids_joined):    # <= added logic for no labeled terms
            rdf_triple += f"data:{paragraph_id} onner:directlyContainsLabeledTerm {labeled_term_ids_joined} .\n\n"
        else:
            rdf_triple += f"data:{paragraph_id} onner:directlyContainsLabeledTerm data:NoLabeledTerm .\n\n"

        # LABELED TERMS
        for info in labeled_term_info_list:
            # PRINTING LABELED TERMS
            rdf_triple += f"data:{info[0]} rdf:type onner:LabeledTerm ;\n"    # DEAL WITH ATOMIC / COMPOUND
            rdf_triple += f"onner:labeledTermText '{info[1]}'^^xsd:string ;\n"
            rdf_triple += f"onner:offset '{info[3]}'^^xsd:nonNegativeInteger ;\n"
            rdf_triple += f"onner:length '{info[4]}'^^xsd:nonNegativeInteger ;\n"
            rdf_triple += f"onner:labeledTermDirectlyContainedBy data:{paragraph_id} ;\n"
            rdf_triple += f"onner:hasLabeledTermStatus data:Candidate_{info[0]} .\n\n"

            # PRINTING LABELED TERMS STATUS
            rdf_triple += f"data:Candidate_{info[0]} rdf:type onner:CandidateStatus ;\n"
            rdf_triple += f"onner:statusAssignmentDate '{info[5]}'^^xsd:dateTime ;\n"
            rdf_triple += f"onner:statusAssignedBy data:Cellulosic_NER_Model ;\n"

            try:
                if not bool(labeling_schema):
                    raise Exception('Labeling schema is empty!')

                label_number_in_schema = labeling_schema.index(info[2]) + 1
            except Exception as e:
                print(f'ERROR: {e}')
            else:
                rdf_triple += f"onner:hasLabeledTermLabel data:Label_{label_number_in_schema} .\n\n"

            # adding lebels and their position in the schema
            if [label_number_in_schema, info[2]] not in labels_in_doc:
                labels_in_doc.append([label_number_in_schema, info[2]])

    try:
        if not bool(labels_in_doc):
            raise Exception('List of labels found in document is empty!\n')     
        else:
            for label in labels_in_doc:
                rdf_triple += f"data:Label_{label[0]} rdf:type onner:Label ;\n"
                rdf_triple += f"onner:fromLabelingSchema data:Labeling_Schema ;\n"
                rdf_triple += f"onner:labelText '{label[1]}'^^xsd:string .\n\n"

    except Exception as e:
        print(f'ERROR: {e}')

    rdf_triple += f"data:Labeling_Schema rdf:type onner:LabelingSchema ;\n"
    rdf_triple += f"onner:schemaName 'CelloGraph'^^xsd:string .\n\n"

    rdf_triple += f"data:Cellulosic_NER_Model rdf:type onner:NER_System ;\n"    # if/else required to identify system and human
    rdf_triple += f"onner:systemVersion '1.0'^^xsd:string .\n\n"

    return rdf_triple

# GraphDB Im- and Exporter
### Insert Graph into Database

In [16]:
from SPARQLWrapper import SPARQLWrapper, POST

def import_graph(document_id, rdf_triple, rdf_type):

    repository_name = 'test-repo'
    database_url = f'http://localhost:7200/repositories/{repository_name}/statements'
    
    if rdf_type == 'document':
        named_graph = f'http://purl.org/spatialai/onner/data/{document_id}/document'
    elif rdf_type == 'entities':
        named_graph = f'http://purl.org/spatialai/onner/data/{document_id}/terms'

    # Construct SPARQL INSERT query with explicit prefixes and a named graph
    query = f'''
        {prefix()}

        INSERT DATA {{
            GRAPH <{named_graph}> {{
                {rdf_triple}
            }}
        }}
    '''

    # Execute via SPARQLWrapper
    sparql = SPARQLWrapper(database_url)
    sparql.setMethod(POST)
    sparql.setQuery(query)

    try:
        sparql.query()
        print(f'✅ RDF ({rdf_type}) successfully inserted into - {named_graph}')
    except Exception as e:
        print(f'❌ Error: {e}')
        
#     return named_graph


### Retrieve Paragraphs from Graph

In [17]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import json

def retrieve_paragraphs(document_id):
    # connect and query graph database
    # ADD TRY CATCH TO AVOID DATABASE CONNECTIVITY ERROR

    # specify the repository
    sparql = SPARQLWrapper("http://localhost:7200/repositories/test-repo")

    # SPARQL query
    # SPARQL query with named graph and error handling
    try:
        sparql.setQuery(f'''
            {prefix()}

            SELECT ?paragraphId ?paragraphText 
            WHERE {{
                    data:Publication_{document_id} rdf:type onner:ScholarlyPublication ;
                                   onner:containsDocumentPart ?paragraphId .

                    ?paragraphId rdf:type onner:Paragraph ;
                                 onner:paragraphText ?paragraphText .
            }}
        ''')

        # convert results to JSON
        sparql.setReturnFormat(JSON)
        fetched_data = sparql.query().convert()
    except Exception as e:
        print(f'Error querying the SPARQL endpoint: {e}')

    return fetched_data

# NER Model

In [18]:
def ner(paragraph_text):
    
    try:
        model_path = '/home/umayer/Work/_dev'
        model_name = 'output_sci_md_vector/model-best'
        nlp = spacy.load(f'{model_path}/{model_name}')
    except OSError:
        print('ERROR: Model Not Found!')
    else:
        print('Model loaded successfully.')

    doc = nlp(paragraph_text)
    
    print('Object created successfully.')
    
    return doc

# NER Tool Integrator

In [19]:
import json
from datetime import datetime


# Extract paragraphs from Graph DB produced data object
def get_paragraphs(fetched_data):
    
    paragraphs = []

    for paragraph in fetched_data['results']['bindings']:
        paragraph_id = paragraph['paragraphId']['value'].split('#')[1]
        paragraph_text = paragraph['paragraphText']['value']
        paragraphs.append([paragraph_id, paragraph_text, {'entities': []}])
        
    return paragraphs


# Create JSON with empty annotation for the annotation tool 
def create_empty_annotation():
    
    labels = [
        {'id': 1, 'name': 'CHEMICAL', 'color': 'red-11'},
        {'id': 2, 'name': 'MATERIAL', 'color': 'red-11'},
        {'id': 3, 'name': 'STRUCTURE', 'color': 'red-11'},
        {'id': 4, 'name': 'PROPERTY', 'color': 'red-11'},
        {'id': 5, 'name': 'APPLICATION', 'color': 'red-11'},
        {'id': 6, 'name': 'PROCESS', 'color': 'red-11'},
        {'id': 7, 'name': 'EQUIPMENT', 'color': 'red-11'},
        {'id': 8, 'name': 'MEASUREMENT', 'color': 'red-11'},
        {'id': 9, 'name': 'ABBREVIATION', 'color': 'red-11'}
    ]
    
    annotations = {'classes': labels, 'annotations': get_paragraphs()}
    json_data = json.dumps(annotations)
    
    return json_data


# 
def entity_generator(paragraphs):
    
    entities = list()
    
    for paragraph in paragraphs:
        paragraph_id = paragraph[0]
        paragraph_text = paragraph[1]
    
        print(f'Processing paragraph: {paragraph_id}')
        doc = ner(paragraph_text)
        
        all_entities = []
        entity_number = 1
        
        for ent in doc.ents:
            date_time = str(datetime.now())[:-7]    # RECHECK THE APPROPRIATE PLACEMENT
            entity_id = paragraph_id + '-' + str(entity_number)
            entity = ent.text
            label = ent.label_
            offset = ent.start_char
            length = ent.end_char - ent.start_char
            entity_info = [entity_id, entity, label, offset, length, date_time]
            all_entities.append(entity_info)
            entity_number += 1

        entity_ids = ['data:'+i[0] for i in all_entities]
        entity_ids_str = ', '.join(entity_ids) 
        entities.append([paragraph_id, all_entities, entity_ids_str])
        
        print('Entities processed successfully.')
        
    return entities


# Main File

In [20]:
import time
start_time = time.perf_counter()

document = xml_reader()
document_id, rdf_triple = document_representer(document)
import_graph(document_id, rdf_triple, 'document')
fetched_data = retrieve_paragraphs(document_id)
para = get_paragraphs(fetched_data)
ent = entity_generator(para)
rdf_triple_terms = entity_representer(ent)
import_graph(document_id, rdf_triple_terms, 'entities')

end_time = time.perf_counter()
execution_time = end_time - start_time
print(f'Execution time: {execution_time:.4f} seconds')

✅ RDF (document) successfully inserted into - http://purl.org/spatialai/onner/data/10.1016_j.indcrop.2015.03.075/document
Processing paragraph: 10.1016_j.indcrop.2015.03.075_A-1
Model loaded successfully.
Object created successfully.
Entities processed successfully.
Processing paragraph: 10.1016_j.indcrop.2015.03.075_1-1
Model loaded successfully.
Object created successfully.
Entities processed successfully.
Processing paragraph: 10.1016_j.indcrop.2015.03.075_1-2
Model loaded successfully.
Object created successfully.
Entities processed successfully.
Processing paragraph: 10.1016_j.indcrop.2015.03.075_1-3
Model loaded successfully.
Object created successfully.
Entities processed successfully.
Processing paragraph: 10.1016_j.indcrop.2015.03.075_1-4
Model loaded successfully.
Object created successfully.
Entities processed successfully.
Processing paragraph: 10.1016_j.indcrop.2015.03.075_1-5
Model loaded successfully.
Object created successfully.
Entities processed successfully.
Processi