In [1]:
import xml.etree.ElementTree as ET
import re
import os
import spacy
import scispacy
from datetime import datetime

### XML Structure
- teiHeader
    - fileDesc
        - titleStmt
            - title
        - publicationStmt
            - date
        - sourceDesc
            - biblStruct
                - idno
    - encodingDesc
    - profileDesc

### XRI (XML-to-RDF-Intermediate) Data Structure
- [
    - [
        - section_number,
        - section_title,
        - {
            - paragraph_number: paragraph_text,
        - }
    - ],
- ]

# RDF Generator
### Reading XML

In [2]:
def read_xml(file_name):

    # LOADING XML AND CREATING ROOT
    xml_file = f'{file_name}'
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    # EXTRACTING METADATA (TITLE, PUBLICATION_DATE and DOI)
    metadata = []

    for elem in root:
        if elem.tag[29:] == 'teiHeader':
            for sub_elem1 in elem:
                if sub_elem1.tag[29:] == 'fileDesc':
                    publication_info = {}
                    for sub_elem2 in sub_elem1:
                        if sub_elem2.tag[29:] == 'titleStmt':
                            for sub_elem3 in sub_elem2:
                                if sub_elem3.tag[29:] == 'title':
                                    publication_info['Title'] = sub_elem3.text
                        if sub_elem2.tag[29:] == 'publicationStmt':
                            for sub_elem3 in sub_elem2:
                                if sub_elem3.tag[29:] == 'date':
                                    publication_info['Publication Date'] = sub_elem3.attrib['when']
                        if sub_elem2.tag[29:] == 'sourceDesc':
                            for sub_elem3 in sub_elem2:
                                if sub_elem3.tag[29:] == 'biblStruct':
                                    for sub_elem4 in sub_elem3:
                                        if sub_elem4.tag[29:] == 'idno':
                                            publication_info['DOI'] = sub_elem4.text
                    metadata.append(['0', 'Metadata', publication_info])
    
    # EXTRACTING ABSTRACT AND NUMBERING THE PARAGRAPHS
    abstract = []

    for elem in root:
        if elem.tag[29:] == 'teiHeader':
            for sub_elem1 in elem:
                if sub_elem1.tag[29:] == 'profileDesc':
                    for sub_elem2 in sub_elem1:
                        if sub_elem2.tag[29:] == 'abstract':
                            for sub_elem3 in sub_elem2:
                                if sub_elem3.tag[29:] == 'div':
                                    list_of_paragraphs = {}
                                    paragraph_number = 0
                                    for sub_elem4 in sub_elem3:
                                        if sub_elem4.tag[29:] == 'p':
                                            paragraph_number += 1
                                            list_of_paragraphs[paragraph_number] = ET.tostring(sub_elem4, encoding='unicode')
                                    abstract.append(['0', 'Abstract', list_of_paragraphs])
                                 
    # EXTRACTING SECTIONS
    # need to normalize the section number (line 17)
    # compare Rojas and Wolf section number
    # \ issue in RDF, see Koshkava 2014 paper
    list_of_sections = []

    for elem in root:
        if (elem.tag[29:] == 'text'):
            for sub_elem1 in elem:
                if (sub_elem1.tag[29:] == 'body'):
                    for sub_elem2 in sub_elem1:
                        if sub_elem2.tag[29:] == 'div':
                            section_number = ''
                            section_name = ''
                            list_of_paragraphs = []
                            for sub_elem3 in sub_elem2:
                                if sub_elem3.tag[29:] == 'head':
                                    if bool(sub_elem3.attrib):
                                        section_number = str(sub_elem3.attrib)
                                        if section_number[-3] == '.':
                                            section_number = section_number[7:-3]
                                        else:
                                            section_number = section_number[7:-2]
                                    else:
                                        section_number = 'NO_SECTION_NUMBER'
                                    section_name = sub_elem3.text
                                if sub_elem3.tag[29:] == 'p':
                                    list_of_paragraphs.append(ET.tostring(sub_elem3, encoding='unicode'))
                            # commented logic skips NO_SECTION_NUMBER with no paragraphs, i.e., Table 1, Table 2, .....
                            # if section_number == 'NO_SECTION_NUMBER' and not bool(list_of_paragraphs):
                            if section_number == 'NO_SECTION_NUMBER':
                                pass
                            else:
                                list_of_sections.append([section_number, section_name, list_of_paragraphs])
                                
    # NUMBERING THE PARAGRAPHS OF SECTIONS 
    for section in list_of_sections:
        dict_ = {i + 1: section[2][i] for i in range(len(section[2]))}
        section.append(dict_)
        section.remove(section[2])
        
    # MERGING ABSTRACT WITH OTHER DOCUMENT PARTS
    document = abstract + list_of_sections

    # PREPROCESSING TEXT
    starting_p_tag_pattern = r'<ns0:p[^>]+>'
    ending_p_tag_pattern = '</ns0:p>'
    starting_ref_tag_pattern = r'<ns0:ref[^>]+>'
    ending_ref_tag_pattern = '</ns0:ref>'
    ref_pattern = r'<ref>.*?</ref>'          # temporary for removing ref tag

    for record in document:
        for paragraph_number, paragraph_text in record[2].items():
            text = paragraph_text
            text = re.sub(starting_p_tag_pattern, '', text)
            text = re.sub(ending_p_tag_pattern, '', text)
            text = re.sub(starting_ref_tag_pattern, '<ref>', text)
            text = re.sub(ending_ref_tag_pattern, '</ref>', text)
            text = re.sub(ref_pattern, '', text)
            record[2][paragraph_number] = text
    
    # MERGING METADATA WITH OTHER DOCUMENT PARTS
    document = metadata + document
    
    return document


### Necessary Functions for:
***represent_document()*** \
***represent_entity()***

In [3]:
# DEFINING PREFIXES
def define_prefix():

    PREFIX_ONNER = "PREFIX onner: <http://purl.org/spatialai/onner/onner-full#>\n"
    PREFIX_DATA = "PREFIX data: <http://purl.org/spatialai/onner/onner-full/data#>\n"
    PREFIX_RDF = "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n"
    PREFIX_RDFS = "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\n"
    PREFIX_XSD = "PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>\n"
    PREFIX_OWL = "PREFIX owl: <http://www.w3.org/2002/07/owl#>\n\n"
    
    return PREFIX_ONNER + PREFIX_DATA + PREFIX_RDF + PREFIX_RDFS + PREFIX_XSD + PREFIX_OWL


# SUBSECTIONS CHECKER FOR A SECTION
def check_nested_section(document, doi, section_number):

    section_ids = []

    for record in document:
        pattern = rf'^{section_number}\.[^.]+$'
        
        if re.search(pattern, record[0]):
            section_id = doi + '_' + str(record[0])
            section_ids.append('data:'+section_id)

    section_ids_joined = ', '.join(section_ids)

    return section_ids_joined


### RDF - Document Representation

In [4]:
# RDF WRITER FOR DOCUMENT
def represent_document(document):

    # GETTING DOI
    try:
        doi = document[0][2]['DOI']
        document_id = doi.replace('/', '_')
    except NameError:
        print('ERROR: Document object is not defined!')

    # PRINTING PREFIXES
    rdf_triple = ''

    # WRITING RDF
    for record in document:
        # metadata
        if record[1] == 'Metadata':
            abstract_id = 'data:' + document_id + '_A'
            section_ids = []

            for section_number in document:
                if section_number[0] != '0' and '.' not in section_number[0]:
                    section_id = document_id + '_' + section_number[0]
                    section_ids.append('data:'+section_id)

            directly_contained_sections = ', '.join(section_ids)

            rdf_triple += f"data:Publication_{document_id} rdf:type onner:ScholarlyPublication ;\n"
            rdf_triple += f"onner:publicationTitle '{record[2]['Title']}'^^xsd:string ;\n"
            rdf_triple += f"onner:publicationDate '{record[2]['Publication Date']}'^^xsd:date ;\n"
            rdf_triple += f"onner:doi '{doi}'^^xsd:string ;\n"
            rdf_triple += f"onner:directlyContainsDocumentPart {abstract_id}, {directly_contained_sections} .\n\n"

        # abstract and it's document parts
        elif record[1] == 'Abstract':
            next_index = document.index(record) + 1
            next_section = document[next_index][0]
            paragraph_ids = []

            for paragraph_number, _ in record[2].items():
                paragraph_id = document_id + '_A' + '-' + str(paragraph_number)
                paragraph_ids.append('data:'+paragraph_id)

            paragraph_ids_joined = ', '.join(paragraph_ids)

            rdf_triple += f"data:{document_id}_A rdf:type onner:Abstract ;\n"
            rdf_triple += f"onner:nextDocumentPart {paragraph_ids[0]} ;\n"    # NEXT DOC PART AFTER ABSTRACT
            rdf_triple += f"onner:directlyContainsDocumentPart {paragraph_ids_joined} .\n\n"

            for paragraph_number, paragraph_text in record[2].items():
                # replacing ' with \' in text
                if "'" in paragraph_text:
                    paragraph_text = paragraph_text.replace("'", r"\'")

                rdf_triple += f"data:{document_id}_A-{paragraph_number} rdf:type onner:Paragraph ;\n"
                rdf_triple += f"onner:positionInParentDocumentPart '{paragraph_number}'^^xsd:nonNegativeInteger ;\n"

                if paragraph_number == len(paragraph_ids):
                    rdf_triple += f"onner:nextDocumentPart data:{document_id}_{next_section} ;\n"
                else:
                    rdf_triple += f"onner:nextDocumentPart data:{document_id}_A-{paragraph_number+1} ;\n"

                rdf_triple += f"onner:paragraphText '{paragraph_text}'^^xsd:string .\n\n"

        # section and it's document parts
        else:
            section_number = record[0]
            section_name = record[1]
            next_index = document.index(record) + 1
            paragraph_ids = []

            if next_index == len(document):
                next_section = 'EndOfDocument'
            else:
                next_section = document[next_index][0]

            # if no paragraphs exist between a section and its immediate subsection
            # else paragraphs exist between a section and its immediate subsection
            if not bool(record[2]):
                directly_contained_sections = check_nested_section(document, document_id, section_number)

                rdf_triple += f"data:{document_id}_{section_number} rdf:type onner:Section ;\n"
                rdf_triple += f"onner:sectionTitle '{section_name}'^^xsd:string ;\n"
                rdf_triple += f"onner:sectionNumber '{section_number}'^^xsd:string ;\n"
                rdf_triple += f"onner:nextDocumentPart data:{document_id}_{next_section} ;\n"
                rdf_triple += f"onner:directlyContainsDocumentPart {directly_contained_sections} .\n\n"
            else:
                directly_contained_sections = check_nested_section(document, document_id, section_number)

                for paragraph_number, _ in record[2].items():
                    paragraph_id = document_id + '_' + str(section_number) + '-' + str(paragraph_number)
                    paragraph_ids.append('data:'+paragraph_id)

                paragraph_ids_joined = ', '.join(paragraph_ids)

                rdf_triple += f"data:{document_id}_{section_number} rdf:type onner:Section ;\n"
                rdf_triple += f"onner:sectionTitle '{section_name}'^^xsd:string ;\n"
                rdf_triple += f"onner:sectionNumber '{section_number}'^^xsd:string ;\n"
                rdf_triple += f"onner:nextDocumentPart {paragraph_ids[0]} ;\n"    # NEXT DOC PART AFTER SECTION

                if bool(directly_contained_sections):
                    rdf_triple += f"onner:directlyContainsDocumentPart {paragraph_ids_joined}, {directly_contained_sections} .\n\n"
                else:
                    rdf_triple += f"onner:directlyContainsDocumentPart {paragraph_ids_joined} .\n\n"

                for paragraph_number, paragraph_text in record[2].items():
                    # replacing ' with \' in text
                    if "'" in paragraph_text:
                        paragraph_text = paragraph_text.replace("'", r"\'")
 
                    rdf_triple += f"data:{document_id}_{section_number}-{paragraph_number} rdf:type onner:Paragraph ;\n"
                    rdf_triple += f"onner:positionInParentDocumentPart '{paragraph_number}'^^xsd:nonNegativeInteger ;\n"

                    if paragraph_number == len(paragraph_ids):
                        rdf_triple += f"onner:nextDocumentPart data:{document_id}_{next_section} ;\n"
                    else:
                        rdf_triple += f"onner:nextDocumentPart data:{document_id}_{section_number}-{paragraph_number+1} ;\n"

                    rdf_triple += f"onner:paragraphText '{paragraph_text}'^^xsd:string .\n\n"

    rdf_triple += f"data:{document_id}_EndOfDocument rdf:type onner:EndOfDocument .\n"

    return document_id, rdf_triple


### RDF - Named Entity Representation

In [5]:
# RDF WRITER FOR NAMED ENTITIES
def represent_entity(entities):
    
    rdf_triple = ''
    labels_in_doc = []
    labeling_schema = ['CHEMICAL', 
                       'MATERIAL', 
                       'STRUCTURE',
                       'PROPERTY',
                       'APPLICATION',
                       'PROCESS',
                       'EQUIPMENT',
                       'MEASUREMENT',
                       'ABBREVIATION']

    # WRITING RDF
    for i in entities:
        paragraph_id = i[0]
        labeled_term_info_list = i[1]
#         labeled_term_ids_joined = i[2]

        if bool(labeled_term_info_list):    # if no term found in paragraph
            entity_ids = ['data:'+i[0] for i in labeled_term_info_list]
            labeled_term_ids_joined = ', '.join(entity_ids) 
            
            rdf_triple += f"data:{paragraph_id} onner:directlyContainsLabeledTerm {labeled_term_ids_joined} .\n\n"
            
            for info in labeled_term_info_list:
                rdf_triple += f"data:{info[0]} rdf:type onner:LabeledTerm ;\n"    # deal with atomic and compound terms
                rdf_triple += f"onner:labeledTermText '{info[1]}'^^xsd:string ;\n"
                rdf_triple += f"onner:offset '{info[3]}'^^xsd:nonNegativeInteger ;\n"
                rdf_triple += f"onner:length '{info[4]}'^^xsd:nonNegativeInteger ;\n"
                rdf_triple += f"onner:labeledTermDirectlyContainedBy data:{paragraph_id} ;\n"
                rdf_triple += f"onner:hasLabeledTermStatus data:Candidate_{info[0]} .\n\n"

                rdf_triple += f"data:Candidate_{info[0]} rdf:type onner:CandidateStatus ;\n"
                rdf_triple += f"onner:statusAssignmentDate '{info[5]}'^^xsd:dateTime ;\n"
                rdf_triple += f"onner:statusAssignedBy data:Cellulosic_NER_Model ;\n"

                try:
                    if not bool(labeling_schema):
                        raise Exception('Labeling schema is empty!')
                    label_number_in_schema = labeling_schema.index(info[2]) + 1
                except Exception as e:
                    print(f'ERROR: {e}')
                else:
                    rdf_triple += f"onner:hasLabeledTermLabel data:Label_{label_number_in_schema} .\n\n"

                # adding lebels and their positions in the schema
                if [label_number_in_schema, info[2]] not in labels_in_doc:
                    labels_in_doc.append([label_number_in_schema, info[2]])
        else:
            rdf_triple += f"data:{paragraph_id} onner:directlyContainsLabeledTerm data:NoLabeledTerm .\n\n"

    try:
        if not bool(labels_in_doc):
            raise Exception('List of labels found in document is empty!\n')     
        else:
            for label in labels_in_doc:
                rdf_triple += f"data:Label_{label[0]} rdf:type onner:Label ;\n"
                rdf_triple += f"onner:fromLabelingSchema data:Labeling_Schema ;\n"
                rdf_triple += f"onner:labelText '{label[1]}'^^xsd:string .\n\n"
    except Exception as e:
        print(f'ERROR: {e}')

    rdf_triple += f"data:Labeling_Schema rdf:type onner:LabelingSchema ;\n"
    rdf_triple += f"onner:schemaName 'CelloGraph'^^xsd:string .\n\n"

    rdf_triple += f"data:Cellulosic_NER_Model rdf:type onner:NER_System ;\n"    # if/else required to identify system and human
    rdf_triple += f"onner:systemVersion '1.0'^^xsd:string .\n\n"

    return rdf_triple


### BACKUP CODE >> RDF - Named Entity Representation

In [6]:
# # RDF WRITER FOR NAMED ENTITIES
# def represent_entity(entities):
    
#     rdf_triple = ''
#     labels_in_doc = []
#     labeling_schema = ['CHEMICAL', 
#                        'MATERIAL', 
#                        'STRUCTURE',
#                        'PROPERTY',
#                        'APPLICATION',
#                        'PROCESS',
#                        'EQUIPMENT',
#                        'MEASUREMENT',
#                        'ABBREVIATION']

#     # WRITING RDF
#     for i in entities:
#         paragraph_id = i[0]
#         labeled_term_info_list = i[1]
#         labeled_term_ids_joined = i[2]

#         if bool(labeled_term_ids_joined):    # if no term found in paragraph
#             rdf_triple += f"data:{paragraph_id} onner:directlyContainsLabeledTerm {labeled_term_ids_joined} .\n\n"
#         else:
#             rdf_triple += f"data:{paragraph_id} onner:directlyContainsLabeledTerm data:NoLabeledTerm .\n\n"

#         for info in labeled_term_info_list:
#             rdf_triple += f"data:{info[0]} rdf:type onner:LabeledTerm ;\n"    # deal with atomic and compound terms
#             rdf_triple += f"onner:labeledTermText '{info[1]}'^^xsd:string ;\n"
#             rdf_triple += f"onner:offset '{info[3]}'^^xsd:nonNegativeInteger ;\n"
#             rdf_triple += f"onner:length '{info[4]}'^^xsd:nonNegativeInteger ;\n"
#             rdf_triple += f"onner:labeledTermDirectlyContainedBy data:{paragraph_id} ;\n"
#             rdf_triple += f"onner:hasLabeledTermStatus data:Candidate_{info[0]} .\n\n"

#             rdf_triple += f"data:Candidate_{info[0]} rdf:type onner:CandidateStatus ;\n"
#             rdf_triple += f"onner:statusAssignmentDate '{info[5]}'^^xsd:dateTime ;\n"
#             rdf_triple += f"onner:statusAssignedBy data:Cellulosic_NER_Model ;\n"

#             try:
#                 if not bool(labeling_schema):
#                     raise Exception('Labeling schema is empty!')
#                 label_number_in_schema = labeling_schema.index(info[2]) + 1
#             except Exception as e:
#                 print(f'ERROR: {e}')
#             else:
#                 rdf_triple += f"onner:hasLabeledTermLabel data:Label_{label_number_in_schema} .\n\n"

#             # adding lebels and their positions in the schema
#             if [label_number_in_schema, info[2]] not in labels_in_doc:
#                 labels_in_doc.append([label_number_in_schema, info[2]])

#     try:
#         if not bool(labels_in_doc):
#             raise Exception('List of labels found in document is empty!\n')     
#         else:
#             for label in labels_in_doc:
#                 rdf_triple += f"data:Label_{label[0]} rdf:type onner:Label ;\n"
#                 rdf_triple += f"onner:fromLabelingSchema data:Labeling_Schema ;\n"
#                 rdf_triple += f"onner:labelText '{label[1]}'^^xsd:string .\n\n"
#     except Exception as e:
#         print(f'ERROR: {e}')

#     rdf_triple += f"data:Labeling_Schema rdf:type onner:LabelingSchema ;\n"
#     rdf_triple += f"onner:schemaName 'CelloGraph'^^xsd:string .\n\n"

#     rdf_triple += f"data:Cellulosic_NER_Model rdf:type onner:NER_System ;\n"    # if/else required to identify system and human
#     rdf_triple += f"onner:systemVersion '1.0'^^xsd:string .\n\n"

#     return rdf_triple


# GraphDB Im- and Exporter
### Insert Graph into Database

In [7]:
from SPARQLWrapper import SPARQLWrapper, POST

# DATA INSERTION INTO GRAPH DB
def insert_graph(document_id, rdf_triple, rdf_type):

    repository_name = 'test-repo'
    database_url = f'http://localhost:7200/repositories/{repository_name}/statements'
    
    if rdf_type == 'document':
        named_graph = f'http://purl.org/spatialai/onner/data/{document_id}/document'
    elif rdf_type == 'entities':
        named_graph = f'http://purl.org/spatialai/onner/data/{document_id}/terms'

    # query => insert data using named graph
    query = f'''
        {define_prefix()}

        INSERT DATA {{
            GRAPH <{named_graph}> {{
                {rdf_triple}
            }}
        }}
    '''

    # execute query
    sparql = SPARQLWrapper(database_url)
    sparql.setMethod(POST)
    sparql.setQuery(query)

    try:
        sparql.query()
        print(f'✔️ RDF ({rdf_type}) successfully inserted into - {named_graph}')
    except Exception as e:
        print(f'❌ Error: {e}')


### Retrieve Paragraphs from Graph

In [8]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import json

# DATA RETRIEVAL FROM GRAPH DB
def get_document_content(document_id):
    
    # specify the repository
    sparql = SPARQLWrapper("http://localhost:7200/repositories/test-repo")

    # query => retrieving data
    try:
        sparql.setQuery(f'''
            {define_prefix()}

            SELECT ?paragraphId ?paragraphText 
            WHERE {{
                    data:Publication_{document_id} rdf:type onner:ScholarlyPublication ;
                                   onner:containsDocumentPart ?paragraphId .

                    ?paragraphId rdf:type onner:Paragraph ;
                                 onner:paragraphText ?paragraphText .
            }}
        ''')
        # convert results to JSON
        sparql.setReturnFormat(JSON)
        exported_data = sparql.query().convert()
    except Exception as e:
        print(f'Error querying the SPARQL endpoint: {e}')

    return exported_data


# NER Model

In [9]:
import spacy

# NLP MODEL LOADER
def ner(paragraph_text):
    
    if not hasattr(ner, 'nlp'):
        try:
            model_name = '/home/umayer/Work/_dev/output_sci_md_vector/model-best'
            ner.nlp = spacy.load(f'{model_name}')
            print('✔️ Model loaded successfully.')
        except OSError:
            print('ERROR: Model Not Found!')

    doc = ner.nlp(paragraph_text) 
    
    return doc


# NER Tool Integrator

In [10]:
import json
from datetime import datetime

# PARAGRAPHS EXTRACTOR FROM GRAPH DB DATA
def get_paragraph(exported_data):
    
    paragraphs = []

    for paragraph in exported_data['results']['bindings']:
        paragraph_id = paragraph['paragraphId']['value'].split('#')[1]
        paragraph_text = paragraph['paragraphText']['value']
        paragraphs.append([paragraph_id, paragraph_text, {'entities': []}])
        
    return paragraphs


# JSON PRODUCER FOR ANNOTATION TOOL 
def create_empty_annotation():
    
    labels = [
        {'id': 1, 'name': 'CHEMICAL', 'color': 'red-11'},
        {'id': 2, 'name': 'MATERIAL', 'color': 'red-11'},
        {'id': 3, 'name': 'STRUCTURE', 'color': 'red-11'},
        {'id': 4, 'name': 'PROPERTY', 'color': 'red-11'},
        {'id': 5, 'name': 'APPLICATION', 'color': 'red-11'},
        {'id': 6, 'name': 'PROCESS', 'color': 'red-11'},
        {'id': 7, 'name': 'EQUIPMENT', 'color': 'red-11'},
        {'id': 8, 'name': 'MEASUREMENT', 'color': 'red-11'},
        {'id': 9, 'name': 'ABBREVIATION', 'color': 'red-11'}
    ]
    
    annotations = {'classes': labels, 'annotations': get_paragraph()}
    json_data = json.dumps(annotations)
    
    return json_data


# NAMED ENTITIES GENERATOR
def generate_entity(paragraphs):
    
    entities_in_document = list()
    
    for paragraph in paragraphs:
        paragraph_id = paragraph[0]
        paragraph_text = paragraph[1]
        
        doc = ner(paragraph_text)
        
        entities_in_paragraph = []
        entity_number = 1
        
        for ent in doc.ents:
            date_time = str(datetime.now())[:-7]    # RECHECK THE APPROPRIATE PLACEMENT
            entity_id = paragraph_id + '-' + str(entity_number)
            entity = ent.text
            label = ent.label_
            offset = ent.start_char
            length = ent.end_char - ent.start_char
            entity_info = [entity_id, entity, label, offset, length, date_time]
            entities_in_paragraph.append(entity_info)
            entity_number += 1

#         entity_ids = ['data:'+i[0] for i in entities_in_paragraph]
#         entity_ids_str = ', '.join(entity_ids) 
#         entities_in_document.append([paragraph_id, entities_in_paragraph, entity_ids_str])
        entities_in_document.append([paragraph_id, entities_in_paragraph])
        
        print(f'Entity processed for {paragraph_id}')
        
    return entities_in_document


# Main File

In [None]:
import time
start_time = time.perf_counter()

document = read_xml('/home/umayer/_div/OnNER/evaluation/example/input/Publication_10.1016_j.indcrop.2015.03.075.xml')
document_id, rdf_document = represent_document(document)
insert_graph(document_id, rdf_document, 'document')
exported_data = get_document_content(document_id)
paragraphs = get_paragraph(exported_data)
entities = generate_entity(paragraphs)
rdf_entities = represent_entity(entities)
insert_graph(document_id, rdf_entities, 'entities')

end_time = time.perf_counter()
execution_time = end_time - start_time
print(f'Execution time: {execution_time:.4f} seconds')


✔️ RDF (document) successfully inserted into - http://purl.org/spatialai/onner/data/10.1016_j.indcrop.2015.03.075/document


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
