In [1]:
import xml.etree.ElementTree as ET
import re
import os
import spacy
import scispacy
from datetime import datetime

### XML Reader

In [19]:
# LOADING XML AND CREATING ROOT

xml_file = 'D:\\Drive\\SISE\\CelloGraph\\Dev\\Data\\XML\\Rojas 2016 - Cellulose Chemistry And Properties - Chapter1.tei.xmlScitex.xml'
tree = ET.parse(xml_file)
root = tree.getroot()

# EXTRACTING METADATA (available data: title, publication_date and doi)

metadata = []

for elem in root:
    if elem.tag[29:] == 'teiHeader':
        for sub_elem1 in elem:
            if sub_elem1.tag[29:] == 'fileDesc':
                publication_info = {}
                for sub_elem2 in sub_elem1:
                    if sub_elem2.tag[29:] == 'titleStmt':
                        for sub_elem3 in sub_elem2:
                            if sub_elem3.tag[29:] == 'title':
                                publication_info['Title'] = sub_elem3.text
                    if sub_elem2.tag[29:] == 'publicationStmt':
                        for sub_elem3 in sub_elem2:
                            if sub_elem3.tag[29:] == 'date':
                                publication_info['Publication Date'] = sub_elem3.attrib['when']
                    if sub_elem2.tag[29:] == 'sourceDesc':
                        for sub_elem3 in sub_elem2:
                            if sub_elem3.tag[29:] == 'biblStruct':
                                for sub_elem4 in sub_elem3:
                                    if sub_elem4.tag[29:] == 'idno':
                                        publication_info['DOI'] = sub_elem4.text
                metadata.append(['0', 'Metadata', publication_info])
                                      
print(metadata)

[['0', 'Metadata', {'Title': 'Cellulose: Structure and Properties', 'Publication Date': '2015-09-10', 'DOI': '10.1007/12_2015_319'}]]


In [12]:
# EXTRACTING ABSTRACT AND NUMBERING PARAGRAPHS

abstract = []

for elem in root:
    if elem.tag[29:] == 'teiHeader':
        for sub_elem1 in elem:
            if sub_elem1.tag[29:] == 'profileDesc':
                for sub_elem2 in sub_elem1:
                    if sub_elem2.tag[29:] == 'abstract':
                        for sub_elem3 in sub_elem2:
                            if sub_elem3.tag[29:] == 'div':
                                list_of_paragraphs = {}
                                paragraph_number = 0
                                for sub_elem4 in sub_elem3:
                                    if sub_elem4.tag[29:] == 'p':
                                        paragraph_number += 1
                                        list_of_paragraphs[paragraph_number] = ET.tostring(sub_elem4, encoding='unicode')
                                abstract.append(['0', 'Abstract', list_of_paragraphs])
                                
print(abstract)

[['0', 'Abstract', {1: '<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0">This work was aimed to evaluate the performance of two different surface modifications on cellulose nano-particles obtained from blue agave bagasse. Cellulose nanofibers were obtained from traditional mechanical methods, and cellulose nanocrystals were obtained via sulfuric acid hydrolysis. After cellulose extraction and size reduction, different composites were elaborated using poly(lactic acid) as matrix varying fiber concentrations and using cellulosic particles of diverse size and surface modification (3aminopropyl triethoxysilane silanized cellulose nanofibers and dodecanoyl chloride esterified cellulose nanocrystals). Non-cellulosic elimination and the effect of surface modifications were analyzed with infrared analysis, showing characteristic intensities for cellulosic species and new peaks for each modification (characteristic ester peak ∼1740 cm −1 , and amino peak ∼1550 cm −1 ) cellulose nanofibers and nan

In [20]:
# EXTRACTING OTHER SECTIONS

# add logic: if a paragraph does not have any labeled term
# \ issue in RDF, see Koshkava 2014 paper

list_of_sections = []

for elem in root:
    if (elem.tag[29:] == 'text'):
        for sub_elem1 in elem:
            if (sub_elem1.tag[29:] == 'body'):
                for sub_elem2 in sub_elem1:
                    if sub_elem2.tag[29:] == 'div':
                        section_number = ''
                        section_name = ''
                        list_of_paragraphs = []
                        for sub_elem3 in sub_elem2:
                            if sub_elem3.tag[29:] == 'head':
                                if bool(sub_elem3.attrib):
                                    section_number = str(sub_elem3.attrib)
                                    if section_number[-3] == '.':
                                        section_number = section_number[7:-3]
                                    else:
                                        section_number = section_number[7:-2]
                                else:
                                    section_number = 'NO_SECTION_NUMBER'
                                section_name = sub_elem3.text
                            if sub_elem3.tag[29:] == 'p':
                                list_of_paragraphs.append(ET.tostring(sub_elem3, encoding='unicode'))
                        # commented logic skips NO_SECTION_NUMBER with no paragraphs, i.e., Table 1, Table 2, .....
                        # if section_number == 'NO_SECTION_NUMBER' and not bool(list_of_paragraphs):
                        if section_number == 'NO_SECTION_NUMBER':
                            pass
                        else:
                            list_of_sections.append([section_number, section_name, list_of_paragraphs])

print(list_of_sections)

[['1', 'Introduction', ['<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0">Cellulose is the most abundant natural polymer in the biosphere, with a global production (and decomposition) of ~1.5 Â 10 <super>12</super> tons per year, comparable to the planetary reserves of the main fossil and mineral sources <ns0:ref type="bibr" target="#b0">[1]</ns0:ref>. In addition to the long-standing scientific interest in cellulose, the use of cellulose as renewable and biodegradable raw material in various applications is a proposed solution to the recent industrial challenge to successfully meet environmental and recycling problems <ns0:ref type="bibr" target="#b1">[2]</ns0:ref>. Versatile structuring of cellulose by various routes of modification, including both physical and chemical methods, has enabled its use in a variety of applications (e.g., fillers, building and coating materials, laminates, papers, textiles, optical films, sorption media, viscosity regulators, and even advanced functional ma

In [21]:
# NUMBERING PARAGRAPHS OF SECTIONS 
# paragraphs are sequentially added in a dictionary
# dictionary is added with each respective record as a 4th element

for section in list_of_sections:
    list_of_paragraphs = {}
    for paragraph_number, paragraph_text in enumerate(section[2], start=1):
        list_of_paragraphs[paragraph_number] = paragraph_text
    section.append(list_of_paragraphs)
    
print(list_of_sections)

[['1', 'Introduction', ['<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0">Cellulose is the most abundant natural polymer in the biosphere, with a global production (and decomposition) of ~1.5 Â 10 <super>12</super> tons per year, comparable to the planetary reserves of the main fossil and mineral sources <ns0:ref type="bibr" target="#b0">[1]</ns0:ref>. In addition to the long-standing scientific interest in cellulose, the use of cellulose as renewable and biodegradable raw material in various applications is a proposed solution to the recent industrial challenge to successfully meet environmental and recycling problems <ns0:ref type="bibr" target="#b1">[2]</ns0:ref>. Versatile structuring of cellulose by various routes of modification, including both physical and chemical methods, has enabled its use in a variety of applications (e.g., fillers, building and coating materials, laminates, papers, textiles, optical films, sorption media, viscosity regulators, and even advanced functional ma

In [22]:
# DELETING THE 3RD ELEMENT FROM EACH RECORD

for section in list_of_sections:
    section.remove(section[2])
    
print(list_of_sections)

[['1', 'Introduction', {1: '<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0">Cellulose is the most abundant natural polymer in the biosphere, with a global production (and decomposition) of ~1.5 Â 10 <super>12</super> tons per year, comparable to the planetary reserves of the main fossil and mineral sources <ns0:ref type="bibr" target="#b0">[1]</ns0:ref>. In addition to the long-standing scientific interest in cellulose, the use of cellulose as renewable and biodegradable raw material in various applications is a proposed solution to the recent industrial challenge to successfully meet environmental and recycling problems <ns0:ref type="bibr" target="#b1">[2]</ns0:ref>. Versatile structuring of cellulose by various routes of modification, including both physical and chemical methods, has enabled its use in a variety of applications (e.g., fillers, building and coating materials, laminates, papers, textiles, optical films, sorption media, viscosity regulators, and even advanced functional

In [23]:
# MERGING ABSTRACT WITH OTHER SECTIONS

document = abstract + list_of_sections
print(document)

[['0', 'Abstract', {1: 'This work was aimed to evaluate the performance of two different surface modifications on cellulose nano-particles obtained from blue agave bagasse. Cellulose nanofibers were obtained from traditional mechanical methods, and cellulose nanocrystals were obtained via sulfuric acid hydrolysis. After cellulose extraction and size reduction, different composites were elaborated using poly(lactic acid) as matrix varying fiber concentrations and using cellulosic particles of diverse size and surface modification (3aminopropyl triethoxysilane silanized cellulose nanofibers and dodecanoyl chloride esterified cellulose nanocrystals). Non-cellulosic elimination and the effect of surface modifications were analyzed with infrared analysis, showing characteristic intensities for cellulosic species and new peaks for each modification (characteristic ester peak ∼1740 cm −1 , and amino peak ∼1550 cm −1 ) cellulose nanofibers and nanocrystals were observed with atom-force microsc

In [24]:
# PREPROCESSING TEXT

starting_p_tag_pattern = r'<ns0:p[^>]+>'
ending_p_tag_pattern = '</ns0:p>'
starting_ref_tag_pattern = r'<ns0:ref[^>]+>'
ending_ref_tag_pattern = '</ns0:ref>'
ref_pattern = r'<ref>.*?</ref>'          # temporary for removing ref tag

for record in document:
    for paragraph_number, paragraph_text in record[2].items():
        text = paragraph_text
        text = re.sub(starting_p_tag_pattern, '', text)
        text = re.sub(ending_p_tag_pattern, '', text)
        text = re.sub(starting_ref_tag_pattern, '<ref>', text)
        text = re.sub(ending_ref_tag_pattern, '</ref>', text)
        text = re.sub(ref_pattern, '', text)
        record[2][paragraph_number] = text
        
print(document)

[['0', 'Abstract', {1: 'This work was aimed to evaluate the performance of two different surface modifications on cellulose nano-particles obtained from blue agave bagasse. Cellulose nanofibers were obtained from traditional mechanical methods, and cellulose nanocrystals were obtained via sulfuric acid hydrolysis. After cellulose extraction and size reduction, different composites were elaborated using poly(lactic acid) as matrix varying fiber concentrations and using cellulosic particles of diverse size and surface modification (3aminopropyl triethoxysilane silanized cellulose nanofibers and dodecanoyl chloride esterified cellulose nanocrystals). Non-cellulosic elimination and the effect of surface modifications were analyzed with infrared analysis, showing characteristic intensities for cellulosic species and new peaks for each modification (characteristic ester peak ∼1740 cm −1 , and amino peak ∼1550 cm −1 ) cellulose nanofibers and nanocrystals were observed with atom-force microsc

In [25]:
# MERGING METADATA WITH OTHER DOCUMENT PARTS

document = metadata + document
print(document)

[['0', 'Metadata', {'Title': 'Cellulose: Structure and Properties', 'Publication Date': '2015-09-10', 'DOI': '10.1007/12_2015_319'}], ['0', 'Abstract', {1: 'This work was aimed to evaluate the performance of two different surface modifications on cellulose nano-particles obtained from blue agave bagasse. Cellulose nanofibers were obtained from traditional mechanical methods, and cellulose nanocrystals were obtained via sulfuric acid hydrolysis. After cellulose extraction and size reduction, different composites were elaborated using poly(lactic acid) as matrix varying fiber concentrations and using cellulosic particles of diverse size and surface modification (3aminopropyl triethoxysilane silanized cellulose nanofibers and dodecanoyl chloride esterified cellulose nanocrystals). Non-cellulosic elimination and the effect of surface modifications were analyzed with infrared analysis, showing characteristic intensities for cellulosic species and new peaks for each modification (characteris

### XRI (XML-to-RDF-Intermediate) to RDF

In [10]:
# LOADING THE NER MODEL

try:
    model_path = 'C:/Users/umayer/_dev/experiment'
    model_name = 'MULTI_LABEL_CELLULOSIC_MODEL_ACC/model-best'
    nlp = spacy.load(f'{model_path}/{model_name}')
except OSError:
    print('ERROR: Model Not Found!')
else:
    print('Model loaded successfully.')

Model loaded successfully.


In [144]:
# WRITING RDF TRIPLES FOR DOCUMENT

# DECLARING VARIABLE
labels_in_doc = []

# DECLARING LABELING SCHEMA
# labeling_schema = ['ENTITY', 
#                    'CHEMICAL_ENTITY', 
#                    'MATERIAL_ENTITY', 
#                    'STRUCTURE_ENTITY',
#                    'APPLICATION',
#                    'PROCESS',
#                    'PROPERTY',
#                    'EQUIPMENT',
#                    'RELATIONSHIP',
#                    'MEASUREMENT', 
#                    'ABBREVIATION']

labeling_schema = ['CHEMICAL', 
                   'MATERIAL', 
                   'MATERIAL_STRUCTURE',
                   'APPLICATION',
                   'PROCESS_OR_TECHNIQUE',
                   'PROPERTY',
                   'MEASUREMENT', 
                   'ABBREVIATION']

# GETTING DOI
try:
    doi = document[0][2]['DOI']
    doi = doi.replace('/', '_')
except NameError:
    print('ERROR: Document object is not defined!')

# PRINTING PREFIXES
print("@prefix onner: <http://spatialai.org/cellograph/ontology/onner/v2.0#> .")
print("@prefix data: <http://spatialai.org/cellograph/ontology/onner/v2.0/data#> .")
print("@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .")
print("@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .")
print("@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .")
print("@prefix owl: <http://www.w3.org/2002/07/owl#> .")
print()

try:
    for record in document:

        # METADATA
        if record[1] == 'Metadata':

            # PRINTING METADATA
            print(f"data:Publication_{doi} rdf:type onner:ScholarlyPublication ;")
            print(f"onner:publicationTitle '{record[2]['Title']}'^^xsd:string ;")
            print(f"onner:doi '{record[2]['DOI']}'^^xsd:string ;")
            print(f"onner:publicationDate '{record[2]['Publication Date']}'^^xsd:date .")
            print()

        # ABSTRACT AND IT'S DOCUMENT PARTS
        elif record[1] == 'Abstract':
            index = document.index(record) + 1
            next_section = document[index][0]
            paragraph_ids = []

            # CREATING PARAGRAPH ID FOR ABSTRACT'S PARAGRAPH 
            for paragraph_number, _ in record[2].items():
                paragraph_id = doi + '_A' + '-' + str(paragraph_number)
                paragraph_ids.append('data:'+paragraph_id)

            paragraph_ids_joined = ', '.join(paragraph_ids)

            # PRINTING ABSTRACT AND IT'S PARAGRAPH IDs 
            print(f"data:{doi}_A rdf:type onner:Abstract ;")
            print(f"onner:nextDocumentPart {paragraph_ids[0]} ;")    # NEXT DOC PART AFTER ABSTRACT
            print(f"onner:directlyContainsDocumentPart {paragraph_ids_joined} .")
            print()

            # PARAGRAPHS AND IT'S DOCUMENT PARTS
            for paragraph_number, paragraph_text in record[2].items():
                labeled_term_info_list = []
                labeled_term_number = 1

                # replacing ' with \' in text
                if "'" in paragraph_text:
                    paragraph_text = paragraph_text.replace("'", r"\'")

                # creating doc object
                doc = nlp(paragraph_text)

                for ent in doc.ents:
                    current_dateTime = str(datetime.now())[:-7]    # RECHECK THE APPROPRIATE PLACEMENT
                    labeled_term_id = doi + '_A' + '-' + str(paragraph_number) + '-' + str(labeled_term_number)
                    labeled_term = ent.text
                    label = ent.label_
                    offset = ent.start_char
                    length = ent.end_char - ent.start_char
                    labeled_term_info = [labeled_term_id, labeled_term, label, offset, length, current_dateTime]
                    labeled_term_info_list.append(labeled_term_info)
                    labeled_term_number += 1

                labeled_term_ids = ['data:'+i[0] for i in labeled_term_info_list]
                labeled_term_ids_joined = ', '.join(labeled_term_ids)

                # PRINTING PARAGRAPHS AND IT'S LABELED TERM IDs 
                print(f"data:{doi}_A-{paragraph_number} rdf:type onner:Paragraph ;")
                print(f"onner:positionInParentDocumentPart '{paragraph_number}'^^xsd:nonNegativeInteger ;")

                if paragraph_number == len(paragraph_ids):
                    print(f"onner:nextDocumentPart data:{doi}_{next_section} ;")
                else:
                    print(f"onner:nextDocumentPart data:{doi}_A-{paragraph_number+1} ;")

                print(f"onner:paragraphText '{paragraph_text}'^^xsd:string ;")
                print(f"onner:directlyContainsLabeledTerm {labeled_term_ids_joined} .")
                print()

                # LABELED TERMS
                for info in labeled_term_info_list:
                    # PRINTING LABELED TERMS
                    print(f"data:{info[0]} rdf:type onner:LabeledTerm ;")    # DEAL WITH ATOMIC / COMPOUND
                    print(f"onner:labeledTermText '{info[1]}'^^xsd:string ;")
                    print(f"onner:offset '{info[3]}'^^xsd:nonNegativeInteger ;")
                    print(f"onner:length '{info[4]}'^^xsd:nonNegativeInteger ;")
                    print(f"onner:labeledTermDirectlyContainedBy data:{doi}_A-{paragraph_number} ;")
                    print(f"onner:hasLabeledTermStatus data:Candidate_{info[0]} .")
                    print()

                    # PRINTING LABELED TERMS STATUS
                    print(f"data:Candidate_{info[0]} rdf:type onner:CandidateStatus ;")
                    print(f"onner:statusAssignmentDate '{info[5]}'^^xsd:dateTime ;")
                    print(f"onner:statusAssignedBy data:Cellulosic_NER_Model ;")

                    try:
                        if not bool(labeling_schema):
                            raise Exception('Labeling schema is empty!')
                            
                        label_number_in_schema = labeling_schema.index(info[2]) + 1
                    except Exception as e:
                        print(f'ERROR: {e}')
                    else:
                        print(f"onner:hasLabeledTermLabel data:Label_{label_number_in_schema} .")

                    # adding lebels and their position in the schema
                    if [label_number_in_schema, info[2]] not in labels_in_doc:
                        labels_in_doc.append([label_number_in_schema, info[2]])
                    print()
                    
            print('#========================= SECTION DIVIDER =========================#')
            print()

        # SECTION AND IT'S DOCUMENT PARTS
        else:
            section_number = record[0]
            section_name = record[1]
            index = document.index(record) + 1
            paragraph_ids = []

            if index == len(document):
                next_section = 'EndOfDocument'
            else:
                next_section = document[index][0]
            
            # EMPTY SECTION CHECK (N0 PARAGRAPHS BETWEEN A SECTION AND ITS IMMEDIATE SUBSECTION)
            if not bool(record[2]):
                section_ids = []

                for record in document:
                    pattern = rf'^{section_number}\.[^.]+$'
                    if re.search(pattern, record[0]):
                        section_id = doi + '_' + str(record[0])
                        section_ids.append('data:'+section_id)
                
                section_ids_joined = ', '.join(section_ids)
                
                print(f"data:{doi}_{section_number} rdf:type onner:Section ;")
                print(f"onner:sectionTitle '{section_number}'^^xsd:string ;")
                print(f"onner:sectionNumber '{section_name}'^^xsd:string ;")
                print(f"onner:nextDocumentPart data:{doi}_{next_section} ;")
                print(f"onner:directlyContainsDocumentPart {section_ids_joined} .")
                
                print()
                print('#========================= SECTION DIVIDER =========================#')
                print()
            
            # NON-EMPTY SECTION CHECK (PARAGRAPHS EXIST BETWEEN A SECTION AND ITS IMMEDIATE SUBSECTION)
            else:
                
                # CREATING PARAGRAPH ID FOR SECTION'S PARAGRAPH 
                for paragraph_number, _ in record[2].items():
                    paragraph_id = doi + '_' + str(section_number) + '-' + str(paragraph_number)
                    paragraph_ids.append('data:'+paragraph_id)

                paragraph_ids_joined = ', '.join(paragraph_ids)

                # PRINTING SECTION AND IT'S PARAGRAPH IDs 
                print(f"data:{doi}_{section_number} rdf:type onner:Section ;")
                print(f"onner:sectionTitle '{record[1]}'^^xsd:string ;")
                print(f"onner:sectionNumber '{record[0]}'^^xsd:string ;")
                print(f"onner:nextDocumentPart {paragraph_ids[0]} ;")    # NEXT DOC PART AFTER SECTION
                print(f"onner:directlyContainsDocumentPart {paragraph_ids_joined} .")
                print()

                # PARAGRAPHS AND IT'S DOCUMENT PARTS
                for paragraph_number, paragraph_text in record[2].items():
                    labeled_term_info_list = []
                    labeled_term_sequence = 1

                    # replacing ' with \' in text
                    if "'" in paragraph_text:
                        paragraph_text = paragraph_text.replace("'", r"\'")

                    # creating doc object
                    doc = nlp(paragraph_text)

                    for ent in doc.ents:
                        current_dateTime = str(datetime.now())[:-7]    # RECHECK THE APPROPRIATE PLACEMENT
                        labeled_term_id = doi + '_' + str(section_number) + '-' + str(paragraph_number) + '-' + str(labeled_term_sequence)
                        labeled_term = ent.text
                        label = ent.label_
                        offset = ent.start_char
                        length = ent.end_char - ent.start_char
                        labeled_term_info = [labeled_term_id, labeled_term, label, offset, length, current_dateTime]
                        labeled_term_info_list.append(labeled_term_info)
                        labeled_term_sequence += 1

                    labeled_term_ids = ['data:'+i[0] for i in labeled_term_info_list]
                    labeled_term_ids_joined = ', '.join(labeled_term_ids)

                    # PRINTING PARAGRAPHS AND IT'S LABELED TERM IDs 
                    print(f"data:{doi}_{section_number}-{paragraph_number} rdf:type onner:Paragraph ;")
                    print(f"onner:positionInParentDocumentPart '{paragraph_number}'^^xsd:nonNegativeInteger ;")

                    if paragraph_number == len(paragraph_ids):
                        print(f"onner:nextDocumentPart data:{doi}_{next_section} ;")
                    else:
                        print(f"onner:nextDocumentPart data:{doi}_{section_number}-{paragraph_number+1} ;")

                    print(f"onner:paragraphText '{paragraph_text}'^^xsd:string ;")
                    print(f"onner:directlyContainsLabeledTerm {labeled_term_ids_joined} .")
                    print()

                    # LABELED TERMS
                    for info in labeled_term_info_list:
                        # PRINTING LABELED TERMS
                        print(f"data:{info[0]} rdf:type onner:LabeledTerm ;")    # DEAL WITH ATOMIC / COMPOUND
                        print(f"onner:labeledTermText '{info[1]}'^^xsd:string ;")
                        print(f"onner:offset '{info[3]}'^^xsd:nonNegativeInteger ;")
                        print(f"onner:length '{info[4]}'^^xsd:nonNegativeInteger ;")
                        print(f"onner:labeledTermDirectlyContainedBy data:{doi}_{section_number}-{paragraph_number} ;")
                        print(f"onner:hasLabeledTermStatus data:Candidate_{info[0]} .")
                        print()

                        # PRINTING LABELED TERMS STATUS
                        print(f"data:Candidate_{info[0]} rdf:type onner:CandidateStatus ;")
                        print(f"onner:statusAssignmentDate '{info[5]}'^^xsd:dateTime ;")
                        print(f"onner:statusAssignedBy data:Cellulosic_NER_Model ;")

                        try:
                            if not bool(labeling_schema):
                                raise Exception('Labeling schema is empty!')

                            label_number_in_schema = labeling_schema.index(info[2]) + 1
                        except Exception as e:
                            print(f'ERROR: {e}')
                        else:
                            print(f"onner:hasLabeledTermLabel data:Label_{label_number_in_schema} .")

                        # adding lebels and their position in the schema
                        if [label_number_in_schema, info[2]] not in labels_in_doc:
                            labels_in_doc.append([label_number_in_schema, info[2]])

                        print()

                print('#========================= SECTION DIVIDER =========================#')
                print()

except NameError:
    print('ERROR: Document object is not defined!')

try:
    if not bool(labels_in_doc):
        raise Exception('List of labels found in document is empty!')
        
    for label in labels_in_doc:
        print(f"data:Label_{label[0]} rdf:type onner:Label ;")
        print(f"onner:fromLabelingSchema data:Dev_Schema ;")
        print(f"onner:labelText '{label[1]}'^^xsd:string .")
        print()       
except Exception as e:
    print(f'ERROR: {e}')

print(f"data:Dev_Schema rdf:type onner:LabelingSchema ;")
print(f"onner:schemaName 'CelloGraph'^^xsd:string .")
print()
            
print(f"data:Cellulosic_NER_Model rdf:type onner:NER_System ;")    # if/else required to identify system and human
print(f"onner:systemVersion '1.0'^^xsd:string .")
print()

print(f"data:{doi}_EndOfDocument rdf:type onner:EndOfDocument .")

@prefix onner: <http://spatialai.org/cellograph/ontology/onner/v2.0#> .
@prefix data: <http://spatialai.org/cellograph/ontology/onner/v2.0/data#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .

data:Publication_10.1016_j.indcrop.2015.03.075 rdf:type onner:ScholarlyPublication ;
onner:publicationTitle 'Industrial Crops and Products'^^xsd:string ;
onner:doi '10.1016/j.indcrop.2015.03.075'^^xsd:string ;
onner:publicationDate '2015-04-08'^^xsd:date .

data:10.1016_j.indcrop.2015.03.075_A rdf:type onner:Abstract ;
onner:nextDocumentPart data:10.1016_j.indcrop.2015.03.075_A-1 ;
onner:directlyContainsDocumentPart data:10.1016_j.indcrop.2015.03.075_A-1 .

data:10.1016_j.indcrop.2015.03.075_A-1 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '1'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1016_j.indc

data:10.1016_j.indcrop.2015.03.075_1-5 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '5'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1016_j.indcrop.2015.03.075_2 ;
onner:paragraphText 'The objective of this work is to develop new composites using PLA as matrix, by adding as fillers modified cellulose nanofibers and nanocrystals obtained from agricultural waste of a highly fibrous plant (Agave tequilana) in order to give value added applications to an undervalued material. On the other hand, PLA properties require improvements to compete with other plastics, making important the exploration of new surface modifications in polymer reinforcements in order to increase mechanical and hydrophobic properties of PLA and to enhance the interaction between nano fillers and the matrix, avoiding the formation of dislocations in the interaction surface. Two different surface modifications on two cellulose nanoparticles (CNF and CNC) were evaluated to determine the impro

data:10.1016_j.indcrop.2015.03.075_2.4-2 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '2'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1016_j.indcrop.2015.03.075_2.5 ;
onner:paragraphText 'For the CNC, previous reported method by  was used with slight modifications. For this, one equivalent of Dodecanoyl Chloride (DDC) per gram of CNC was put inside a flask alongside 50 mL of toluene and 1 equivalent of pyridine as catalyst, then CNC were added and stirred with 400 rpm for 6 h at 110 • C using reflux. The slurry was centrifuged and then washed with toluene, acetone, and ethanol before setting it inside a soxhlet where the remains were extracted during 12 h using ethanol. After this, the modified CNC were oven dried at 50 • C for 24 h.'^^xsd:string ;
onner:directlyContainsLabeledTerm data:10.1016_j.indcrop.2015.03.075_2.4-2-1, data:10.1016_j.indcrop.2015.03.075_2.4-2-2, data:10.1016_j.indcrop.2015.03.075_2.4-2-3, data:10.1016_j.indcrop.2015.03.075_2.4-2-4, d

data:10.1016_j.indcrop.2015.03.075_2.6-7 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '7'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1016_j.indcrop.2015.03.075_2.6-8 ;
onner:paragraphText 'Tensile tests of the composites were performed using MTS Insight 10 equipment provided with pneumatic clamps (Advantage Pneumatic Grips) and 250 N loading cell, with a speed of 5 mm min −1 . Samples were prepared 60 mm long, with an average width of 5 mm and thickness of 0.7-0.9 mm. The set distance between the clamps was 20 mm. The values quoted are the average of ten measurements.'^^xsd:string ;
onner:directlyContainsLabeledTerm data:10.1016_j.indcrop.2015.03.075_2.6-7-1, data:10.1016_j.indcrop.2015.03.075_2.6-7-2, data:10.1016_j.indcrop.2015.03.075_2.6-7-3, data:10.1016_j.indcrop.2015.03.075_2.6-7-4, data:10.1016_j.indcrop.2015.03.075_2.6-7-5, data:10.1016_j.indcrop.2015.03.075_2.6-7-6, data:10.1016_j.indcrop.2015.03.075_2.6-7-7, data:10.1016_j.indcrop.2015.03.075_2.

data:10.1016_j.indcrop.2015.03.075_3.2-1 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '1'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1016_j.indcrop.2015.03.075_3.2-2 ;
onner:paragraphText 'After the elimination of non-cellulosic components of the agave bagasse, bleached pulp was analyzed with ATR-IR to determine the quality of the obtained cellulose, in Fig. , it can be observed the ATR-IR spectra of cellulose from agave bagasse (a) showing peaks associated to high purity cellulose. Signals between 3600 and 3000 cm <super>−1</super> correspond to OH vibration in cellulose; the band between 3000 and 2600 cm <super>−1</super> corresponds to asymmetric and symmetric C H stretching vibration present in cellulose (1-2), moreover, the peaks present at ∼1430, ∼1162 and ∼1111 cm <super>−1</super> (a-5 to a-6) are assigned to CH <sub>2</sub> symmetric bending, asymmetric C O C bridge stretching, anhydroglucose ring asymmetric stretching, respectively, this is asso

data:10.1016_j.indcrop.2015.03.075_3.2-6 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '6'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1016_j.indcrop.2015.03.075_3.3 ;
onner:paragraphText 'In case of modified cellulose, the surface modification performed in each sample generated a new amorphous region around the crystalline cellulose by hydrogen bonds with the hydroxyl groups present in the exterior chains of the molecular wall. XRD results concur with previous AT-IR analysis respecting the formation of a new surface in cellulose. In the case of BANS, it can be appreciated a slight increase in the crystallite size as a result of the addition of amino chains to the crystalline region; in Fig. ,  the peak deconvolution for the BANS (lower right) signal shows three new low-signal broad peaks at 2Â = 24.97, 30.25 and 37.67 • with an average H of 4.1 ± 0.56 • . For the BADC sample, the addition of large aliphatic chains generates a whole new region around the cr

data:10.1016_j.indcrop.2015.03.075_3.3-3 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '3'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1016_j.indcrop.2015.03.075_4 ;
onner:paragraphText 'In Table  glass transition temperature (T <sub>g</sub> ), melt temperature (T <sub>m</sub> ), crystallization temperature (T <sub>c</sub> ), enthalpy of melting (H <sub>m</sub> ), and enthalpy of crystallization (H <sub>c</sub> ) of the different composites are presented. Bulk PLA presents a T <sub>g</sub> of 63.22 • C and Tm 175.66 • C this results are similar to those reported in previous works . Changes in T <sub>g</sub> are not significant between different composites, in more crystalline reinforcements such as silanized nanofibers or cellulose nanocrystals it can be observed a slight variation in the T <sub>c</sub> (between 3 and 5 • C) this changes and the increase of the enthalpy of fusion show a good nucleation of the cellulose crystallites enhancing the chain flow 