In [1]:
import xml.etree.ElementTree as ET
import re
import os
import spacy
import scispacy
from datetime import datetime

### XML to Text

In [5]:
# THIS FUNCTION MUST BE ALIGNED WITH CORRESPONDING CODE CELLS

def xml_to_text_all():
    
    # CREATING A LIST OF FILE WITH AVAILABLE FILES
    
    directory_path = 'D:\\Drive\\SISE\\CelloGraph\\Dev\\Data\\XML'
    file_list = os.listdir(directory_path)
    
    
    for xml_file in file_list:
        
        # LOADING XML AND CREATING ROOT
        
        tree = ET.parse(directory_path + '\\' + xml_file)
        root = tree.getroot()


        # EXTRACTING METADATA (available data: title, publication_date and doi)

        metadata = []

        for elem in root:
            if elem.tag[29:] == 'teiHeader':
                for sub_elem1 in elem:
                    if sub_elem1.tag[29:] == 'fileDesc':
                        publication_info = {}
                        for sub_elem2 in sub_elem1:
                            if sub_elem2.tag[29:] == 'titleStmt':
                                for sub_elem3 in sub_elem2:
                                    if sub_elem3.tag[29:] == 'title':
                                        publication_info['Title'] = sub_elem3.text
                            if sub_elem2.tag[29:] == 'publicationStmt':
                                for sub_elem3 in sub_elem2:
                                    if sub_elem3.tag[29:] == 'date':
                                        publication_info['Publication Date'] = sub_elem3.attrib['when']
                            if sub_elem2.tag[29:] == 'sourceDesc':
                                for sub_elem3 in sub_elem2:
                                    if sub_elem3.tag[29:] == 'biblStruct':
                                        for sub_elem4 in sub_elem3:
                                            if sub_elem4.tag[29:] == 'idno':
                                                publication_info['DOI'] = sub_elem4.text
                        metadata.append([0, 'Metadata', publication_info])


        # EXTRACTING ABSTRACT AND NUMBERING PARAGRAPHS

        abstract = []

        for elem in root:
            if elem.tag[29:] == 'teiHeader':
                for sub_elem1 in elem:
                    if sub_elem1.tag[29:] == 'profileDesc':
                        for sub_elem2 in sub_elem1:
                            if sub_elem2.tag[29:] == 'abstract':
                                for sub_elem3 in sub_elem2:
                                    if sub_elem3.tag[29:] == 'div':
                                        list_of_paragraphs = {}
                                        paragraph_number = 0
                                        for sub_elem4 in sub_elem3:
                                            if sub_elem4.tag[29:] == 'p':
                                                paragraph_number += 1
                                                list_of_paragraphs[paragraph_number] = ET.tostring(sub_elem4, encoding='unicode')
                                        abstract.append([0, 'Abstract', list_of_paragraphs])


        # EXTRACTING OTHER SECTIONS

        list_of_sections = []

        for elem in root:
            if (elem.tag[29:] == 'text'):
                for sub_elem1 in elem:
                    if (sub_elem1.tag[29:] == 'body'):
                        for sub_elem2 in sub_elem1:
                            if sub_elem2.tag[29:] == 'div':
                                section_number = ''
                                section_name = ''
                                list_of_paragraphs = []
                                for sub_elem3 in sub_elem2:
                                    if sub_elem3.tag[29:] == 'head':
                                        if bool(sub_elem3.attrib):
                                            section_number = str(sub_elem3.attrib)[7:-3]
                                        else:
                                            section_number = 'NO_SECTION_NUMBER'
                                        section_name = sub_elem3.text
                                    if sub_elem3.tag[29:] == 'p':
                                        list_of_paragraphs.append(ET.tostring(sub_elem3, encoding='unicode'))
                                # commented logic skips NO_SECTION_NUMBER with no paragraphs, i.e., Table 1, Table 2, .....
                                # if section_number == 'NO_SECTION_NUMBER' and not bool(list_of_paragraphs):
                                if section_number == 'NO_SECTION_NUMBER':
                                    pass
                                else:
                                    list_of_sections.append([section_number, section_name, list_of_paragraphs])


        # NUMBERING PARAGRAPHS OF SECTIONS 
        # paragraphs are sequentially added in a dictionary
        # dictionary is added with each respective record as a 4th element

        for section in list_of_sections:
            list_of_paragraphs = {}
            for paragraph_number, paragraph_text in enumerate(section[2], start=1):
                list_of_paragraphs[paragraph_number] = paragraph_text
            section.append(list_of_paragraphs)


        # DELETING THE 3RD ELEMENT FROM EACH RECORD

        for section in list_of_sections:
            section.remove(section[2])


        # MERGING ABSTRACT WITH OTHER SECTIONS

        document = abstract + list_of_sections


        # PREPROCESSING TEXT

        starting_p_tag_pattern = r'<ns0:p[^>]+>'
        ending_p_tag_pattern = '</ns0:p>'
        starting_ref_tag_pattern = r'<ns0:ref[^>]+>'
        ending_ref_tag_pattern = '</ns0:ref>'
        ref_pattern = r'<ref>.*?</ref>'          # temporary for removing ref tag

        for record in document:
            for paragraph_number, paragraph_text in record[2].items():
                text = paragraph_text
                text = re.sub(starting_p_tag_pattern, '', text)
                text = re.sub(ending_p_tag_pattern, '', text)
                text = re.sub(starting_ref_tag_pattern, '<ref>', text)
                text = re.sub(ending_ref_tag_pattern, '</ref>', text)
                text = re.sub(ref_pattern, '', text)
                record[2][paragraph_number] = text


        # EXTRACTING TEXT AND WRITING IN FILE

        output_file_name = os.path.basename(xml_file)[:-4]

        for record in document:
            for _, paragraph in record[2].items():
                with open(f'D:\\Drive\\SISE\\CelloGraph\\Dev\\Data\\Text\\{output_file_name}.txt', 'a', encoding='utf-8') as output_file:
                    output_file.write(paragraph + '\n')
    

In [None]:
# xml_to_text_all()

### XML Structure
- teiHeader
    - fileDesc
        - titleStmt
            - title
        - publicationStmt
            - date
        - sourceDesc
            - biblStruct
                - idno
    - encodingDesc
    - profileDesc

### XRI (XML-to-RDF-Intermediate) Data Structure
- [
    - [
        - section_number,
        - section_title,
        - {
            - paragraph_number:
                - {
                    - sentence_number: sectence_text,
                - },
        - }
    - ],
- ]

### XML to XRI (XML-to-RDF-Intermediate)

In [2]:
# LOADING XML AND CREATING ROOT

xml_file = 'D:\\Drive\\SISE\\CelloGraph\\Dev\\Data\\XML\\Rojas 2016 - Cellulose Chemistry And Properties - Chapter1.tei.xmlScitex.xml'
tree = ET.parse(xml_file)
root = tree.getroot()

# EXTRACTING METADATA (available data: title, publication_date and doi)

metadata = []

for elem in root:
    if elem.tag[29:] == 'teiHeader':
        for sub_elem1 in elem:
            if sub_elem1.tag[29:] == 'fileDesc':
                publication_info = {}
                for sub_elem2 in sub_elem1:
                    if sub_elem2.tag[29:] == 'titleStmt':
                        for sub_elem3 in sub_elem2:
                            if sub_elem3.tag[29:] == 'title':
                                publication_info['Title'] = sub_elem3.text
                    if sub_elem2.tag[29:] == 'publicationStmt':
                        for sub_elem3 in sub_elem2:
                            if sub_elem3.tag[29:] == 'date':
                                publication_info['Publication Date'] = sub_elem3.attrib['when']
                    if sub_elem2.tag[29:] == 'sourceDesc':
                        for sub_elem3 in sub_elem2:
                            if sub_elem3.tag[29:] == 'biblStruct':
                                for sub_elem4 in sub_elem3:
                                    if sub_elem4.tag[29:] == 'idno':
                                        publication_info['DOI'] = sub_elem4.text
                metadata.append(['0', 'Metadata', publication_info])
                                      
print(metadata)

[['0', 'Metadata', {'Title': 'Cellulose: Structure and Properties', 'Publication Date': '2015-09-10', 'DOI': '10.1007/12_2015_319'}]]


In [3]:
# EXTRACTING ABSTRACT AND NUMBERING PARAGRAPHS

abstract = []

for elem in root:
    if elem.tag[29:] == 'teiHeader':
        for sub_elem1 in elem:
            if sub_elem1.tag[29:] == 'profileDesc':
                for sub_elem2 in sub_elem1:
                    if sub_elem2.tag[29:] == 'abstract':
                        for sub_elem3 in sub_elem2:
                            if sub_elem3.tag[29:] == 'div':
                                list_of_paragraphs = {}
                                paragraph_number = 0
                                for sub_elem4 in sub_elem3:
                                    if sub_elem4.tag[29:] == 'p':
                                        paragraph_number += 1
                                        list_of_paragraphs[paragraph_number] = ET.tostring(sub_elem4, encoding='unicode')
                                abstract.append(['0', 'Abstract', list_of_paragraphs])
                                
print(abstract)

[['0', 'Abstract', {1: '<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0">Cellulose, a fascinating biopolymer and the most common organic compound on earth, is comprehensively reviewed. Details of its crystalline phases are given, starting with a description of molecular and supramolecular structures, including the hydrogen bond systems. Sources of this ubiquitous biopolymer are mentioned, with attention to the special properties of bacterially synthesized nanofibrous cellulose. Nanostructures obtained by disintegration of cellulose fibers (top-down approach) yielding nano-or microfibrillated cellulose and cellulose whiskers are the basis for novel materials with extraordinary properties. Moreover, nanofibers and nanoparticles can be made by special techniques applying the bottom-up approach. Efficient systems to dissolve cellulose by destruction of the hydrogen bond systems using ionic liquids and systems based on polar aprotic solvent and salt are described. Novel cellulose derivatives 

In [4]:
# EXTRACTING OTHER SECTIONS

# need to normalize the section number (line 17)
# compare Rojas and Wolf section number
# \ issue in RDF, see Koshkava 2014 paper

list_of_sections = []

for elem in root:
    if (elem.tag[29:] == 'text'):
        for sub_elem1 in elem:
            if (sub_elem1.tag[29:] == 'body'):
                for sub_elem2 in sub_elem1:
                    if sub_elem2.tag[29:] == 'div':
                        section_number = ''
                        section_name = ''
                        list_of_paragraphs = []
                        for sub_elem3 in sub_elem2:
                            if sub_elem3.tag[29:] == 'head':
                                if bool(sub_elem3.attrib):
                                    section_number = str(sub_elem3.attrib)
                                    if section_number[-3] == '.':
                                        section_number = section_number[7:-3]
                                    else:
                                        section_number = section_number[7:-2]
                                else:
                                    section_number = 'NO_SECTION_NUMBER'
                                section_name = sub_elem3.text
                            if sub_elem3.tag[29:] == 'p':
                                list_of_paragraphs.append(ET.tostring(sub_elem3, encoding='unicode'))
                        # commented logic skips NO_SECTION_NUMBER with no paragraphs, i.e., Table 1, Table 2, .....
                        # if section_number == 'NO_SECTION_NUMBER' and not bool(list_of_paragraphs):
                        if section_number == 'NO_SECTION_NUMBER':
                            pass
                        else:
                            list_of_sections.append([section_number, section_name, list_of_paragraphs])

print(list_of_sections)

[['1', 'Introduction', ['<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0">Cellulose is the most abundant natural polymer in the biosphere, with a global production (and decomposition) of ~1.5 Â 10 <super>12</super> tons per year, comparable to the planetary reserves of the main fossil and mineral sources <ns0:ref type="bibr" target="#b0">[1]</ns0:ref>. In addition to the long-standing scientific interest in cellulose, the use of cellulose as renewable and biodegradable raw material in various applications is a proposed solution to the recent industrial challenge to successfully meet environmental and recycling problems <ns0:ref type="bibr" target="#b1">[2]</ns0:ref>. Versatile structuring of cellulose by various routes of modification, including both physical and chemical methods, has enabled its use in a variety of applications (e.g., fillers, building and coating materials, laminates, papers, textiles, optical films, sorption media, viscosity regulators, and even advanced functional ma

In [None]:
# # DON'T USE THIS PIECE OF CODE  (commented - 12 Nov 23)
# # HAVE TO ADJUST THIS CODE FOR TABLES/IMAGES
# # merging "paragraphs with no section number" to its previous section

# list_of_sections_length = len(list_of_sections)

# for i in range(list_of_sections_length-1,0,-1):
#     if list_of_sections[i][0] == 'NO_SECTION_NUMBER':
#         list_of_sections[i-1][2].extend(list_of_sections[i][2])
#         list_of_sections.remove(list_of_sections[i])
        
# list_of_sections

In [5]:
# NUMBERING PARAGRAPHS OF SECTIONS 
# paragraphs are sequentially added in a dictionary
# dictionary is added with each respective record as a 4th element

for section in list_of_sections:
    list_of_paragraphs = {}
    for paragraph_number, paragraph_text in enumerate(section[2], start=1):
        list_of_paragraphs[paragraph_number] = paragraph_text
    section.append(list_of_paragraphs)
    
print(list_of_sections)

[['1', 'Introduction', ['<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0">Cellulose is the most abundant natural polymer in the biosphere, with a global production (and decomposition) of ~1.5 Â 10 <super>12</super> tons per year, comparable to the planetary reserves of the main fossil and mineral sources <ns0:ref type="bibr" target="#b0">[1]</ns0:ref>. In addition to the long-standing scientific interest in cellulose, the use of cellulose as renewable and biodegradable raw material in various applications is a proposed solution to the recent industrial challenge to successfully meet environmental and recycling problems <ns0:ref type="bibr" target="#b1">[2]</ns0:ref>. Versatile structuring of cellulose by various routes of modification, including both physical and chemical methods, has enabled its use in a variety of applications (e.g., fillers, building and coating materials, laminates, papers, textiles, optical films, sorption media, viscosity regulators, and even advanced functional ma

In [6]:
# DELETING THE 3RD ELEMENT FROM EACH RECORD

for section in list_of_sections:
    section.remove(section[2])
    
print(list_of_sections)

[['1', 'Introduction', {1: '<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0">Cellulose is the most abundant natural polymer in the biosphere, with a global production (and decomposition) of ~1.5 Â 10 <super>12</super> tons per year, comparable to the planetary reserves of the main fossil and mineral sources <ns0:ref type="bibr" target="#b0">[1]</ns0:ref>. In addition to the long-standing scientific interest in cellulose, the use of cellulose as renewable and biodegradable raw material in various applications is a proposed solution to the recent industrial challenge to successfully meet environmental and recycling problems <ns0:ref type="bibr" target="#b1">[2]</ns0:ref>. Versatile structuring of cellulose by various routes of modification, including both physical and chemical methods, has enabled its use in a variety of applications (e.g., fillers, building and coating materials, laminates, papers, textiles, optical films, sorption media, viscosity regulators, and even advanced functional

In [7]:
# MERGING ABSTRACT WITH OTHER SECTIONS

document = abstract + list_of_sections
print(document)

[['0', 'Abstract', {1: '<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0">Cellulose, a fascinating biopolymer and the most common organic compound on earth, is comprehensively reviewed. Details of its crystalline phases are given, starting with a description of molecular and supramolecular structures, including the hydrogen bond systems. Sources of this ubiquitous biopolymer are mentioned, with attention to the special properties of bacterially synthesized nanofibrous cellulose. Nanostructures obtained by disintegration of cellulose fibers (top-down approach) yielding nano-or microfibrillated cellulose and cellulose whiskers are the basis for novel materials with extraordinary properties. Moreover, nanofibers and nanoparticles can be made by special techniques applying the bottom-up approach. Efficient systems to dissolve cellulose by destruction of the hydrogen bond systems using ionic liquids and systems based on polar aprotic solvent and salt are described. Novel cellulose derivatives 

In [8]:
# PREPROCESSING TEXT

starting_p_tag_pattern = r'<ns0:p[^>]+>'
ending_p_tag_pattern = '</ns0:p>'
starting_ref_tag_pattern = r'<ns0:ref[^>]+>'
ending_ref_tag_pattern = '</ns0:ref>'
ref_pattern = r'<ref>.*?</ref>'          # temporary for removing ref tag

for record in document:
    for paragraph_number, paragraph_text in record[2].items():
        text = paragraph_text
        text = re.sub(starting_p_tag_pattern, '', text)
        text = re.sub(ending_p_tag_pattern, '', text)
        text = re.sub(starting_ref_tag_pattern, '<ref>', text)
        text = re.sub(ending_ref_tag_pattern, '</ref>', text)
        text = re.sub(ref_pattern, '', text)
        record[2][paragraph_number] = text
        
print(document)

[['0', 'Abstract', {1: 'Cellulose, a fascinating biopolymer and the most common organic compound on earth, is comprehensively reviewed. Details of its crystalline phases are given, starting with a description of molecular and supramolecular structures, including the hydrogen bond systems. Sources of this ubiquitous biopolymer are mentioned, with attention to the special properties of bacterially synthesized nanofibrous cellulose. Nanostructures obtained by disintegration of cellulose fibers (top-down approach) yielding nano-or microfibrillated cellulose and cellulose whiskers are the basis for novel materials with extraordinary properties. Moreover, nanofibers and nanoparticles can be made by special techniques applying the bottom-up approach. Efficient systems to dissolve cellulose by destruction of the hydrogen bond systems using ionic liquids and systems based on polar aprotic solvent and salt are described. Novel cellulose derivatives are available by chemical modification under he

In [None]:
# # EXTRACTING TEXT AND WRITING IN FILE

# file_name = os.path.basename(xml_file)[:-4]

# for record in document:
#     for _, paragraph in record[2].items():
#         with open(f'output\\{file_name}.txt', 'a', encoding='utf-8') as output_file:
#             output_file.write(paragraph + '\n')

In [None]:
# # NOT IN USE NOW! WILL BE DELETED!!
# # import spacy and load en_core_web_sm model

# import spacy
# nlp = spacy.load('en_core_web_sm')

In [None]:
# # NOT IN USE NOW! WILL BE DELETED!!
# # organizing sentences sequentially in a dictionary
# # adding dictionary with each respective record as a 4th element

# for record in document:
#     text_content = record[2]
#     list_of_paragraphs = {}
    
#     for paragraph_number, paragraph_text in text_content.items():
#         list_of_sentences = {}
#         doc = nlp(paragraph_text)
        
#         for sentence_number, sentence_text in enumerate(doc.sents, start=1):
#             list_of_sentences[sentence_number] = sentence_text.text            # .text added to convert into text
        
#         list_of_paragraphs[paragraph_number] = list_of_sentences
        
#     record.append(list_of_paragraphs)

In [None]:
# # NOT IN USE NOW! WILL BE DELETED!!
# # deleting 3rd element from each record of document

# for record in document:
#     record.remove(record[2])  

In [9]:
# MERGING METADATA WITH OTHER DOCUMENT PARTS

document = metadata + document
print(document)

[['0', 'Metadata', {'Title': 'Cellulose: Structure and Properties', 'Publication Date': '2015-09-10', 'DOI': '10.1007/12_2015_319'}], ['0', 'Abstract', {1: 'Cellulose, a fascinating biopolymer and the most common organic compound on earth, is comprehensively reviewed. Details of its crystalline phases are given, starting with a description of molecular and supramolecular structures, including the hydrogen bond systems. Sources of this ubiquitous biopolymer are mentioned, with attention to the special properties of bacterially synthesized nanofibrous cellulose. Nanostructures obtained by disintegration of cellulose fibers (top-down approach) yielding nano-or microfibrillated cellulose and cellulose whiskers are the basis for novel materials with extraordinary properties. Moreover, nanofibers and nanoparticles can be made by special techniques applying the bottom-up approach. Efficient systems to dissolve cellulose by destruction of the hydrogen bond systems using ionic liquids and syste

### XRI (XML-to-RDF-Intermediate) to RDF

In [10]:
# LOADING THE NER MODEL

try:
    model_path = 'C:/Users/umayer/_dev/experiment'
    model_name = 'MULTI_LABEL_CELLULOSIC_MODEL_ACC/model-best'
    nlp = spacy.load(f'{model_path}/{model_name}')
except OSError:
    print('ERROR: Model Not Found!')
else:
    print('Model loaded successfully.')

Model loaded successfully.


In [11]:
# WRITING RDF TRIPLES FOR DOCUMENT

# DECLARING VARIABLE
labels_in_doc = []

# DECLARING LABELING SCHEMA
# labeling_schema = ['ENTITY', 
#                    'CHEMICAL_ENTITY', 
#                    'MATERIAL_ENTITY', 
#                    'STRUCTURE_ENTITY',
#                    'APPLICATION',
#                    'PROCESS',
#                    'PROPERTY',
#                    'EQUIPMENT',
#                    'RELATIONSHIP',
#                    'MEASUREMENT', 
#                    'ABBREVIATION']

labeling_schema = ['CHEMICAL', 
                   'MATERIAL', 
                   'MATERIAL_STRUCTURE',
                   'APPLICATION',
                   'PROCESS_OR_TECHNIQUE',
                   'PROPERTY',
                   'MEASUREMENT', 
                   'ABBREVIATION']

# GETTING DOI
try:
    doi = document[0][2]['DOI']
    doi = doi.replace('/', '_')
except NameError:
    print('ERROR: Document object is not defined!')

# PRINTING PREFIXES
print("@prefix onner: <http://spatialai.org/cellograph/ontology/onner/v2.0#> .")
print("@prefix data: <http://spatialai.org/cellograph/ontology/onner/v2.0/data#> .")
print("@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .")
print("@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .")
print("@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .")
print("@prefix owl: <http://www.w3.org/2002/07/owl#> .")
print()

try:
    for record in document:

        # METADATA
        if record[1] == 'Metadata':

            # PRINTING METADATA
            print(f"data:Publication_{doi} rdf:type onner:ScholarlyPublication ;")
            print(f"onner:publicationTitle '{record[2]['Title']}'^^xsd:string ;")
            print(f"onner:doi '{record[2]['DOI']}'^^xsd:string ;")
            print(f"onner:publicationDate '{record[2]['Publication Date']}'^^xsd:date .")
            print()

        # ABSTRACT AND IT'S DOCUMENT PARTS
        elif record[1] == 'Abstract':
            index = document.index(record) + 1
            next_section = document[index][0]
            paragraph_ids = []

            # CREATING PARAGRAPH ID FOR ABSTRACT'S PARAGRAPH 
            for paragraph_number, _ in record[2].items():
                paragraph_id = doi + '_A' + '-' + str(paragraph_number)
                paragraph_ids.append('data:'+paragraph_id)

            paragraph_ids_joined = ', '.join(paragraph_ids)

            # PRINTING ABSTRACT AND IT'S PARAGRAPH IDs 
            print(f"data:{doi}_A rdf:type onner:Abstract ;")
            print(f"onner:nextDocumentPart {paragraph_ids[0]} ;")    # NEXT DOC PART AFTER ABSTRACT
            print(f"onner:directlyContainsDocumentPart {paragraph_ids_joined} .")
            print()

            # PARAGRAPHS AND IT'S DOCUMENT PARTS
            for paragraph_number, paragraph_text in record[2].items():
                labeled_term_info_list = []
                labeled_term_number = 1

                # replacing ' with \' in text
                if "'" in paragraph_text:
                    paragraph_text = paragraph_text.replace("'", r"\'")

                # creating doc object
                doc = nlp(paragraph_text)

                for ent in doc.ents:
                    current_dateTime = str(datetime.now())[:-7]    # RECHECK THE APPROPRIATE PLACEMENT
                    labeled_term_id = doi + '_A' + '-' + str(paragraph_number) + '-' + str(labeled_term_number)
                    labeled_term = ent.text
                    label = ent.label_
                    offset = ent.start_char
                    length = ent.end_char - ent.start_char
                    labeled_term_info = [labeled_term_id, labeled_term, label, offset, length, current_dateTime]
                    labeled_term_info_list.append(labeled_term_info)
                    labeled_term_number += 1

                labeled_term_ids = ['data:'+i[0] for i in labeled_term_info_list]
                labeled_term_ids_joined = ', '.join(labeled_term_ids)

                # PRINTING PARAGRAPHS AND IT'S LABELED TERM IDs 
                print(f"data:{doi}_A-{paragraph_number} rdf:type onner:Paragraph ;")
                print(f"onner:positionInParentDocumentPart '{paragraph_number}'^^xsd:nonNegativeInteger ;")

                if paragraph_number == len(paragraph_ids):
                    print(f"onner:nextDocumentPart data:{doi}_{next_section} ;")
                else:
                    print(f"onner:nextDocumentPart data:{doi}_A-{paragraph_number+1} ;")

                print(f"onner:paragraphText '{paragraph_text}'^^xsd:string ;")
                print(f"onner:directlyContainsLabeledTerm {labeled_term_ids_joined} .")
                print()

                # LABELED TERMS
                for info in labeled_term_info_list:
                    # PRINTING LABELED TERMS
                    print(f"data:{info[0]} rdf:type onner:LabeledTerm ;")    # DEAL WITH ATOMIC / COMPOUND
                    print(f"onner:labeledTermText '{info[1]}'^^xsd:string ;")
                    print(f"onner:offset '{info[3]}'^^xsd:nonNegativeInteger ;")
                    print(f"onner:length '{info[4]}'^^xsd:nonNegativeInteger ;")
                    print(f"onner:labeledTermDirectlyContainedBy data:{doi}_A-{paragraph_number} ;")
                    print(f"onner:hasLabeledTermStatus data:Candidate_{info[0]} .")
                    print()

                    # PRINTING LABELED TERMS STATUS
                    print(f"data:Candidate_{info[0]} rdf:type onner:CandidateStatus ;")
                    print(f"onner:statusAssignmentDate '{info[5]}'^^xsd:dateTime ;")
                    print(f"onner:statusAssignedBy data:Cellulosic_NER_Model ;")

                    try:
                        if not bool(labeling_schema):
                            raise Exception('Labeling schema is empty!')
                            
                        label_number_in_schema = labeling_schema.index(info[2]) + 1
                    except Exception as e:
                        print(f'ERROR: {e}')
                    else:
                        print(f"onner:hasLabeledTermLabel data:Label_{label_number_in_schema} .")

                    # adding lebels and their position in the schema
                    if [label_number_in_schema, info[2]] not in labels_in_doc:
                        labels_in_doc.append([label_number_in_schema, info[2]])
                    print()
                    
            print('#========================= SECTION DIVIDER =========================#')
            print()

        # SECTION AND IT'S DOCUMENT PARTS
        else:
            section_number = record[0]
            section_name = record[1]
            index = document.index(record) + 1
            paragraph_ids = []

            if index == len(document):
                next_section = 'EndOfDocument'
            else:
                next_section = document[index][0]
            
            # EMPTY SECTION CHECK (N0 PARAGRAPHS BETWEEN A SECTION AND ITS IMMEDIATE SUBSECTION)
            if not bool(record[2]):
                section_ids = []

                for record in document:
                    pattern = rf'^{section_number}\.[^.]+$'
                    if re.search(pattern, record[0]):
                        section_id = doi + '_' + str(record[0])
                        section_ids.append('data:'+section_id)
                
                section_ids_joined = ', '.join(section_ids)
                
                print(f"data:{doi}_{section_number} rdf:type onner:Section ;")
                print(f"onner:sectionTitle '{section_number}'^^xsd:string ;")
                print(f"onner:sectionNumber '{section_name}'^^xsd:string ;")
                print(f"onner:nextDocumentPart data:{doi}_{next_section} ;")
                print(f"onner:directlyContainsDocumentPart {section_ids_joined} .")
                
                print()
                print('#========================= SECTION DIVIDER =========================#')
                print()
            
            # NON-EMPTY SECTION CHECK (PARAGRAPHS EXIST BETWEEN A SECTION AND ITS IMMEDIATE SUBSECTION)
            else:
                
                # CREATING PARAGRAPH ID FOR SECTION'S PARAGRAPH 
                for paragraph_number, _ in record[2].items():
                    paragraph_id = doi + '_' + str(section_number) + '-' + str(paragraph_number)
                    paragraph_ids.append('data:'+paragraph_id)

                paragraph_ids_joined = ', '.join(paragraph_ids)

                # PRINTING SECTION AND IT'S PARAGRAPH IDs 
                print(f"data:{doi}_{section_number} rdf:type onner:Section ;")
                print(f"onner:sectionTitle '{record[1]}'^^xsd:string ;")
                print(f"onner:sectionNumber '{record[0]}'^^xsd:string ;")
                print(f"onner:nextDocumentPart {paragraph_ids[0]} ;")    # NEXT DOC PART AFTER SECTION
                print(f"onner:directlyContainsDocumentPart {paragraph_ids_joined} .")
                print()

                # PARAGRAPHS AND IT'S DOCUMENT PARTS
                for paragraph_number, paragraph_text in record[2].items():
                    labeled_term_info_list = []
                    labeled_term_sequence = 1

                    # replacing ' with \' in text
                    if "'" in paragraph_text:
                        paragraph_text = paragraph_text.replace("'", r"\'")

                    # creating doc object
                    doc = nlp(paragraph_text)

                    for ent in doc.ents:
                        current_dateTime = str(datetime.now())[:-7]    # RECHECK THE APPROPRIATE PLACEMENT
                        labeled_term_id = doi + '_' + str(section_number) + '-' + str(paragraph_number) + '-' + str(labeled_term_sequence)
                        labeled_term = ent.text
                        label = ent.label_
                        offset = ent.start_char
                        length = ent.end_char - ent.start_char
                        labeled_term_info = [labeled_term_id, labeled_term, label, offset, length, current_dateTime]
                        labeled_term_info_list.append(labeled_term_info)
                        labeled_term_sequence += 1

                    labeled_term_ids = ['data:'+i[0] for i in labeled_term_info_list]
                    labeled_term_ids_joined = ', '.join(labeled_term_ids)

                    # PRINTING PARAGRAPHS AND IT'S LABELED TERM IDs 
                    print(f"data:{doi}_{section_number}-{paragraph_number} rdf:type onner:Paragraph ;")
                    print(f"onner:positionInParentDocumentPart '{paragraph_number}'^^xsd:nonNegativeInteger ;")

                    if paragraph_number == len(paragraph_ids):
                        print(f"onner:nextDocumentPart data:{doi}_{next_section} ;")
                    else:
                        print(f"onner:nextDocumentPart data:{doi}_{section_number}-{paragraph_number+1} ;")

                    print(f"onner:paragraphText '{paragraph_text}'^^xsd:string ;")
                    print(f"onner:directlyContainsLabeledTerm {labeled_term_ids_joined} .")
                    print()

                    # LABELED TERMS
                    for info in labeled_term_info_list:
                        # PRINTING LABELED TERMS
                        print(f"data:{info[0]} rdf:type onner:LabeledTerm ;")    # DEAL WITH ATOMIC / COMPOUND
                        print(f"onner:labeledTermText '{info[1]}'^^xsd:string ;")
                        print(f"onner:offset '{info[3]}'^^xsd:nonNegativeInteger ;")
                        print(f"onner:length '{info[4]}'^^xsd:nonNegativeInteger ;")
                        print(f"onner:labeledTermDirectlyContainedBy data:{doi}_{section_number}-{paragraph_number} ;")
                        print(f"onner:hasLabeledTermStatus data:Candidate_{info[0]} .")
                        print()

                        # PRINTING LABELED TERMS STATUS
                        print(f"data:Candidate_{info[0]} rdf:type onner:CandidateStatus ;")
                        print(f"onner:statusAssignmentDate '{info[5]}'^^xsd:dateTime ;")
                        print(f"onner:statusAssignedBy data:Cellulosic_NER_Model ;")

                        try:
                            if not bool(labeling_schema):
                                raise Exception('Labeling schema is empty!')

                            label_number_in_schema = labeling_schema.index(info[2]) + 1
                        except Exception as e:
                            print(f'ERROR: {e}')
                        else:
                            print(f"onner:hasLabeledTermLabel data:Label_{label_number_in_schema} .")

                        # adding lebels and their position in the schema
                        if [label_number_in_schema, info[2]] not in labels_in_doc:
                            labels_in_doc.append([label_number_in_schema, info[2]])

                        print()

                print('#========================= SECTION DIVIDER =========================#')
                print()

except NameError:
    print('ERROR: Document object is not defined!')

try:
    if not bool(labels_in_doc):
        raise Exception('List of labels found in document is empty!')
        
    for label in labels_in_doc:
        print(f"data:Label_{label[0]} rdf:type onner:Label ;")
        print(f"onner:fromLabelingSchema data:Dev_Schema ;")
        print(f"onner:labelText '{label[1]}'^^xsd:string .")
        print()       
except Exception as e:
    print(f'ERROR: {e}')

print(f"data:Dev_Schema rdf:type onner:LabelingSchema ;")
print(f"onner:schemaName 'CelloGraph'^^xsd:string .")
print()
            
print(f"data:Cellulosic_NER_Model rdf:type onner:NER_System ;")    # if/else required to identify system and human
print(f"onner:systemVersion '1.0'^^xsd:string .")
print()

print(f"data:{doi}_EndOfDocument rdf:type onner:EndOfDocument .")

@prefix onner: <http://spatialai.org/cellograph/ontology/onner/v2.0#> .
@prefix data: <http://spatialai.org/cellograph/ontology/onner/v2.0/data#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .

data:Publication_10.1007_12_2015_319 rdf:type onner:ScholarlyPublication ;
onner:publicationTitle 'Cellulose: Structure and Properties'^^xsd:string ;
onner:doi '10.1007/12_2015_319'^^xsd:string ;
onner:publicationDate '2015-09-10'^^xsd:date .

data:10.1007_12_2015_319_A rdf:type onner:Abstract ;
onner:nextDocumentPart data:10.1007_12_2015_319_A-1 ;
onner:directlyContainsDocumentPart data:10.1007_12_2015_319_A-1 .

data:10.1007_12_2015_319_A-1 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '1'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_1 ;
onner:paragraphText 'Cellulose, a fascinatin

data:10.1007_12_2015_319_1-1 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '1'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_1-2 ;
onner:paragraphText 'Cellulose is the most abundant natural polymer in the biosphere, with a global production (and decomposition) of ~1.5 Â 10 <super>12</super> tons per year, comparable to the planetary reserves of the main fossil and mineral sources . In addition to the long-standing scientific interest in cellulose, the use of cellulose as renewable and biodegradable raw material in various applications is a proposed solution to the recent industrial challenge to successfully meet environmental and recycling problems . Versatile structuring of cellulose by various routes of modification, including both physical and chemical methods, has enabled its use in a variety of applications (e.g., fillers, building and coating materials, laminates, papers, textiles, optical films, sorption media, viscosity regulators, a

data:10.1007_12_2015_319_2-1 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '1'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_2-2 ;
onner:paragraphText 'Cellulose is distributed throughout nature in plants, animals, algae, fungi, and minerals (Fig. ). However, the major source of cellulose is plant fiber. Cellulose contributes approximately 40% to the carbon fraction in plants, serving as structuring element within the complex architecture of their cell walls. Cellulose can occur in pure form in plants but it is usually accompanied by hemicelluloses, lignins, and comparably small amounts of extractives. Wood contains about 40-50 wt% cellulose. Comparable amounts can be found in bagasse , bamboo , straw (40-50 wt%), and even higher in flax (70-80 wt%), hemp , jute , kapok , and ramie (70-75 wt %). Cotton is a fairly pure cellulose source, containing more than 90 wt% . An impressive amount of cellulose is produced each year, not only in wood fib

data:10.1007_12_2015_319_3.1-3 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '3'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_3.2 ;
onner:paragraphText 'The DP of native cellulose of various origins is in the range of 1,000-30,000, which corresponds to chain lengths of 500-15,000 nm. The cellulose samples that are obtained by isolation methods possess DP values ranging between 800 and 3,000 . Cellulose samples are polydisperse, thus, the DP is an average value. There are several techniques that can give information about the molar masses and their distribution, including viscosity measurements, size-exclusion chromatography, and light scattering.'^^xsd:string ;
onner:directlyContainsLabeledTerm data:10.1007_12_2015_319_3.1-3-1, data:10.1007_12_2015_319_3.1-3-2, data:10.1007_12_2015_319_3.1-3-3, data:10.1007_12_2015_319_3.1-3-4, data:10.1007_12_2015_319_3.1-3-5, data:10.1007_12_2015_319_3.1-3-6, data:10.1007_12_2015_319_3.1-3-7, data:10.1007_

data:10.1007_12_2015_319_3.2-4 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '4'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_3.2-5 ;
onner:paragraphText 'Intermolecular hydrogen bonding is responsible for the strong interaction between cellulose chains. The bonds are produced between adjacent cellulose macromolecules located along the (002) plane in the crystal lattice of cellulose I (native cellulose), mainly between the oxygen atom in C3 and the OH at C6 (see Sect. 3.3) . Together, the hydrogen bonding, weak C-H-O bonds, and hydrophobic interactions are responsible for the assembly of cellulose in layers, as elucidated by synchrotron X-ray and neutron diffraction experiments .'^^xsd:string ;
onner:directlyContainsLabeledTerm data:10.1007_12_2015_319_3.2-4-1, data:10.1007_12_2015_319_3.2-4-2, data:10.1007_12_2015_319_3.2-4-3, data:10.1007_12_2015_319_3.2-4-4, data:10.1007_12_2015_319_3.2-4-5, data:10.1007_12_2015_319_3.2-4-6, data:10.1007_

data:10.1007_12_2015_319_3.3-2 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '2'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_3.3-3 ;
onner:paragraphText 'Celluloses from different sources possess comparable crystallinity (i.e., modifications of cellulose I). However, solid state <super>13</super> C-NMR studies revealed that cellulose can crystallize with varying proportions of two different phases, named cellulose I <sub>α</sub> and I β . Plant cellulose mainly consists of cellulose I <sub>β</sub> , whereas cellulose produced by primitive organisms crystallizes in the I <sub>α</sub> phase. The monoclinic unit cell of cellulose I <sub>α</sub> with a space group P2 <sub>1</sub> consists of two cellulose molecules, each containing a cellobiose unit in the 002 corner plane and 002 center plane in a parallel fashion . Cellulose I <sub>β</sub> corresponds to a triclinic symmetry with space group P <sub>1</sub> containing one chain in the unit cel

data:10.1007_12_2015_319_3.3-5 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '5'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_3.3-6 ;
onner:paragraphText 'In addition to the crystalline domains, there are also amorphous or noncrystalline regions in cellulose, which influence the physical and chemical properties of celluloses . Interactions between solid cellulose and water, enzymes, and reactive or adsorptive substances occur first at the noncrystalline, amorphous domains or at the surface of cellulose crystals. Entire amorphous cellulose samples can be prepared by ball-milling of cellulose , deacetylation of cellulose acetate under nonaqueous alkaline conditions , or precipitation from nonaqueous cellulose solutions into nonaqueous media avoiding stress . However, the amorphous structures are usually unstable in the presence of water and form partly crystalline cellulose II. Interestingly, it was found that Raman and solid state <super>13</

data:10.1007_12_2015_319_3.4.1-2 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '2'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_3.4.1-3 ;
onner:paragraphText 'Fibers from different sources display different morphologies and dimensions. For example, cotton fibers are twisted (Fig. ) whereas those from spruce wood are generally untwisted (Fig. ). In contrast, fibers from bast plants are straight and round (Fig. ). Interestingly, they all share an internal structure made up of multiple cell wall layers. During the growth period, plant fibers develop a primary Cellulose: Structure and Properties cell wall layer (P) that is much thinner than the secondary wall (S), which is formed on its inner side. Further inside, the tertiary cell wall (T) is exposed to an open, hollow area or lumen resulting in typical hollow, cylinder-like plant cells. The cell wall thickness and length of plant fibers are about 4-6 30 μm and 15-30 μm, respectively. The P and

data:10.1007_12_2015_319_3.4.2-3 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '3'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_3.4.2-4 ;
onner:paragraphText 'Under static culture conditions, layers (sheets) of BC of up to several centimeters thickness are formed on the surface of the culture medium. It is important to control the pH because the accumulation of gluconic-, acetic-, or lactic acids in the culture broth decreases the pH far below the optimum for growth and cellulose production . In the 1980s, Johnson &amp; Johnson (New Brunswick, USA) started to commercialize sheets of BC on large scale for the treatment of different wounds . Independently, a Brazilian company, BioFill Produtos Biotecnologicos (Curitiba, PR Brazil), created a new wound healing system based on BC . At present, commercial products such as Suprasorb X <super>®</super> are distributed by Lohmann &amp; Rauscher (Neuwied, Germany).'^^xsd:string ;
onner:directlyContai

data:10.1007_12_2015_319_4.1-1 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '1'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_4.1-2 ;
onner:paragraphText 'Microcrystalline cellulose (MC) is a fine, white, and odorless crystalline powder (commercial products include Avicel <super>®</super> , Heweten <super>®</super> , Microcel <super>®</super> , Nilyn <super>®</super> , and Novagel <super>®</super> ) used in pharmaceutical (tablet binder), food (rheology control), and paper applications as well as in composite manufacturing . MC is commercially produced by treatment of biomass with aqueous sodium hydroxide to remove other constituents , followed by acidic hydrolysis. During hydrolysis, the DP of cellulose decreases with hydrolysis time until reaching a plateau value called "level off DP" (LODP), which ranges between 25 and 300 depending on the cellulose source . The hydrolysis takes place in the less crystalline regions, leaving a solid resid

data:10.1007_12_2015_319_4.2-2 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '2'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_4.2-3 ;
onner:paragraphText 'The stability of cellulose whiskers is strongly influenced by the size polydispersity, the dimensions of the particles, and their surface charge. Suspensions of Cellulose: Structure and Properties whiskers prepared with H <sub>2</sub> SO <sub>4</sub> (negatively charged) are more stable as a result of electrostatic repulsion  than whiskers obtained by hydrolysis with HCl (neutral particles).'^^xsd:string ;
onner:directlyContainsLabeledTerm data:10.1007_12_2015_319_4.2-2-1, data:10.1007_12_2015_319_4.2-2-2, data:10.1007_12_2015_319_4.2-2-3, data:10.1007_12_2015_319_4.2-2-4, data:10.1007_12_2015_319_4.2-2-5, data:10.1007_12_2015_319_4.2-2-6, data:10.1007_12_2015_319_4.2-2-7, data:10.1007_12_2015_319_4.2-2-8, data:10.1007_12_2015_319_4.2-2-9, data:10.1007_12_2015_319_4.2-2-10, data:10.1007_12

data:10.1007_12_2015_319_4.2-4 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '4'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_4.2-5 ;
onner:paragraphText 'In cellulose-based nanocomposites, whiskers give excellent properties because their regular and precise rigid-rod shape improves the mechanical characteristics of a variety of natural and synthetic materials. The nanocomposites show significantly enhanced mechanical properties as a result of formation of a rigid whiskers network, even when the whiskers content is only a few percent . Cellulose nanocrystals can be dispersed in polar aprotic solvents such as dimethyl sulfoxide (DMSO) and N,N-dimethylformamide (DMF), for example, for the preparation of films displaying birefringence . Dispersions in dichloromethane allow film-casting with poly(ε-caprolactone) leading to completely biobased composites that possess higher melting and crystallization temperatures, as well as higher glass transit

data:10.1007_12_2015_319_4.3-2 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '2'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_4.3-3 ;
onner:paragraphText 'MFC can be used to produce patterned surfaces using lithographic techniques . In these cases, MFC improves homogeneity and stability, which is important in various applications. Microcontact printing of oppositely charged poly(ethylene imine) (PEI) on a surface of PEI/poly(styrene sulfonate) followed by MFC treatment (Fig. ), or on a PEI-coated poly(dimethyl siloxane) stamp, produces geometric patterns (Fig. ). Such surfaces can be used in the development of membranes and filters because the pore geometry and size can be controlled by selection of the appropriate microstamp pattern.'^^xsd:string ;
onner:directlyContainsLabeledTerm data:10.1007_12_2015_319_4.3-2-1, data:10.1007_12_2015_319_4.3-2-2, data:10.1007_12_2015_319_4.3-2-3, data:10.1007_12_2015_319_4.3-2-4, data:10.1007_12_2015_319_

data:10.1007_12_2015_319_5.1-1 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '1'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_5.1-2 ;
onner:paragraphText 'The electrospinning technique is widely used for the production of nanofibers, which opens a route for production of materials with high effective surface areas . Nanofibers can be produced from different polymers and have applications in various fields, namely biomedicine, composites, filters, catalysts, and textiles . Nanofibers regulate water vapor and wind permeability and can improve the thermal isolation of textiles. Moreover, they can possess special properties such as aerosol-filtration, binding of chemical and biological contaminants, or improved surfactant release . Air cleaning of contaminated environments is a typical example of their application .'^^xsd:string ;
onner:directlyContainsLabeledTerm data:10.1007_12_2015_319_5.1-1-1, data:10.1007_12_2015_319_5.1-1-2, data:10.1007_1

data:10.1007_12_2015_319_5.2-1 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '1'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_5.2-2 ;
onner:paragraphText 'Nanoscaled particles can be obtained from different cellulose esters, including commercially available cellulose acetates, cellulose acetate propionate, and cellulose acetate butyrate, and also from some organo-soluble cellulose ethers. Methods commonly used are emulsification solvent evaporation and the low-energy method of solvent displacement by dialysis, inducing nanoprecipitation . Comparing the methods, a large amount of small and uniform nanoparticles can be obtained by the emulsification solvent evaporation procedure, whereas solvent displacement yields narrowly distributed particles. Typical particles obtained from cellulose acetate are shown in Fig.   . Dialysis is easy to use and therefore appropriate for laboratory-scale studies. Moreover, very pure suspensions of the nanoparti

data:10.1007_12_2015_319_5.2-3 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '3'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_5.2-4 ;
onner:paragraphText 'Although an organo-soluble cellulose derivative must be used for the technique of nanoprecipitation, even pure cellulose nanoparticles can be prepared. Using trimethylsilyl cellulose (TMSC), the formation of nanoparticles by dialysis of the organic solvent against water is accompanied by complete removal of the TMS functions. Analysis of particle size distribution shows that cellulose particles with a size of 80-260 nm are accessible in this simple manner . Aqueous suspensions of the pure, spherical cellulose nanoparticles are storable for several months without any demixing. Covalent labeling of the cellulose nanoparticles with FITC has no influence on particle size, shape, and stability. The particles can be sterilized and suspended in biological media without structural changes. As can 

data:10.1007_12_2015_319_5.2-5 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '5'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_5.2-6 ;
onner:paragraphText '1. Solubility of a sufficient amount of electrolyte in the organic liquid 2. Adequate stability of the electrolyte/solvent complex 3. Cooperative action of the solvated ion-pair on cellulose hydrogen bonds 4. Sufficient basicity of the anion  For example, to obtain a 3 wt% solution of cellulose in DMAc requires about 4 wt% LiCl, whereas 10 wt% LiCl is needed to dissolve it in DMF. This agrees with the fact that LiCl forms a stronger complex with the former solvent . By contrast, NaCl is not appropriate because it is insoluble in DMAc and DMF. The strength of cation-solvent association of alkali metal chlorides in DMAc and DMF is in the order Li <super>+</super> &gt; Na <super>+</super> &gt; K <super>+</super> &gt; Cs <super>+</super> (as determined by electrospray ionization mass spectrosc

data:10.1007_12_2015_319_5.2-7 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '7'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_5.2-8 ;
onner:paragraphText 'Highly surprising is the finding that cellulose dissolves quickly even in a mixture of acetone/triethyloctylammonium chloride containing 9 parts of the salt and 20 parts of the organic liquid. No pretreatment or activation of the cellulose is necessary. This has not yet been reported for binary acetone/salt mixtures, including ILs, where acetone has been found to cause immediate cellulose precipitation . Further increase in the amount of triethyloctylammonium chloride does not have an adverse effect on the solution. The <super>13</super> C-NMR spectrum measured for cellulose dissolved in acetone/triethyloctylammonium chloride verifies that the biopolymer is dissolved without being chemically modified (nonderivatizing solvent) as is the case for all solvents of this class (Fig. ). Neverthel

data:10.1007_12_2015_319_5.2-10 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '10'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_5.2-11 ;
onner:paragraphText 'Complete dehydration of TBAF Â 3H <sub>2</sub> O, resulting in the water-free salt, is impossible because anhydrous TBAF is unstable and undergoes rapid E2 elimination, resulting in the formation of hydrogen difluoride anions . However, preparation of anhydrous TBAF in situ by reacting tetra-n-butylammonium cyanide with hexafluorobenzene in dry DMSO has been described . Freshly prepared water-free DMSO/TBAF solution, even in the presence of the by-product hexacyanobenzene, dissolves cellulose very easily. In the water-free solvent, dissolution of bleached cotton fibers with very high DP of 3,743 occurs within a short time, as visualized by optical microscopy (Fig. , ).'^^xsd:string ;
onner:directlyContainsLabeledTerm data:10.1007_12_2015_319_5.2-10-1, data:10.1007_12_2015_319_5.2-10-2, 

data:10.1007_12_2015_319_5.2-12 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '12'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_5.2-13 ;
onner:paragraphText 'These results again substantiate the simple approach mentioned above for creation of new solvents for cellulose. Following such an approach, another solvent was found very recently; almost anhydrous dibenzyldimethylammonium (BMAF Â 0.1H <sub>2</sub> O) in DMSO dissolves microcrystalline and fibrous celluloses .'^^xsd:string ;
onner:directlyContainsLabeledTerm data:10.1007_12_2015_319_5.2-12-1, data:10.1007_12_2015_319_5.2-12-2, data:10.1007_12_2015_319_5.2-12-3, data:10.1007_12_2015_319_5.2-12-4, data:10.1007_12_2015_319_5.2-12-5, data:10.1007_12_2015_319_5.2-12-6, data:10.1007_12_2015_319_5.2-12-7, data:10.1007_12_2015_319_5.2-12-8, data:10.1007_12_2015_319_5.2-12-9 .

data:10.1007_12_2015_319_5.2-12-1 rdf:type onner:LabeledTerm ;
onner:labeledTermText 'solvents'^^xsd:string ;
onner:of

data:10.1007_12_2015_319_6.2-1 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '1'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_6.2-2 ;
onner:paragraphText 'The first ionic liquids (ILs) used for esterification of cellulose were Nalkylpyridinium halides, especially N-ethylpyridinium chloride (EPyCl) and Nbenzylpyridinium chloride (BPyCl) . Nevertheless, the most promising ILs for the modification of cellulose are the salts of 1-alkyl-3-methylimidazolium. In 2002, it was shown that such ILs could open new paths for the shaping of polysaccharides . Additionally, they could lead to commercially relevant routes toward homogeneous cellulose chemistry, which would significantly broaden the number of tailored cellulose derivatives. Meanwhile, a huge number of cellulose-dissolving ILs are now known and discussed in various recent reviews (e.g.,  and references cited therein), and the number of reported low melting organic salts is growing rapidly (Fig

data:10.1007_12_2015_319_6.3-2 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '2'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_6.3-3 ;
onner:paragraphText 'Complete dissolution of microcrystalline cellulose in aqueous NaOH is possible . However, linters cellulose had limited solubility (26-37%) applying the same procedure. Kamide and coworkers have applied steam explosion treatments in order to dissolve pulp directly in NaOH . In technical papers, they claim that a solution of 5% of steam-exploded cellulose in 9.1% NaOH at 4 C, spun into 20% H <sub>2</sub> SO <sub>4</sub> at 5 C, yielded fibers but of poor quality.'^^xsd:string ;
onner:directlyContainsLabeledTerm data:10.1007_12_2015_319_6.3-2-1, data:10.1007_12_2015_319_6.3-2-2, data:10.1007_12_2015_319_6.3-2-3, data:10.1007_12_2015_319_6.3-2-4, data:10.1007_12_2015_319_6.3-2-5, data:10.1007_12_2015_319_6.3-2-6, data:10.1007_12_2015_319_6.3-2-7, data:10.1007_12_2015_319_6.3-2-8, data:10.1007

data:10.1007_12_2015_319_6.3-3 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '3'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_6.3-4 ;
onner:paragraphText 'Recently, the dissolution and modification of cellulose in mixtures of an aqueous base with urea and thiourea has been the focus of interest . Cellulose can be dissolved in an aqueous solution of NaOH (7 wt%)/urea (12 wt%). Starting from a precooled mixture at À12 C, cellulose dissolves within 2 min. The urea hydrates could possibly be self-assembled at the surface of the NaOH hydrogen-bonded cellulose . The solutions are rather unstable and sensitive to temperature, polymer concentration, and storage time . Alternatives include LiOH/ urea  and NaOH/thiourea . TEM images and wide-angle X-ray diffraction (WAXD) provide experimental evidence for the formation of a wormlike cellulose inclusion complex surrounded by urea (Fig. ). Glucan cellulose was used as a precursor for chemical modificati

data:10.1007_12_2015_319_6.3-5 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '5'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_7.1 ;
onner:paragraphText 'However, the dissolution of cellulose prior to chemical reaction offers a great opportunity for the design of novel and unconventional cellulose derivatives by homogeneous phase chemistry. For homogeneous phase chemistry, either nonderivatizing or derivatizing solvents can be used. In the case of derivatizing solvents, both conversion of the soluble intermediate formed during dissolution and modification of the isolated intermediate (which is re-dissolved in an organic solvent such as DMSO or DMF) are considered homogeneous reactions. By contrast, neither chemical modification of soluble but "stable" cellulose derivatives such as cellulose acetate in DMSO nor chemical modification of cellulose under dissolution of the cellulose derivative formed (as a result of the conversion) are included i

data:10.1007_12_2015_319_7.1.1-1 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '1'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_7.1.2 ;
onner:paragraphText 'Although a wide variety of solvents for cellulose have been developed and investigated in recent years, only a few have shown the potential for controlled and homogeneous functionalization of the polysaccharide (Table ) . Limitations to the application of solvents result from high toxicity, high reactivity of the solvents leading to undesired side reactions, and loss of solubility during reactions. The latter results in inhomogeneous mixtures through formation of gels and pastes, which are difficult to mix, and even through formation of de-swollen particles of low reactivity, which settle out in the reaction medium. Homogeneous reaction conditions give the opportunity for esterification with state of the art reagents, for example, after in situ activation of carboxylic acids, which is ch

data:10.1007_12_2015_319_7.1.2-1 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '1'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_7.1.2-2 ;
onner:paragraphText 'Although studied for decades, sulfation of cellulose is still of interest because the products show pronounced bioactivity and can be used for self-assembly systems such as polyelectrolyte complexes. A very elegant method offers the sulfation of cellulose dissolved in ILs. Cellulose dissolved in BMIMCl/co-solvent mixtures can be easily converted into cellulose sulfate (CS) by using SO <sub>3</sub> -Py, SO <sub>3</sub> -DMF, or ClSO <sub>3</sub> H . Highly substituted CS with DS values up to 3 has been reported for sulfation in BMIMCl at 30 C ; however, it should be noted that cellulose/IL solutions slowly turned solid upon cooling to room temperature, depending on the cellulose and moisture content. Synthesis of CS with an even distribution of sulfate groups along the polymer chains re

data:10.1007_12_2015_319_7.1.3-1 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '1'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_7.2 ;
onner:paragraphText 'In addition to typical modification of the hydroxyl groups of cellulose, chemical modification can be carried out by reaction at the C atoms of the AGU. Nucleophilic displacement (S <sub>N</sub> ) reactions with cellulose are based on the transformation of hydroxyl groups of the biopolymer to a good leaving group, mainly by tosylation . A broad variety of cellulose derivatives are accessible, as summarized in Table . The S <sub>N</sub> reaction occurs almost exclusively at the primary position of the repeating unit, most probably for steric reasons. The S <sub>N</sub> of a tosylate moiety occurs via a S <sub>N</sub> 2 mechanism (i.e., a transition state appears containing five atoms that is hardly formed at the secondary positions of the modified AGU).'^^xsd:string ;
onner:directlyContains

data:10.1007_12_2015_319_7.2-1 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '1'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_7.2-2 ;
onner:paragraphText 'Conversion of cellulose tosylate with diamines or oligoamines yields polymers of the type P-CH <sub>2</sub> -NH-(X)-NH <sub>2</sub> (P ¼ cellulose; X ¼ alkylene, aryl, aralkylene, or oligoamine) at position 6 (Fig. ). These cellulose derivatives can form transparent films and can be used for the immobilization of enzymes such as glucose oxidase, peroxidase, and lactate oxidase. The products are useful as biosensors. Soluble and film-forming cellulose derivatives with redox-chromogenic and enzymeimmobilizing 1,4-phenylenediamine groups have been reported . Thus, it is possible to design amino celluloses with properties that differ in, for example, the distance of the terminal NH <sub>2</sub> groups from the cellulose backbone (spacer effect), basicity, and reactivity. Moreover, di-and oligo

data:10.1007_12_2015_319_7.2-3 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '3'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_7.2-4 ;
onner:paragraphText '6-Deoxy-6-amino cellulose forms multiple oligomeric species that were discovered using the hydrodynamic technique of analytical ultracentrifugation as a probe. For every amino cellulose studied, the sedimentation coefficient distributions indicate 4 or 5 discrete species, with a stepwise increase in sedimentation coefficient. This was found in every case across a range of six different solute loading concentrations (from 0.125 to 2.0 mg/mL). For example, the lowest sedimentation coefficient of 6-Deoxy-6-(2-(bis(2-aminoethyl)aminoethyl)amino) cellulose was 1.8 Svedberg (S). Additional species sedimenting at peak maxima of 2.8, 4.0, 5.1 and 6.5 S were also clearly found (Fig. ). It is obvious that even a fully reversible self-association (tetramerization) within this family of 6-deoxy-6-amin

data:10.1007_12_2015_319_7.2-5 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '5'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_7.2-6 ;
onner:paragraphText 'The chemoselective introduction of dendrons into cellulose is achieved by homogeneous reaction of 6-deoxy-6-azido cellulose with propargylpolyamidoamine (PAMAM) dendrons in DMSO and ILs or heterogeneously in methanol in the presence of CuSO 4'^^xsd:string ;
onner:directlyContainsLabeledTerm data:10.1007_12_2015_319_7.2-5-1, data:10.1007_12_2015_319_7.2-5-2, data:10.1007_12_2015_319_7.2-5-3, data:10.1007_12_2015_319_7.2-5-4, data:10.1007_12_2015_319_7.2-5-5, data:10.1007_12_2015_319_7.2-5-6, data:10.1007_12_2015_319_7.2-5-7, data:10.1007_12_2015_319_7.2-5-8 .

data:10.1007_12_2015_319_7.2-5-1 rdf:type onner:LabeledTerm ;
onner:labeledTermText 'cellulose'^^xsd:string ;
onner:offset '49'^^xsd:nonNegativeInteger ;
onner:length '9'^^xsd:nonNegativeInteger ;
onner:labeledTermDirectlyContainedBy 

data:10.1007_12_2015_319_7.2-8 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '8'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_7.2-9 ;
onner:paragraphText 'Even water-soluble deoxy-azido cellulose derivatives are accessible by carboxymethylation, applying 2-propanol/aqueous NaOH as medium . The carboxymethyl deoxy-azido cellulose provides a convenient starting material for the selective conversion by Huisgen reaction, yielding water-soluble carboxymethyl 6-deoxy-(1-N-(1,2,3-triazolo)-4-PAMAM) cellulose derivatives of first to third generation (Fig. ).'^^xsd:string ;
onner:directlyContainsLabeledTerm data:10.1007_12_2015_319_7.2-8-1, data:10.1007_12_2015_319_7.2-8-2, data:10.1007_12_2015_319_7.2-8-3, data:10.1007_12_2015_319_7.2-8-4, data:10.1007_12_2015_319_7.2-8-5, data:10.1007_12_2015_319_7.2-8-6, data:10.1007_12_2015_319_7.2-8-7, data:10.1007_12_2015_319_7.2-8-8, data:10.1007_12_2015_319_7.2-8-9, data:10.1007_12_2015_319_7.2-8-10 .

data:1

data:10.1007_12_2015_319_7.4-1 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '1'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_7.4-2 ;
onner:paragraphText 'Polysaccharide aryl carbonates are easily accessible reactive derivatives useful for a variety of reactions . Easily soluble cellulose aryl carbonates can be synthesized by applying phenyl chloroformate, phenyl fluoroformate, and p-NO <sub>2</sub> -phenyl chloroformate under homogeneous reaction conditions with DMAc/LiCl as reaction medium. Pyridine should be used instead of triethylamine to reduce the nucleophilicity of the hydroxyl groups of the polymer and to exclude formation of cyclic or intermolecular carbonates .'^^xsd:string ;
onner:directlyContainsLabeledTerm data:10.1007_12_2015_319_7.4-1-1, data:10.1007_12_2015_319_7.4-1-2, data:10.1007_12_2015_319_7.4-1-3, data:10.1007_12_2015_319_7.4-1-4, data:10.1007_12_2015_319_7.4-1-5, data:10.1007_12_2015_319_7.4-1-6, data:10.1007_12_2015_

data:10.1007_12_2015_319_7.4-3 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '3'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_8 ;
onner:paragraphText 'A variety of novel cellulose derivatives are accessible based on cellulose phenyl carbonate. For instance, poly-zwitterions can be produced (Fig. ). Cellulose phenyl carbonate can be allowed to react with equimolar amounts of β-alanine ethyl ester and N-tert-butoxycarbonyl-1,2-ethanediamine. The aminolysis produces (3-ethoxy-3-oxopropyl-N-Boc-2-aminoethyl) cellulose carbamate with a DS <sub>alanineester</sub> of 0.88 and a DS <sub>BocÀEDA</sub> of 0.95. Thus, there is indication of similar reactivity of the amines, together with a very high conversion (95% of the carbonate moieties into carbamate). Fig.  Reaction scheme for the synthesis of anionic, cationic, and ampholytic cellulose carbamate A polyanion, polycation, and poly-zwitterion can be obtained from cellulose carbamate because of the 

data:10.1007_12_2015_319_8-1 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '1'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1007_12_2015_319_8-2 ;
onner:paragraphText 'Cellulose is the most important renewable resource and a unique polymer in terms of its structure and properties. Because of its unique properties, cellulose can serve as starting material for various products and processes for a sustainable world and the development of a country\'s bioeconomy. Physical and chemical modification reactions yielding fibers, film, sponges, and cellulose ethers and esters are of high commercial importance today. However, research and development in the field of nanostructuring of cellulose and cellulose derivatives, homogeneous chemistry with cellulose applying various solvents (including molten salts, ionic liquids, and water-based systems) can open new avenues for product design with modern organic chemistry. It can be expected that homogeneous phase chemistry w

# Backup code

In [None]:
# WRITING RDF TRIPLES FOR DOCUMENT

# DECLARING VARIABLE
labels_in_doc = []

# DECLARING LABELING SCHEMA
labeling_schema = ['ENTITY', 
                   'CHEMICAL_ENTITY', 
                   'MATERIAL_ENTITY', 
                   'STRUCTURE_ENTITY',
                   'APPLICATION',
                   'PROCESS',
                   'PROPERTY',
                   'EQUIPMENT',
                   'RELATIONSHIP',
                   'MEASUREMENT', 
                   'ABBREVIATION']

# GETTING DOI
try:
    doi = document[0][2]['DOI']
    doi = doi.replace('/', '_')
except NameError:
    print('ERROR: Document object is not defined!')

# PRINTING PREFIXES
print("@prefix onner: <http://spatialai.org/cellograph/ontology/onner/v2.0#> .")
print("@prefix data: <http://spatialai.org/cellograph/ontology/onner/v2.0/data#> .")
print("@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .")
print("@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .")
print("@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .")
print("@prefix owl: <http://www.w3.org/2002/07/owl#> .")
print()

try:
    for record in document:

        # METADATA
        if record[1] == 'Metadata':

            # PRINTING METADATA
            print(f"data:Publication_{doi} rdf:type onner:ScholarlyPublication ;")
            print(f"onner:title '{record[2]['Title']}'^^xsd:string ;")
            print(f"onner:doi '{record[2]['DOI']}'^^xsd:string ;")
            print(f"onner:publicationDate '{record[2]['Publication Date']}'^^xsd:date .")
            print()

        # ABSTRACT AND IT'S DOCUMENT PARTS
        elif record[1] == 'Abstract':
            index = document.index(record) + 1
            next_section = document[index][0]
            paragraph_ids = []

            # CREATING PARAGRAPH ID FOR ABSTRACT'S PARAGRAPH 
            for paragraph_number, _ in record[2].items():
                paragraph_id = doi + '_A' + '-' + str(paragraph_number)
                paragraph_ids.append('data:'+paragraph_id)

            paragraph_ids_joined = ', '.join(paragraph_ids)

            # PRINTING ABSTRACT AND IT'S PARAGRAPH IDs 
            print(f"data:{doi}_A rdf:type onner:Abstract ;")
#             print(f"onner:nextDocumentPart {paragraph_ids[0]} ;")    # NEXT DOC PART AFTER ABSTRACT
            print(f"onner:directlyContainsDocumentPart {paragraph_ids_joined} .")
            print()

            # PARAGRAPHS AND IT'S DOCUMENT PARTS
            for paragraph_number, paragraph_text in record[2].items():
                labeled_term_info_list = []
                labeled_term_number = 1

                # replacing ' with \' in text
                if "'" in paragraph_text:
                    paragraph_text = paragraph_text.replace("'", r"\'")

                # creating doc object
                doc = nlp(paragraph_text)

                for ent in doc.ents:
                    current_dateTime = str(datetime.now())[:-7]    # RECHECK THE APPROPRIATE PLACEMENT
                    labeled_term_id = doi + '_A' + '-' + str(paragraph_number) + '-' + str(labeled_term_number)
                    labeled_term = ent.text
                    label = ent.label_
                    offset = ent.start_char
                    length = ent.end_char - ent.start_char
                    labeled_term_info = [labeled_term_id, labeled_term, label, offset, length, current_dateTime]
                    labeled_term_info_list.append(labeled_term_info)
                    labeled_term_number += 1

                labeled_term_ids = ['data:'+i[0] for i in labeled_term_info_list]
                labeled_term_ids_joined = ', '.join(labeled_term_ids)

                # PRINTING PARAGRAPHS AND IT'S LABELED TERM IDs 
                print(f"data:{doi}_A-{paragraph_number} rdf:type onner:Paragraph ;")
                print(f"onner:positionInParentDocumentPart '{paragraph_number}'^^xsd:nonNegativeInteger ;")

                if paragraph_number == len(paragraph_ids):
                    print(f"onner:nextDocumentPart data:{doi}_{next_section} ;")
                else:
                    print(f"onner:nextDocumentPart data:{doi}_A-{paragraph_number+1} ;")

                print(f"onner:paragraphText '{paragraph_text}'^^xsd:string ;")
                print(f"onner:directlyContainsLabeledTerm {labeled_term_ids_joined} .")
                print()

                # LABELED TERMS
                for info in labeled_term_info_list:
                    # PRINTING LABELED TERMS
                    print(f"data:{info[0]} rdf:type onner:LabeledTerm ;")    # DEAL WITH ATOMIC / COMPOUND
                    print(f"onner:documentPartIsDirectlyContainedBy data:{doi}_A-{paragraph_number} ;")
                    print(f"onner:labeledTermText '{info[1]}'^^xsd:string ;")
                    print(f"onner:offset '{info[3]}'^^xsd:nonNegativeInteger ;")
                    print(f"onner:length '{info[4]}'^^xsd:nonNegativeInteger ;")
                    print(f"onner:hasLabeledTermStatus data:Candidate_{info[0]} .")
                    print()

                    # PRINTING LABELED TERMS STATUS
                    print(f"data:Candidate_{info[0]} rdf:type onner:CandidateStatus ;")
                    print(f"onner:statusCreatedBy data:System_127_0_0_1 ;")
                    print(f"onner:statusCreatedDate '{info[5]}'^^xsd:dateTime ;")

                    try:
                        if not bool(labeling_schema):
                            raise Exception('Labeling schema is empty!')
                            
                        label_number_in_schema = labeling_schema.index(info[2]) + 1
                    except Exception as e:
                        print(f'ERROR: {e}')
                    else:
                        print(f"onner:hasTermLabel data:Label{label_number_in_schema} .")

                    # adding lebels and their position in the schema
                    if [label_number_in_schema, info[2]] not in labels_in_doc:
                        labels_in_doc.append([label_number_in_schema, info[2]])
                    print()

            print('#========================= SECTION DIVIDER =========================#')
            print()

        # SECTION AND IT'S DOCUMENT PARTS
        else:
            section_number = record[0]
            section_name = record[1]
            index = document.index(record) + 1
            paragraph_ids = []

            if index == len(document):
                next_section = 'EndOfDocument'
            else:
                next_section = document[index][0]

            # CREATING PARAGRAPH ID FOR SECTION'S PARAGRAPH 
            for paragraph_number, _ in record[2].items():
                paragraph_id = doi + '_' + str(section_number) + '-' + str(paragraph_number)
                paragraph_ids.append('data:'+paragraph_id)

            paragraph_ids_joined = ', '.join(paragraph_ids)
            
            # PRINTING SECTION AND IT'S PARAGRAPH IDs 
            print(f"data:{doi}_{section_number} rdf:type onner:Section ;")
            print(f"onner:headerText '{record[1]}'^^xsd:string ;")
#             print(f"onner:nextDocumentPart {paragraph_ids[0]} ;")    # NEXT DOC PART AFTER SECTION
            print(f"onner:directlyContainsDocumentPart {paragraph_ids_joined} .")
            print()
            
            # PARAGRAPHS AND IT'S DOCUMENT PARTS
            for paragraph_number, paragraph_text in record[2].items():
                labeled_term_info_list = []
                labeled_term_sequence = 1

                # replacing ' with \' in text
                if "'" in paragraph_text:
                    paragraph_text = paragraph_text.replace("'", r"\'")

                # creating doc object
                doc = nlp(paragraph_text)

                for ent in doc.ents:
                    current_dateTime = str(datetime.now())[:-7]    # RECHECK THE APPROPRIATE PLACEMENT
                    labeled_term_id = doi + '_' + str(section_number) + '-' + str(paragraph_number) + '-' + str(labeled_term_sequence)
                    labeled_term = ent.text
                    label = ent.label_
                    offset = ent.start_char
                    length = ent.end_char - ent.start_char
                    labeled_term_info = [labeled_term_id, labeled_term, label, offset, length, current_dateTime]
                    labeled_term_info_list.append(labeled_term_info)
                    labeled_term_sequence += 1

                labeled_term_ids = ['data:'+i[0] for i in labeled_term_info_list]
                labeled_term_ids_joined = ', '.join(labeled_term_ids)

                # PRINTING PARAGRAPHS AND IT'S LABELED TERM IDs 
                print(f"data:{doi}_{section_number}-{paragraph_number} rdf:type onner:Paragraph ;")
                print(f"onner:positionInParentDocumentPart '{paragraph_number}'^^xsd:nonNegativeInteger ;")

                if paragraph_number == len(paragraph_ids):
                    print(f"onner:nextDocumentPart data:{doi}_{next_section} ;")
                else:
                    print(f"onner:nextDocumentPart data:{doi}_{section_number}-{paragraph_number+1} ;")

                print(f"onner:paragraphText '{paragraph_text}'^^xsd:string ;")
                print(f"onner:directlyContainsLabeledTerm {labeled_term_ids_joined} .")
                print()

                # LABELED TERMS
                for info in labeled_term_info_list:
                    # PRINTING LABELED TERMS
                    print(f"data:{info[0]} rdf:type onner:LabeledTerm ;")    # DEAL WITH ATOMIC / COMPOUND
                    print(f"onner:documentPartIsDirectlyContainedBy data:{doi}_{section_number}-{paragraph_number} ;")
                    print(f"onner:labeledTermText '{info[1]}'^^xsd:string ;")
                    print(f"onner:offset '{info[3]}'^^xsd:nonNegativeInteger ;")
                    print(f"onner:length '{info[4]}'^^xsd:nonNegativeInteger ;")
                    print(f"onner:hasLabeledTermStatus data:Candidate_{info[0]} .")
                    print()

                    # PRINTING LABELED TERMS STATUS
                    print(f"data:Candidate_{info[0]} rdf:type onner:CandidateStatus ;")
                    print(f"onner:statusCreatedBy data:System_127_0_0_1 ;")
                    print(f"onner:statusCreatedDate '{info[5]}'^^xsd:dateTime ;")

                    try:
                        if not bool(labeling_schema):
                            raise Exception('Labeling schema is empty!')
                            
                        label_number_in_schema = labeling_schema.index(info[2]) + 1
                    except Exception as e:
                        print(f'ERROR: {e}')
                    else:
                        print(f"onner:hasTermLabel data:Label{label_number_in_schema} .")

                    # adding lebels and their position in the schema
                    if [label_number_in_schema, info[2]] not in labels_in_doc:
                        labels_in_doc.append([label_number_in_schema, info[2]])

                    print()

            print('#========================= SECTION DIVIDER =========================#')
            print()

except NameError:
    print('ERROR: Document object is not defined!')

try:
    if not bool(labels_in_doc):
        raise Exception('List of labels found in document is empty!')
        
    for label in labels_in_doc:
        print(f"data:Label{label[0]} rdf:type onner:Label ;")
        print(f"onner:fromLabelingSchema data:DevSchema ;")
        print(f"onner:labelText '{label[1]}'^^xsd:string .")
        print()       
except Exception as e:
    print(f'ERROR: {e}')
            
print(f"data:System_127_0_0_1 rdf:type onner:Agent .")
print(f"data:DevSchema rdf:type onner:LabelingSchema .")

# Connect to GraphDB

In [51]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import json

In [52]:
# connect and query graph database
# ADD TRY CATCH TO AVOID DATABASE CONNECTIVITY ERROR

# specify the repository
sparql = SPARQLWrapper("http://LAPTOP-S7TVD5I4:7200/repositories/SciPub")

# SPARQL query
sparql.setQuery("""
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX : <http://spatialai.org/SciPub/v2.0#>
    PREFIX SciPub: <http://spatialai.org/SciPub/v2.0#>

    SELECT ?sentenceID ?sentenceText ?labeledTermText ?offset ?length ?labelText (NOW() AS ?systemDateTime)
    WHERE {
        ?paragraphID rdf:type SciPub:Paragraph ;
                     SciPub:directlyContainsDocumentPart ?sentenceID .

        ?sentenceID SciPub:sentenceText ?sentenceText .

        OPTIONAL {
            ?labeledTermID SciPub:documentPartIsDirectlyContainedBy ?sentenceID ;
                           SciPub:labeledTermText ?labeledTermText ;
                           SciPub:offset ?offset ;
                           SciPub:length ?length ;
                           SciPub:hasLabel ?labelID .

            ?labelID rdf:type SciPub:Label ;
                     SciPub:labelText ?labelText .
        }
    }
""")

# convert results to JSON
sparql.setReturnFormat(JSON)
fetched_data = sparql.query().convert()


URLError: <urlopen error [WinError 10061] No connection could be made because the target machine actively refused it>

In [None]:
# convert query result from JSON to dataframe

column_header = ['sentence_id', 'sentence_text', 'labeled_term_text', 'offset', 'length', 'label_text', 'system_datetime']
fetched_data_df = pd.DataFrame(columns=column_header)
sentence_id = []
sentence_text = []
labeled_term_text = []
offset = []
length = []
label_text = []
system_datetime = []

for record in fetched_data['results']['bindings']:
    if len(record) == 7:
        for key, value in record.items():
            if key == 'sentenceID':
                sentence_id.append(value['value'])
            if key == 'sentenceText':
                sentence_text.append(value['value'])
            if key == 'labeledTermText':
                labeled_term_text.append(value['value'])
            if key == 'offset':
                offset.append(value['value'])
            if key == 'length':
                length.append(value['value'])
            if key == 'labelText':
                label_text.append(value['value'])
            if key == 'systemDateTime':
                format_system_datetime = value['value'][:-10]
                format_system_datetime = format_system_datetime.replace('T', ' ')
                system_datetime.append(format_system_datetime)
    
    elif len(record) == 3:
        for key, value in record.items():
            if key == 'sentenceID':
                sentence_id.append(value['value'])
            if key == 'sentenceText':
                sentence_text.append(value['value'])
            if key == 'systemDateTime':
                format_system_datetime = value['value'][:-10]
                format_system_datetime = format_system_datetime.replace('T', ' ')
                system_datetime.append(format_system_datetime)
        
        labeled_term_text.append('')
        offset.append('')
        length.append('')
        label_text.append('')
    
    else:
        print('Different record length (should be 7 or 3) found in SPARQL result (JSON format).')
        
if len(sentence_id) == len(sentence_text) == len(labeled_term_text) == len(offset) == len(length) == len(label_text) == len(system_datetime):
    fetched_data_df['sentence_id'] = sentence_id
    fetched_data_df['sentence_text'] = sentence_text
    fetched_data_df['labeled_term_text'] = labeled_term_text
    fetched_data_df['offset'] = offset
    fetched_data_df['length'] = length
    fetched_data_df['label_text'] = label_text
    fetched_data_df['system_datetime'] = system_datetime 
else:
    print('Different length found for dataframe columns.')

In [None]:
fetched_data_df

In [None]:
# create JSON from dataframe

distinct_sentence_id = list(fetched_data_df['sentence_id'].unique())
fetched_data_json = {'classes': [], 'annotations': []}

for sentence_id in distinct_sentence_id:
    single_sentence_annotation = []
    fetched_data_sub_df = fetched_data_df.loc[fetched_data_df['sentence_id'] == sentence_id]
    first_index = list(fetched_data_sub_df.index)[0]
    single_sentence_annotation.append(sentence_id)
    single_sentence_annotation.append(fetched_data_sub_df['sentence_text'][first_index])
    single_sentence_annotation.append({'entities': []})

    for idx in fetched_data_sub_df.index:          
        if offset[idx] == '' or length[idx] == '' or label_text[idx] == '':
            ll = ['', '', ['', [['', '', '']]]]
            single_sentence_annotation[2]['entities'].append(ll)
        else:
            ll = [int(offset[idx]), int(offset[idx])+int(length[idx]), [label_text[idx], [['Candidate', system_datetime[idx], 'NER Model']]]]
            single_sentence_annotation[2]['entities'].append(ll)
        
    fetched_data_json['annotations'].append(single_sentence_annotation)


In [None]:
json.dumps(fetched_data_json)

In [None]:
a = "To explain this deviation of the \'ideal behavior\', this paper discusses extensively the impact of the nanoparticle shape on the nanocomposite permeability along with structural aspects, related to both the particle nature and size, and the nanocomposite processing routes."

In [None]:
a.replace("\'", "'")

In [None]:
a = "It is well known that inclusion of homogeneously dispersed and oriented impermeable fillers with high aspect ratio, such as platelets or elongated particles, should significantly increase the diffusion path of gas and vapors and yield to improve barrier properties."

In [None]:
a[218:224]