In [1]:
import xml.etree.ElementTree as ET
import re
import os
import spacy
import scispacy
from datetime import datetime

### XML to Text

In [5]:
# THIS FUNCTION MUST BE ALIGNED WITH CORRESPONDING CODE CELLS

def xml_to_text_all():
    
    # CREATING A LIST OF FILE WITH AVAILABLE FILES
    
    directory_path = 'D:\\Drive\\SISE\\CelloGraph\\Dev\\Data\\XML'
    file_list = os.listdir(directory_path)
    
    
    for xml_file in file_list:
        
        # LOADING XML AND CREATING ROOT
        
        tree = ET.parse(directory_path + '\\' + xml_file)
        root = tree.getroot()


        # EXTRACTING METADATA (available data: title, publication_date and doi)

        metadata = []

        for elem in root:
            if elem.tag[29:] == 'teiHeader':
                for sub_elem1 in elem:
                    if sub_elem1.tag[29:] == 'fileDesc':
                        publication_info = {}
                        for sub_elem2 in sub_elem1:
                            if sub_elem2.tag[29:] == 'titleStmt':
                                for sub_elem3 in sub_elem2:
                                    if sub_elem3.tag[29:] == 'title':
                                        publication_info['Title'] = sub_elem3.text
                            if sub_elem2.tag[29:] == 'publicationStmt':
                                for sub_elem3 in sub_elem2:
                                    if sub_elem3.tag[29:] == 'date':
                                        publication_info['Publication Date'] = sub_elem3.attrib['when']
                            if sub_elem2.tag[29:] == 'sourceDesc':
                                for sub_elem3 in sub_elem2:
                                    if sub_elem3.tag[29:] == 'biblStruct':
                                        for sub_elem4 in sub_elem3:
                                            if sub_elem4.tag[29:] == 'idno':
                                                publication_info['DOI'] = sub_elem4.text
                        metadata.append([0, 'Metadata', publication_info])


        # EXTRACTING ABSTRACT AND NUMBERING PARAGRAPHS

        abstract = []

        for elem in root:
            if elem.tag[29:] == 'teiHeader':
                for sub_elem1 in elem:
                    if sub_elem1.tag[29:] == 'profileDesc':
                        for sub_elem2 in sub_elem1:
                            if sub_elem2.tag[29:] == 'abstract':
                                for sub_elem3 in sub_elem2:
                                    if sub_elem3.tag[29:] == 'div':
                                        list_of_paragraphs = {}
                                        paragraph_number = 0
                                        for sub_elem4 in sub_elem3:
                                            if sub_elem4.tag[29:] == 'p':
                                                paragraph_number += 1
                                                list_of_paragraphs[paragraph_number] = ET.tostring(sub_elem4, encoding='unicode')
                                        abstract.append([0, 'Abstract', list_of_paragraphs])


        # EXTRACTING OTHER SECTIONS

        list_of_sections = []

        for elem in root:
            if (elem.tag[29:] == 'text'):
                for sub_elem1 in elem:
                    if (sub_elem1.tag[29:] == 'body'):
                        for sub_elem2 in sub_elem1:
                            if sub_elem2.tag[29:] == 'div':
                                section_number = ''
                                section_name = ''
                                list_of_paragraphs = []
                                for sub_elem3 in sub_elem2:
                                    if sub_elem3.tag[29:] == 'head':
                                        if bool(sub_elem3.attrib):
                                            section_number = str(sub_elem3.attrib)[7:-3]
                                        else:
                                            section_number = 'NO_SECTION_NUMBER'
                                        section_name = sub_elem3.text
                                    if sub_elem3.tag[29:] == 'p':
                                        list_of_paragraphs.append(ET.tostring(sub_elem3, encoding='unicode'))
                                # commented logic skips NO_SECTION_NUMBER with no paragraphs, i.e., Table 1, Table 2, .....
                                # if section_number == 'NO_SECTION_NUMBER' and not bool(list_of_paragraphs):
                                if section_number == 'NO_SECTION_NUMBER':
                                    pass
                                else:
                                    list_of_sections.append([section_number, section_name, list_of_paragraphs])


        # NUMBERING PARAGRAPHS OF SECTIONS 
        # paragraphs are sequentially added in a dictionary
        # dictionary is added with each respective record as a 4th element

        for section in list_of_sections:
            list_of_paragraphs = {}
            for paragraph_number, paragraph_text in enumerate(section[2], start=1):
                list_of_paragraphs[paragraph_number] = paragraph_text
            section.append(list_of_paragraphs)


        # DELETING THE 3RD ELEMENT FROM EACH RECORD

        for section in list_of_sections:
            section.remove(section[2])


        # MERGING ABSTRACT WITH OTHER SECTIONS

        document = abstract + list_of_sections


        # PREPROCESSING TEXT

        starting_p_tag_pattern = r'<ns0:p[^>]+>'
        ending_p_tag_pattern = '</ns0:p>'
        starting_ref_tag_pattern = r'<ns0:ref[^>]+>'
        ending_ref_tag_pattern = '</ns0:ref>'
        ref_pattern = r'<ref>.*?</ref>'          # temporary for removing ref tag

        for record in document:
            for paragraph_number, paragraph_text in record[2].items():
                text = paragraph_text
                text = re.sub(starting_p_tag_pattern, '', text)
                text = re.sub(ending_p_tag_pattern, '', text)
                text = re.sub(starting_ref_tag_pattern, '<ref>', text)
                text = re.sub(ending_ref_tag_pattern, '</ref>', text)
                text = re.sub(ref_pattern, '', text)
                record[2][paragraph_number] = text


        # EXTRACTING TEXT AND WRITING IN FILE

        output_file_name = os.path.basename(xml_file)[:-4]

        for record in document:
            for _, paragraph in record[2].items():
                with open(f'D:\\Drive\\SISE\\CelloGraph\\Dev\\Data\\Text\\{output_file_name}.txt', 'a', encoding='utf-8') as output_file:
                    output_file.write(paragraph + '\n')
    

In [None]:
# xml_to_text_all()

### XML Structure
- teiHeader
    - fileDesc
        - titleStmt
            - title
        - publicationStmt
            - date
        - sourceDesc
            - biblStruct
                - idno
    - encodingDesc
    - profileDesc

### XRI (XML-to-RDF-Intermediate) Data Structure
- [
    - [
        - section_number,
        - section_title,
        - {
            - paragraph_number:
                - {
                    - sentence_number: sectence_text,
                - },
        - }
    - ],
- ]

### XML to XRI (XML-to-RDF-Intermediate)

In [31]:
# LOADING XML AND CREATING ROOT

xml_file = 'D:\\Drive\\SISE\\CelloGraph\\Dev\\Data\\XML\\Wolf et al. - How the shape of fillers affects properties of nanocomposites.tei.xmlScitex.xml'
tree = ET.parse(xml_file)
root = tree.getroot()

# EXTRACTING METADATA (available data: title, publication_date and doi)

metadata = []

for elem in root:
    if elem.tag[29:] == 'teiHeader':
        for sub_elem1 in elem:
            if sub_elem1.tag[29:] == 'fileDesc':
                publication_info = {}
                for sub_elem2 in sub_elem1:
                    if sub_elem2.tag[29:] == 'titleStmt':
                        for sub_elem3 in sub_elem2:
                            if sub_elem3.tag[29:] == 'title':
                                publication_info['Title'] = sub_elem3.text
                    if sub_elem2.tag[29:] == 'publicationStmt':
                        for sub_elem3 in sub_elem2:
                            if sub_elem3.tag[29:] == 'date':
                                publication_info['Publication Date'] = sub_elem3.attrib['when']
                    if sub_elem2.tag[29:] == 'sourceDesc':
                        for sub_elem3 in sub_elem2:
                            if sub_elem3.tag[29:] == 'biblStruct':
                                for sub_elem4 in sub_elem3:
                                    if sub_elem4.tag[29:] == 'idno':
                                        publication_info['DOI'] = sub_elem4.text
                metadata.append([0, 'Metadata', publication_info])
                                      
print(metadata)

[[0, 'Metadata', {'Title': 'An Update on Overview of Cellulose, Its Structure and Applications', 'Publication Date': '2019-12-18', 'DOI': '10.5772/intechopen.84727'}]]


In [32]:
# EXTRACTING ABSTRACT AND NUMBERING PARAGRAPHS

abstract = []

for elem in root:
    if elem.tag[29:] == 'teiHeader':
        for sub_elem1 in elem:
            if sub_elem1.tag[29:] == 'profileDesc':
                for sub_elem2 in sub_elem1:
                    if sub_elem2.tag[29:] == 'abstract':
                        for sub_elem3 in sub_elem2:
                            if sub_elem3.tag[29:] == 'div':
                                list_of_paragraphs = {}
                                paragraph_number = 0
                                for sub_elem4 in sub_elem3:
                                    if sub_elem4.tag[29:] == 'p':
                                        paragraph_number += 1
                                        list_of_paragraphs[paragraph_number] = ET.tostring(sub_elem4, encoding='unicode')
                                abstract.append([0, 'Abstract', list_of_paragraphs])
                                
print(abstract)

[[0, 'Abstract', {1: '<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0"><ns0:s>Cellulose (C 6 H 10 O 5 ) n is one of the most ubiquitous organic polymers on the planet.</ns0:s><ns0:s>It is a significant structural component of the primary cell wall of green plants, various forms of algae and oomycetes.</ns0:s><ns0:s>It is a polysaccharide consisting of a linear chain of several hundred to many thousands of β(1 → 4) linked d-glucose units.</ns0:s><ns0:s>There are various extraction procedures for cellulose developed by using different processes like oxidation, etherification and esterification which convert the prepared celluloses in to cellulose derivatives.</ns0:s><ns0:s>Since it is a non-toxic, bio-degradable polymer with high tensile and compressive strength, it has widespread use in various fields such as nanotechnology, pharmaceutical industry, food industry, cosmetics, textile and paper industry, drug-delivery systems in treating cancer and other diseases.</ns0:s><ns0:s>Micro-crysta

In [33]:
# EXTRACTING OTHER SECTIONS

list_of_sections = []

for elem in root:
    if (elem.tag[29:] == 'text'):
        for sub_elem1 in elem:
            if (sub_elem1.tag[29:] == 'body'):
                for sub_elem2 in sub_elem1:
                    if sub_elem2.tag[29:] == 'div':
                        section_number = ''
                        section_name = ''
                        list_of_paragraphs = []
                        for sub_elem3 in sub_elem2:
                            if sub_elem3.tag[29:] == 'head':
                                if bool(sub_elem3.attrib):
                                    section_number = str(sub_elem3.attrib)[7:-3]
                                else:
                                    section_number = 'NO_SECTION_NUMBER'
                                section_name = sub_elem3.text
                            if sub_elem3.tag[29:] == 'p':
                                list_of_paragraphs.append(ET.tostring(sub_elem3, encoding='unicode'))
                        # commented logic skips NO_SECTION_NUMBER with no paragraphs, i.e., Table 1, Table 2, .....
                        # if section_number == 'NO_SECTION_NUMBER' and not bool(list_of_paragraphs):
                        if section_number == 'NO_SECTION_NUMBER':
                            pass
                        else:
                            list_of_sections.append([section_number, section_name, list_of_paragraphs])

print(list_of_sections)

[['1', 'Introduction', ['<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0"><ns0:s>Cellulose is the most abundant biopolymer available in nature, since it is one of the major components of the cell walls of most of the plants <ns0:ref type="bibr" target="#b0">[1]</ns0:ref>.</ns0:s><ns0:s>It is a homopolymer of anhydroglucose, with the glucose residues linked in a ß-1,4 fashion <ns0:ref type="bibr" target="#b1">[2]</ns0:ref>.</ns0:s><ns0:s>Cell walls of plant cells attribute their mechanical strength to cellulose.</ns0:s><ns0:s>Cellulose owes its Cellulose structural properties to the fact that it can retain a semi-crystalline state of aggregation even in an aqueous environment, which is unusual for a polysaccharide <ns0:ref type="bibr" target="#b2">[3,</ns0:ref><ns0:ref type="bibr" target="#b3">4]</ns0:ref>.</ns0:s></ns0:p>', '<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0"><ns0:s>As far as cellulose based products are concerned, paperboard and paper are the most commonly used ones <ns0:ref

In [None]:
# # DON'T USE THIS PIECE OF CODE  (commented - 12 Nov 23)
# # HAVE TO ADJUST THIS CODE FOR TABLES/IMAGES
# # merging "paragraphs with no section number" to its previous section

# list_of_sections_length = len(list_of_sections)

# for i in range(list_of_sections_length-1,0,-1):
#     if list_of_sections[i][0] == 'NO_SECTION_NUMBER':
#         list_of_sections[i-1][2].extend(list_of_sections[i][2])
#         list_of_sections.remove(list_of_sections[i])
        
# list_of_sections

In [34]:
# NUMBERING PARAGRAPHS OF SECTIONS 
# paragraphs are sequentially added in a dictionary
# dictionary is added with each respective record as a 4th element

for section in list_of_sections:
    list_of_paragraphs = {}
    for paragraph_number, paragraph_text in enumerate(section[2], start=1):
        list_of_paragraphs[paragraph_number] = paragraph_text
    section.append(list_of_paragraphs)
    
print(list_of_sections)

[['1', 'Introduction', ['<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0"><ns0:s>Cellulose is the most abundant biopolymer available in nature, since it is one of the major components of the cell walls of most of the plants <ns0:ref type="bibr" target="#b0">[1]</ns0:ref>.</ns0:s><ns0:s>It is a homopolymer of anhydroglucose, with the glucose residues linked in a ß-1,4 fashion <ns0:ref type="bibr" target="#b1">[2]</ns0:ref>.</ns0:s><ns0:s>Cell walls of plant cells attribute their mechanical strength to cellulose.</ns0:s><ns0:s>Cellulose owes its Cellulose structural properties to the fact that it can retain a semi-crystalline state of aggregation even in an aqueous environment, which is unusual for a polysaccharide <ns0:ref type="bibr" target="#b2">[3,</ns0:ref><ns0:ref type="bibr" target="#b3">4]</ns0:ref>.</ns0:s></ns0:p>', '<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0"><ns0:s>As far as cellulose based products are concerned, paperboard and paper are the most commonly used ones <ns0:ref

In [35]:
# DELETING THE 3RD ELEMENT FROM EACH RECORD

for section in list_of_sections:
    section.remove(section[2])
    
print(list_of_sections)

[['1', 'Introduction', {1: '<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0"><ns0:s>Cellulose is the most abundant biopolymer available in nature, since it is one of the major components of the cell walls of most of the plants <ns0:ref type="bibr" target="#b0">[1]</ns0:ref>.</ns0:s><ns0:s>It is a homopolymer of anhydroglucose, with the glucose residues linked in a ß-1,4 fashion <ns0:ref type="bibr" target="#b1">[2]</ns0:ref>.</ns0:s><ns0:s>Cell walls of plant cells attribute their mechanical strength to cellulose.</ns0:s><ns0:s>Cellulose owes its Cellulose structural properties to the fact that it can retain a semi-crystalline state of aggregation even in an aqueous environment, which is unusual for a polysaccharide <ns0:ref type="bibr" target="#b2">[3,</ns0:ref><ns0:ref type="bibr" target="#b3">4]</ns0:ref>.</ns0:s></ns0:p>', 2: '<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0"><ns0:s>As far as cellulose based products are concerned, paperboard and paper are the most commonly used ones <n

In [36]:
# MERGING ABSTRACT WITH OTHER SECTIONS

document = abstract + list_of_sections
print(document)

[[0, 'Abstract', {1: '<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0"><ns0:s>Cellulose (C 6 H 10 O 5 ) n is one of the most ubiquitous organic polymers on the planet.</ns0:s><ns0:s>It is a significant structural component of the primary cell wall of green plants, various forms of algae and oomycetes.</ns0:s><ns0:s>It is a polysaccharide consisting of a linear chain of several hundred to many thousands of β(1 → 4) linked d-glucose units.</ns0:s><ns0:s>There are various extraction procedures for cellulose developed by using different processes like oxidation, etherification and esterification which convert the prepared celluloses in to cellulose derivatives.</ns0:s><ns0:s>Since it is a non-toxic, bio-degradable polymer with high tensile and compressive strength, it has widespread use in various fields such as nanotechnology, pharmaceutical industry, food industry, cosmetics, textile and paper industry, drug-delivery systems in treating cancer and other diseases.</ns0:s><ns0:s>Micro-crysta

In [37]:
# PREPROCESSING TEXT

starting_p_tag_pattern = r'<ns0:p[^>]+>'
ending_p_tag_pattern = '</ns0:p>'
starting_ref_tag_pattern = r'<ns0:ref[^>]+>'
ending_ref_tag_pattern = '</ns0:ref>'
ref_pattern = r'<ref>.*?</ref>'          # temporary for removing ref tag

for record in document:
    for paragraph_number, paragraph_text in record[2].items():
        text = paragraph_text
        text = re.sub(starting_p_tag_pattern, '', text)
        text = re.sub(ending_p_tag_pattern, '', text)
        text = re.sub(starting_ref_tag_pattern, '<ref>', text)
        text = re.sub(ending_ref_tag_pattern, '</ref>', text)
        text = re.sub(ref_pattern, '', text)
        record[2][paragraph_number] = text
        
print(document)

[[0, 'Abstract', {1: "<ns0:s>Cellulose (C 6 H 10 O 5 ) n is one of the most ubiquitous organic polymers on the planet.</ns0:s><ns0:s>It is a significant structural component of the primary cell wall of green plants, various forms of algae and oomycetes.</ns0:s><ns0:s>It is a polysaccharide consisting of a linear chain of several hundred to many thousands of β(1 → 4) linked d-glucose units.</ns0:s><ns0:s>There are various extraction procedures for cellulose developed by using different processes like oxidation, etherification and esterification which convert the prepared celluloses in to cellulose derivatives.</ns0:s><ns0:s>Since it is a non-toxic, bio-degradable polymer with high tensile and compressive strength, it has widespread use in various fields such as nanotechnology, pharmaceutical industry, food industry, cosmetics, textile and paper industry, drug-delivery systems in treating cancer and other diseases.</ns0:s><ns0:s>Micro-crystalline cellulose in particular is among the most

In [None]:
# # EXTRACTING TEXT AND WRITING IN FILE

# file_name = os.path.basename(xml_file)[:-4]

# for record in document:
#     for _, paragraph in record[2].items():
#         with open(f'output\\{file_name}.txt', 'a', encoding='utf-8') as output_file:
#             output_file.write(paragraph + '\n')

In [None]:
# # NOT IN USE NOW! WILL BE DELETED!!
# # import spacy and load en_core_web_sm model

# import spacy
# nlp = spacy.load('en_core_web_sm')

In [None]:
# # NOT IN USE NOW! WILL BE DELETED!!
# # organizing sentences sequentially in a dictionary
# # adding dictionary with each respective record as a 4th element

# for record in document:
#     text_content = record[2]
#     list_of_paragraphs = {}
    
#     for paragraph_number, paragraph_text in text_content.items():
#         list_of_sentences = {}
#         doc = nlp(paragraph_text)
        
#         for sentence_number, sentence_text in enumerate(doc.sents, start=1):
#             list_of_sentences[sentence_number] = sentence_text.text            # .text added to convert into text
        
#         list_of_paragraphs[paragraph_number] = list_of_sentences
        
#     record.append(list_of_paragraphs)

In [None]:
# # NOT IN USE NOW! WILL BE DELETED!!
# # deleting 3rd element from each record of document

# for record in document:
#     record.remove(record[2])  

In [38]:
# MERGING METADATA WITH OTHER DOCUMENT PARTS

document = metadata + document
print(document)

[[0, 'Metadata', {'Title': 'An Update on Overview of Cellulose, Its Structure and Applications', 'Publication Date': '2019-12-18', 'DOI': '10.5772/intechopen.84727'}], [0, 'Abstract', {1: "<ns0:s>Cellulose (C 6 H 10 O 5 ) n is one of the most ubiquitous organic polymers on the planet.</ns0:s><ns0:s>It is a significant structural component of the primary cell wall of green plants, various forms of algae and oomycetes.</ns0:s><ns0:s>It is a polysaccharide consisting of a linear chain of several hundred to many thousands of β(1 → 4) linked d-glucose units.</ns0:s><ns0:s>There are various extraction procedures for cellulose developed by using different processes like oxidation, etherification and esterification which convert the prepared celluloses in to cellulose derivatives.</ns0:s><ns0:s>Since it is a non-toxic, bio-degradable polymer with high tensile and compressive strength, it has widespread use in various fields such as nanotechnology, pharmaceutical industry, food industry, cosmet

### XRI (XML-to-RDF-Intermediate) to RDF

In [39]:
# LOADING THE NER MODEL

try:
    model_path = 'C:/Users/umayer/_dev/experiment'
    model_name = 'SINGLE_LABEL_CELLULOSIC_MODEL_ACC/model-best'
    nlp = spacy.load(f'{model_path}/{model_name}')
except OSError:
    print('ERROR: Model Not Found!')
else:
    print('Model loaded successfully.')

Model loaded successfully.


In [40]:
# WRITING RDF TRIPLES FOR DOCUMENT

# DECLARING VARIABLE
labels_in_doc = []

# DECLARING LABELING SCHEMA
labeling_schema = ['ENTITY', 
                   'CHEMICAL_ENTITY', 
                   'MATERIAL_ENTITY', 
                   'STRUCTURE_ENTITY',
                   'APPLICATION',
                   'PROCESS',
                   'PROPERTY',
                   'EQUIPMENT',
                   'RELATIONSHIP',
                   'MEASUREMENT', 
                   'ABBREVIATION']

# GETTING DOI
try:
    doi = document[0][2]['DOI']
    doi = doi.replace('/', '_')
except NameError:
    print('ERROR: Document object is not defined!')

# PRINTING PREFIXES
print("@prefix scipub: <http://spatialai.org/scipub/v2.0#> .")
print("@prefix data: <http://spatialai.org/scipub/v2.0/data#> .")
print("@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .")
print("@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .")
print("@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .")
print("@prefix owl: <http://www.w3.org/2002/07/owl#> .")
print()

try:
    for record in document:

        # METADATA
        if record[1] == 'Metadata':

            # PRINTING METADATA
            print(f"data:Publication_{doi} rdf:type scipub:ScientificPublication ;")
            print(f"scipub:title '{record[2]['Title']}'^^xsd:string ;")
            print(f"scipub:doi '{record[2]['DOI']}'^^xsd:string ;")
            print(f"scipub:publicationDate '{record[2]['Publication Date']}'^^xsd:date .")
            print()

        # ABSTRACT AND IT'S DOCUMENT PARTS
        elif record[1] == 'Abstract':
            index = document.index(record) + 1
            next_section = document[index][0]
            paragraph_ids = []

            # CREATING PARAGRAPH ID FOR ABSTRACT'S PARAGRAPH 
            for paragraph_number, _ in record[2].items():
                paragraph_id = doi + '_A' + '-' + str(paragraph_number)
                paragraph_ids.append('data:'+paragraph_id)

            paragraph_ids_joined = ', '.join(paragraph_ids)

            # PRINTING ABSTRACT AND IT'S PARAGRAPH IDs 
            print(f"data:{doi}_A rdf:type scipub:Abstract ;")
            print(f"scipub:directlyContainsDocumentPart {paragraph_ids_joined} .")
            print()

            # PARAGRAPHS AND IT'S DOCUMENT PARTS
            for paragraph_number, paragraph_text in record[2].items():
                labeled_term_info_list = []
                labeled_term_number = 1

                # replacing ' with \' in text
                if "'" in paragraph_text:
                    paragraph_text = paragraph_text.replace("'", r"\'")

                # creating doc object
                doc = nlp(paragraph_text)

                for ent in doc.ents:
                    current_dateTime = str(datetime.now())[:-7]    # RECHECK THE APPROPRIATE PLACEMENT
                    labeled_term_id = doi + '_A' + '-' + str(paragraph_number) + '-' + str(labeled_term_number)
                    labeled_term = ent.text
                    label = ent.label_
                    offset = ent.start_char
                    length = ent.end_char - ent.start_char
                    labeled_term_info = [labeled_term_id, labeled_term, label, offset, length, current_dateTime]
                    labeled_term_info_list.append(labeled_term_info)
                    labeled_term_number += 1

                labeled_term_ids = ['data:'+i[0] for i in labeled_term_info_list]
                labeled_term_ids_joined = ', '.join(labeled_term_ids)

                # PRINTING PARAGRAPHS AND IT'S LABELED TERM IDs 
                print(f"data:{doi}_A-{paragraph_number} rdf:type scipub:Paragraph ;")
                print(f"scipub:positionInParentDocumentPart '{paragraph_number}'^^xsd:nonNegativeInteger ;")

                if paragraph_number == len(paragraph_ids):
                    print(f"scipub:nextDocumentPart data:{doi}_{next_section} ;")
                else:
                    print(f"scipub:nextDocumentPart data:{doi}_A-{paragraph_number+1} ;")

                print(f"scipub:paragraphText '{paragraph_text}'^^xsd:string ;")
                print(f"scipub:directlyContainsDocumentPart {labeled_term_ids_joined} .")
                print()

                # LABELED TERMS
                for info in labeled_term_info_list:
                    # PRINTING LABELED TERMS
                    print(f"data:{info[0]} rdf:type scipub:LabeledTerm ;")    # DEAL WITH ATOMIC / COMPOUND
                    print(f"scipub:documentPartIsDirectlyContainedBy data:{doi}_A-{paragraph_number} ;")
                    print(f"scipub:labeledTermText '{info[1]}'^^xsd:string ;")
                    print(f"scipub:offset '{info[3]}'^^xsd:nonNegativeInteger ;")
                    print(f"scipub:length '{info[4]}'^^xsd:nonNegativeInteger ;")
                    print(f"scipub:hasStatus data:Candidate_{info[0]} .")
                    print()

                    # PRINTING LABELED TERMS STATUS
                    print(f"data:Candidate_{info[0]} rdf:type scipub:CandidateStatus ;")
                    print(f"scipub:statusCreatedBy data:System_127_0_0_1 ;")
                    print(f"scipub:statusCreatedDate '{info[5]}'^^xsd:dateTime ;")

                    try:
                        if not bool(labeling_schema):
                            raise Exception('Labeling schema is empty!')
                            
                        label_number_in_schema = labeling_schema.index(info[2]) + 1
                    except Exception as e:
                        print(f'ERROR: {e}')
                    else:
                        print(f"scipub:hasLabel data:Label{label_number_in_schema} .")

                    # adding lebels and their position in the schema
                    if [label_number_in_schema, info[2]] not in labels_in_doc:
                        labels_in_doc.append([label_number_in_schema, info[2]])
                    print()

            print('#========================= SECTION DIVIDER =========================#')
            print()

        # SECTION AND IT'S DOCUMENT PARTS
        else:
            section_number = record[0]
            section_name = record[1]
            index = document.index(record) + 1
            paragraph_ids = []

            if index == len(document):
                next_section = 'EndOfDocument'
            else:
                next_section = document[index][0]

            # CREATING PARAGRAPH ID FOR SECTION'S PARAGRAPH 
            for paragraph_number, _ in record[2].items():
                paragraph_id = doi + '_' + str(section_number) + '-' + str(paragraph_number)
                paragraph_ids.append('data:'+paragraph_id)

            paragraph_ids_joined = ', '.join(paragraph_ids)
            
            # PRINTING SECTION AND IT'S PARAGRAPH IDs 
            print(f"data:{doi}_{section_number} rdf:type scipub:Section ;")
            print(f"scipub:headerText '{record[1]}'^^xsd:string ;")
            print(f"scipub:directlyContainsDocumentPart {paragraph_ids_joined} .")
            print()
            
            # PARAGRAPHS AND IT'S DOCUMENT PARTS
            for paragraph_number, paragraph_text in record[2].items():
                labeled_term_info_list = []
                labeled_term_sequence = 1

                # replacing ' with \' in text
                if "'" in paragraph_text:
                    paragraph_text = paragraph_text.replace("'", r"\'")

                # creating doc object
                doc = nlp(paragraph_text)

                for ent in doc.ents:
                    current_dateTime = str(datetime.now())[:-7]    # RECHECK THE APPROPRIATE PLACEMENT
                    labeled_term_id = doi + '_' + str(section_number) + '-' + str(paragraph_number) + '-' + str(labeled_term_sequence)
                    labeled_term = ent.text
                    label = ent.label_
                    offset = ent.start_char
                    length = ent.end_char - ent.start_char
                    labeled_term_info = [labeled_term_id, labeled_term, label, offset, length, current_dateTime]
                    labeled_term_info_list.append(labeled_term_info)
                    labeled_term_sequence += 1

                labeled_term_ids = ['data:'+i[0] for i in labeled_term_info_list]
                labeled_term_ids_joined = ', '.join(labeled_term_ids)

                # PRINTING PARAGRAPHS AND IT'S LABELED TERM IDs 
                print(f"data:{doi}_{section_number}-{paragraph_number} rdf:type scipub:Paragraph ;")
                print(f"scipub:positionInParentDocumentPart '{paragraph_number}'^^xsd:nonNegativeInteger ;")

                if paragraph_number == len(paragraph_ids):
                    print(f"scipub:nextDocumentPart data:{doi}_{next_section} ;")
                else:
                    print(f"scipub:nextDocumentPart data:{doi}_{section_number}-{paragraph_number+1} ;")

                print(f"scipub:paragraphText '{paragraph_text}'^^xsd:string ;")
                print(f"scipub:directlyContainsDocumentPart {labeled_term_ids_joined} .")
                print()

                # LABELED TERMS
                for info in labeled_term_info_list:
                    # PRINTING LABELED TERMS
                    print(f"data:{info[0]} rdf:type scipub:LabeledTerm ;")    # DEAL WITH ATOMIC / COMPOUND
                    print(f"scipub:documentPartIsDirectlyContainedBy data:{doi}_{section_number}-{paragraph_number} ;")
                    print(f"scipub:labeledTermText '{info[1]}'^^xsd:string ;")
                    print(f"scipub:offset '{info[3]}'^^xsd:nonNegativeInteger ;")
                    print(f"scipub:length '{info[4]}'^^xsd:nonNegativeInteger ;")
                    print(f"scipub:hasStatus data:Candidate_{info[0]} .")
                    print()

                    # PRINTING LABELED TERMS STATUS
                    print(f"data:Candidate_{info[0]} rdf:type scipub:CandidateStatus ;")
                    print(f"scipub:statusCreatedBy data:System_127_0_0_1 ;")
                    print(f"scipub:statusCreatedDate '{info[5]}'^^xsd:dateTime ;")

                    try:
                        if not bool(labeling_schema):
                            raise Exception('Labeling schema is empty!')
                            
                        label_number_in_schema = labeling_schema.index(info[2]) + 1
                    except Exception as e:
                        print(f'ERROR: {e}')
                    else:
                        print(f"scipub:hasLabel data:Label{label_number_in_schema} .")

                    # adding lebels and their position in the schema
                    if [label_number_in_schema, info[2]] not in labels_in_doc:
                        labels_in_doc.append([label_number_in_schema, info[2]])

                    print()

            print('#========================= SECTION DIVIDER =========================#')
            print()

except NameError:
    print('ERROR: Document object is not defined!')

try:
    if not bool(labels_in_doc):
        raise Exception('List of labels found in document is empty!')
        
    for label in labels_in_doc:
        print(f"data:Label{label[0]} rdf:type scipub:Label ;")
        print(f"scipub:fromLabelingSchema data:DevSchema ;")
        print(f"scipub:labelText '{label[1]}'^^xsd:string .")
        print()       
except Exception as e:
    print(f'ERROR: {e}')
            
print(f"data:System_127_0_0_1 rdf:type scipub:Agent .")
print(f"data:DevSchema rdf:type scipub:LabelingSchema .")

@prefix scipub: <http://spatialai.org/scipub/v2.0#> .
@prefix data: <http://spatialai.org/scipub/v2.0/data#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .

data:Publication_10.5772_intechopen.84727 rdf:type scipub:ScientificPublication ;
scipub:title 'An Update on Overview of Cellulose, Its Structure and Applications'^^xsd:string ;
scipub:doi '10.5772/intechopen.84727'^^xsd:string ;
scipub:publicationDate '2019-12-18'^^xsd:date .

data:10.5772_intechopen.84727_A rdf:type scipub:Abstract ;
scipub:directlyContainsDocumentPart data:10.5772_intechopen.84727_A-1 .

data:10.5772_intechopen.84727_A-1 rdf:type scipub:Paragraph ;
scipub:positionInParentDocumentPart '1'^^xsd:nonNegativeInteger ;
scipub:nextDocumentPart data:10.5772_intechopen.84727_1 ;
scipub:paragraphText '<ns0:s>Cellulose (C 6 H 10 O 5 ) n is one of the most

data:10.5772_intechopen.84727_4-1 rdf:type scipub:Paragraph ;
scipub:positionInParentDocumentPart '1'^^xsd:nonNegativeInteger ;
scipub:nextDocumentPart data:10.5772_intechopen.84727_5 ;
scipub:paragraphText '<ns0:s>The structure of cellulose has been constantly a subject requiring intensive research as it is formed by the hydrogen bonds between the network of hydroxy groups .</ns0:s><ns0:s>The progress was for more than a 100 years of intensive development on structure analysis methods like electron microscopy, X-ray diffraction and high Major pathways of formation of cellulose .</ns0:s>'^^xsd:string ;
scipub:directlyContainsDocumentPart data:10.5772_intechopen.84727_4-1-1, data:10.5772_intechopen.84727_4-1-2, data:10.5772_intechopen.84727_4-1-3, data:10.5772_intechopen.84727_4-1-4, data:10.5772_intechopen.84727_4-1-5, data:10.5772_intechopen.84727_4-1-6, data:10.5772_intechopen.84727_4-1-7 .

data:10.5772_intechopen.84727_4-1-1 rdf:type scipub:LabeledTerm ;
scipub:documentPartIsDirect

data:10.5772_intechopen.84727_6.1.1.-1 rdf:type scipub:Paragraph ;
scipub:positionInParentDocumentPart '1'^^xsd:nonNegativeInteger ;
scipub:nextDocumentPart data:10.5772_intechopen.84727_6.1. ;
scipub:paragraphText '<ns0:s>Cellulose is extracted from SCB by using steam explosion and xylanase pretreatment and bleaching process.</ns0:s><ns0:s>The dried SCB treated with steam explosion at a pressure 13 bar (195°C) for 15 minutes to obtain steam exploded SCB fibres.</ns0:s><ns0:s>Then, the steam exploded SCB is treated with 20 μg of xylanase using fibre to liquor ratio of 1:10 for 1 hour at 50°C under constant agitation.</ns0:s><ns0:s>Then, dried steam exploded SCB is treated with xylanase (fibre to liquor ratio of 1:10) and then bleached with 0.7% sodium chlorite (NaClO 2 ) adjusted to a pH of 4 by the addition of weak acetic acid at 70°C for 1 hour.</ns0:s><ns0:s>Sodium chlorite and acetic acid at the same loading were added to the reaction every 1 hour till the cellulose turns white.</n

data:10.5772_intechopen.84727_6.2.-2 rdf:type scipub:Paragraph ;
scipub:positionInParentDocumentPart '2'^^xsd:nonNegativeInteger ;
scipub:nextDocumentPart data:10.5772_intechopen.84727_6.2.-3 ;
scipub:paragraphText '<ns0:s>2. Preparation of cellulose from bamboo fibre and de-waxing of bamboo fibre: The 400 mL toluene and 200 mL of ethyl alcohol are filled into a round flask to produce toluene-ethanol of ratio 2:1.</ns0:s><ns0:s>The round flask is placed on a heating element.</ns0:s><ns0:s>A Soxhlet extractor is placed on top of the boiling flask and fixed firmly using a retort stand.</ns0:s><ns0:s>About 10 grams of GBF is scooped into a membrane tube and placed into the extraction thimble.</ns0:s><ns0:s>A Liebig condenser is placed on top of the extractor and then fixed firmly.</ns0:s><ns0:s>The temperature of the heating element is observed using a digital thermometer and it is maintained at 250°C.</ns0:s><ns0:s>The extraction process is continued till the colour mixture disappears.</

data:10.5772_intechopen.84727_4-1 rdf:type scipub:Paragraph ;
scipub:positionInParentDocumentPart '1'^^xsd:nonNegativeInteger ;
scipub:nextDocumentPart data:10.5772_intechopen.84727_6.1.2. ;
scipub:paragraphText '<ns0:s>Dynamic thermogravimetric measurements were performed using a Shimadzu DTG 60H instrument.</ns0:s><ns0:s>The temperature programs for dynamic tests was run from ambient temperature 25-700°C.</ns0:s><ns0:s>All measurements was made under a nitrogen flow (20 mL/min), while keeping a constant heating rate of 10°C min −1 and using an aluminium crucible with a pinhole (Figure ).</ns0:s>'^^xsd:string ;
scipub:directlyContainsDocumentPart data:10.5772_intechopen.84727_4-1-1, data:10.5772_intechopen.84727_4-1-2, data:10.5772_intechopen.84727_4-1-3, data:10.5772_intechopen.84727_4-1-4, data:10.5772_intechopen.84727_4-1-5 .

data:10.5772_intechopen.84727_4-1-1 rdf:type scipub:LabeledTerm ;
scipub:documentPartIsDirectlyContainedBy data:10.5772_intechopen.84727_4-1 ;
scipub:labeled

data:10.5772_intechopen.84727_6.1.2.-1 rdf:type scipub:Paragraph ;
scipub:positionInParentDocumentPart '1'^^xsd:nonNegativeInteger ;
scipub:nextDocumentPart data:10.5772_intechopen.84727_6.1.2. ;
scipub:paragraphText '<ns0:s>X-ray diffraction is applied to identify the crystallinity of rice husk grits after numerous extraction methods.</ns0:s><ns0:s>Each sample/material in the arrangement of milled powder is set aside on the sample vessel and levelled to attain complete and unvarying X-ray exposure.</ns0:s><ns0:s>The trials are examined with the assistance of an X-ray diffractometer at room temperature (RT) by means of a monochromatic CuKα energy source (λ = 0.1539 nm) in the step-scan approach with a 2θ angle extending from 10 to 50°C with a stage of 0.04 and scanning period of 5 minutes.</ns0:s><ns0:s>To characterize the crystallinity of the several samples, the crystallinity index CrI, is created based on the mirrored intensity data (Figure ).</ns0:s>'^^xsd:string ;
scipub:directlyC

data:10.5772_intechopen.84727_7.2.-1 rdf:type scipub:Paragraph ;
scipub:positionInParentDocumentPart '1'^^xsd:nonNegativeInteger ;
scipub:nextDocumentPart data:10.5772_intechopen.84727_7.2. ;
scipub:paragraphText '<ns0:s>HPMC is widely utilized in the pharmaceutical industry not only because it is safe and nontoxic but also because it does not get engrossed orally and does not upsurge the energy of It is utilized as a film-forming agent, thickener, blocker, sustainedrelease agent, blending agent and suspending agent in many dosage forms, thus forming the numerous pharmaceutical preparation consistently discrete, tough short of being wrecked due to sustained release effects or steady emulsion without stratification.</ns0:s><ns0:s>It is regularly used as a matrix, adhesives, frame ingredients, the film creating material or in the creation of sustained or controlled release microcapsules and pellets .</ns0:s>'^^xsd:string ;
scipub:directlyContainsDocumentPart data:10.5772_intechopen.84727

data:10.5772_intechopen.84727_7.3.-1 rdf:type scipub:Paragraph ;
scipub:positionInParentDocumentPart '1'^^xsd:nonNegativeInteger ;
scipub:nextDocumentPart data:10.5772_intechopen.84727_7.3.-2 ;
scipub:paragraphText '<ns0:s>From the advent of novel drug delivery systems, cellulose based models seemed like strong candidates due to their projected benefits.</ns0:s><ns0:s>Since then various advances have been made with the aim to bring its use to common practice.</ns0:s><ns0:s>There are still many hurdles to cross before this becomes a reality.</ns0:s><ns0:s>Cellulose based drug delivery is an important step in green and sustainable pharmacy which focuses on toxicity reduction, biodegradability and less hazardous synthesis with respect to drugs and drug delivery systems.</ns0:s><ns0:s>A very brief overview of the primary ways in which it is used is provided here.</ns0:s><ns0:s>Cellulose nanocrystals (CNCs) have the potential to acquire a negative charge during hydrolysis.</ns0:s><ns0:s>Thi

<module 'builtins' (built-in)>

In [13]:
# printing abstract and sections

doi = document[0][2]['DOI']
doi = doi.replace('/', '_')
term_number = 1
labeling_schema = {0: 'ENTITY', 
                   1: 'CHEMICAL_ENTITY', 
                   2: 'MATERIAL_ENTITY', 
                   3: 'STRUCTURE_ENTITY', 
                   4: 'RELATIONSHIP', 
                   5: 'PROPERTY', 
                   6: 'PROCESS_OR_TECHNIQUE', 
                   7: 'APPLICATION', 
                   8: 'MEASUREMENT', 
                   9: 'ABBREVIATION'}

print("@prefix SciPub: <http://spatialai.org/SciPub/v2.0#> .")
print("@prefix : <http://spatialai.org/SciPub/v2.0/data#> .")
print("@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .")
print("@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .")
print("@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .")
print("@prefix owl: <http://www.w3.org/2002/07/owl#> .")
print()

for record in document:
    if record[1] == 'Metadata':
        print(f":Publication rdf:type SciPub:ScientificPublication ;")
        print(f"SciPub:title '{record[2]['Title']}'^^xsd:string ;")
        print(f"SciPub:doi '{record[2]['DOI']}'^^xsd:string ;")
        print(f"SciPub:publicationDate '{record[2]['Publication Date']}'^^xsd:date .")
        print()
    
    elif record[1] == 'Abstract':
        index = document.index(record) + 1
        next_section = document[index][0]
        paragraph_positions = []
        
        # RDF>> ABSTRACT rdf:type Abstract
        print(f":{doi}_A rdf:type SciPub:Abstract ;")
        
        # RDF>> ABSTRACT sp:directlyContainsDocumentPart PARAGRAPH
        for paragraph_number, paragraph_object in record[2].items():
            position = ':' + doi + '_A' + '-' + str(paragraph_number)
            paragraph_positions.append(position)
            
        joined_paragraph_positions = ', '.join(paragraph_positions)
        print(f"SciPub:directlyContainsDocumentPart {joined_paragraph_positions} .")
        print()
            
        # RDF>> PARAGRAPH rdf:type Paragraph
        for element in paragraph_positions:
            print(f"{element} rdf:type SciPub:Paragraph ;")
            
        for paragraph_number, paragraph_object in record[2].items():
            sentence_positions = []
            
            # RDF>> PARAGRAPH sp:directlyContainsDocumentPart SENTENCE
            for sentence_number, sentence_text in paragraph_object.items():
                position = ':' + doi + '_A' + '-' + str(paragraph_number) + '-' + str(sentence_number)
                sentence_positions.append(position)
            
            joined_sentence_positions = ', '.join(sentence_positions)
            print(f"SciPub:directlyContainsDocumentPart {joined_sentence_positions} .")
            print()
            
            # RDF>> SENTENCE rdf:type Sectence
            # RDF>> SENTENCE sp:positionInParentDocumentPart 'POSITION'^^xsd:nonNegativeInteger
            # RDF>> SENTENCE sp:nextDocumentPart SENTENCE/PARAGRAPH/SECTION
            # RDF>> SENTENCE sp:sentenceText 'TEXT_OF_SENTENCE'^^xsd:string
            for sentence_number, sentence_text in paragraph_object.items():
                
                # replacing ' with \' in text
                if "'" in sentence_text:
                    sentence_text = sentence_text.replace("'", r"\'")
                    
                print(f":{doi}_A-{paragraph_number}-{sentence_number} rdf:type SciPub:Sectence ;")
                print(f"SciPub:positionInParentDocumentPart '{sentence_number}'^^xsd:nonNegativeInteger ;")
                
                if sentence_number == len(sentence_positions):
                    if paragraph_number == len(paragraph_positions):
                        print(f"SciPub:nextDocumentPart :{doi}_{next_section} ;")
                    else:
                        print(f"SciPub:nextDocumentPart :{doi}_A-{paragraph_number+1} ;")
                else:
                    print(f"SciPub:nextDocumentPart :{doi}_A-{paragraph_number}-{sentence_number+1} ;")
                    
                print(f"SciPub:sentenceText '{sentence_text}'^^xsd:string .")
                print()
                
                # RDF>> LABELED_TERM rdf:type AtomicLabeledTerm/CompoundLabeledTerm
                # RDF>> LABELED_TERM sp:labeledTermText 'TEXT_OF_TERM'^^xsd:string
                # RDF>> LABELED_TERM sp:labeledTermIsContainedBy SENTENCE
                # RDF>> LABELED_TERM sp:offset 'STARTING_POSITION'^^xsd:nonNegativeInteger
                # RDF>> LABELED_TERM sp:length 'LENGTH'^^xsd:nonNegativeInteger
                # RDF>> LABELED_TERM sp:hasLabel LABEL
                doc = nlp(sentence_text)
                for ent in doc.ents:
                    print(f":LabeledTerm{term_number} rdf:type SciPub:AtomicLabeledTerm ;")  ## DEAL WITH ATOMIC / COMPOUND
                    print(f"SciPub:labeledTermText '{ent.text}'^^xsd:string ;")
                    print(f"SciPub:documentPartIsDirectlyContainedBy :{doi}_A-{paragraph_number}-{sentence_number} ;")
                    print(f"SciPub:offset '{ent.start_char}'^^xsd:nonNegativeInteger ;")
                    print(f"SciPub:length '{ent.end_char - ent.start_char}'^^xsd:nonNegativeInteger ;")
                    
                    for label_number, label_text in labeling_schema.items():
                        if label_text == ent.label_:
                            print(f"SciPub:hasLabel :Label{label_number} .")
                            break
                        else:
                            print(f"SciPub:hasLabel :Label_0 .")
                            break
                            
                    term_number += 1
                    print()
            
        print('#=================== TEMPORARY SECTION DIVIDER ===================#')
        print()
    
    else:
        section_number = record[0]
        section_name = record[1]
        index = document.index(record) + 1
        paragraph_positions = []
        
        if index == len(document):
            next_section = 'EndOfDocument'
        else:
            next_section = document[index][0]
        
        # RDF>> 
        # RDF>> 
        print(f":{doi}_{section_number} rdf:type SciPub:Section ;")
        print(f"SciPub:headerText '{record[1]}'^^xsd:string ;")

        # RDF>> SECTION sp:directlyContainsDocumentPart PARAGRAPH
        for paragraph_number, paragraph_object in record[2].items():
            position = ':' + doi + '_' + str(section_number) + '-' + str(paragraph_number)
            paragraph_positions.append(position)
        
        joined_paragraph_positions = ', '.join(paragraph_positions)
        print(f"SciPub:directlyContainsDocumentPart {joined_paragraph_positions} .")
        print()  
        
        for paragraph_number, paragraph_object in record[2].items():
            sentence_positions = []
            
            # RDF>> PARAGRAPH rdf:type Paragraph
            # RDF>> PARAGRAPH sp:directlyContainsDocumentPart SENTENCE
            for sentence_number, sentence_text in paragraph_object.items():
                position = ':' + doi + '_' + str(section_number) + '-' + str(paragraph_number) + '-' + str(sentence_number)
                sentence_positions.append(position)
            
            joined_sentence_positions = ', '.join(sentence_positions)
            print(f":{doi}_{section_number}-{paragraph_number} rdf:type SciPub:Paragraph ;")
            print(f"SciPub:directlyContainsDocumentPart {joined_sentence_positions} .")
            print()
            
            # RDF>> SENTENCE rdf:type Sectence
            # RDF>> SENTENCE sp:positionInParentDocumentPart 'POSITION'^^xsd:nonNegativeInteger
            # RDF>> SENTENCE sp:nextDocumentPart SENTENCE/PARAGRAPH/SECTION
            # RDF>> SENTENCE sp:sentenceText 'TEXT_OF_SENTENCE'^^xsd:string
            for sentence_number, sentence_text in paragraph_object.items():
                
                # replacing ' with \' in text
                if "'" in sentence_text:
                    sentence_text = sentence_text.replace("'", r"\'")
                    
                print(f":{doi}_{section_number}-{paragraph_number}-{sentence_number} rdf:type SciPub:Sentence ;")
                print(f"SciPub:positionInParentDocumentPart '{sentence_number}'^^xsd:nonNegativeInteger ;")
                
                if sentence_number == len(sentence_positions):
                    if paragraph_number == len(paragraph_positions):
                        print(f"SciPub:nextDocumentPart :{doi}_{next_section} ;")
                    else:
                        print(f"SciPub:nextDocumentPart :{doi}_{section_number}-{paragraph_number+1} ;")
                else:
                    print(f"SciPub:nextDocumentPart :{doi}_{section_number}-{paragraph_number}-{sentence_number+1} ;")
                    
                print(f"SciPub:sentenceText '{sentence_text}'^^xsd:string .")
                print()
                
                # RDF>> LABELED_TERM rdf:type AtomicLabeledTerm/CompoundLabeledTerm
                # RDF>> LABELED_TERM sp:labeledTermText 'TEXT_OF_TERM'^^xsd:string
                # RDF>> LABELED_TERM sp:labeledTermIsContainedBy SENTENCE
                # RDF>> LABELED_TERM sp:offset 'STARTING_POSITION'^^xsd:nonNegativeInteger
                # RDF>> LABELED_TERM sp:length 'LENGTH'^^xsd:nonNegativeInteger
                # RDF>> LABELED_TERM sp:hasLabel LABEL
                doc = nlp(sentence_text)
                for ent in doc.ents:
                    print(f":LabeledTerm{term_number} rdf:type SciPub:AtomicLabeledTerm ;")  ## DEAL WITH ATOMIC / COMPOUND
                    print(f"SciPub:labeledTermText '{ent.text}'^^xsd:string ;")
                    print(f"SciPub:documentPartIsDirectlyContainedBy :{doi}_{section_number}-{paragraph_number}-{sentence_number} ;")
                    print(f"SciPub:offset '{ent.start_char}'^^xsd:nonNegativeInteger ;")
                    print(f"SciPub:length '{ent.end_char - ent.start_char}'^^xsd:nonNegativeInteger ;")
                    
                    for label_number, label_text in labeling_schema.items():
                        if label_text == ent.label_:
                            print(f"SciPub:hasLabel :Label{label_number} .")
                            break
                        else:
                            print(f"SciPub:hasLabel :Label_0 .")
                            break
                            
                    term_number += 1
                    print()
            
        print('#=================== TEMPORARY SECTION DIVIDER ===================#')
        print()
        
for label_number, label_text in labeling_schema.items():
    print(f":Label{label_number} rdf:type SciPub:Label ;")
    print(f"SciPub:fromLabelingSchema :LabelingSchema{label_number} ;")
    print(f"SciPub:labelText '{label_text}'^^xsd:string .")
    print()
            
for label_number, _ in labeling_schema.items():
    print(f":LabelingSchema{label_number} rdf:type SciPub:LabelingSchema .")
    

# Connect to GraphDB

In [51]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import json

In [52]:
# connect and query graph database
# ADD TRY CATCH TO AVOID DATABASE CONNECTIVITY ERROR

# specify the repository
sparql = SPARQLWrapper("http://LAPTOP-S7TVD5I4:7200/repositories/SciPub")

# SPARQL query
sparql.setQuery("""
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX : <http://spatialai.org/SciPub/v2.0#>
    PREFIX SciPub: <http://spatialai.org/SciPub/v2.0#>

    SELECT ?sentenceID ?sentenceText ?labeledTermText ?offset ?length ?labelText (NOW() AS ?systemDateTime)
    WHERE {
        ?paragraphID rdf:type SciPub:Paragraph ;
                     SciPub:directlyContainsDocumentPart ?sentenceID .

        ?sentenceID SciPub:sentenceText ?sentenceText .

        OPTIONAL {
            ?labeledTermID SciPub:documentPartIsDirectlyContainedBy ?sentenceID ;
                           SciPub:labeledTermText ?labeledTermText ;
                           SciPub:offset ?offset ;
                           SciPub:length ?length ;
                           SciPub:hasLabel ?labelID .

            ?labelID rdf:type SciPub:Label ;
                     SciPub:labelText ?labelText .
        }
    }
""")

# convert results to JSON
sparql.setReturnFormat(JSON)
fetched_data = sparql.query().convert()


URLError: <urlopen error [WinError 10061] No connection could be made because the target machine actively refused it>

In [None]:
# convert query result from JSON to dataframe

column_header = ['sentence_id', 'sentence_text', 'labeled_term_text', 'offset', 'length', 'label_text', 'system_datetime']
fetched_data_df = pd.DataFrame(columns=column_header)
sentence_id = []
sentence_text = []
labeled_term_text = []
offset = []
length = []
label_text = []
system_datetime = []

for record in fetched_data['results']['bindings']:
    if len(record) == 7:
        for key, value in record.items():
            if key == 'sentenceID':
                sentence_id.append(value['value'])
            if key == 'sentenceText':
                sentence_text.append(value['value'])
            if key == 'labeledTermText':
                labeled_term_text.append(value['value'])
            if key == 'offset':
                offset.append(value['value'])
            if key == 'length':
                length.append(value['value'])
            if key == 'labelText':
                label_text.append(value['value'])
            if key == 'systemDateTime':
                format_system_datetime = value['value'][:-10]
                format_system_datetime = format_system_datetime.replace('T', ' ')
                system_datetime.append(format_system_datetime)
    
    elif len(record) == 3:
        for key, value in record.items():
            if key == 'sentenceID':
                sentence_id.append(value['value'])
            if key == 'sentenceText':
                sentence_text.append(value['value'])
            if key == 'systemDateTime':
                format_system_datetime = value['value'][:-10]
                format_system_datetime = format_system_datetime.replace('T', ' ')
                system_datetime.append(format_system_datetime)
        
        labeled_term_text.append('')
        offset.append('')
        length.append('')
        label_text.append('')
    
    else:
        print('Different record length (should be 7 or 3) found in SPARQL result (JSON format).')
        
if len(sentence_id) == len(sentence_text) == len(labeled_term_text) == len(offset) == len(length) == len(label_text) == len(system_datetime):
    fetched_data_df['sentence_id'] = sentence_id
    fetched_data_df['sentence_text'] = sentence_text
    fetched_data_df['labeled_term_text'] = labeled_term_text
    fetched_data_df['offset'] = offset
    fetched_data_df['length'] = length
    fetched_data_df['label_text'] = label_text
    fetched_data_df['system_datetime'] = system_datetime 
else:
    print('Different length found for dataframe columns.')

In [None]:
fetched_data_df

In [None]:
# create JSON from dataframe

distinct_sentence_id = list(fetched_data_df['sentence_id'].unique())
fetched_data_json = {'classes': [], 'annotations': []}

for sentence_id in distinct_sentence_id:
    single_sentence_annotation = []
    fetched_data_sub_df = fetched_data_df.loc[fetched_data_df['sentence_id'] == sentence_id]
    first_index = list(fetched_data_sub_df.index)[0]
    single_sentence_annotation.append(sentence_id)
    single_sentence_annotation.append(fetched_data_sub_df['sentence_text'][first_index])
    single_sentence_annotation.append({'entities': []})

    for idx in fetched_data_sub_df.index:          
        if offset[idx] == '' or length[idx] == '' or label_text[idx] == '':
            ll = ['', '', ['', [['', '', '']]]]
            single_sentence_annotation[2]['entities'].append(ll)
        else:
            ll = [int(offset[idx]), int(offset[idx])+int(length[idx]), [label_text[idx], [['Candidate', system_datetime[idx], 'NER Model']]]]
            single_sentence_annotation[2]['entities'].append(ll)
        
    fetched_data_json['annotations'].append(single_sentence_annotation)


In [None]:
json.dumps(fetched_data_json)

In [None]:
a = "To explain this deviation of the \'ideal behavior\', this paper discusses extensively the impact of the nanoparticle shape on the nanocomposite permeability along with structural aspects, related to both the particle nature and size, and the nanocomposite processing routes."

In [None]:
a.replace("\'", "'")

In [None]:
a = "It is well known that inclusion of homogeneously dispersed and oriented impermeable fillers with high aspect ratio, such as platelets or elongated particles, should significantly increase the diffusion path of gas and vapors and yield to improve barrier properties."

In [None]:
a[218:224]