In [1]:
import xml.etree.ElementTree as ET
import re
import os
import spacy
import scispacy
from datetime import datetime

### XML to Text

In [5]:
# THIS FUNCTION MUST BE ALIGNED WITH CORRESPONDING CODE CELLS

def xml_to_text_all():
    
    # CREATING A LIST OF FILE WITH AVAILABLE FILES
    
    directory_path = 'D:\\Drive\\SISE\\CelloGraph\\Dev\\Data\\XML'
    file_list = os.listdir(directory_path)
    
    
    for xml_file in file_list:
        
        # LOADING XML AND CREATING ROOT
        
        tree = ET.parse(directory_path + '\\' + xml_file)
        root = tree.getroot()


        # EXTRACTING METADATA (available data: title, publication_date and doi)

        metadata = []

        for elem in root:
            if elem.tag[29:] == 'teiHeader':
                for sub_elem1 in elem:
                    if sub_elem1.tag[29:] == 'fileDesc':
                        publication_info = {}
                        for sub_elem2 in sub_elem1:
                            if sub_elem2.tag[29:] == 'titleStmt':
                                for sub_elem3 in sub_elem2:
                                    if sub_elem3.tag[29:] == 'title':
                                        publication_info['Title'] = sub_elem3.text
                            if sub_elem2.tag[29:] == 'publicationStmt':
                                for sub_elem3 in sub_elem2:
                                    if sub_elem3.tag[29:] == 'date':
                                        publication_info['Publication Date'] = sub_elem3.attrib['when']
                            if sub_elem2.tag[29:] == 'sourceDesc':
                                for sub_elem3 in sub_elem2:
                                    if sub_elem3.tag[29:] == 'biblStruct':
                                        for sub_elem4 in sub_elem3:
                                            if sub_elem4.tag[29:] == 'idno':
                                                publication_info['DOI'] = sub_elem4.text
                        metadata.append([0, 'Metadata', publication_info])


        # EXTRACTING ABSTRACT AND NUMBERING PARAGRAPHS

        abstract = []

        for elem in root:
            if elem.tag[29:] == 'teiHeader':
                for sub_elem1 in elem:
                    if sub_elem1.tag[29:] == 'profileDesc':
                        for sub_elem2 in sub_elem1:
                            if sub_elem2.tag[29:] == 'abstract':
                                for sub_elem3 in sub_elem2:
                                    if sub_elem3.tag[29:] == 'div':
                                        list_of_paragraphs = {}
                                        paragraph_number = 0
                                        for sub_elem4 in sub_elem3:
                                            if sub_elem4.tag[29:] == 'p':
                                                paragraph_number += 1
                                                list_of_paragraphs[paragraph_number] = ET.tostring(sub_elem4, encoding='unicode')
                                        abstract.append([0, 'Abstract', list_of_paragraphs])


        # EXTRACTING OTHER SECTIONS

        list_of_sections = []

        for elem in root:
            if (elem.tag[29:] == 'text'):
                for sub_elem1 in elem:
                    if (sub_elem1.tag[29:] == 'body'):
                        for sub_elem2 in sub_elem1:
                            if sub_elem2.tag[29:] == 'div':
                                section_number = ''
                                section_name = ''
                                list_of_paragraphs = []
                                for sub_elem3 in sub_elem2:
                                    if sub_elem3.tag[29:] == 'head':
                                        if bool(sub_elem3.attrib):
                                            section_number = str(sub_elem3.attrib)[7:-3]
                                        else:
                                            section_number = 'NO_SECTION_NUMBER'
                                        section_name = sub_elem3.text
                                    if sub_elem3.tag[29:] == 'p':
                                        list_of_paragraphs.append(ET.tostring(sub_elem3, encoding='unicode'))
                                # commented logic skips NO_SECTION_NUMBER with no paragraphs, i.e., Table 1, Table 2, .....
                                # if section_number == 'NO_SECTION_NUMBER' and not bool(list_of_paragraphs):
                                if section_number == 'NO_SECTION_NUMBER':
                                    pass
                                else:
                                    list_of_sections.append([section_number, section_name, list_of_paragraphs])


        # NUMBERING PARAGRAPHS OF SECTIONS 
        # paragraphs are sequentially added in a dictionary
        # dictionary is added with each respective record as a 4th element

        for section in list_of_sections:
            list_of_paragraphs = {}
            for paragraph_number, paragraph_text in enumerate(section[2], start=1):
                list_of_paragraphs[paragraph_number] = paragraph_text
            section.append(list_of_paragraphs)


        # DELETING THE 3RD ELEMENT FROM EACH RECORD

        for section in list_of_sections:
            section.remove(section[2])


        # MERGING ABSTRACT WITH OTHER SECTIONS

        document = abstract + list_of_sections


        # PREPROCESSING TEXT

        starting_p_tag_pattern = r'<ns0:p[^>]+>'
        ending_p_tag_pattern = '</ns0:p>'
        starting_ref_tag_pattern = r'<ns0:ref[^>]+>'
        ending_ref_tag_pattern = '</ns0:ref>'
        ref_pattern = r'<ref>.*?</ref>'          # temporary for removing ref tag

        for record in document:
            for paragraph_number, paragraph_text in record[2].items():
                text = paragraph_text
                text = re.sub(starting_p_tag_pattern, '', text)
                text = re.sub(ending_p_tag_pattern, '', text)
                text = re.sub(starting_ref_tag_pattern, '<ref>', text)
                text = re.sub(ending_ref_tag_pattern, '</ref>', text)
                text = re.sub(ref_pattern, '', text)
                record[2][paragraph_number] = text


        # EXTRACTING TEXT AND WRITING IN FILE

        output_file_name = os.path.basename(xml_file)[:-4]

        for record in document:
            for _, paragraph in record[2].items():
                with open(f'D:\\Drive\\SISE\\CelloGraph\\Dev\\Data\\Text\\{output_file_name}.txt', 'a', encoding='utf-8') as output_file:
                    output_file.write(paragraph + '\n')
    

In [None]:
# xml_to_text_all()

### XML Structure
- teiHeader
    - fileDesc
        - titleStmt
            - title
        - publicationStmt
            - date
        - sourceDesc
            - biblStruct
                - idno
    - encodingDesc
    - profileDesc

### XRI (XML-to-RDF-Intermediate) Data Structure
- [
    - [
        - section_number,
        - section_title,
        - {
            - paragraph_number:
                - {
                    - sentence_number: sectence_text,
                - },
        - }
    - ],
- ]

### XML to XRI (XML-to-RDF-Intermediate)

In [2]:
# LOADING XML AND CREATING ROOT

xml_file = 'D:\\Drive\\SISE\\CelloGraph\\Dev\\Data\\XML\\Wolf_2018_modified.xml'
tree = ET.parse(xml_file)
root = tree.getroot()

# EXTRACTING METADATA (available data: title, publication_date and doi)

metadata = []

for elem in root:
    if elem.tag[29:] == 'teiHeader':
        for sub_elem1 in elem:
            if sub_elem1.tag[29:] == 'fileDesc':
                publication_info = {}
                for sub_elem2 in sub_elem1:
                    if sub_elem2.tag[29:] == 'titleStmt':
                        for sub_elem3 in sub_elem2:
                            if sub_elem3.tag[29:] == 'title':
                                publication_info['Title'] = sub_elem3.text
                    if sub_elem2.tag[29:] == 'publicationStmt':
                        for sub_elem3 in sub_elem2:
                            if sub_elem3.tag[29:] == 'date':
                                publication_info['Publication Date'] = sub_elem3.attrib['when']
                    if sub_elem2.tag[29:] == 'sourceDesc':
                        for sub_elem3 in sub_elem2:
                            if sub_elem3.tag[29:] == 'biblStruct':
                                for sub_elem4 in sub_elem3:
                                    if sub_elem4.tag[29:] == 'idno':
                                        publication_info['DOI'] = sub_elem4.text
                metadata.append(['0', 'Metadata', publication_info])
                                      
print(metadata)

[['0', 'Metadata', {'Title': 'How the shape of fillers affects the barrier properties of polymer/non-porous particles nanocomposites: A review', 'Publication Date': '2018-04-03', 'DOI': '10.1016/j.memsci.2018.03.085'}]]


In [3]:
# EXTRACTING ABSTRACT AND NUMBERING PARAGRAPHS

abstract = []

for elem in root:
    if elem.tag[29:] == 'teiHeader':
        for sub_elem1 in elem:
            if sub_elem1.tag[29:] == 'profileDesc':
                for sub_elem2 in sub_elem1:
                    if sub_elem2.tag[29:] == 'abstract':
                        for sub_elem3 in sub_elem2:
                            if sub_elem3.tag[29:] == 'div':
                                list_of_paragraphs = {}
                                paragraph_number = 0
                                for sub_elem4 in sub_elem3:
                                    if sub_elem4.tag[29:] == 'p':
                                        paragraph_number += 1
                                        list_of_paragraphs[paragraph_number] = ET.tostring(sub_elem4, encoding='unicode')
                                abstract.append(['0', 'Abstract', list_of_paragraphs])
                                
print(abstract)

[['0', 'Abstract', {1: '<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0">More than 1000 published experimental data of gas (O 2 and CO 2 ) and vapor (H 2 O) permeability in nanocomposites containing either spherical, elongated or platelet particles were collected, assorted and compared in order to decipher the role of particle shape on the reduction of the relative permeability of the nanocomposite. It is well known that inclusion of homogeneously dispersed and oriented impermeable fillers with high aspect ratio, such as platelets or elongated particles, should significantly increase the diffusion path of gas and vapors and yield to improve barrier properties. Results revealed that this expected impact was not systematically achieved, even for impermeable lamellar fillers that usually displayed the highest aspect ratio. More specifically, an unexpected increase of the permeability in the nanocomposite was often observed. To explain this deviation of the \'ideal behavior\', this paper dis

In [4]:
# EXTRACTING OTHER SECTIONS

# need to normalize the section number (line 17)
# compare Rojas and Wolf section number
# \ issue in RDF, see Koshkava 2014 paper

list_of_sections = []

for elem in root:
    if (elem.tag[29:] == 'text'):
        for sub_elem1 in elem:
            if (sub_elem1.tag[29:] == 'body'):
                for sub_elem2 in sub_elem1:
                    if sub_elem2.tag[29:] == 'div':
                        section_number = ''
                        section_name = ''
                        list_of_paragraphs = []
                        for sub_elem3 in sub_elem2:
                            if sub_elem3.tag[29:] == 'head':
                                if bool(sub_elem3.attrib):
                                    section_number = str(sub_elem3.attrib)
                                    if section_number[-3] == '.':
                                        section_number = section_number[7:-3]
                                    else:
                                        section_number = section_number[7:-2]
                                else:
                                    section_number = 'NO_SECTION_NUMBER'
                                section_name = sub_elem3.text
                            if sub_elem3.tag[29:] == 'p':
                                list_of_paragraphs.append(ET.tostring(sub_elem3, encoding='unicode'))
                        # commented logic skips NO_SECTION_NUMBER with no paragraphs, i.e., Table 1, Table 2, .....
                        # if section_number == 'NO_SECTION_NUMBER' and not bool(list_of_paragraphs):
                        if section_number == 'NO_SECTION_NUMBER':
                            pass
                        else:
                            list_of_sections.append([section_number, section_name, list_of_paragraphs])

print(list_of_sections)

[['1', 'Introduction', ['<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0">In the objective of developing efficient and optimal packaging, one of the main challenges is to design and provide food packaging materials able to protect the food from the external environment and to maintain food quality and safety throughout its shelf life <ns0:ref type="bibr" target="#b0">[1,</ns0:ref><ns0:ref type="bibr" target="#b1">2]</ns0:ref>. The mass transfers are thus at the heart of the feature of the food packaging, especially transfers of water vapor, oxygen and/or carbon dioxide, which condition the rates of numerous reactions of food degradation (oxidation, microbial development, physiological reactions, etc.). The development of bulk nanocomposite structures by introducing nanoparticles, i.e. fillers having at least one dimension lower than 100 nm, in polymeric matrices appeared as one of the most promising directions in the development of packaging materials with advanced mass transfer properti

In [None]:
# # DON'T USE THIS PIECE OF CODE  (commented - 12 Nov 23)
# # HAVE TO ADJUST THIS CODE FOR TABLES/IMAGES
# # merging "paragraphs with no section number" to its previous section

# list_of_sections_length = len(list_of_sections)

# for i in range(list_of_sections_length-1,0,-1):
#     if list_of_sections[i][0] == 'NO_SECTION_NUMBER':
#         list_of_sections[i-1][2].extend(list_of_sections[i][2])
#         list_of_sections.remove(list_of_sections[i])
        
# list_of_sections

In [5]:
# NUMBERING PARAGRAPHS OF SECTIONS 
# paragraphs are sequentially added in a dictionary
# dictionary is added with each respective record as a 4th element

for section in list_of_sections:
    list_of_paragraphs = {}
    for paragraph_number, paragraph_text in enumerate(section[2], start=1):
        list_of_paragraphs[paragraph_number] = paragraph_text
    section.append(list_of_paragraphs)
    
print(list_of_sections)

[['1', 'Introduction', ['<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0">In the objective of developing efficient and optimal packaging, one of the main challenges is to design and provide food packaging materials able to protect the food from the external environment and to maintain food quality and safety throughout its shelf life <ns0:ref type="bibr" target="#b0">[1,</ns0:ref><ns0:ref type="bibr" target="#b1">2]</ns0:ref>. The mass transfers are thus at the heart of the feature of the food packaging, especially transfers of water vapor, oxygen and/or carbon dioxide, which condition the rates of numerous reactions of food degradation (oxidation, microbial development, physiological reactions, etc.). The development of bulk nanocomposite structures by introducing nanoparticles, i.e. fillers having at least one dimension lower than 100 nm, in polymeric matrices appeared as one of the most promising directions in the development of packaging materials with advanced mass transfer properti

In [6]:
# DELETING THE 3RD ELEMENT FROM EACH RECORD

for section in list_of_sections:
    section.remove(section[2])
    
print(list_of_sections)

[['1', 'Introduction', {1: '<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0">In the objective of developing efficient and optimal packaging, one of the main challenges is to design and provide food packaging materials able to protect the food from the external environment and to maintain food quality and safety throughout its shelf life <ns0:ref type="bibr" target="#b0">[1,</ns0:ref><ns0:ref type="bibr" target="#b1">2]</ns0:ref>. The mass transfers are thus at the heart of the feature of the food packaging, especially transfers of water vapor, oxygen and/or carbon dioxide, which condition the rates of numerous reactions of food degradation (oxidation, microbial development, physiological reactions, etc.). The development of bulk nanocomposite structures by introducing nanoparticles, i.e. fillers having at least one dimension lower than 100 nm, in polymeric matrices appeared as one of the most promising directions in the development of packaging materials with advanced mass transfer prope

In [7]:
# MERGING ABSTRACT WITH OTHER SECTIONS

document = abstract + list_of_sections
print(document)

[['0', 'Abstract', {1: '<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0">More than 1000 published experimental data of gas (O 2 and CO 2 ) and vapor (H 2 O) permeability in nanocomposites containing either spherical, elongated or platelet particles were collected, assorted and compared in order to decipher the role of particle shape on the reduction of the relative permeability of the nanocomposite. It is well known that inclusion of homogeneously dispersed and oriented impermeable fillers with high aspect ratio, such as platelets or elongated particles, should significantly increase the diffusion path of gas and vapors and yield to improve barrier properties. Results revealed that this expected impact was not systematically achieved, even for impermeable lamellar fillers that usually displayed the highest aspect ratio. More specifically, an unexpected increase of the permeability in the nanocomposite was often observed. To explain this deviation of the \'ideal behavior\', this paper dis

In [8]:
# PREPROCESSING TEXT

starting_p_tag_pattern = r'<ns0:p[^>]+>'
ending_p_tag_pattern = '</ns0:p>'
starting_ref_tag_pattern = r'<ns0:ref[^>]+>'
ending_ref_tag_pattern = '</ns0:ref>'
ref_pattern = r'<ref>.*?</ref>'          # temporary for removing ref tag

for record in document:
    for paragraph_number, paragraph_text in record[2].items():
        text = paragraph_text
        text = re.sub(starting_p_tag_pattern, '', text)
        text = re.sub(ending_p_tag_pattern, '', text)
        text = re.sub(starting_ref_tag_pattern, '<ref>', text)
        text = re.sub(ending_ref_tag_pattern, '</ref>', text)
        text = re.sub(ref_pattern, '', text)
        record[2][paragraph_number] = text
        
print(document)

[['0', 'Abstract', {1: "More than 1000 published experimental data of gas (O 2 and CO 2 ) and vapor (H 2 O) permeability in nanocomposites containing either spherical, elongated or platelet particles were collected, assorted and compared in order to decipher the role of particle shape on the reduction of the relative permeability of the nanocomposite. It is well known that inclusion of homogeneously dispersed and oriented impermeable fillers with high aspect ratio, such as platelets or elongated particles, should significantly increase the diffusion path of gas and vapors and yield to improve barrier properties. Results revealed that this expected impact was not systematically achieved, even for impermeable lamellar fillers that usually displayed the highest aspect ratio. More specifically, an unexpected increase of the permeability in the nanocomposite was often observed. To explain this deviation of the 'ideal behavior', this paper discusses extensively the impact of the nanoparticle

In [None]:
# # EXTRACTING TEXT AND WRITING IN FILE

# file_name = os.path.basename(xml_file)[:-4]

# for record in document:
#     for _, paragraph in record[2].items():
#         with open(f'output\\{file_name}.txt', 'a', encoding='utf-8') as output_file:
#             output_file.write(paragraph + '\n')

In [None]:
# # NOT IN USE NOW! WILL BE DELETED!!
# # import spacy and load en_core_web_sm model

# import spacy
# nlp = spacy.load('en_core_web_sm')

In [None]:
# # NOT IN USE NOW! WILL BE DELETED!!
# # organizing sentences sequentially in a dictionary
# # adding dictionary with each respective record as a 4th element

# for record in document:
#     text_content = record[2]
#     list_of_paragraphs = {}
    
#     for paragraph_number, paragraph_text in text_content.items():
#         list_of_sentences = {}
#         doc = nlp(paragraph_text)
        
#         for sentence_number, sentence_text in enumerate(doc.sents, start=1):
#             list_of_sentences[sentence_number] = sentence_text.text            # .text added to convert into text
        
#         list_of_paragraphs[paragraph_number] = list_of_sentences
        
#     record.append(list_of_paragraphs)

In [None]:
# # NOT IN USE NOW! WILL BE DELETED!!
# # deleting 3rd element from each record of document

# for record in document:
#     record.remove(record[2])  

In [9]:
# MERGING METADATA WITH OTHER DOCUMENT PARTS

document = metadata + document
print(document)

[['0', 'Metadata', {'Title': 'How the shape of fillers affects the barrier properties of polymer/non-porous particles nanocomposites: A review', 'Publication Date': '2018-04-03', 'DOI': '10.1016/j.memsci.2018.03.085'}], ['0', 'Abstract', {1: "More than 1000 published experimental data of gas (O 2 and CO 2 ) and vapor (H 2 O) permeability in nanocomposites containing either spherical, elongated or platelet particles were collected, assorted and compared in order to decipher the role of particle shape on the reduction of the relative permeability of the nanocomposite. It is well known that inclusion of homogeneously dispersed and oriented impermeable fillers with high aspect ratio, such as platelets or elongated particles, should significantly increase the diffusion path of gas and vapors and yield to improve barrier properties. Results revealed that this expected impact was not systematically achieved, even for impermeable lamellar fillers that usually displayed the highest aspect ratio

### XRI (XML-to-RDF-Intermediate) to RDF

In [10]:
# LOADING THE NER MODEL

try:
    model_path = 'C:/Users/umayer/_dev/experiment'
    model_name = 'MULTI_LABEL_CELLULOSIC_MODEL_ACC/model-best'
    nlp = spacy.load(f'{model_path}/{model_name}')
except OSError:
    print('ERROR: Model Not Found!')
else:
    print('Model loaded successfully.')

Model loaded successfully.


In [48]:
def nested_section_check(section_number):

    section_ids = []

    for record in document:
        pattern = rf'^{section_number}\.[^.]+$'
        if re.search(pattern, record[0]):
            section_id = doi + '_' + str(record[0])
            section_ids.append('data:'+section_id)

    section_ids_joined = ', '.join(section_ids)

    return section_ids_joined

In [54]:
# WRITING RDF TRIPLES FOR DOCUMENT

# DECLARING VARIABLE
labels_in_doc = []

# DECLARING LABELING SCHEMA
# labeling_schema = ['ENTITY', 
#                    'CHEMICAL_ENTITY', 
#                    'MATERIAL_ENTITY', 
#                    'STRUCTURE_ENTITY',
#                    'APPLICATION',
#                    'PROCESS',
#                    'PROPERTY',
#                    'EQUIPMENT',
#                    'RELATIONSHIP',
#                    'MEASUREMENT', 
#                    'ABBREVIATION']

labeling_schema = ['CHEMICAL', 
                   'MATERIAL', 
                   'MATERIAL_STRUCTURE',
                   'APPLICATION',
                   'PROCESS_OR_TECHNIQUE',
                   'PROPERTY',
                   'MEASUREMENT', 
                   'ABBREVIATION']

# GETTING DOI
try:
    doi = document[0][2]['DOI']
    doi = doi.replace('/', '_')
except NameError:
    print('ERROR: Document object is not defined!')

# PRINTING PREFIXES
print("@prefix onner: <http://spatialai.org/cellograph/ontology/onner/v2.0#> .")
print("@prefix data: <http://spatialai.org/cellograph/ontology/onner/v2.0/data#> .")
print("@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .")
print("@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .")
print("@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .")
print("@prefix owl: <http://www.w3.org/2002/07/owl#> .")
print()

try:
    for record in document:

        # METADATA
        if record[1] == 'Metadata':
            
            abstract_id = 'data:' + doi + '_A'
            section_ids = [abstract_id]
            
            for section_number in document:
                if section_number[0] != '0' and '.' not in section_number[0]:
                    section_id = doi + '_' + section_number[0]
                    section_ids.append('data:'+section_id)
            
            section_ids_joined = ', '.join(section_ids)
            
            # PRINTING METADATA
            print(f"data:Publication_{doi} rdf:type onner:ScholarlyPublication ;")
            print(f"onner:publicationTitle '{record[2]['Title']}'^^xsd:string ;")
            print(f"onner:doi '{record[2]['DOI']}'^^xsd:string ;")
            print(f"onner:publicationDate '{record[2]['Publication Date']}'^^xsd:date ;")
            print(f"onner:directlyContainsDocumentPart {section_ids_joined} .")
            print()

        # ABSTRACT AND IT'S DOCUMENT PARTS
        elif record[1] == 'Abstract':
            
            next_index = document.index(record) + 1
            next_section = document[next_index][0]
            paragraph_ids = []

            # CREATING PARAGRAPH ID FOR ABSTRACT'S PARAGRAPH 
            for paragraph_number, _ in record[2].items():
                paragraph_id = doi + '_A' + '-' + str(paragraph_number)
                paragraph_ids.append('data:'+paragraph_id)

            paragraph_ids_joined = ', '.join(paragraph_ids)

            # PRINTING ABSTRACT AND IT'S PARAGRAPH IDs 
            print(f"data:{doi}_A rdf:type onner:Abstract ;")
            print(f"onner:nextDocumentPart {paragraph_ids[0]} ;")    # NEXT DOC PART AFTER ABSTRACT
            print(f"onner:directlyContainsDocumentPart {paragraph_ids_joined} .")
            print()

            # PARAGRAPHS AND IT'S DOCUMENT PARTS
            for paragraph_number, paragraph_text in record[2].items():
                labeled_term_info_list = []
                labeled_term_number = 1

                # replacing ' with \' in text
                if "'" in paragraph_text:
                    paragraph_text = paragraph_text.replace("'", r"\'")

                # creating doc object
                doc = nlp(paragraph_text)

                for ent in doc.ents:
                    current_dateTime = str(datetime.now())[:-7]    # RECHECK THE APPROPRIATE PLACEMENT
                    labeled_term_id = doi + '_A' + '-' + str(paragraph_number) + '-' + str(labeled_term_number)
                    labeled_term = ent.text
                    label = ent.label_
                    offset = ent.start_char
                    length = ent.end_char - ent.start_char
                    labeled_term_info = [labeled_term_id, labeled_term, label, offset, length, current_dateTime]
                    labeled_term_info_list.append(labeled_term_info)
                    labeled_term_number += 1

                labeled_term_ids = ['data:'+i[0] for i in labeled_term_info_list]
                labeled_term_ids_joined = ', '.join(labeled_term_ids)

                # PRINTING PARAGRAPHS AND IT'S LABELED TERM IDs 
                print(f"data:{doi}_A-{paragraph_number} rdf:type onner:Paragraph ;")
                print(f"onner:positionInParentDocumentPart '{paragraph_number}'^^xsd:nonNegativeInteger ;")

                if paragraph_number == len(paragraph_ids):
                    print(f"onner:nextDocumentPart data:{doi}_{next_section} ;")
                else:
                    print(f"onner:nextDocumentPart data:{doi}_A-{paragraph_number+1} ;")

                print(f"onner:paragraphText '{paragraph_text}'^^xsd:string ;")
                
                if bool(labeled_term_ids_joined):    # <= added logic for no labeled terms
                    print(f"onner:directlyContainsLabeledTerm {labeled_term_ids_joined} .")
                else:
                    print(f"onner:directlyContainsLabeledTerm data:NoLabeledTerm .")
                
                print()

                # LABELED TERMS
                for info in labeled_term_info_list:
                    # PRINTING LABELED TERMS
                    print(f"data:{info[0]} rdf:type onner:LabeledTerm ;")    # DEAL WITH ATOMIC / COMPOUND
                    print(f"onner:labeledTermText '{info[1]}'^^xsd:string ;")
                    print(f"onner:offset '{info[3]}'^^xsd:nonNegativeInteger ;")
                    print(f"onner:length '{info[4]}'^^xsd:nonNegativeInteger ;")
                    print(f"onner:labeledTermDirectlyContainedBy data:{doi}_A-{paragraph_number} ;")
                    print(f"onner:hasLabeledTermStatus data:Candidate_{info[0]} .")
                    print()

                    # PRINTING LABELED TERMS STATUS
                    print(f"data:Candidate_{info[0]} rdf:type onner:CandidateStatus ;")
                    print(f"onner:statusAssignmentDate '{info[5]}'^^xsd:dateTime ;")
                    print(f"onner:statusAssignedBy data:Cellulosic_NER_Model ;")

                    try:
                        if not bool(labeling_schema):
                            raise Exception('Labeling schema is empty!')
                            
                        label_number_in_schema = labeling_schema.index(info[2]) + 1
                    except Exception as e:
                        print(f'ERROR: {e}')
                    else:
                        print(f"onner:hasLabeledTermLabel data:Label_{label_number_in_schema} .")

                    # adding lebels and their position in the schema
                    if [label_number_in_schema, info[2]] not in labels_in_doc:
                        labels_in_doc.append([label_number_in_schema, info[2]])
                    print()
                    
            print('#========================= SECTION DIVIDER =========================#')
            print()

        # SECTION AND IT'S DOCUMENT PARTS
        else:
            section_number = record[0]
            section_name = record[1]
            next_index = document.index(record) + 1
            paragraph_ids = []

            if next_index == len(document):
                next_section = 'EndOfDocument'
            else:
                next_section = document[next_index][0]
            
            # IF N0 PARAGRAPHS EXIST BETWEEN A SECTION AND ITS IMMEDIATE SUBSECTION
            if not bool(record[2]):
                
                directly_contained_sections = nested_section_check(section_number)
                
                print(f"data:{doi}_{section_number} rdf:type onner:Section ;")
                print(f"onner:sectionTitle '{section_name}'^^xsd:string ;")
                print(f"onner:sectionNumber '{section_number}'^^xsd:string ;")
                print(f"onner:nextDocumentPart data:{doi}_{next_section} ;")
                print(f"onner:directlyContainsDocumentPart {directly_contained_sections} .")
                
                print()
                print('#========================= SECTION DIVIDER =========================#')
                print()
            
            # IF PARAGRAPHS EXIST BETWEEN A SECTION AND ITS IMMEDIATE SUBSECTION
            else:
                
                directly_contained_sections = nested_section_check(section_number)
                
                # CREATING PARAGRAPH ID FOR SECTION'S PARAGRAPH 
                for paragraph_number, _ in record[2].items():
                    paragraph_id = doi + '_' + str(section_number) + '-' + str(paragraph_number)
                    paragraph_ids.append('data:'+paragraph_id)

                paragraph_ids_joined = ', '.join(paragraph_ids)

                # PRINTING SECTION AND IT'S PARAGRAPH IDs 
                print(f"data:{doi}_{section_number} rdf:type onner:Section ;")
                print(f"onner:sectionTitle '{section_name}'^^xsd:string ;")
                print(f"onner:sectionNumber '{section_number}'^^xsd:string ;")
                print(f"onner:nextDocumentPart {paragraph_ids[0]} ;")    # NEXT DOC PART AFTER SECTION
                
                if bool(directly_contained_sections):
                    print(f"onner:directlyContainsDocumentPart {paragraph_ids_joined}, {directly_contained_sections} .")
                else:
                    print(f"onner:directlyContainsDocumentPart {paragraph_ids_joined} .")
                
                print()

                # PARAGRAPHS AND IT'S DOCUMENT PARTS
                for paragraph_number, paragraph_text in record[2].items():
                    labeled_term_info_list = []
                    labeled_term_sequence = 1

                    # replacing ' with \' in text
                    if "'" in paragraph_text:
                        paragraph_text = paragraph_text.replace("'", r"\'")

                    # creating doc object
                    doc = nlp(paragraph_text)

                    for ent in doc.ents:
                        current_dateTime = str(datetime.now())[:-7]    # RECHECK THE APPROPRIATE PLACEMENT
                        labeled_term_id = doi + '_' + str(section_number) + '-' + str(paragraph_number) + '-' + str(labeled_term_sequence)
                        labeled_term = ent.text
                        label = ent.label_
                        offset = ent.start_char
                        length = ent.end_char - ent.start_char
                        labeled_term_info = [labeled_term_id, labeled_term, label, offset, length, current_dateTime]
                        labeled_term_info_list.append(labeled_term_info)
                        labeled_term_sequence += 1

                    labeled_term_ids = ['data:'+i[0] for i in labeled_term_info_list]
                    labeled_term_ids_joined = ', '.join(labeled_term_ids)

                    # PRINTING PARAGRAPHS AND IT'S LABELED TERM IDs 
                    print(f"data:{doi}_{section_number}-{paragraph_number} rdf:type onner:Paragraph ;")
                    print(f"onner:positionInParentDocumentPart '{paragraph_number}'^^xsd:nonNegativeInteger ;")

                    if paragraph_number == len(paragraph_ids):
                        print(f"onner:nextDocumentPart data:{doi}_{next_section} ;")
                    else:
                        print(f"onner:nextDocumentPart data:{doi}_{section_number}-{paragraph_number+1} ;")

                    print(f"onner:paragraphText '{paragraph_text}'^^xsd:string ;")
                    
                    if bool(labeled_term_ids_joined):    # <= added logic for no labeled terms
                        print(f"onner:directlyContainsLabeledTerm {labeled_term_ids_joined} .")
                    else:
                        print(f"onner:directlyContainsLabeledTerm data:NoLabeledTerm .")
                        
                    print()

                    # LABELED TERMS
                    for info in labeled_term_info_list:
                        # PRINTING LABELED TERMS
                        print(f"data:{info[0]} rdf:type onner:LabeledTerm ;")    # DEAL WITH ATOMIC / COMPOUND
                        print(f"onner:labeledTermText '{info[1]}'^^xsd:string ;")
                        print(f"onner:offset '{info[3]}'^^xsd:nonNegativeInteger ;")
                        print(f"onner:length '{info[4]}'^^xsd:nonNegativeInteger ;")
                        print(f"onner:labeledTermDirectlyContainedBy data:{doi}_{section_number}-{paragraph_number} ;")
                        print(f"onner:hasLabeledTermStatus data:Candidate_{info[0]} .")
                        print()

                        # PRINTING LABELED TERMS STATUS
                        print(f"data:Candidate_{info[0]} rdf:type onner:CandidateStatus ;")
                        print(f"onner:statusAssignmentDate '{info[5]}'^^xsd:dateTime ;")
                        print(f"onner:statusAssignedBy data:Cellulosic_NER_Model ;")

                        try:
                            if not bool(labeling_schema):
                                raise Exception('Labeling schema is empty!')

                            label_number_in_schema = labeling_schema.index(info[2]) + 1
                        except Exception as e:
                            print(f'ERROR: {e}')
                        else:
                            print(f"onner:hasLabeledTermLabel data:Label_{label_number_in_schema} .")

                        # adding lebels and their position in the schema
                        if [label_number_in_schema, info[2]] not in labels_in_doc:
                            labels_in_doc.append([label_number_in_schema, info[2]])

                        print()

                print('#========================= SECTION DIVIDER =========================#')
                print()

except NameError:
    print('ERROR: Document object is not defined!')

try:
    if not bool(labels_in_doc):
        raise Exception('List of labels found in document is empty!')
        
    for label in labels_in_doc:
        print(f"data:Label_{label[0]} rdf:type onner:Label ;")
        print(f"onner:fromLabelingSchema data:Labeling_Schema ;")
        print(f"onner:labelText '{label[1]}'^^xsd:string .")
        print()       
except Exception as e:
    print(f'ERROR: {e}')

print(f"data:Labeling_Schema rdf:type onner:LabelingSchema ;")
print(f"onner:schemaName 'CelloGraph'^^xsd:string .")
print()
            
print(f"data:Cellulosic_NER_Model rdf:type onner:NER_System ;")    # if/else required to identify system and human
print(f"onner:systemVersion '1.0'^^xsd:string .")
print()

print(f"data:{doi}_EndOfDocument rdf:type onner:EndOfDocument .")

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 127)

# Try-Error code

In [121]:
# WRITING RDF TRIPLES FOR DOCUMENT

# DECLARING VARIABLE
labels_in_doc = []

# DECLARING LABELING SCHEMA
# labeling_schema = ['ENTITY', 
#                    'CHEMICAL_ENTITY', 
#                    'MATERIAL_ENTITY', 
#                    'STRUCTURE_ENTITY',
#                    'APPLICATION',
#                    'PROCESS',
#                    'PROPERTY',
#                    'EQUIPMENT',
#                    'RELATIONSHIP',
#                    'MEASUREMENT', 
#                    'ABBREVIATION']

labeling_schema = ['CHEMICAL', 
                   'MATERIAL', 
                   'MATERIAL_STRUCTURE',
                   'APPLICATION',
                   'PROCESS_OR_TECHNIQUE',
                   'PROPERTY',
                   'MEASUREMENT', 
                   'ABBREVIATION']

# GETTING DOI
try:
    doi = document[0][2]['DOI']
    doi = doi.replace('/', '_')
except NameError:
    print('ERROR: Document object is not defined!')

# PRINTING PREFIXES
print("@prefix onner: <http://spatialai.org/cellograph/ontology/onner/v2.0#> .")
print("@prefix data: <http://spatialai.org/cellograph/ontology/onner/v2.0/data#> .")
print("@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .")
print("@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .")
print("@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .")
print("@prefix owl: <http://www.w3.org/2002/07/owl#> .")
print()

try:
    for record in document:

        # METADATA
        if record[1] == 'Metadata':
            
            abstract_id = 'data:' + doi + '_A'
            global section_ids
            section_ids = [abstract_id]
            
            for section_number in document:
                if section_number[0] != '0' and '.' not in section_number[0]:
                    section_id = doi + '_' + section_number[0]
                    section_ids.append('data:'+section_id)
            
            section_ids_joined = ', '.join(section_ids)
            
            # PRINTING METADATA
            print(f"data:Publication_{doi} rdf:type onner:ScholarlyPublication ;")
            print(f"onner:publicationTitle '{record[2]['Title']}'^^xsd:string ;")
            print(f"onner:doi '{record[2]['DOI']}'^^xsd:string ;")
            print(f"onner:publicationDate '{record[2]['Publication Date']}'^^xsd:date ;")
            print(f"onner:directlyContainsDocumentPart {section_ids_joined} .")
            print()
            
except NameError:
    print('ERROR: Document object is not defined!')


# for record in document:
#     if record[1] == 'Abstract':
#         next_index = document.index(record) + 1
#         next_section = document[next_index][0]
#         paragraph_ids = []

#         # CREATING PARAGRAPH ID FOR ABSTRACT'S PARAGRAPH 
#         for paragraph_number, _ in record[2].items():
#             paragraph_id = doi + '_A' + '-' + str(paragraph_number)
#             paragraph_ids.append('data:'+paragraph_id)

#         paragraph_ids_joined = ', '.join(paragraph_ids)

#         # PRINTING ABSTRACT AND IT'S PARAGRAPH IDs 
#         print(f"{abstract_id} rdf:type onner:Abstract ;")
#         print(f"onner:nextDocumentPart {paragraph_ids[0]} ;")    # NEXT DOC PART AFTER ABSTRACT
#         print(f"onner:directlyContainsDocumentPart {paragraph_ids_joined} .")
#         print()

#         # PARAGRAPHS AND IT'S DOCUMENT PARTS
#         for paragraph_number, paragraph_text in record[2].items():
#             labeled_term_info_list = []
#             labeled_term_number = 1

#             # replacing ' with \' in text
#             if "'" in paragraph_text:
#                 paragraph_text = paragraph_text.replace("'", r"\'")

#             # creating doc object
#             doc = nlp(paragraph_text)

#             for ent in doc.ents:
#                 current_dateTime = str(datetime.now())[:-7]    # RECHECK THE APPROPRIATE PLACEMENT
#                 labeled_term_id = doi + '_A' + '-' + str(paragraph_number) + '-' + str(labeled_term_number)
#                 labeled_term = ent.text
#                 label = ent.label_
#                 offset = ent.start_char
#                 length = ent.end_char - ent.start_char
#                 labeled_term_info = [labeled_term_id, labeled_term, label, offset, length, current_dateTime]
#                 labeled_term_info_list.append(labeled_term_info)
#                 labeled_term_number += 1

#             labeled_term_ids = ['data:'+i[0] for i in labeled_term_info_list]
#             labeled_term_ids_joined = ', '.join(labeled_term_ids)

#             # PRINTING PARAGRAPHS AND IT'S LABELED TERM IDs 
#             print(f"data:{doi}_A-{paragraph_number} rdf:type onner:Paragraph ;")
#             print(f"onner:positionInParentDocumentPart '{paragraph_number}'^^xsd:nonNegativeInteger ;")

#             if paragraph_number == len(paragraph_ids):
#                 print(f"onner:nextDocumentPart data:{doi}_{next_section} ;")
#             else:
#                 print(f"onner:nextDocumentPart data:{doi}_A-{paragraph_number+1} ;")

#             print(f"onner:paragraphText '{paragraph_text}'^^xsd:string ;")
#             print(f"onner:directlyContainsLabeledTerm {labeled_term_ids_joined} .")
#             print()

#             # LABELED TERMS
#             for info in labeled_term_info_list:
#                 # PRINTING LABELED TERMS
#                 print(f"data:{info[0]} rdf:type onner:LabeledTerm ;")    # DEAL WITH ATOMIC / COMPOUND
#                 print(f"onner:labeledTermText '{info[1]}'^^xsd:string ;")
#                 print(f"onner:offset '{info[3]}'^^xsd:nonNegativeInteger ;")
#                 print(f"onner:length '{info[4]}'^^xsd:nonNegativeInteger ;")
#                 print(f"onner:labeledTermDirectlyContainedBy data:{doi}_A-{paragraph_number} ;")
#                 print(f"onner:hasLabeledTermStatus data:Candidate_{info[0]} .")
#                 print()

#                 # PRINTING LABELED TERMS STATUS
#                 print(f"data:Candidate_{info[0]} rdf:type onner:CandidateStatus ;")
#                 print(f"onner:statusAssignmentDate '{info[5]}'^^xsd:dateTime ;")
#                 print(f"onner:statusAssignedBy data:Cellulosic_NER_Model ;")

#                 try:
#                     if not bool(labeling_schema):
#                         raise Exception('Labeling schema is empty!')

#                     label_number_in_schema = labeling_schema.index(info[2]) + 1
#                 except Exception as e:
#                     print(f'ERROR: {e}')
#                 else:
#                     print(f"onner:hasLabeledTermLabel data:Label_{label_number_in_schema} .")

#                 # adding lebels and their position in the schema
#                 if [label_number_in_schema, info[2]] not in labels_in_doc:
#                     labels_in_doc.append([label_number_in_schema, info[2]])
#                 print()

#         print('#========================= SECTION DIVIDER =========================#')
#         print()

    
for section_id in section_ids:
    
    for record in document:
        section_number = section_id[-1]
        section_name = record[1]
        next_index = document.index(record) + 1
        
        if record[0] == section_number:
            paragraph_ids = []

            if next_index == len(document):
                next_section = 'EndOfDocument'
            else:
                next_section = document[next_index][0]
                
            # EMPTY SECTION CHECK (N0 PARAGRAPHS BETWEEN A SECTION AND ITS IMMEDIATE SUBSECTION)
            if not bool(record[2]):
                subsection_ids = []

                for record in document:
                    pattern = rf'^{section_number}\.[^.]+$'
                    if re.search(pattern, record[0]):
                        subsection_id = doi + '_' + str(record[0])
                        subsection_ids.append('data:'+subsection_id)
                
                subsection_ids_joined = ', '.join(subsection_ids)
                
                print(f"{section_id} rdf:type onner:Section ;")
                print(f"onner:sectionTitle '{section_number}'^^xsd:string ;")
                print(f"onner:sectionNumber '{section_name}'^^xsd:string ;")
                print(f"onner:nextDocumentPart data:{doi}_{next_section} ;")
                print(f"onner:directlyContainsDocumentPart {subsection_ids_joined} .")
                
                print()
                print('#========================= SECTION DIVIDER =========================#')
                print()
                
            # NON-EMPTY SECTION CHECK (PARAGRAPHS EXIST BETWEEN A SECTION AND ITS IMMEDIATE SUBSECTION)
            else:
                
                # CREATING PARAGRAPH ID FOR SECTION'S PARAGRAPH 
                for paragraph_number, _ in record[2].items():
                    paragraph_id = doi + '_' + str(section_number) + '-' + str(paragraph_number)
                    paragraph_ids.append('data:'+paragraph_id)

                paragraph_ids_joined = ', '.join(paragraph_ids)

                # PRINTING SECTION AND IT'S PARAGRAPH IDs 
                print(f"{section_id} rdf:type onner:Section ;")
                print(f"onner:sectionTitle '{record[1]}'^^xsd:string ;")
                print(f"onner:sectionNumber '{record[0]}'^^xsd:string ;")
                print(f"onner:nextDocumentPart {paragraph_ids[0]} ;")    # NEXT DOC PART AFTER SECTION
                print(f"onner:directlyContainsDocumentPart {paragraph_ids_joined} .")
                print()
                
                # PARAGRAPHS AND IT'S DOCUMENT PARTS
                for paragraph_number, paragraph_text in record[2].items():
                    labeled_term_info_list = []
                    labeled_term_sequence = 1

                    # replacing ' with \' in text
                    if "'" in paragraph_text:
                        paragraph_text = paragraph_text.replace("'", r"\'")

                    # creating doc object
                    doc = nlp(paragraph_text)

                    for ent in doc.ents:
                        current_dateTime = str(datetime.now())[:-7]    # RECHECK THE APPROPRIATE PLACEMENT
                        labeled_term_id = doi + '_' + str(section_number) + '-' + str(paragraph_number) + '-' + str(labeled_term_sequence)
                        labeled_term = ent.text
                        label = ent.label_
                        offset = ent.start_char
                        length = ent.end_char - ent.start_char
                        labeled_term_info = [labeled_term_id, labeled_term, label, offset, length, current_dateTime]
                        labeled_term_info_list.append(labeled_term_info)
                        labeled_term_sequence += 1

                    labeled_term_ids = ['data:'+i[0] for i in labeled_term_info_list]
                    labeled_term_ids_joined = ', '.join(labeled_term_ids)

                    # PRINTING PARAGRAPHS AND IT'S LABELED TERM IDs 
                    print(f"{section_id}-{paragraph_number} rdf:type onner:Paragraph ;")
                    print(f"onner:positionInParentDocumentPart '{paragraph_number}'^^xsd:nonNegativeInteger ;")

                    if paragraph_number == len(paragraph_ids):
                        print(f"onner:nextDocumentPart data:{doi}_{next_section} ;")
                    else:
                        print(f"onner:nextDocumentPart data:{doi}_{section_number}-{paragraph_number+1} ;")

                    print(f"onner:paragraphText '{paragraph_text}'^^xsd:string ;")
                    print(f"onner:directlyContainsLabeledTerm {labeled_term_ids_joined} .")
                    print()

                    # LABELED TERMS
                    for info in labeled_term_info_list:
                        # PRINTING LABELED TERMS
                        print(f"data:{info[0]} rdf:type onner:LabeledTerm ;")    # DEAL WITH ATOMIC / COMPOUND
                        print(f"onner:labeledTermText '{info[1]}'^^xsd:string ;")
                        print(f"onner:offset '{info[3]}'^^xsd:nonNegativeInteger ;")
                        print(f"onner:length '{info[4]}'^^xsd:nonNegativeInteger ;")
                        print(f"onner:labeledTermDirectlyContainedBy data:{doi}_{section_number}-{paragraph_number} ;")
                        print(f"onner:hasLabeledTermStatus data:Candidate_{info[0]} .")
                        print()

                        # PRINTING LABELED TERMS STATUS
                        print(f"data:Candidate_{info[0]} rdf:type onner:CandidateStatus ;")
                        print(f"onner:statusAssignmentDate '{info[5]}'^^xsd:dateTime ;")
                        print(f"onner:statusAssignedBy data:Cellulosic_NER_Model ;")

                        try:
                            if not bool(labeling_schema):
                                raise Exception('Labeling schema is empty!')

                            label_number_in_schema = labeling_schema.index(info[2]) + 1
                        except Exception as e:
                            print(f'ERROR: {e}')
                        else:
                            print(f"onner:hasLabeledTermLabel data:Label_{label_number_in_schema} .")

                        # adding lebels and their position in the schema
                        if [label_number_in_schema, info[2]] not in labels_in_doc:
                            labels_in_doc.append([label_number_in_schema, info[2]])

                        print()

                print('#========================= SECTION DIVIDER =========================#')
                print()
                
try:
    if not bool(labels_in_doc):
        raise Exception('List of labels found in document is empty!')
        
    for label in labels_in_doc:
        print(f"data:Label_{label[0]} rdf:type onner:Label ;")
        print(f"onner:fromLabelingSchema data:Labeling_Schema ;")
        print(f"onner:labelText '{label[1]}'^^xsd:string .")
        print()       
except Exception as e:
    print(f'ERROR: {e}')

print(f"data:Labeling_Schema rdf:type onner:LabelingSchema ;")
print(f"onner:schemaName 'CelloGraph'^^xsd:string .")
print()
            
print(f"data:Cellulosic_NER_Model rdf:type onner:NER_System ;")    # if/else required to identify system and human
print(f"onner:systemVersion '1.0'^^xsd:string .")
print()

print(f"data:{doi}_EndOfDocument rdf:type onner:EndOfDocument .")




@prefix onner: <http://spatialai.org/cellograph/ontology/onner/v2.0#> .
@prefix data: <http://spatialai.org/cellograph/ontology/onner/v2.0/data#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .

data:Publication_10.1016_j.memsci.2018.03.085 rdf:type onner:ScholarlyPublication ;
onner:publicationTitle 'How the shape of fillers affects the barrier properties of polymer/non-porous particles nanocomposites: A review'^^xsd:string ;
onner:doi '10.1016/j.memsci.2018.03.085'^^xsd:string ;
onner:publicationDate '2018-04-03'^^xsd:date ;
onner:directlyContainsDocumentPart data:10.1016_j.memsci.2018.03.085_A, data:10.1016_j.memsci.2018.03.085_1, data:10.1016_j.memsci.2018.03.085_2, data:10.1016_j.memsci.2018.03.085_3, data:10.1016_j.memsci.2018.03.085_4, data:10.1016_j.memsci.2018.03.085_5 .

data:10.1016_j.memsci.2018.03.085_1 rd

data:10.1016_j.memsci.2018.03.085_2-1 rdf:type onner:Paragraph ;
onner:positionInParentDocumentPart '1'^^xsd:nonNegativeInteger ;
onner:nextDocumentPart data:10.1016_j.memsci.2018.03.085_2-2 ;
onner:paragraphText 'All the processes used to prepare polymer-based nanocomposites have already been extensively detailed in dedicated reviews and are not further detailed here. Examples of reference publications for layered silicates-based nanocomposites are those of Alexandre &amp; Dubois , Ray et al. , Pavlidou et al. , Mittal et al.  and Cui et al. , for natural fibre-based nanocomposites, those of Saheb et al.  and Siqueira et al.  and for inorganic spherical particle-based nanocomposites, those of Cong et al.  and Chung et al. . Due to the large range of polymer matrices and nanoparticles, different processing routes have been proposed to produce nanocomposites based on their convenience according to the nature of the raw constituents and their field of application (Fig. ). At laboratory s

# Connect to GraphDB

In [51]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import json

In [52]:
# connect and query graph database
# ADD TRY CATCH TO AVOID DATABASE CONNECTIVITY ERROR

# specify the repository
sparql = SPARQLWrapper("http://LAPTOP-S7TVD5I4:7200/repositories/SciPub")

# SPARQL query
sparql.setQuery("""
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX : <http://spatialai.org/SciPub/v2.0#>
    PREFIX SciPub: <http://spatialai.org/SciPub/v2.0#>

    SELECT ?sentenceID ?sentenceText ?labeledTermText ?offset ?length ?labelText (NOW() AS ?systemDateTime)
    WHERE {
        ?paragraphID rdf:type SciPub:Paragraph ;
                     SciPub:directlyContainsDocumentPart ?sentenceID .

        ?sentenceID SciPub:sentenceText ?sentenceText .

        OPTIONAL {
            ?labeledTermID SciPub:documentPartIsDirectlyContainedBy ?sentenceID ;
                           SciPub:labeledTermText ?labeledTermText ;
                           SciPub:offset ?offset ;
                           SciPub:length ?length ;
                           SciPub:hasLabel ?labelID .

            ?labelID rdf:type SciPub:Label ;
                     SciPub:labelText ?labelText .
        }
    }
""")

# convert results to JSON
sparql.setReturnFormat(JSON)
fetched_data = sparql.query().convert()


URLError: <urlopen error [WinError 10061] No connection could be made because the target machine actively refused it>

In [None]:
# convert query result from JSON to dataframe

column_header = ['sentence_id', 'sentence_text', 'labeled_term_text', 'offset', 'length', 'label_text', 'system_datetime']
fetched_data_df = pd.DataFrame(columns=column_header)
sentence_id = []
sentence_text = []
labeled_term_text = []
offset = []
length = []
label_text = []
system_datetime = []

for record in fetched_data['results']['bindings']:
    if len(record) == 7:
        for key, value in record.items():
            if key == 'sentenceID':
                sentence_id.append(value['value'])
            if key == 'sentenceText':
                sentence_text.append(value['value'])
            if key == 'labeledTermText':
                labeled_term_text.append(value['value'])
            if key == 'offset':
                offset.append(value['value'])
            if key == 'length':
                length.append(value['value'])
            if key == 'labelText':
                label_text.append(value['value'])
            if key == 'systemDateTime':
                format_system_datetime = value['value'][:-10]
                format_system_datetime = format_system_datetime.replace('T', ' ')
                system_datetime.append(format_system_datetime)
    
    elif len(record) == 3:
        for key, value in record.items():
            if key == 'sentenceID':
                sentence_id.append(value['value'])
            if key == 'sentenceText':
                sentence_text.append(value['value'])
            if key == 'systemDateTime':
                format_system_datetime = value['value'][:-10]
                format_system_datetime = format_system_datetime.replace('T', ' ')
                system_datetime.append(format_system_datetime)
        
        labeled_term_text.append('')
        offset.append('')
        length.append('')
        label_text.append('')
    
    else:
        print('Different record length (should be 7 or 3) found in SPARQL result (JSON format).')
        
if len(sentence_id) == len(sentence_text) == len(labeled_term_text) == len(offset) == len(length) == len(label_text) == len(system_datetime):
    fetched_data_df['sentence_id'] = sentence_id
    fetched_data_df['sentence_text'] = sentence_text
    fetched_data_df['labeled_term_text'] = labeled_term_text
    fetched_data_df['offset'] = offset
    fetched_data_df['length'] = length
    fetched_data_df['label_text'] = label_text
    fetched_data_df['system_datetime'] = system_datetime 
else:
    print('Different length found for dataframe columns.')

In [None]:
fetched_data_df

In [None]:
# create JSON from dataframe

distinct_sentence_id = list(fetched_data_df['sentence_id'].unique())
fetched_data_json = {'classes': [], 'annotations': []}

for sentence_id in distinct_sentence_id:
    single_sentence_annotation = []
    fetched_data_sub_df = fetched_data_df.loc[fetched_data_df['sentence_id'] == sentence_id]
    first_index = list(fetched_data_sub_df.index)[0]
    single_sentence_annotation.append(sentence_id)
    single_sentence_annotation.append(fetched_data_sub_df['sentence_text'][first_index])
    single_sentence_annotation.append({'entities': []})

    for idx in fetched_data_sub_df.index:          
        if offset[idx] == '' or length[idx] == '' or label_text[idx] == '':
            ll = ['', '', ['', [['', '', '']]]]
            single_sentence_annotation[2]['entities'].append(ll)
        else:
            ll = [int(offset[idx]), int(offset[idx])+int(length[idx]), [label_text[idx], [['Candidate', system_datetime[idx], 'NER Model']]]]
            single_sentence_annotation[2]['entities'].append(ll)
        
    fetched_data_json['annotations'].append(single_sentence_annotation)


In [None]:
json.dumps(fetched_data_json)

In [None]:
a = "To explain this deviation of the \'ideal behavior\', this paper discusses extensively the impact of the nanoparticle shape on the nanocomposite permeability along with structural aspects, related to both the particle nature and size, and the nanocomposite processing routes."

In [None]:
a.replace("\'", "'")

In [None]:
a = "It is well known that inclusion of homogeneously dispersed and oriented impermeable fillers with high aspect ratio, such as platelets or elongated particles, should significantly increase the diffusion path of gas and vapors and yield to improve barrier properties."

In [None]:
a[218:224]