In [1]:
import xml.etree.ElementTree as ET
import re
import spacy
import scispacy

### XML Structure
- teiHeader
    - fileDesc
        - titleStmt
            - title
        - publicationStmt
            - date
        - sourceDesc
            - biblStruct
                - idno
    - encodingDesc
    - profileDesc

### List Structure
- [
    - [
        - section_number,
        - section_title,
        - {
            - paragraph_number:
                - {
                    - sentence_number: sectence_text,
                - },
        - }
    - ],
- ]

### XML to List

In [2]:
# loading xml and creating root

tree = ET.parse('D:\\Drive\\SISE\\CelloGraph\\Dev\\data\\xml_output\\wolf_et_al.xml')
root = tree.getroot()

# getting metadata (title, publication_date and doi)

title = ''
publication_date = ''
doi = ''

for elem in root:
    if elem.tag[29:] == 'teiHeader':
        for sub_elem1 in elem:
            if sub_elem1.tag[29:] == 'fileDesc':
                for sub_elem2 in sub_elem1:
                    if sub_elem2.tag[29:] == 'titleStmt':
                        for sub_elem3 in sub_elem2:
                            if sub_elem3.tag[29:] == 'title':
                                title = sub_elem3.text
                    if sub_elem2.tag[29:] == 'publicationStmt':
                        for sub_elem3 in sub_elem2:
                            if sub_elem3.tag[29:] == 'date':
                                publication_date = sub_elem3.text
                    if sub_elem2.tag[29:] == 'sourceDesc':
                        for sub_elem3 in sub_elem2:
                            if sub_elem3.tag[29:] == 'biblStruct':
                                for sub_elem4 in sub_elem3:
                                    if sub_elem4.tag[29:] == 'idno':
                                        doi = sub_elem4.text
                                        
print(title)
print(publication_date)
print(doi)

How the shape of fillers affects the barrier properties of polymer/non-porous particles nanocomposites: A review
03 April 2018
10.1016/j.memsci.2018.03.085


In [3]:
# getting abstract

abstract = []

for elem in root:
    if elem.tag[29:] == 'teiHeader':
        for sub_elem1 in elem:
            if sub_elem1.tag[29:] == 'profileDesc':
                for sub_elem2 in sub_elem1:
                    if sub_elem2.tag[29:] == 'abstract':
                        for sub_elem3 in sub_elem2:
                            if sub_elem3.tag[29:] == 'div':
                                list_of_paragraphs = {}
                                paragraph_number = 0
                                for sub_elem4 in sub_elem3:
                                    if sub_elem4.tag[29:] == 'p':
                                        paragraph_number += 1
                                        list_of_paragraphs[paragraph_number] = ET.tostring(sub_elem4, encoding='unicode')
                                abstract.append(['Abstract', 'Abstract', list_of_paragraphs])
                                
abstract

[['Abstract',
  'Abstract',
  {1: '<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0">More than 1000 published experimental data of gas (O 2 and CO 2 ) and vapor (H 2 O) permeability in nanocomposites containing either spherical, elongated or platelet particles were collected, assorted and compared in order to decipher the role of particle shape on the reduction of the relative permeability of the nanocomposite. It is well known that inclusion of homogeneously dispersed and oriented impermeable fillers with high aspect ratio, such as platelets or elongated particles, should significantly increase the diffusion path of gas and vapors and yield to improve barrier properties. Results revealed that this expected impact was not systematically achieved, even for impermeable lamellar fillers that usually displayed the highest aspect ratio. More specifically, an unexpected increase of the permeability in the nanocomposite was often observed. To explain this deviation of the \'ideal behavior\', thi

In [4]:
# getting other sections

list_of_sections = []

for elem in root:
    if (elem.tag[29:] == 'text'):
        for sub_elem1 in elem:
            if (sub_elem1.tag[29:] == 'body'):
                for sub_elem2 in sub_elem1:
                    if sub_elem2.tag[29:] == 'div':
                        section_number = ''
                        section_name = ''
                        list_of_paragraphs = []
                        for sub_elem3 in sub_elem2:
                            if sub_elem3.tag[29:] == 'head':
                                if bool(sub_elem3.attrib):
                                    section_number = str(sub_elem3.attrib)[7:-3]
                                else:
                                    section_number = 'NO_SECTION_NUMBER'
                                section_name = sub_elem3.text
                            if sub_elem3.tag[29:] == 'p':
                                list_of_paragraphs.append(ET.tostring(sub_elem3, encoding='unicode'))
                        if section_number == 'NO_SECTION_NUMBER' and not bool(list_of_paragraphs):
                            pass
                        else:
                            list_of_sections.append([section_number, section_name, list_of_paragraphs])

list_of_sections

[['1',
  'Introduction',
  ['<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0">In the objective of developing efficient and optimal packaging, one of the main challenges is to design and provide food packaging materials able to protect the food from the external environment and to maintain food quality and safety throughout its shelf life <ns0:ref type="bibr" target="#b0">[1,</ns0:ref><ns0:ref type="bibr" target="#b1">2]</ns0:ref>. The mass transfers are thus at the heart of the feature of the food packaging, especially transfers of water vapor, oxygen and/or carbon dioxide, which condition the rates of numerous reactions of food degradation (oxidation, microbial development, physiological reactions, etc.). The development of bulk nanocomposite structures by introducing nanoparticles, i.e. fillers having at least one dimension lower than 100 nm, in polymeric matrices appeared as one of the most promising directions in the development of packaging materials with advanced mass transfer prop

In [5]:
# merging "paragraphs with no section number" to its previous section
# HAVE TO ADJUST THIS SECTION FOR TABLES/IMAGES

list_of_sections_length = len(list_of_sections)

for i in range(list_of_sections_length-1,0,-1):
    if list_of_sections[i][0] == 'NO_SECTION_NUMBER':
        list_of_sections[i-1][2].extend(list_of_sections[i][2])
        list_of_sections.remove(list_of_sections[i])
        
list_of_sections

[['1',
  'Introduction',
  ['<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0">In the objective of developing efficient and optimal packaging, one of the main challenges is to design and provide food packaging materials able to protect the food from the external environment and to maintain food quality and safety throughout its shelf life <ns0:ref type="bibr" target="#b0">[1,</ns0:ref><ns0:ref type="bibr" target="#b1">2]</ns0:ref>. The mass transfers are thus at the heart of the feature of the food packaging, especially transfers of water vapor, oxygen and/or carbon dioxide, which condition the rates of numerous reactions of food degradation (oxidation, microbial development, physiological reactions, etc.). The development of bulk nanocomposite structures by introducing nanoparticles, i.e. fillers having at least one dimension lower than 100 nm, in polymeric matrices appeared as one of the most promising directions in the development of packaging materials with advanced mass transfer prop

In [6]:
# organizing paragraph(s) sequentially in a dictionary
# adding dictionary with each respective record as a 4th element

for section in list_of_sections:
    list_of_paragraphs = {}
    for paragraph_number, paragraph_text in enumerate(section[2], start=1):
        list_of_paragraphs[paragraph_number] = paragraph_text
    section.append(list_of_paragraphs)
    
list_of_sections

[['1',
  'Introduction',
  ['<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0">In the objective of developing efficient and optimal packaging, one of the main challenges is to design and provide food packaging materials able to protect the food from the external environment and to maintain food quality and safety throughout its shelf life <ns0:ref type="bibr" target="#b0">[1,</ns0:ref><ns0:ref type="bibr" target="#b1">2]</ns0:ref>. The mass transfers are thus at the heart of the feature of the food packaging, especially transfers of water vapor, oxygen and/or carbon dioxide, which condition the rates of numerous reactions of food degradation (oxidation, microbial development, physiological reactions, etc.). The development of bulk nanocomposite structures by introducing nanoparticles, i.e. fillers having at least one dimension lower than 100 nm, in polymeric matrices appeared as one of the most promising directions in the development of packaging materials with advanced mass transfer prop

In [7]:
# deleting 3rd element from each record of document

for section in list_of_sections:
    section.remove(section[2])
    
list_of_sections

[['1',
  'Introduction',
  {1: '<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0">In the objective of developing efficient and optimal packaging, one of the main challenges is to design and provide food packaging materials able to protect the food from the external environment and to maintain food quality and safety throughout its shelf life <ns0:ref type="bibr" target="#b0">[1,</ns0:ref><ns0:ref type="bibr" target="#b1">2]</ns0:ref>. The mass transfers are thus at the heart of the feature of the food packaging, especially transfers of water vapor, oxygen and/or carbon dioxide, which condition the rates of numerous reactions of food degradation (oxidation, microbial development, physiological reactions, etc.). The development of bulk nanocomposite structures by introducing nanoparticles, i.e. fillers having at least one dimension lower than 100 nm, in polymeric matrices appeared as one of the most promising directions in the development of packaging materials with advanced mass transfer p

In [8]:
# merging abstract with other sections

document = abstract + list_of_sections
document

[['Abstract',
  'Abstract',
  {1: '<ns0:p xmlns:ns0="http://www.tei-c.org/ns/1.0">More than 1000 published experimental data of gas (O 2 and CO 2 ) and vapor (H 2 O) permeability in nanocomposites containing either spherical, elongated or platelet particles were collected, assorted and compared in order to decipher the role of particle shape on the reduction of the relative permeability of the nanocomposite. It is well known that inclusion of homogeneously dispersed and oriented impermeable fillers with high aspect ratio, such as platelets or elongated particles, should significantly increase the diffusion path of gas and vapors and yield to improve barrier properties. Results revealed that this expected impact was not systematically achieved, even for impermeable lamellar fillers that usually displayed the highest aspect ratio. More specifically, an unexpected increase of the permeability in the nanocomposite was often observed. To explain this deviation of the \'ideal behavior\', thi

In [9]:
# text preprocessing
# HAVE TO WORK WITH SUBSCRIPT AND SUPERSCRIPT

pattern_starting_p_tag = r'<ns0:p[^>]+>'
pattern_ending_p_tag = '</ns0:p>'
pattern_starting_ref_tag = r'<ns0:ref[^>]+>'
pattern_ending_ref_tag = '</ns0:ref>'
pattern_ref = r'<ref>.*?</ref>'    # temporary for removing ref tag

for record in document:
    for paragraph_number, paragraph_text in record[2].items():
        text = paragraph_text
        text = re.sub(pattern_starting_p_tag, '', text)
        text = re.sub(pattern_ending_p_tag, '', text)
        text = re.sub(pattern_starting_ref_tag, '<ref>', text)
        text = re.sub(pattern_ending_ref_tag, '</ref>', text)
        text = re.sub(pattern_ref, '', text)
        record[2][paragraph_number] = text
        
document

[['Abstract',
  'Abstract',
  {1: "More than 1000 published experimental data of gas (O 2 and CO 2 ) and vapor (H 2 O) permeability in nanocomposites containing either spherical, elongated or platelet particles were collected, assorted and compared in order to decipher the role of particle shape on the reduction of the relative permeability of the nanocomposite. It is well known that inclusion of homogeneously dispersed and oriented impermeable fillers with high aspect ratio, such as platelets or elongated particles, should significantly increase the diffusion path of gas and vapors and yield to improve barrier properties. Results revealed that this expected impact was not systematically achieved, even for impermeable lamellar fillers that usually displayed the highest aspect ratio. More specifically, an unexpected increase of the permeability in the nanocomposite was often observed. To explain this deviation of the 'ideal behavior', this paper discusses extensively the impact of the n

In [10]:
# import spacy and load en_core_web_sm model

import spacy
nlp = spacy.load('en_core_web_sm')

In [11]:
# organizing sentences sequentially in a dictionary
# adding dictionary with each respective record as a 4th element

for record in document:
    text_content = record[2]
    list_of_paragraphs = {}
    
    for paragraph_number, paragraph_text in text_content.items():
        list_of_sentences = {}
        doc = nlp(paragraph_text)
        
        for sentence_number, sentence_text in enumerate(doc.sents, start=1):
            list_of_sentences[sentence_number] = sentence_text
        
        list_of_paragraphs[paragraph_number] = list_of_sentences
        
    record.append(list_of_paragraphs)

In [12]:
# deleting 3rd element from each record of document

for record in document:
    record.remove(record[2])  

In [13]:
document

[['Abstract',
  'Abstract',
  {1: {1: More than 1000 published experimental data of gas (O 2 and CO 2 ) and vapor (H 2 O) permeability in nanocomposites containing either spherical, elongated or platelet particles were collected, assorted and compared in order to decipher the role of particle shape on the reduction of the relative permeability of the nanocomposite.,
    2: It is well known that inclusion of homogeneously dispersed and oriented impermeable fillers with high aspect ratio, such as platelets or elongated particles, should significantly increase the diffusion path of gas and vapors and yield to improve barrier properties.,
    3: Results revealed that this expected impact was not systematically achieved, even for impermeable lamellar fillers that usually displayed the highest aspect ratio.,
    4: More specifically, an unexpected increase of the permeability in the nanocomposite was often observed.,
    5: To explain this deviation of the 'ideal behavior', this paper discus

### List to RDF

In [21]:
# printing metadata (title, publication_date and doi)

print(f":Pub1 rdf:type :ScholarlyPublication .")
print(f":Pub1 sp:title '{title}'^^xsd:string .")
print(f":Pub1 sp:doi '{doi}'^^xsd:string .")
print(f":Pub1 sp:publicationDate '{publication_date}'^^xsd:date .")

:Pub1 rdf:type :ScholarlyPublication .
:Pub1 sp:title 'How the shape of fillers affects the barrier properties of polymer/non-porous particles nanocomposites: A review'^^xsd:string .
:Pub1 sp:doi '10.1016/j.memsci.2018.03.085'^^xsd:string .
:Pub1 sp:publicationDate '03 April 2018'^^xsd:date .


In [37]:
# printing abstract and sections

# Comments:
# Location data not available in xml

for record in document:
    if record[0] == 'Abstract':
        for paragraph_number, paragraph_object in record[2].items():
            print(f":Abstract{paragraph_number} rdf:type :Abstract .")
            print('\n')
            for sentence_number, sentence_text in paragraph_object.items():
                print(f":Abstract{paragraph_number} sp:containsSentence :Abstract{paragraph_number}Sentence{sentence_number} .")
                print(f":Abstract{paragraph_number}Sentence{sentence_number} rdf:type :Sentence .")
                print(f":Abstract{paragraph_number}Sentence{sentence_number} sp:hasLocationInDocument :Abstract{paragraph_number}Sentence{sentence_number}Location .")
                print(f":Abstract{paragraph_number}Sentence{sentence_number} sp:sentenceText '{sentence_text}'^^xsd:string .")
                print('\n')
    else:
        section_number = record[0]
        section_name = record[1]
        
        print(f":Section{record[0]} rdf:type :Section .")
        print(f":Section{record[0]} sp:headerText '{record[1]}'^^xsd:string .")
        print('\n')
        
        for paragraph_number, paragraph_object in record[2].items():
            print(f":Section{section_number} sp:containsParagraph :Section{section_number}Paragraph{paragraph_number} .")
            print(f":Section{section_number}Paragraph{paragraph_number} rdf:type :Paragraph .")
            print('\n')
            for sentence_number, sentence_text in paragraph_object.items():
                print(f":Section{section_number}Paragraph{paragraph_number} sp:containsSentence :Section{section_number}Paragraph{paragraph_number}Sectence{sentence_number} .")
                print(f":Section{section_number}Paragraph{paragraph_number}Sectence{sentence_number} rdf:type :Sectence .")
                print(f":Section{section_number}Paragraph{paragraph_number}Sectence{sentence_number} sp:hasLocationInDocument :Section{section_number}Paragraph{paragraph_number}Sectence{sentence_number}Location .")
                print(f":Section{section_number}Paragraph{paragraph_number}Sectence{sentence_number} sp:sentenceText '{sentence_text}'^^xsd:string .")
                print('\n')

:Abstract1 rdf:type :Abstract .


:Abstract1 sp:containsSentence :Abstract1Sentence1 .
:Abstract1Sentence1 rdf:type :Sentence .
:Abstract1Sentence1 sp:hasLocationInDocument :Abstract1Sentence1Location .
:Abstract1Sentence1 sp:sentenceText 'More than 1000 published experimental data of gas (O 2 and CO 2 ) and vapor (H 2 O) permeability in nanocomposites containing either spherical, elongated or platelet particles were collected, assorted and compared in order to decipher the role of particle shape on the reduction of the relative permeability of the nanocomposite.'^^xsd:string .


:Abstract1 sp:containsSentence :Abstract1Sentence2 .
:Abstract1Sentence2 rdf:type :Sentence .
:Abstract1Sentence2 sp:hasLocationInDocument :Abstract1Sentence2Location .
:Abstract1Sentence2 sp:sentenceText 'It is well known that inclusion of homogeneously dispersed and oriented impermeable fillers with high aspect ratio, such as platelets or elongated particles, should significantly increase the diffusion path 

# Simple Demo

In [3]:
# READING ABSTRACT

tree = ET.parse('Wolf et al.xml')
root = tree.getroot()
text = ''

for elem in root:
#     print(elem[0].tag)
    if elem.tag[29:] == 'teiHeader':
        for sub_elem1 in elem:
            if sub_elem1.tag[29:] == 'profileDesc':
                for sub_elem2 in sub_elem1:
                    if sub_elem2.tag[29:] == 'abstract':
                        for sub_elem3 in sub_elem2:
                            if sub_elem3.tag[29:] == 'div':
                                for sub_elem4 in sub_elem3:
                                    text = sub_elem4.text
                                    
text

"More than 1000 published experimental data of gas (O 2 and CO 2 ) and vapor (H 2 O) permeability in nanocomposites containing either spherical, elongated or platelet particles were collected, assorted and compared in order to decipher the role of particle shape on the reduction of the relative permeability of the nanocomposite. It is well known that inclusion of homogeneously dispersed and oriented impermeable fillers with high aspect ratio, such as platelets or elongated particles, should significantly increase the diffusion path of gas and vapors and yield to improve barrier properties. Results revealed that this expected impact was not systematically achieved, even for impermeable lamellar fillers that usually displayed the highest aspect ratio. More specifically, an unexpected increase of the permeability in the nanocomposite was often observed. To explain this deviation of the 'ideal behavior', this paper discusses extensively the impact of the nanoparticle shape on the nanocompo

In [4]:
# LOADING MODEL AND DOC OBJECT

nlp = spacy.load('experiment/SINGLE_LABEL_CELLULOSIC_MODEL_ACC/model-best/')
nlp.add_pipe('sentencizer')
doc = nlp(text)

In [5]:
# CREATE UNIQUE ENTITY LIST

entity_list = []

# for ent in doc.ents:
#     if ent.text not in entity_list:
#         entity_list.append(ent.text)
        
# entity_list
previous_sentence = ''
l = []

for ent in doc.ents:
    
    current_sentence = ent.sent.text
    
    if previous_sentence == '':
        entity_list.append(ent.text)
    elif previous_sentence == current_sentence :
        entity_list.append(ent.text)
    else:
        l.append((previous_sentence, entity_list))
        entity_list.clear()
        entity_list.append(ent.text)
        
    previous_sentence = current_sentence



    

In [9]:
# TEMP CELL

for ent in doc.ents:
    print(ent.text, '===>>', ent.label_)

gas ===>> ENTITY
O 2 ===>> ENTITY
CO ===>> ENTITY
H 2 O ===>> ENTITY
nanocomposites ===>> ENTITY
platelet particles ===>> ENTITY
particle shape ===>> ENTITY
nanocomposite ===>> ENTITY
inclusion of ===>> ENTITY
particles ===>> ENTITY
gas ===>> ENTITY
vapors ===>> ENTITY
nanocomposite ===>> ENTITY
paper ===>> ENTITY
nanoparticle shape ===>> ENTITY
nanocomposite permeability ===>> ENTITY
size ===>> ENTITY
nanocomposite ===>> ENTITY
gas selectivity ===>> ENTITY
O 2 /CO ===>> ENTITY


In [17]:
para_data = []

for sent_no, sent in enumerate(doc.sents, start=1):
    ent_from_sent = []
    for ent in sent.ents:
        ent_with_position = (ent.text, ent.start_char, ent.end_char)
        ent_from_sent.append(ent_with_position)
#         print(sent_no, '=====>>', sent.text, '=====>>', ent.text)

    if bool(ent_from_sent):
        sent_data = (sent_no, sent, ent_from_sent)
        para_data.append(sent_data)
    
para_data
        
        

[(1,
  More than 1000 published experimental data of gas (O 2 and CO 2 ) and vapor (H 2 O) permeability in nanocomposites containing either spherical, elongated or platelet particles were collected, assorted and compared in order to decipher the role of particle shape on the reduction of the relative permeability of the nanocomposite.,
  [('gas', 46, 49),
   ('O 2', 51, 54),
   ('CO', 59, 61),
   ('H 2 O', 77, 82),
   ('nanocomposites', 100, 114),
   ('platelet particles', 157, 175),
   ('particle shape', 247, 261),
   ('nanocomposite', 315, 328)]),
 (2,
  It is well known that inclusion of homogeneously dispersed and oriented impermeable fillers with high aspect ratio, such as platelets or elongated particles, should significantly increase the diffusion path of gas and vapors and yield to improve barrier properties.,
  [('inclusion of', 352, 364),
   ('particles', 477, 486),
   ('gas', 540, 543),
   ('vapors', 548, 554)]),
 (4,
  More specifically, an unexpected increase of the permea

In [16]:
print(f':LabeledTerm{} rdf:type :AtomicLabeledTerm ;')
print(f"sp:labeledTermText '{}'^^xsd:string ;")
print(f'sp:labeledTermIsContainedBy :{} ;')
print(f"sp:offset '{}'^^xsd:nonNegativeInteger ;")
print(f"sp:length '{}'^^xsd:nonNegativeInteger ;")
print(f'sp:hasLabel :Label1??? .')

SyntaxError: f-string: empty expression not allowed (1440621099.py, line 1)

In [46]:
labeled_term_serial = 0

for sent_data in para_data:
    for ent in sent_data[2]:
#         print(ent)
        labeled_term_serial += 1
        print(f':LabeledTerm{labeled_term_serial} rdf:type :AtomicLabeledTerm ;')
        print(f"sp:labeledTermText '{ent[0]}'^^xsd:string ;")
        print(f'sp:labeledTermIsContainedBy :Abstract1Sentence{sent_data[0]} ;')
        print(f"sp:offset '{ent[1]}'^^xsd:nonNegativeInteger ;")
        print(f"sp:length '{ent[2] - ent[1]}'^^xsd:nonNegativeInteger ;")
        print(f'sp:hasLabel :Label1??? .')
        print('\n')

:LabeledTerm1 rdf:type :AtomicLabeledTerm ;
sp:labeledTermText 'gas'^^xsd:string ;
sp:labeledTermIsContainedBy :Abstract1Sentence1 ;
sp:offset '46'^^xsd:nonNegativeInteger ;
sp:length '3'^^xsd:nonNegativeInteger ;
sp:hasLabel :Label1??? .


:LabeledTerm2 rdf:type :AtomicLabeledTerm ;
sp:labeledTermText 'O 2'^^xsd:string ;
sp:labeledTermIsContainedBy :Abstract1Sentence1 ;
sp:offset '51'^^xsd:nonNegativeInteger ;
sp:length '3'^^xsd:nonNegativeInteger ;
sp:hasLabel :Label1??? .


:LabeledTerm3 rdf:type :AtomicLabeledTerm ;
sp:labeledTermText 'CO'^^xsd:string ;
sp:labeledTermIsContainedBy :Abstract1Sentence1 ;
sp:offset '59'^^xsd:nonNegativeInteger ;
sp:length '2'^^xsd:nonNegativeInteger ;
sp:hasLabel :Label1??? .


:LabeledTerm4 rdf:type :AtomicLabeledTerm ;
sp:labeledTermText 'H 2 O'^^xsd:string ;
sp:labeledTermIsContainedBy :Abstract1Sentence1 ;
sp:offset '77'^^xsd:nonNegativeInteger ;
sp:length '5'^^xsd:nonNegativeInteger ;
sp:hasLabel :Label1??? .


:LabeledTerm5 rdf:type :AtomicLabel