In [1]:
import pandas as pd
import ast
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import re
import spacy
from spacy import displacy
df = pd.read_csv('22perm.csv')

In [2]:
# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

In [3]:
document = df['doc'].unique()

In [4]:
document

array(['Antiplasticisation and oxygen permeability of starch–sorbitol films',
       'Application of bioplastics for food packaging',
       'Barrier and mechanical properties of carrot puree films',
       'Barrier and surface properties of chitosan-coated greaseproof paper',
       'Barrier properties of chitosan coated polyethylene',
       'Design of biodegradable bio-based equilibrium modified atmosphere packaging (EMAP) for fresh fruits and vegetables by using micro-perforated poly-lactic acid (PLA) films',
       'Edible oxygen barrier bilayer film pouches from corn zein and soy protein isolate for olive oil packaging',
       'Effect of clay content, homogenization RPM, pH, and ultrasonication on mechanical and barrier properties of fish gelatin montmorillonite nanocomposite films',
       'Effect of plasticizers and fatty acids on mechanical and permeability characteristics of chitosan films',
       'Exploring the potentialities of using lignocellulosic fibres derived from th

In [5]:

# c_doc = []
# for title in document:
#     if "of" in title:
#         title = title.split("of", 1)[1].strip()
#     if "by" in title:
#         title = title.split("by", 1)[1].strip()
#     if "from" in title:
#         title = title.split("from", 1)[1].strip()
#     else:
#         title = title
#     c_doc.append(title)



In [6]:
def clean(text):
    if 'derived from' in text:
        text = text.split("derived from", 1)[0].strip()
        print('derived from -> ', text)
    if "using" in text:
        text = text.split("using", 1)[1].strip()
        print('using ->' , text)
    if 'adapted' in text:
        text = text.split("adapted", 1)[0].strip()
        print('using ->' , text)
    elif "from" in text:
        text = text.split("from", 1)[1].strip()
        print('from -> ', text)

    if "of" in text:
        text = text.split("of", 1)[1].strip()
        print('of -> ' ,text)

    return text

# Function to pre-filter specific cases before dependency parsing
def pre_filter(text):
    text = text.replace('(', '').replace(')', '').replace('-', ' ').replace('nanocomposites', 'nanocomposite').replace('-',' ')
    # text = clean(text)

    base_material = None
    material_type = None
    secondary_material = None
    continue_parsing = True
    # print ('Before : base:', base_material, 'type:', material_type, 'second: ', secondary_material)

    # If "coated" or "coating" is in text
    if "coated" in text or "coating" in text:
        material_type = "coated"
        parts = text.split("coated" if "coated" in text else "coating")
        if len(parts) == 2:
            secondary_material = parts[0].strip()  # Words before "coated/coating"
            base_material = parts[1].strip()       # Words after "coated/coating"
            continue_parsing  = True

    # If "nano-", "nanoclay", or "nanocomposite" in text
    if any(nano in text for nano in ["nano-", "nanoclay", "nanocomposite"]):
        material_type = "nanocomposite"
        continue_parsing  = True
    
    if 'montmorillonite' in text:
        secondary_material = 'montmorillonite'
        continue_parsing  = True
    
    if base_material and material_type and secondary_material:
        print(text)
        return base_material, material_type, secondary_material, False
    
    # print ('After: base:', base_material, 'type:', material_type, 'second: ', secondary_material)
    return base_material, material_type, secondary_material, continue_parsing



In [71]:

# Function to extract components based on dependency parsing rules
def extract_components(title):
    title = clean(title)
    # Apply pre-filters for known cases
    base_material, material_type, secondary_material, continue_parsing = pre_filter(title)
    # If pre-filter identified all components, return them directly
    
    if not continue_parsing:
        print ('Done before extract!! Doc:', title, 'Base: ', base_material, ' |type:', material_type, ' |secondary: ', secondary_material )
        return base_material, material_type, secondary_material
    else:
        print('Contining Parsing')
        

    # clean of again
    if "of" in title:
        title = title.split("of", 1)[1].strip()
        print('second of -> ' ,title)
        
    doc = nlp(title)
    print('Processing: ' , doc)
    # Dependency parsing logic
    for i, token in enumerate(doc):       
        # Priority 1: Identify Base Material from known keywords
        if not base_material and (token.pos_ == "NOUN" or token.pos_ == "PROPN"):
            # Check for known base material keywords
            if any(keyword in token.text.lower() for keyword in ["starch", "fish gelatin", "chitosan", "acid", "polyethylene", "polypropylene", "zein", "corn", "pla", "soybean", "protein", "methylcellulose", "fibres", "polysaccharide"]):
                base_material = " ".join([child.text for child in token.children if child.dep_ in ("amod", "compound")]) + " " + token.text
                print('Base Material identified:', base_material)
         
        # Priority 2: Check for 'film' or 'films' as base material if no keyword match was found
        if not base_material and token.text.lower() in ["film", "films"]:
            print('text has film(s)')
            # Check if there's a noun before 'film(s)' to use as base material
            for child in token.children:
                print('yes',child.text, ':', child.dep_)
                if child.dep_ == "dobj":
                    base_material = child.text
                    print('Base Material (dobj of film):', base_material)

            if not base_material and i > 0 and doc[i - 1].pos_ == "NOUN":
                base_material = doc[i - 1].text
                for child in doc[i - 1].children:
                    if child.dep_ == "amod":
                        base_material = child.text
                        print('adj of film:', child.text)
            
            # Handle cases like 'chitosan films' where there is no preceding noun
            elif not base_material and token.pos_ == "NOUN":
                base_material = token.text
                for child in token.children:
                    if child.dep_ == "amod":
                        base_material = child.text
                        print('adj of film:', child.text)
        
        # Material Type: Nanocomposite, Biocomposite, etc.
        if token.text.lower() in ["nanocomposite"]:
            material_type = token.text
            for child in token.children:
                if child.dep_ == "amod":
                    base_material = child.text
                    print('adj of nanocomposite: ' , child.text)
                # else:
                #     base_material = None
        
        if token.text.lower() == "coated" or token.text.lower() == "coatings":
            material_type = "coated"
            # If 'coated', get the adjective modifier of 'coated' as the secondary material
            for child in token.children:
                if child.dep_ == "amod":
                    secondary_material = child.text
                    print('adj of coating: ' , child.text)

        

        # Check if "based" is in the title and use its adverbial modifier as base material
        if token.text.lower() == "based":
            for child in token.children:
                if child.dep_ == "npadvmod" and child.text.lower() != "bio":
                    base_material = child.text
                if child.dep_ == 'advmod':
                    base_material = child.text
                    print('adv of base: ' , child.text)
            
            # Collect nouns before "based" as base material if not assigned
            # base_material = " ".join([doc[j].text for j in range(i) if doc[j].pos_ == "NOUN"]).strip()
       
        # Identify Base Material based on common material keywords or structure
        
        
        # Secondary Material: Detecting specific materials or modifiers
        if token.pos_ == "NOUN":
            if "montmorillonite" in token.text.lower():
                secondary_material = "montmorillonite"
                print('adv of base:', secondary_material)
            elif "clay" in token.text.lower():
                secondary_material = "clay"
                print('adv of base:', secondary_material)
            elif "silica" in token.text.lower():
                secondary_material = "silica"
                print('adv of base:', secondary_material)

        
    options = {
        'compact': True,     # Compact mode for clearer display
        'bg': '#f0f0f0',     # Background color
        'color': 'black',    # Arrow color
        'font': 'Arial',     # Font style
        'distance': 100,     # Distance between tokens
        'offset_x': 50,      # Horizontal offset
        'arrow_stroke': 2,   # Width of arrow strokes
        'arrow_width': 8     # Size of arrow heads
    }

    # Visualize the dependency tree with custom options
    displacy.render(doc, style='dep', jupyter=True, options=options)
    
    print ('After: base:', base_material, 'type:', material_type, 'second: ', secondary_material)

    return base_material.strip() if base_material else None, material_type, secondary_material

In [73]:

# Apply the function to each title and collect results
results = [extract_components(title) for title in document]

# Convert results to DataFrame for better readability, including the title as a column
df = pd.DataFrame(results, columns=["Base Material", "Material Type", "Secondary Material"])
df["Title"] = document  # Add titles for reference

# Reorder columns to place Title first
df = df[["Title", "Base Material", "Material Type", "Secondary Material"]]

# # Display the DataFrame
# # print(df)


of ->  starch–sorbitol films
Contining Parsing
Processing:  starch–sorbitol films
Base Material identified:  starch


After: base:  starch type: None second:  None
of ->  bioplastics for food packaging
Contining Parsing
Processing:  bioplastics for food packaging
Base Material identified:  bioplastics


After: base:  bioplastics type: None second:  None
of ->  carrot puree films
Contining Parsing
Processing:  carrot puree films
text has film(s)
yes carrot : nmod
yes puree : amod
adj of film: puree


After: base: puree type: None second:  None
of ->  chitosan-coated greaseproof paper
chitosan coated greaseproof paper
Done before extract!! Doc: chitosan-coated greaseproof paper Base:  greaseproof paper  |type: coated  |secondary:  chitosan
of ->  chitosan coated polyethylene
chitosan coated polyethylene
Done before extract!! Doc: chitosan coated polyethylene Base:  polyethylene  |type: coated  |secondary:  chitosan
using -> micro-perforated poly-lactic acid (PLA) films
Contining Parsing
Processing:  micro-perforated poly-lactic acid (PLA) films
Base Material identified: micro - perforated lactic acid


After: base: micro - perforated lactic acid type: None second:  None
from ->  corn zein and soy protein isolate for olive oil packaging
Contining Parsing
Processing:  corn zein and soy protein isolate for olive oil packaging
Base Material identified: zein corn


After: base: zein corn type: None second:  None
of ->  clay content, homogenization RPM, pH, and ultrasonication on mechanical and barrier properties of fish gelatin montmorillonite nanocomposite films
Contining Parsing
second of ->  fish gelatin montmorillonite nanocomposite films
Processing:  fish gelatin montmorillonite nanocomposite films
text has film(s)
yes gelatin : amod
yes montmorillonite : amod
yes nanocomposite : amod
adj of film: gelatin
adj of film: montmorillonite
adj of film: nanocomposite


After: base: nanocomposite type: nanocomposite second:  montmorillonite
of ->  plasticizers and fatty acids on mechanical and permeability characteristics of chitosan films
Contining Parsing
second of ->  chitosan films
Processing:  chitosan films
text has film(s)
yes chitosan : amod
adj of film: chitosan


After: base: chitosan type: None second:  None
derived from ->  Exploring the potentialities of using lignocellulosic fibres
using -> lignocellulosic fibres
Contining Parsing
Processing:  lignocellulosic fibres
Base Material identified: lignocellulosic fibres


After: base: lignocellulosic fibres type: None second:  None
of ->  nanocomposites based on in situ polymerized poly(n-butyl methacrylate) in the presence of surface modified montmorillonite
Contining Parsing
second of ->  surface modified montmorillonite
Processing:  surface modified montmorillonite


After: base: None type: nanocomposite second:  montmorillonite
of ->  polypropylene nanocomposites with thermally-stable imidazolium modified clay
Contining Parsing
Processing:  polypropylene nanocomposites with thermally-stable imidazolium modified clay
adv of base: clay


After: base: None type: nanocomposite second:  clay
of ->  poly(lactic acid)
Contining Parsing
Processing:  poly(lactic acid)
Base Material identified: poly(lactic acid


After: base: poly(lactic acid type: None second:  None
of ->  polyether-based polyurethane–silica nanocomposite membranes
Contining Parsing
Processing:  polyether-based polyurethane–silica nanocomposite membranes
adv of base: silica


After: base: polyether type: nanocomposite second:  silica
using -> Gas transfer properties of wheat gluten coated paper
of ->  wheat gluten coated paper
wheat gluten coated paper
Done before extract!! Doc: wheat gluten coated paper Base:  paper  |type: coated  |secondary:  wheat gluten
of ->  polypropylene clay composite membranes
Contining Parsing
Processing:  polypropylene clay composite membranes
Base Material identified:  polypropylene
adv of base: clay


After: base:  polypropylene type: None second:  clay
Contining Parsing
Processing:  Soluble soybean polysaccharide A new carbohydrate to make a biodegradable film for sustainable green packaging
Base Material identified: Soluble soybean


After: base: Soluble soybean type: None second:  None
of ->  novel galactomannans as edible coatings for tropical fruits
novel galactomannans as edible coatings for tropical fruits
Done before extract!! Doc: novel galactomannans as edible coatings for tropical fruits Base:  s for tropical fruits  |type: coated  |secondary:  novel galactomannans as edible
of ->  PLA nanoclay composite films
Contining Parsing
Processing:  PLA nanoclay composite films
Base Material identified:  PLA
adv of base: clay


After: base:  PLA type: nanocomposite second:  clay
of ->  a predictive model coupling gas transfer and microbial growth in fresh food packed under modified atmosphere
Contining Parsing
Processing:  a predictive model coupling gas transfer and microbial growth in fresh food packed under modified atmosphere


After: base: None type: None second:  None
of ->  corn–zein coated polypropylene films
corn–zein coated polypropylene films
Done before extract!! Doc: corn–zein coated polypropylene films Base:  polypropylene films  |type: coated  |secondary:  corn–zein
of ->  methylcellulose-based edible films
Contining Parsing
Processing:  methylcellulose-based edible films
adv of base:  methylcellulose


After: base: methylcellulose type: None second:  None


In [74]:
extract_components('Suitability of novel galactomannans as edible coatings for tropical fruits')

of ->  novel galactomannans as edible coatings for tropical fruits
novel galactomannans as edible coatings for tropical fruits
Done before extract!! Doc: novel galactomannans as edible coatings for tropical fruits Base:  s for tropical fruits  |type: coated  |secondary:  novel galactomannans as edible


('s for tropical fruits', 'coated', 'novel galactomannans as edible')

In [41]:
extract_components('Effect of plasticizers and fatty acids on mechanical and permeability characteristics of chitosan films')

of ->  plasticizers and fatty acids on mechanical and permeability characteristics of chitosan films
Contining Parsing
second of ->  chitosan films
Processing:  chitosan films
text has film(s)
adj of film: chitosan


After: base: chitosan type: None second:  None


('chitosan', None, None)