## This notebook explores and implements the construction of new property parsers for the `ChemDataExtractor` python toolkit.

These first cells are directly taken from the tutorial .ipynb at https://github.com/CambridgeMolecularEngineering/chemdataextractor/blob/master/examples/extracting_a_custom_property.ipynb

Any comments were added by Wesley Tatum for clarity

--------------------

In [1]:
import re
from chemdataextractor import Document
from chemdataextractor.model import Compound
from chemdataextractor.doc import Paragraph, Heading
from chemdataextractor.model import BaseModel, StringType, ListType, ModelType
from chemdataextractor.parse import R, I, W, Optional, merge
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first

In [29]:
#create a sample document that emulates the type of sentence that the property
#might be listed in in an article
d = Document(
    Heading(u'Synthesis of 2,4,6-trinitrotoluene (3a)'),
    Paragraph(u'The procedure was followed to yield a pale yellow solid (b.p. 240 °C)')
)

d

In [30]:
#CDE base classes do not include boiling point extraction, so it only
#recognizes the compound name, its label, and its role
d.records.serialize()

[{'names': ['2,4,6-trinitrotoluene'], 'labels': ['3a'], 'roles': ['product']}]

In [31]:
#define a new "schema" for the new property and add it to the Compound model
#this class definition will be added to cde/cde/model.py

#name of property class
class BoilingPoint(BaseModel):
    #the type of the values that will be assigned to the class's reporting
    #as far as I know, it should always be `StringType()`
    value = StringType()
    units = StringType()
    
Compound.boiling_points = ListType(ModelType(BoilingPoint))

In [93]:
#Define the new property model's text matching using the cde regex classes

#from cde/cde/parse/elements.py:
#R = Regex: match token text with regex
#I = IWord: case insensitive match to token text
#W = Word: match token exactly
#Optional: not necessary for regex match to occur

#from cde/cde/parse/actions.py
#merge: join tokens into a single string w/o spaces b/w them

#keyword matching for text that triggers value scraping
bprefix = (R(u'^b\.?p\.?$', re.I) | I(u'boiling') + I(u'point')).hide()

#for the reported units and values, make sure to add (u'units') and (u'value')
bunits = (W(u'°') + Optional(R(u'^[CFK]\.?$')))(u'units').add_action(merge)
bvalue = R(u'^\d+(\.\d+)?$')(u'value')

#combine everything into a single, labeled regex-matching pattern
bp = (bprefix + bvalue + bunits)(u'bp')

In [94]:

class BpParser(BaseParser):
    #make sure to add docstrings, unlike anything in the rest of CDE
    #root seems to be the inherited variable for the regex matching pattern
    root = bp

    #over-write abstract functions to return the boiling point class results
    def interpret(self, result, start, end):
        compound = Compound(
            boiling_points=[
                BoilingPoint(
                    #Not sure how to interpret this part
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound

In [95]:
#add the parser to the list of cde parsers
Paragraph.parsers = [BpParser()]

In [96]:
#re-initialize the same document and serialize, which is to apply all parsers
#and to report entities, properties, roles, etc.

d = Document(
    Heading(u'Synthesis of 2,4,6-trinitrotoluene (3a)'),
    Paragraph(u'The procedure was followed to yield a pale yellow solid (b.p. 240 °C)')
)

#now that the BP classes have been added, new properties can be parsed
d.records.serialize()

[{'names': ['2,4,6-trinitrotoluene'],
  'labels': ['3a'],
  'roles': ['product'],
  'boiling_points': [{'value': '240', 'units': '°C'}]}]

## Notes:


 - Adding parsers and property classes this way are only temporary and last only until the kernel is re-initialized. In order to permanently added these properties to CDE, these need to be added to the package's actual file system
   - Property class is added to `cde/cde/model.py` and also initialized within the `Compound()` class at the bottom of the same `model.py`
   - Property parser class is added as a separate .py file within to `cde/cde/parse/`. This means that it will have to be imported in `cde/cde/doc/text.py`
   - Property parser class needs to be initialized in `cde/cde/doc/text.py` in the `Paragraph()` class definition
   - For contextual information, such as the apparatus used to take a measurement, modify `cde/cde/parse/context.py` (make sure to mark contextual = True in parser definition)
   
   
 - If I want to extract these properties from tables, they need to be added to `cde/cde/parse/table.py`
     - For table extraction, HTML files are needed, not text or pdf
   
-------------------------------

## Now to practice defining another simple property class and parser

It would be most useful to practice with properties that are pertinent to corrosion inhibitors, especially properties that show up in a lot of our different interests. I'll practice with one of them:

 - HOMO (IP)

In [99]:
class HOMOLevel(BaseModel):
    """
    
    """
    value = StringType()
    units = StringType()
    
Compound.HOMO_level = ListType(ModelType(HOMOLevel))

#keyword matching for text that triggers value scraping
prefix = Optional('a') + Optional(I('^HOMO$') | I('HOMO') | I('HOMO level') | I('HOMO energy') | I('HOMO energy level')) + Optional(I('EHOMO')) + Optional(I('of') | I('=') | I('equal to') | I('is'))

#for the reported units and values, make sure to add (u'units') and (u'value')
units = (R('eV\.?'))('units')
value = (R('^\-?\d+(\.\d+)?$'))('value')

#combine everything into a single, labeled regex-matching pattern
homo = (prefix + value + units)('homo')


class HOMOParser(BaseParser):
    """
    
    """
    root = homo

    #over-write abstract functions to return the boiling point class results
    def interpret(self, result, start, end):
        compound = Compound(
            HOMO_level=[
                HOMOLevel(
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound
        
Paragraph.parsers = [HOMOParser()]

In [2]:
d = Document(
    Heading(u'Molecular energy levels of poly(3-hexylthiophene) (P3HT)'),
    Paragraph('P3HT has a HOMO level of -4.9 eV, while the LUMO level is -3.0 eV. This corresponds to a bandgap of 1.9 eV.'),
    Paragraph(u'For P3HT, EHOMO is -4.9 eV'),
    Heading(u'Synthesis of 2,4,6-trinitrotoluene (3a)'),
    Paragraph(u'The procedure was followed to yield a pale yellow solid (b.p. 240 °C)')
)
d

NameError: name 'Document' is not defined

In [1]:
d.records.serialize()

NameError: name 'd' is not defined

In [3]:
test_str = '/'

print(re.match('/', test_str))

<re.Match object; span=(0, 1), match='/'>
