# Extracting Solubility

In [1]:
from chemdataextractor import Document
from chemdataextractor.model import Compound
from chemdataextractor.doc import Paragraph, Heading

In [2]:
d = Document(  
    Paragraph(u'The procedure was followed to yield a pale yellow solid Hippeastrine Hydrobromide. ( melting point 137 °C)')
)
d.records.serialize()

[{u'names': [u'Hippeastrine Hydrobromide']},
 {u'melting_points': [{u'units': u'\xb0C', u'value': u'137'}]}]

In [3]:
d = Document.from_file("../../test_mp.htm")
d.records.serialize()

[{u'names': [u'aromatic phenyl']},
 {u'names': [u'metalloporphyrins']},
 {u'names': [u'Pd']},
 {u'names': [u'Pt octaethylporphyrin']},
 {u'names': [u'platinum']},
 {u'names': [u'quartz-halogen']},
 {u'names': [u'quartz']},
 {u'names': [u'thiophene-substitutents']},
 {u'names': [u'arylboronic acid']},
 {u'names': [u'2-Bu3Sn-thiophene']},
 {u'names': [u'Pd2(dba)3']},
 {u'names': [u'CF3-Ph-B(OR)2']},
 {u'names': [u'4-pyridine boronic acid pinacole ester']},
 {u'names': [u'Aliquat 336']},
 {u'names': [u'MeOH']},
 {u'names': [u'C25H17N']},
 {u'names': [u'1538 ( m ) 1519 ( w ) 1495 ( m ) 1438 ( s ) 1390 ( s ) 1324 ( m ), 1254 ( m ) 1212']},
 {u'names': [u'4-(trifluoromethyl)phenyl boronic acid']},
 {u'names': [u'C27H17F3']},
 {u'names': [u'4-cyanophenyl boronic acid']},
 {u'names': [u'ethanol']},
 {u'names': [u'C27H17N']},
 {u'names': [u'C27H20O']},
 {u'names': [u'C24H16S']},
 {u'names': [u'C22H14S2']},
 {u'names': [u'C25H15F3S']},
 {u'names': [u'C27H19F3O']},
 {u'names': [u'thiophene\u2013a

In [6]:
d = Document.from_file("../../test1.htm")
d.records.serialize()

[{u'names': [u'fluoroquinolone']},
 {u'names': [u'piperazine']},
 {u'names': [u'methanol']},
 {u'names': [u'propanol']},
 {u'names': [u'acetone']},
 {u'names': [u'chloroform']},
 {u'names': [u'lactate']},
 {u'names': [u'mesylic']},
 {u'names': [u'glycolic acid']},
 {u'names': [u'saccharin']},
 {u'names': [u'methapyrilene']},
 {u'names': [u'amine']},
 {u'names': [u'carboxyl']},
 {u'names': [u'sulfate']},
 {u'names': [u'Plasdone']},
 {u'names': [u'methacrylic acid methyl methacrylate copolymer']},
 {u'names': [u'methacrylic acid ethyl acrylate']},
 {u'names': [u'poly(acrylic acid )']},
 {u'names': [u'OH']},
 {u'names': [u'hydroxypropyl methylcellulose acetate succinate']},
 {u'names': [u'polyvinyl caprolactam']},
 {u'names': [u'polyvinyl acetate']},
 {u'names': [u'polyethylene glycol']},
 {u'names': [u'Ludwigshafen']},
 {u'names': [u'Dodecane, triethylamine']},
 {u'names': [u'l-\u03b1-phosphatidylcholine']},
 {u'names': [u'phosphate']},
 {u'names': [u'Powder X-ray Diffraction', u'PXRD']}

In [None]:
from chemdataextractor.model import BaseModel, StringType, ListType, ModelType

class Solubility(BaseModel):
    value = StringType()
    units = StringType()
    
Compound.solubility = ListType(ModelType(Solubility))

In [None]:
import re
from chemdataextractor.parse import R, I, W, Optional, merge

# prefix = (R(u'^m\.?p\.?$', re.I) | I(u'melting') + I(u'point')).hide()
# match any words between
# fill = Optional(R('(\w+\s){}')).hide()

prefix = Optional(W('=') | I('of') | I('was') | I('is') | I('at')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about')).hide()

# delim = R(u'^[:;\.,]$')
value = R(u'^\d+(\.\d+)?$')(u'value')

units = (W(u'nM') | W(u'μM') | W(u'mM') | W(u'μg')| W(u'mg'))(u'units').add_action(merge)

so = (prefix + value + units)(u'so')

# units = (W(u'°') + Optional(R(u'^[CFK]\.?$')))(u'units').add_action(merge)
# value = R(u'^\d+(\.\d+)?$')(u'value')
# bp = (prefix + + value + units)(u'bp')

In [None]:
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first

class SoParser(BaseParser):
    root = so

    def interpret(self, result, start, end):
        compound = Compound(
            solubility=[
                Solubility(
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound


In [None]:
Paragraph.parsers = [SoParser()]

In [None]:
d = Document(
    Heading(u'Synthesis of 2,4,6-trinitrotoluene (3a)'),
    Paragraph(u'The procedure was followed to yield a pale yellow solid (solubility of 28 mg/mL)')
)
d.records.serialize()


In [None]:
d = Document.from_file("../../test1.htm")
d.records.serialize()