In [1]:
import requests
from SPARQLWrapper import SPARQLWrapper, JSON
from pprint import pprint
import io
import re

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

In [2]:
# for entity recognition of compounds without cas numbers
def link_compound2chebi(compound):
    """
    used NCBO Annotator from BioPortal to return ChEBI IDS
    for substrates and products of reactions from Expasy enzyme
    """
    url = 'http://data.bioontology.org/annotator'
    params = dict(apikey=api_key, text=compound, ontologies='CHEBI', longest_only='true',
                  include='properties', exlude_numbers='false', exclude_synonyms='false', mappins='all')
    tm_results = requests.get(url=url, params=params)
    return tm_results.json()

In [18]:
def execute_query(query):
    endpoint = SPARQLWrapper('https://query.wikidata.org/sparql')
    endpoint.setQuery(query)
    endpoint.setReturnFormat(JSON)
    return endpoint.query().convert()

# for mapping the cas number to chebi id using wikidata's sparql endpoint
def map_cas_to_chebi_wd(cas_number):
    query = '''
    select ?compoundLabel ?compound ?chebi where {
      ?compound wdt:P231 '%s';
                wdt:P683 ?chebi.
      SERVICE wikibase:label {
            bd:serviceParam wikibase:language "en" .
      }
    }
    ''' % (cas_number)
    results = execute_query(query)
    results = results['results']['bindings']
    return_val = ''
    if len(results) == 1:
        return_val = results[0]
    else:
        print(cas, 'no wikidata mapping')

In [4]:
# uses pdfminer2 to extract text from a pdf
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [5]:
# extract text to string from a 
document = convert_pdf_to_txt('../data/toc_headings.pdf')

In [6]:
document

"Complete Table of Contents - Volumes One and Two\n\nVolume One\n\nGeneral Information\n\n1 Introduction and Public Health Uses\n4 What’s New and Different?\n6 Data Sources and Data Analysis\n12 Interpretation of Report and Updated Tables Data: Important Factors\n14 Calculation of Urinary Inorganic-related Arsenic Species\n15 Calculation of PFOS and PFOA as the Sum of the Isomers\n17 Chemical and Toxicological Information\n\nAdducts of Hemoglobin\n\n19 Acrylamide\nCAS No. 79-06-1\n20 Glycidamide\nCAS No. 486-56-6\n\nTobacco Smoke\n21 Cotinine\n\nCAS No. 486-56-6\n\n24 Hydroxycotinine\n25 NNAL\n\nCAS No. 76014-81-8\n\nDisinfection By-Products\n\n29 Bromodichloromethane\n\nCAS No. 75-27-4\n\n31 Dibromochloromethane (Chlorodibromomethane)\n\nCAS No. 124-48-1\n\n33 Tribromomethane (Bromoform)\n\nCAS No. 75-25-2\n\n35 Trichloromethane (Chloroform)\n\nCAS No. 67-66-3\n\nPersonal Care and Consumer Product Chemicals and Metabolites\n\n37 Benzophenone-3\n\nCAS No. 131-57-7\n41 Bisphenol A\nCAS 

In [7]:
lines = document.split('\n')
lines = [x for x in lines if x]

In [8]:
lines

['Complete Table of Contents - Volumes One and Two',
 'Volume One',
 'General Information',
 '1 Introduction and Public Health Uses',
 '4 What’s New and Different?',
 '6 Data Sources and Data Analysis',
 '12 Interpretation of Report and Updated Tables Data: Important Factors',
 '14 Calculation of Urinary Inorganic-related Arsenic Species',
 '15 Calculation of PFOS and PFOA as the Sum of the Isomers',
 '17 Chemical and Toxicological Information',
 'Adducts of Hemoglobin',
 '19 Acrylamide',
 'CAS No. 79-06-1',
 '20 Glycidamide',
 'CAS No. 486-56-6',
 'Tobacco Smoke',
 '21 Cotinine',
 'CAS No. 486-56-6',
 '24 Hydroxycotinine',
 '25 NNAL',
 'CAS No. 76014-81-8',
 'Disinfection By-Products',
 '29 Bromodichloromethane',
 'CAS No. 75-27-4',
 '31 Dibromochloromethane (Chlorodibromomethane)',
 'CAS No. 124-48-1',
 '33 Tribromomethane (Bromoform)',
 'CAS No. 75-25-2',
 '35 Trichloromethane (Chloroform)',
 'CAS No. 67-66-3',
 'Personal Care and Consumer Product Chemicals and Metabolites',
 '37 Be

In [9]:
# start the parsing to filter out lines we don't care about
lines = [x for x in lines if '_____' not in x]
lines = [x for x in lines if 'Updated Tables' not in x]
lines = [x.lstrip('\x0c') for x in lines if '(cid:' not in x]

In [10]:
lines = lines[9:]

In [11]:
lines

['Adducts of Hemoglobin',
 '19 Acrylamide',
 'CAS No. 79-06-1',
 '20 Glycidamide',
 'CAS No. 486-56-6',
 'Tobacco Smoke',
 '21 Cotinine',
 'CAS No. 486-56-6',
 '24 Hydroxycotinine',
 '25 NNAL',
 'CAS No. 76014-81-8',
 'Disinfection By-Products',
 '29 Bromodichloromethane',
 'CAS No. 75-27-4',
 '31 Dibromochloromethane (Chlorodibromomethane)',
 'CAS No. 124-48-1',
 '33 Tribromomethane (Bromoform)',
 'CAS No. 75-25-2',
 '35 Trichloromethane (Chloroform)',
 'CAS No. 67-66-3',
 'Personal Care and Consumer Product Chemicals and Metabolites',
 '37 Benzophenone-3',
 'CAS No. 131-57-7',
 '41 Bisphenol A',
 'CAS No. 80-05-7',
 '45 Bisphenol F',
 '47 Bisphenol S',
 '49 4-tert-Octylphenol',
 'CAS No. 140-66-9',
 '51 Triclocarban',
 'CAS No. 101-20-2',
 '53 Triclosan',
 'CAS No. 3380-34-5',
 '57 Butyl paraben',
 'CAS No. 94-26-8',
 '61 Ethyl paraben',
 'CAS No. 120-47-8',
 '65 Methyl paraben',
 'CAS No. 99-76-3',
 '69 n-Propyl paraben',
 'CAS No. 94-13-3',
 '73 2,4-Dichlorophenol',
 'CAS No. 120-8

In [14]:
from collections import OrderedDict
heading_index = OrderedDict()
for index, line in enumerate(lines):
    try:
        if not line[0].isdigit() and not line.startswith('CAS'):
            heading_index[line] = {'index':index,
                                   'chemicals': [],
                                   'next_index': '',
                                  }
    except Exception as e:
        print(e)
for key, value in heading_index.items():
    next_key = list(heading_index.keys()).index(key) + 1
    try:
        this_index = value
        next_index = heading_index[list(heading_index.keys())[next_key]]['index']
        heading_index[key]['next_index'] = next_index
    except Exception as e:
        print(e)

string index out of range
list index out of range


In [15]:
heading_index

OrderedDict([('Adducts of Hemoglobin',
              {'index': 0, 'chemicals': [], 'next_index': 5}),
             ('Tobacco Smoke',
              {'index': 5, 'chemicals': [], 'next_index': 11}),
             ('Disinfection By-Products',
              {'index': 11, 'chemicals': [], 'next_index': 20}),
             ('Personal Care and Consumer Product Chemicals and Metabolites',
              {'index': 20, 'chemicals': [], 'next_index': 45}),
             ('Flame Retardant Metabolites',
              {'index': 45, 'chemicals': [], 'next_index': 55}),
             ('Fungicides and Metabolites',
              {'index': 55, 'chemicals': [], 'next_index': 64}),
             ('Herbicides and Metabolites',
              {'index': 64, 'chemicals': [], 'next_index': 80}),
             ('Sulfonyl Urea Herbicides',
              {'index': 80, 'chemicals': [], 'next_index': 115}),
             ('Insect Repellent and Metabolites',
              {'index': 115, 'chemicals': [], 'next_index': 122}),


In [21]:
# start the actual work
for key, value in heading_index.items():
    for index, line in enumerate(lines):
        try:
            if 'CAS' in line:
                if index > int(value['index']) and index < int(value['next_index']):
                    cas = line.lstrip('CAS No. ')
                    chebi_results = map_cas_to_chebi_wd(cas)
                    print(chebi_results)
                    value['chemicals'].append(line)
        except Exception as e:
            print(line, e)      
        
#         results = 
    

None
None
None
None
None
None
None
None
None
None
None
None
3380-34-5 no wikidata mapping
None
None
None
None
None
None
None
None
None
87-86-5 no wikidata mapping
None
2122-19-2 no wikidata mapping
None
None
138722-96-0 no wikidata mapping
None
None


KeyboardInterrupt: 