In [None]:
import requests
from SPARQLWrapper import SPARQLWrapper, JSON
from pprint import pprint
import io
import re

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

In [None]:
# for entity recognition of compounds without cas numbers
def link_compound2chebi(compound):
    """
    used NCBO Annotator from BioPortal to return ChEBI IDS
    for substrates and products of reactions from Expasy enzyme
    """
    url = 'http://data.bioontology.org/annotator'
    params = dict(apikey=api_key, text=compound, ontologies='CHEBI', longest_only='true',
                  include='properties', exlude_numbers='false', exclude_synonyms='false', mappins='all')
    tm_results = requests.get(url=url, params=params)
    return tm_results.json()

In [None]:
def execute_query(query):
    endpoint = SPARQLWrapper('https://query.wikidata.org/sparql')
    endpoint.setQuery(query)
    endpoint.setReturnFormat(JSON)
    return endpoint.query().convert()

# for mapping the cas number to chebi id using wikidata's sparql endpoint
def map_cas_to_chebi_wd(cas_number):
    query = '''
    select ?compoundLabel ?compound ?chebi where {
      ?compound wdt:P231 '%s';
                wdt:P683 ?chebi.
      SERVICE wikibase:label {
            bd:serviceParam wikibase:language "en" .
      }
    }
    ''' % (cas_number)
    return execute_query(query)

In [None]:
# uses pdfminer2 to extract text from a pdf
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [None]:
# extract text to string from a 
document = convert_pdf_to_txt('data/toc_headings.pdf')

In [None]:
# start the parsing to filter out lines we don't care about
lines = document.split('\n')
lines = [x for x in lines if x]
lines = [x for x in lines if '_' not in x]
lines = [x for x in lines if 'Updated Tables' not in x]
lines = [x.lstrip('\x0c') for x in lines if '(cid:' not in x]

In [None]:
# start the actual work
for index, line in enumerate(lines):
    if 'CAS' in line:
        cas = line.split()[-1]
        results = map_cas_to_chebi_wd(cas)['results']['bindings']
        if len(results) == 1:
            print(results[0]['compoundLabel']['value'], results[0]['chebi']['value'])
        else:
            print(cas, 'no wikidata mapping')
#     if not line.startswith('CAS') and not re.search('^\s*[0-9]', line):
    