In [1]:
import bioservices
import json
import re
import importlib
import pandas as pd
from pathlib import Path

In [2]:
import cache
from helpers import log 
from helpers import should_log
from helpers import print_percent_done
# importlib.reload(cache)
# importlib.reload(should_log)

In [3]:
k = bioservices.kegg.KEGG()
parser = bioservices.kegg.KEGGParser()

# This file is WIP still:
### TODO:
- Convert the enzymes to a SMILES strings
- Allow to fetch data from Kegg for ECs that are in Brenda (or other databases)
    - perhaps first, a master list of all FlavoEnzymes can be made, then itterated over
        

### QUESTIONS:
- SMILES string
    - https://www.genome.jp/tools/simcomp/

### Configurables
Edit the folling options before running the scraper

In [None]:
useCaching = True # setting this parameter to False will fetch new data from server, instead of using cache

keywords = ['FAD', 'FMN', 'flavin', 'flavoenzyme', 'flavo']
import_file = "export/kegg.json"
export_file = "export/kegg.json"
poster_child = '1.14.13.2'

whitelist_path = Path('modules/scrapers/whitelist.csv')
blacklist_path = Path('modules/scrapers/blacklist.csv')

### Helper functions

In [5]:
def read_past_data(path=import_file):
    try:
        with open(path) as json_file:
            return json.load(json_file)
    except:
        return {}

In [6]:
def get_ids(keyword):
    results = k.find(database='enzyme', query=keyword)
    results_array = results.split('\n')
    ids_array = [i.split('\t')[0] for i in results_array if (i)]
    return set(ids_array)

In [7]:
def get_ids_for_keywords(keywords):
    all_ids_set = set()
    for keyword in keywords:
        kw_ids = get_ids(keyword);
        all_ids_set = all_ids_set | kw_ids
    return all_ids_set

In [8]:
def kegg_request(id):
    resp = cache.kegg_cached_reqest(request_name=f'kegg_ec_request', ec=id, request_fn=k.get, useCaching=useCaching)
    ec_parse = parser.parse(resp)
    if 'SUBSTRATE' in ec_parse.keys():
        ec_parse['SUBSTRATE'] = get_compounds_with_smiles(ec_parse['SUBSTRATE'])
        ec_parse['PRODUCT'] = get_compounds_with_smiles(ec_parse['PRODUCT'])
    return ec_parse

In [9]:
def get_compounds_with_smiles(list_of_compounds):
    new_compounds = {}
    pattern = '(.+) \[CPD:(.+)\]'

    for compound in list_of_compounds:
        compound = compound.replace(';','') # this will be the full name
        name = compound # this will be the short name
        smiles_str = None
        cpd_number = None
        
        if 'CPD' in compound:
            # this breaks the strings like "4-hydroxybenzoate [CPD:C00156];" 
            # into an array of tuples like this: [('4-hydroxybenzoate', 'C00156')]
            match = re.findall(pattern, name)[0]
            
            # constructing the cpd number, it must be like "cpd:C00156"
            name = match[0]
            cpd_number = 'cpd:' + match[1]
            try:
                smiles_str = cache.get_smile_string(cpd_number)
                log(f"compound:{name}, cpd_number:{cpd_number},name:{name},smiles: {smiles_str}\n\n", 'debug')
                if not smiles_str:
                    raise Exception(f'Compound: {cpd_number} does not have a SMILES string.')
            except Exception as e:
                log(f"{e}", 'debug')
                continue
        else:
            name = compound
            log(f'⚠️ Compound "{compound}" is missing a CPD number', 'debug')
            
        new_compounds[name] = {
                'name': name,
                'smiles': smiles_str,
                'kegg_name': compound,
                'kegg_id': cpd_number
            }
    return new_compounds

In [10]:
def get_all_data(ids, previous_json, verbose=False):    
    data_dict = previous_json
    
    log(f'Getting data for following IDs - {ids}', 'debug')
    for index,id in enumerate(ids):
        ec_data = kegg_request(id)
        name = ec_data.get('SYSNAME', id) # if sysname not found, will use ec_number
        ec_number = id.replace('ec:','')
        
        ec_data['KEGG_ID'] = id
        ec_data['EC_NUMBER'] = ec_number
        data_dict[ec_number] = ec_data
        
        log(f'Getting following data - {data_dict[ec_number]}', 'silly')
        if should_log(message_verbosity='info'):
            print_percent_done(index=index, length=len(ids))
            
    return data_dict

In [11]:
white_list_ids = set(pd.read_csv(whitelist_path)['ec'])
white_list_ids

{'ec:1.14.13.2'}

In [12]:
def scrape_kegg():
    # this needs to be read from the outside dir
    # TODO: add all brenda wierdo ecs into the blacklist
    white_list_ids = set(pd.read_csv(whitelist_path)['ec'])
    black_list_ids = set(pd.read_csv(blacklist_path)['ec'])

    log(f'Kegg scraping script started...','info')
    # Reading past results from KEGG
    previous_json = read_past_data()
    log(f'1. Successfully read previous json data, that has total of {len(previous_json)} records', 'success')
    prev_ids = {enzyme['KEGG_ID'] for enzyme in list(previous_json.values())}
    
    # Getting IDs of all entries that are missing from past
    new_ids = get_ids_for_keywords(keywords)
    # new_ids = set(brenda_blacklist) - set(benda_not_found)
    
    # TODO: make sure this line of code is not crazy! 
    # note: the parentecies are crucial here for order of operations
    all_ids = (white_list_ids|new_ids)-(prev_ids|black_list_ids)
                
    # If new ids have been found, fetch the data
    if len(all_ids) > 0:
        log(f'2. Following potential flavins are missing from past results:','info')
        if should_log('info'):
            [print(f'- {ec}') for ec in all_ids]
    
        # Scraping the data
        log(f'\n3. Fetching the data','info')
        flavins = get_all_data(all_ids, previous_json, verbose=True)
        
        # Writing out the results to the file
        with open(export_file, 'w') as outfile:
            json.dump(flavins, outfile)
        log(f'\nSuccessfully written out {len(all_ids)} results to "{export_file}"','success')
    else:
        log("Doesn't look like there are any new flavins on KEGG!",'info')

### Running the program

In [None]:
scrape_kegg()

---------------
## The rest is a playground

In [16]:
x = kegg_request(poster_child)

In [20]:
x.keys()

dict_keys(['ENTRY', 'NAME', 'CLASS', 'SYSNAME', 'REACTION', 'SUBSTRATE', 'PRODUCT', 'COMMENT', 'HISTORY', 'REFERENCE', 'ORTHOLOGY', 'GENES', 'DBLINKS'])

In [22]:
x['PRODUCT']

{'1-acyl-2-[(3E)-hexadec-3-enoyl]-[glycerolipid]': {'name': '1-acyl-2-[(3E)-hexadec-3-enoyl]-[glycerolipid]',
  'smiles': None,
  'kegg_name': '1-acyl-2-[(3E)-hexadec-3-enoyl]-[glycerolipid]',
  'kegg_id': None},
 'H2O': {'name': 'H2O',
  'smiles': '[H]O[H]',
  'kegg_name': 'H2O [CPD:C00001]',
  'kegg_id': 'cpd:C00001'}}

In [None]:
# prints all the ECs of items without SYSNAME
kegg = read_past_data()
sysnames = [ec for (ec, details) in list(kegg.items()) if not details.get('SYSNAME')]
sysnames

### Seeing how many brenda flavins are in Kegg

In [None]:
brenda = read_past_data('export/brenda.json')
brenda_ecs = sorted(brenda.keys())
# brenda_flavins = get_all_data(brenda_ecs, previous_json = {}, verbose=True)
benda_not_found=[]
for index,id in enumerate(brenda_ecs):
        try:
            ec_data = kegg_request(id)
        except:
            benda_not_found.append(id)

In [None]:
len(benda_not_found)

In [None]:
CPD = k.parse(k.get("cpd:C00156"))

In [None]:
CPD.keys()

In [None]:
CPD['FORMULA']

In [None]:
import requests

In [None]:

    
target_db = 'uniprot'
source_db = 'compound'
cpd = 'cpd:C00156'
url = f'http://rest.kegg.jp/conv/{target_db}/{source_db}'
resp = requests.get(url)
print(resp.text)

In [None]:
# perhaps we can use openbable or pybel?
# https://pypi.org/project/openbabel/3.0.0/
# to convert to smiles string?

### Converting Kegg entry to SMILES

In [None]:
cpd = 'cpd:C00156'
url = f'https://www.genome.jp/dbget-bin/www_bget?-f+m+compound+{cpd}'
resp = requests.get(url)
print(resp.text)

In [None]:
# chebi_con = bioservices.ChEBI()

kegg_entry = parser.parse(k.get('cpd:C00156'))
ChEBI_links = kegg_entry['DBLINKS']['ChEBI'].split(' ')
ChEMBL_links = kegg_entry['DBLINKS'].get('ChEMBL', None)
print(ChEBI_links)

# chebi_entry = chebi_con.getCompleteEntity('CHEBI:' + kegg_entry['DBLINKS']['ChEBI'])

# print (chebi_entry.smiles)
# print (chebi_entry.inchi)
# print (chebi_entry.inchiKey)


# >>> from bioservices import *
# >>> ch = ChEBI()
# >>> res = ch.getCompleteEntity("CHEBI:27732")
# >>> res.smiles
# CN1C(=O)N(C)c2ncn(C)c2C1=O


In [None]:
map_kegg_chebi['cpd:C00156']

In [None]:
ChEMBL_links

In [None]:
kegg_entry

In [None]:
PC_data = kegg_request(poster_child)

In [None]:
PC_data

In [None]:
PC_data.keys()

In [None]:
PC_data['PRODUCT'].keys()

In [None]:
data = kegg_request(poster_child)
substrates = data['SUBSTRATE']
products = data['PRODUCT']

get_updated_compounds(substrates)
# get_updated_compounds(products)