In [14]:
import bioservices
import json
import re

from helpers import print_percent_done

In [47]:
import importlib
# importlib.reload(cache)

import cache
# importlib.reload(cache)

import helpers
from helpers import log 
from helpers import should_log
importlib.reload(helpers)
importlib.reload(cache)


verbositydebug, VERBOSITYinfo


<module 'cache' from '/Users/oneoPk/Desktop/flavo/flavoenzymes/cache.py'>

In [22]:
k = bioservices.kegg.KEGG()
parser = bioservices.kegg.KEGGParser()

# This file is WIP still:
### TODO:
- Convert the enzymes to a SMILES strings
- Allow to fetch data from Kegg for ECs that are in Brenda (or other databases)
    - perhaps first, a master list of all FlavoEnzymes can be made, then itterated over
        

### QUESTIONS:
- SMILES string
    - https://www.genome.jp/tools/simcomp/

### Configurables
Edit the folling options before running the scraper

In [23]:
useCaching = True # setting this parameter to False will fetch new data from server, instead of using cache
# VERBOSITY = 'info' # set to 'none' to have almost no output

keywords = ['FAD', 'FMN', 'flavin', 'flavoenzyme', 'flavo']
import_file = "export/kegg.json"
export_file = "export/kegg.json"
poster_child = '1.14.13.2'

### Helper functions

In [24]:
def read_past_data(path=import_file):
    try:
        with open(path) as json_file:
            return json.load(json_file)
    except:
        return {}

In [25]:
def get_ids(keyword):
    results = k.find(database='enzyme', query=keyword)
    results_array = results.split('\n')
    ids_array = [i.split('\t')[0] for i in results_array if (i)]
    return set(ids_array)

In [26]:
def get_ids_for_keywords(keywords):
    all_ids_set = set()
    for keyword in keywords:
        kw_ids = get_ids(keyword);
        all_ids_set = all_ids_set | kw_ids
    return all_ids_set

In [27]:
def kegg_request(id):
    resp = cache.kegg_cached_reqest(request_name=f'kegg_ec_request', ec=id, request_fn=k.get, useCaching=useCaching)
    ec_parse = parser.parse(resp)
    ec_parse['SUBSTRATE'] = get_compounds_with_smiles(ec_parse['SUBSTRATE'])
    ec_parse['PRODUCT'] = get_compounds_with_smiles(ec_parse['PRODUCT'])
    return ec_parse

In [28]:
def get_compounds_with_smiles(list_of_compounds):
    new_compounds = {}
    pattern = '(.+) \[CPD:(.+)\]'

    for compound in list_of_compounds:
        if 'CPD' in compound:
            # this breaks the strings like "4-hydroxybenzoate [CPD:C00156];" into
            # an array of tuples like this: [('4-hydroxybenzoate', 'C00156')]
            match = re.findall(pattern, compound)[0]
            
            # constructing the cpd number, it must be like "cpd:C00156" 
            name = match[0]
            cpd_number = 'cpd:' + match[1]
            
            try:
                smiles_str = cache.get_smile_string(cpd_number)
            except Exception as e:
                new_compounds[name] = {
                    'name': name,
                }
                log(f"{e}", 'warning')
                continue
            
            # updating the entree
            new_compounds[name] = {
                'name': name,
                'smiles': smiles_str,
                'kegg_name': compound,
            }
            log(f"compound:{compound}, cpd_number:{cpd_number},name:{name},smiles: {smiles_str}\n\n", 'debug')
        else:
            name = compound
            # this will throw an exception that will need to be handled
            # if this is anoying, can convert it to a print statement
            log(f'⚠️ Compound "{compound}" is missing a CPD number', 'warning')
            new_compounds[name] = {
                'name': name,
            }
            
    return new_compounds

In [29]:
def get_all_data(ids, previous_json, verbose=False):    
    data_dict = previous_json
    
    log(f'Getting data for following IDs - {ids}', 'debug')
    for index,id in enumerate(ids):
        ec_data = kegg_request(id)
        name = ec_data.get('SYSNAME', id) # if sysname not found, will use ec_number
        ec_number = id.replace('ec:','')
        
        ec_data['KEGG_ID'] = id
        ec_data['EC_NUMBER'] = ec_number
        data_dict[ec_number] = ec_data
        
        log(f'Getting following data - {data_dict[ec_number]}', 'debug')
        if should_log(verbosity='info'):
            print_percent_done(index=index, length=len(ids))
            
    return data_dict

In [30]:
# this needs to be read from the outside dir
black_list_ids = set()
white_list_ids = set()


def scrape_kegg():
    log(f'Kegg scraping script started...','debug')
    # Reading past results from KEGG
    previous_json = read_past_data()
    log(f'1. Successfully read previous json data, that has total of {len(previous_json)} records', 'info')
    prev_ids = {enzyme['KEGG_ID'] for enzyme in list(previous_json.values())}
    
    # Getting IDs of all entries that are missing from past
    new_ids = get_ids_for_keywords(keywords)
    # new_ids = set(brenda_blacklist) - set(benda_not_found)
    
    # TODO: make sure this line of code is not crazy! 
    # note: the parentecies are crucial here for order of operations
    all_ids = (white_list_ids|new_ids)-(prev_ids|black_list_ids)
                
    # If new ids have been found, fetch the data
    if len(all_ids) > 0:
        log(f'2. Following potential flavins are missing from past results:','info')
        if should_log('info'):
            [print(f'- {ec}') for ec in all_ids]
    
        # Scraping the data
        log(f'\n3. Fetching the data','info')
        flavins = get_all_data(all_ids, previous_json, verbose=True)
        
        # Writing out the results to the file
        with open(export_file, 'w') as outfile:
            json.dump(flavins, outfile)
        log(f'\nSuccessfully written out {len(all_ids)} results to "{export_file}"','info')
    else:
        log("Doesn't look like there are any new flavins on KEGG!",'warning')

### Running the program

In [48]:
# VERBOSITY levels:         less_verbose <---- 'none' | 'error' | 'warning' | 'info' | 'debug' ----> more_verbose
x = kegg_request('1.14.19.43')

verbositydebug, VERBOSITYinfo
verbositydebug, VERBOSITYinfo
verbositydebug, VERBOSITYinfo
verbositydebug, VERBOSITYinfo
verbositydebug, VERBOSITYinfo
verbositydebug, VERBOSITYinfo
verbositydebug, VERBOSITYinfo
verbositydebug, VERBOSITYinfo
verbositydebug, VERBOSITYinfo


In [48]:
scrape_kegg()

18:28:49 [info]: 1. Successfully read previous json data, that has total of 0 records
18:28:52 [info]: 2. Following potential flavins are missing from past results:
- ec:3.1.3.102
- ec:2.1.1.75
- ec:2.8.2.28
- ec:1.14.14.20
- ec:1.14.13.32
- ec:1.14.20.6
- ec:1.3.8.11
- ec:3.5.99.1
- ec:1.3.8.14
- ec:1.14.14.34
- ec:1.14.19.33
- ec:1.14.19.43
- ec:1.5.1.41
- ec:2.1.1.155
- ec:2.1.1.83
- ec:1.5.1.39
- ec:2.4.1.115
- ec:6.2.1.59
- ec:1.14.19.3
- ec:2.8.2.26
- ec:1.7.1.17
- ec:1.14.14.8
- ec:1.12.98.1
- ec:1.14.14.27
- ec:1.14.14.3
- ec:2.1.1.231
- ec:1.5.1.20
- ec:2.7.10.2
- ec:2.1.1.343
- ec:1.14.14.82
- ec:2.1.1.148
- ec:2.7.1.180
- ec:2.7.1.26
- ec:2.1.1.150
- ec:1.5.1.40
- ec:1.14.20.5
- ec:2.4.2.35
- ec:6.2.1.41
- ec:1.1.5.4
- ec:3.6.1.18
- ec:1.19.1.1
- ec:1.14.19.31
- ec:1.14.19.35
- ec:1.5.1.38
- ec:2.7.8.28
- ec:1.14.14.9
- ec:1.3.1.45
- ec:1.14.19.34
- ec:1.14.19.30
- ec:3.2.1.161
- ec:1.4.3.4
- ec:1.14.14.5
- ec:1.14.19.22
- ec:2.3.1.116
- ec:2.1.1.74
- ec:2.7.7.2
- ec:1.14.13

KeyboardInterrupt: 

In [None]:
# prints all the ECs of items without SYSNAME
kegg = read_past_data()
sysnames = [ec for (ec, details) in list(kegg.items()) if not details.get('SYSNAME')]
sysnames

### Seeing how many brenda flavins are in Kegg

In [None]:
brenda = read_past_data('export/brenda.json')
brenda_ecs = sorted(brenda.keys())
# brenda_flavins = get_all_data(brenda_ecs, previous_json = {}, verbose=True)
benda_not_found=[]
for index,id in enumerate(brenda_ecs):
        try:
            ec_data = kegg_request(id)
        except:
            benda_not_found.append(id)

In [None]:
len(benda_not_found)

In [None]:
CPD = k.parse(k.get("cpd:C00156"))

In [None]:
CPD.keys()

In [None]:
CPD['FORMULA']

In [None]:
import requests

In [None]:

    
target_db = 'uniprot'
source_db = 'compound'
cpd = 'cpd:C00156'
url = f'http://rest.kegg.jp/conv/{target_db}/{source_db}'
resp = requests.get(url)
print(resp.text)

In [None]:
# perhaps we can use openbable or pybel?
# https://pypi.org/project/openbabel/3.0.0/
# to convert to smiles string?

### Converting Kegg entry to SMILES

In [None]:
cpd = 'cpd:C00156'
url = f'https://www.genome.jp/dbget-bin/www_bget?-f+m+compound+{cpd}'
resp = requests.get(url)
print(resp.text)

In [None]:
# chebi_con = bioservices.ChEBI()

kegg_entry = parser.parse(k.get('cpd:C00156'))
ChEBI_links = kegg_entry['DBLINKS']['ChEBI'].split(' ')
ChEMBL_links = kegg_entry['DBLINKS'].get('ChEMBL', None)
print(ChEBI_links)

# chebi_entry = chebi_con.getCompleteEntity('CHEBI:' + kegg_entry['DBLINKS']['ChEBI'])

# print (chebi_entry.smiles)
# print (chebi_entry.inchi)
# print (chebi_entry.inchiKey)


# >>> from bioservices import *
# >>> ch = ChEBI()
# >>> res = ch.getCompleteEntity("CHEBI:27732")
# >>> res.smiles
# CN1C(=O)N(C)c2ncn(C)c2C1=O


In [None]:
map_kegg_chebi['cpd:C00156']

In [None]:
ChEMBL_links

In [None]:
kegg_entry

In [None]:
PC_data = kegg_request(poster_child)

In [None]:
PC_data.keys()

In [None]:
PC_data['PRODUCT'].keys()

In [None]:
data = kegg_request(poster_child)
substrates = data['SUBSTRATE']
products = data['PRODUCT']

get_updated_compounds(substrates)
# get_updated_compounds(products)